From 53f152b8d0da0a15007188ffc74d129e15ec5300 Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Tue, 5 Aug 2025 08:07:49 -0700
Subject: [PATCH 01/10] [build] fix WebAssembly build on macOS/arm64 (#25653)

### Description

fix WebAssembly build on macOS/arm64 by disable appending
"-Donnxruntime_USE_KLEIDIAI=ON" to the cmake_args

KleidiAI should not be enabled for WebAssembly build.
---
 tools/ci_build/build.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py
index 56fd3f1323e92..2080f4f7941c6 100644
--- a/tools/ci_build/build.py
+++ b/tools/ci_build/build.py
@@ -889,7 +889,7 @@ def generate_build_tree(
     # * Leave disabled if "no_kleidiai" argument was specified.
     # * Enable if the target is Android and args.android_abi contains arm64*
     # * Enable for a Windows cross compile build if compile target is an Arm one.
-    # * Finally enable if platform.machine contains "arm64". This should cover the following cases:
+    # * Finally enable if platform.machine contains "arm64" and not a WebAssembly build. This should cover the following cases:
     #     *  Linux on Arm
     #     *  MacOs (case must be ignored)
     # * TODO Delegate responsibility for Onnxruntime_USE_KLEIDIAI = ON to CMake logic
@@ -897,7 +897,7 @@ def generate_build_tree(
         if (
             (args.android and "arm64" in args.android_abi.lower())
             or (is_windows() and (args.arm64 or args.arm64ec or args.arm) and platform.architecture()[0] != "AMD64")
-            or ("arm64" in platform.machine().lower())
+            or ("arm64" in platform.machine().lower() and not args.build_wasm)
         ):
             cmake_args += ["-Donnxruntime_USE_KLEIDIAI=ON"]
 

From b1546da6f524f4995ed3273648ebf1152f9f727a Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Tue, 5 Aug 2025 13:53:13 -0700
Subject: [PATCH 02/10] [build] fix build with delay load hook (#25657)

### Description

Fix build when at least one delay load DLL is needed for onnxruntime.dll

The old code contains non standard macro definition which is considered
as build error in latest VC++
---
 onnxruntime/core/dll/delay_load_hook.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/onnxruntime/core/dll/delay_load_hook.cc b/onnxruntime/core/dll/delay_load_hook.cc
index bc5e1aa662721..3e7b9c132456d 100644
--- a/onnxruntime/core/dll/delay_load_hook.cc
+++ b/onnxruntime/core/dll/delay_load_hook.cc
@@ -45,7 +45,7 @@
 
 namespace {
 
-#define DEFINE_KNOWN_DLL(name) {#name ".dll", L#name L".dll"}
+#define DEFINE_KNOWN_DLL(name) {#name ".dll", L## #name L".dll"}
 
 constexpr struct {
   const char* str;

From d912167c91a7c9f471eb3fc88300e03727ac2ca6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Maximilian=20M=C3=BCller?=
 <44298237+gedoensmax@users.noreply.github.com>
Date: Wed, 6 Aug 2025 17:58:53 +0200
Subject: [PATCH 03/10] [NV TRT RTX EP] Cumulative TRT RTX EP merge (#25656)

This currently holds 2 major improvements:
- dynamic shape models should have much lower memory usage and in
addition to that the management is move towards ORT allocators
- the overhead for shape binding and address updates is reduce per
inference

---------

Co-authored-by: Gaurav Garg <gaugarg@nvidia.com>
---
 .../nv_tensorrt_rtx/nv_execution_provider.cc  | 680 +++++++-----------
 .../nv_tensorrt_rtx/nv_execution_provider.h   |  69 +-
 .../nv_execution_provider_info.h              |   1 -
 .../nv_execution_provider_utils.h             |  25 +
 .../nv_tensorrt_rtx/nv_basic_test.cc          |   4 +-
 5 files changed, 332 insertions(+), 447 deletions(-)

diff --git a/onnxruntime/core/providers/nv_tensorrt_rtx/nv_execution_provider.cc b/onnxruntime/core/providers/nv_tensorrt_rtx/nv_execution_provider.cc
index 286db9070766d..cc9d9f3da1d81 100644
--- a/onnxruntime/core/providers/nv_tensorrt_rtx/nv_execution_provider.cc
+++ b/onnxruntime/core/providers/nv_tensorrt_rtx/nv_execution_provider.cc
@@ -123,10 +123,11 @@ void* OutputAllocator::reallocateOutputAsync(char const* /*tensorName*/, void* /
   // even for empty tensors, so allocate a dummy byte.
   size = std::max(size, static_cast<uint64_t>(1));
   if (size > allocated_size) {
-    cudaFree(outputPtr);
+    alloc_->Free(alloc_, outputPtr);
     outputPtr = nullptr;
     allocated_size = 0;
-    if (cudaMalloc(&outputPtr, size) == cudaSuccess) {
+    outputPtr = alloc_->Alloc(alloc_, size);
+    if (outputPtr) {
       allocated_size = size;
     }
   }
@@ -352,193 +353,6 @@ bool ApplyProfileShapesFromProviderOptions(std::vector<nvinfer1::IOptimizationPr
   return true;
 }
 
-/*
- * Apply TensorRT optimization profile shapes from input tensor value.
- *
- * This function supports single/multiple profile(s).
- * (Note: An optimization profile describes a range of dimensions for each network input)
- *
- * @param shape_tensor_values holds "shape tensor -> shape values" for the INT32 shape tensor input across this inference run
- * @param shape_tensor_values_int64 holds "shape tensor -> shape values" for the INT64 shape tensor input across this inference run
- */
-Status ApplyProfileShapesFromInputTensorValue(std::vector<nvinfer1::IOptimizationProfile*>& trt_profiles,
-                                              Ort::KernelContext ctx,
-                                              nvinfer1::ITensor* input,
-                                              ShapeRangesMap& shape_ranges,
-                                              const std::unordered_map<std::string, size_t>& input_indexes,
-                                              std::unordered_map<std::string, std::vector<int32_t>>& shape_tensor_values,
-                                              std::unordered_map<std::string, std::vector<int64_t>>& shape_tensor_values_int64,
-                                              cudaStream_t stream,
-                                              bool* engine_update) {
-  for (size_t i = 0; i < trt_profiles.size(); i++) {
-    const std::string& input_name = input->getName();
-    nvinfer1::Dims dims = input->getDimensions();
-    int nb_dims = dims.nbDims;
-
-    size_t input_index = 0;
-    const auto& iter = input_indexes.find(input_name);
-    if (iter != input_indexes.end()) {
-      input_index = iter->second;
-    }
-
-    auto input_tensor = ctx.GetInput(input_index);
-    auto tensor_info = input_tensor.GetTensorTypeAndShapeInfo();
-    const auto tensor_shapes = tensor_info.GetShape();
-    auto& shape_ranges_per_input = shape_ranges[input_name];
-
-    auto trt_profile = trt_profiles[i];
-
-    // If there are multiple profiles, for second and rest of profiles, simply copy the min/max/opt profile values from the first profile.
-    // Following "if statement" won't be executed since TRT EP currently only allows single profile for non-explicit profiles case.
-    if (i > 0) {
-      if (input->isShapeTensor()) {
-        // shape tensor
-        int shape_size = nb_dims == 0 ? 1 : static_cast<int>(tensor_shapes[0]);
-        std::vector<int64_t> shapes_min(shape_size), shapes_opt(shape_size), shapes_max(shape_size);
-        for (int j = 0; j < shape_size; j++) {
-          shapes_min[j] = *(trt_profiles[0]->getShapeValuesV2(input_name.c_str(), nvinfer1::OptProfileSelector::kMIN));
-          shapes_max[j] = *(trt_profiles[0]->getShapeValuesV2(input_name.c_str(), nvinfer1::OptProfileSelector::kMAX));
-          shapes_opt[j] = *(trt_profiles[0]->getShapeValuesV2(input_name.c_str(), nvinfer1::OptProfileSelector::kOPT));
-        }
-        trt_profile->setShapeValuesV2(input_name.c_str(), nvinfer1::OptProfileSelector::kMIN, &shapes_min[0], shape_size);
-        trt_profile->setShapeValuesV2(input_name.c_str(), nvinfer1::OptProfileSelector::kMAX, &shapes_max[0], shape_size);
-        trt_profile->setShapeValuesV2(input_name.c_str(), nvinfer1::OptProfileSelector::kOPT, &shapes_opt[0], shape_size);
-      } else {
-        // execution tensor
-        nvinfer1::Dims dims_min, dims_opt, dims_max;
-        dims_min = trt_profiles[0]->getDimensions(input_name.c_str(), nvinfer1::OptProfileSelector::kMIN);
-        dims_max = trt_profiles[0]->getDimensions(input_name.c_str(), nvinfer1::OptProfileSelector::kMAX);
-        dims_opt = trt_profiles[0]->getDimensions(input_name.c_str(), nvinfer1::OptProfileSelector::kOPT);
-        trt_profile->setDimensions(input_name.c_str(), nvinfer1::OptProfileSelector::kMIN, dims_min);
-        trt_profile->setDimensions(input_name.c_str(), nvinfer1::OptProfileSelector::kMAX, dims_max);
-        trt_profile->setDimensions(input_name.c_str(), nvinfer1::OptProfileSelector::kOPT, dims_opt);
-      }
-      continue;
-    }
-
-    // Create shape profile
-    if (input->isShapeTensor()) {
-      // Get shape values for shape tensor input
-      const auto tensor_type = tensor_info.GetElementType();
-      // The shape of the "shape tensor" is either zero dimension (scalar) or 1-dimension
-      int shape_size = dims.nbDims == 0 ? 1 : static_cast<int>(tensor_shapes[0]);
-      // For setting TRT optimization profile. (Note: the min/opt/max profile values are still int32 even though int64 is supported after TRT 10)
-      std::vector<int32_t> values(shape_size);
-
-      switch (tensor_type) {
-        case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32: {
-          auto buffer = std::make_unique<int32_t[]>(shape_size);
-          auto status = GetShapeOfShapeTensor<int32_t>(input_tensor, buffer.get(), shape_size, stream);
-          if (status != Status::OK()) {
-            return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, status.ErrorMessage());
-          }
-          shape_tensor_values[input_name].resize(shape_size);
-          for (int j = 0; j < shape_size; ++j) {
-            shape_tensor_values[input_name][j] = buffer[j];
-            values[j] = buffer[j];
-          }
-          break;
-        }
-        case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64: {
-          auto buffer = std::make_unique<int64_t[]>(shape_size);
-          auto status = GetShapeOfShapeTensor<int64_t>(input_tensor, buffer.get(), shape_size, stream);
-          if (status != Status::OK()) {
-            return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, status.ErrorMessage());
-          }
-          shape_tensor_values_int64[input_name].resize(shape_size);
-          for (int j = 0; j < shape_size; ++j) {
-            shape_tensor_values_int64[input_name][j] = buffer[j];
-            values[j] = static_cast<int32_t>(buffer[j]);
-          }
-          break;
-        }
-        default: {
-          return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
-                                 "TensorRT shape tensor data type: " + std::to_string(tensor_type) + " not supported.");
-        }
-      }
-
-      // Update shape ranges
-      std::vector<int64_t> shapes_min(shape_size), shapes_opt(shape_size), shapes_max(shape_size);
-      int shape_range_size = static_cast<int>(shape_ranges_per_input.size());
-      if (shape_size == shape_range_size) {
-        // If shape size matches, check/update shape range
-        for (int j = 0; j < shape_size; ++j) {
-          auto& shape_range = shape_ranges_per_input[j][0];  // only has one profile
-          shapes_min[j] = static_cast<int64_t>(shape_range[0]);
-          shapes_max[j] = static_cast<int64_t>(shape_range[1]);
-          shapes_opt[j] = static_cast<int64_t>(shape_range[2]);
-
-          const auto& tensor_shape_value = values[j];
-          // Update shape range lower bound
-          if (tensor_shape_value < shape_range[0]) {
-            shape_range[0] = tensor_shape_value;
-            shapes_min[j] = tensor_shape_value;
-            *engine_update = true;
-          }
-          // Update shape range upper bound
-          if (tensor_shape_value > shape_range[1]) {
-            shape_range[1] = tensor_shape_value;
-            shape_range[2] = tensor_shape_value;
-            shapes_max[j] = tensor_shape_value;
-            shapes_opt[j] = tensor_shape_value;
-            *engine_update = true;
-          }
-        }
-      } else {
-        // If shape size doesn't match, initialize shape_range with the new shape value
-        shape_ranges_per_input.clear();
-        for (int j = 0; j < shape_size; ++j) {
-          const auto& tensor_shape_value = values[j];
-          std::vector<std::vector<int64_t>> profile_vector;
-          std::vector<int64_t> shape_vector{tensor_shape_value, tensor_shape_value, tensor_shape_value};
-          profile_vector.push_back(shape_vector);  // only one profile needed
-          shape_ranges_per_input[j] = profile_vector;
-          shapes_min[j] = tensor_shape_value;
-          shapes_opt[j] = tensor_shape_value;
-          shapes_max[j] = tensor_shape_value;
-        }
-        *engine_update = true;
-      }
-
-      trt_profile->setShapeValuesV2(input_name.c_str(), nvinfer1::OptProfileSelector::kMIN, &shapes_min[0], shape_size);
-      trt_profile->setShapeValuesV2(input_name.c_str(), nvinfer1::OptProfileSelector::kMAX, &shapes_max[0], shape_size);
-      trt_profile->setShapeValuesV2(input_name.c_str(), nvinfer1::OptProfileSelector::kOPT, &shapes_opt[0], shape_size);
-    } else {  // Execution tensor
-      nvinfer1::Dims dims_min(dims), dims_opt(dims), dims_max(dims);
-      for (int j = 0, end = nb_dims; j < end; ++j) {
-        const auto& tensor_shape = tensor_shapes[j];
-        if (shape_ranges_per_input.find(j) != shape_ranges_per_input.end()) {
-          auto& shape_range = shape_ranges_per_input[j][0];  // only has one profile
-          dims_min.d[j] = static_cast<int32_t>(shape_range[0]);
-          dims_max.d[j] = static_cast<int32_t>(shape_range[1]);
-          dims_opt.d[j] = static_cast<int32_t>(shape_range[2]);
-
-          // Update minimum dimension
-          if (tensor_shape < shape_range[0]) {
-            shape_range[0] = tensor_shape;
-            dims_min.d[j] = static_cast<int32_t>(tensor_shape);
-            *engine_update = true;
-          }
-          // Update maximum dimension
-          if (tensor_shape > shape_range[1]) {
-            shape_range[1] = tensor_shape;
-            shape_range[2] = tensor_shape;
-            dims_max.d[j] = static_cast<int32_t>(tensor_shape);
-            dims_opt.d[j] = static_cast<int32_t>(tensor_shape);
-            *engine_update = true;
-          }
-        }
-      }
-
-      trt_profile->setDimensions(input_name.c_str(), nvinfer1::OptProfileSelector::kMIN, dims_min);
-      trt_profile->setDimensions(input_name.c_str(), nvinfer1::OptProfileSelector::kMAX, dims_max);
-      trt_profile->setDimensions(input_name.c_str(), nvinfer1::OptProfileSelector::kOPT, dims_opt);
-    }
-  }
-  return Status::OK();
-}
-
 #define CASE_GET_INPUT_TENSOR(DATA_TYPE, SrcT)                                              \
   case DATA_TYPE: {                                                                         \
     auto input_tensor_ptr = input_tensor.GetTensorData<SrcT>();                             \
@@ -554,6 +368,7 @@ Status ApplyProfileShapesFromInputTensorValue(std::vector<nvinfer1::IOptimizatio
 #define CASE_GET_CAST_INPUT_TENSOR(DATA_TYPE, SrcT, DstT)                                                         \
   case DATA_TYPE: {                                                                                               \
     auto input_tensor_ptr = input_tensor.GetTensorData<SrcT>();                                                   \
+    skip_input_binding_allowed = false;                                                                           \
     if (input_tensor_ptr != nullptr && elem_cnt > 0) {                                                            \
       scratch_buffers.push_back(IAllocator::MakeUniquePtrFromOrtAllocator<void>(alloc, elem_cnt * sizeof(DstT))); \
       data = scratch_buffers.back().get();                                                                        \
@@ -568,6 +383,7 @@ Status ApplyProfileShapesFromInputTensorValue(std::vector<nvinfer1::IOptimizatio
 #define CASE_GET_OUTPUT_TENSOR(DATA_TYPE, SrcT)                                             \
   case DATA_TYPE: {                                                                         \
     auto output_tensor_ptr = output_tensor.GetTensorMutableData<SrcT>();                    \
+    data_ptr = output_tensor_ptr;                                                           \
     if (output_tensor_ptr != nullptr && elem_cnt > 0) {                                     \
       buffers[output_name] = output_tensor_ptr;                                             \
     } else {                                                                                \
@@ -580,6 +396,8 @@ Status ApplyProfileShapesFromInputTensorValue(std::vector<nvinfer1::IOptimizatio
 #define CASE_GET_CAST_OUTPUT_TENSOR(DATA_TYPE, SrcT, DstT)                                                        \
   case DATA_TYPE: {                                                                                               \
     auto output_tensor_ptr = output_tensor.GetTensorMutableData<SrcT>();                                          \
+    data_ptr = output_tensor_ptr;                                                                                 \
+    skip_output_binding_allowed = false;                                                                          \
     if (output_tensor_ptr != nullptr && elem_cnt > 0) {                                                           \
       scratch_buffers.push_back(IAllocator::MakeUniquePtrFromOrtAllocator<void>(alloc, elem_cnt * sizeof(DstT))); \
       buffers[output_name] = scratch_buffers.back().get();                                                        \
@@ -628,7 +446,8 @@ Status BindContextInput(Ort::KernelContext& ctx,
                         std::unordered_map<std::string, std::vector<int64_t>>& shape_tensor_values_int64,
                         std::vector<IAllocatorUniquePtr<void>>& scratch_buffers,
                         OrtAllocator* alloc,
-                        cudaStream_t stream) {
+                        cudaStream_t stream,
+                        bool& skip_input_binding_allowed) {
   auto input_tensor = ctx.GetInput(input_index);
   auto tensor_info = input_tensor.GetTensorTypeAndShapeInfo();
   const auto tensor_shapes = tensor_info.GetShape();
@@ -647,7 +466,7 @@ Status BindContextInput(Ort::KernelContext& ctx,
 
   if (trt_engine->isShapeInferenceIO(input_name)) {
     // Bind "shape tensor" input buffer
-
+    skip_input_binding_allowed = false;  // Shape tensor input binding cannot be skipped
     // The shape of the "shape tensor" is either zero dimension (scalar) or 1-dimension
     int shape_size = trt_engine->getTensorShape(input_name).nbDims == 0 ? 1 : static_cast<int>(tensor_shapes[0]);
     switch (tensor_type) {
@@ -775,19 +594,20 @@ Status BindContextOutput(Ort::KernelContext& ctx,
                          DDSOutputAllocatorMap& dds_output_allocator_map,
                          std::vector<IAllocatorUniquePtr<void>>& scratch_buffers,
                          OrtAllocator* alloc,
-                         std::unordered_map<char const*, void*>& buffers) {
+                         std::unordered_map<char const*, void*>& buffers,
+                         nvinfer1::Dims& dims,
+                         void*& data_ptr,
+                         bool& skip_output_binding_allowed) {
   // Get output shape
-  nvinfer1::Dims dims = trt_context->getTensorShape(output_name);
+  dims = trt_context->getTensorShape(output_name);
   int nb_dims = dims.nbDims;
   bool is_DDS = false;
-  std::vector<int64_t> output_shapes(nb_dims);
   for (int j = 0, end = nb_dims; j < end; ++j) {
     // data-dependent shape
     if (dims.d[j] == -1) {
       is_DDS = true;
       break;
     }
-    output_shapes[j] = dims.d[j];
   }
 
   auto known_DDS = dds_output_allocator_map.find(output_name) != dds_output_allocator_map.end();
@@ -800,16 +620,19 @@ Status BindContextOutput(Ort::KernelContext& ctx,
   // Otherwise, if the shape of the output tensor is known prior to the runtime, ORT will pre-allocate memory buffer for the output tensor for enqueueV3.
   if (is_DDS || known_DDS) {
     if (!known_DDS) {
-      auto allocatorPtr = std::make_unique<OutputAllocator>();
+      auto allocatorPtr = std::make_unique<OutputAllocator>(alloc);
       trt_context->setOutputAllocator(output_name, allocatorPtr.get());
       dds_output_allocator_map[output_name] = std::move(allocatorPtr);
+      dims.nbDims = -1;    // Set to -1 to indicate that the shape is not known at this point.
+      data_ptr = nullptr;  // Set data_ptr to nullptr for DDS output binding.
     }
   } else {
-    output_tensors[i] = ctx.GetOutput(output_index, output_shapes);
+    output_tensors[i] = ctx.GetOutput(output_index, dims.d, nb_dims);
     auto& output_tensor = output_tensors[i];
     const auto elem_cnt = output_tensor.GetTensorTypeAndShapeInfo().GetElementCount();
 
     switch (output_type) {
+      // below macros set data_ptr and skip_output_binding_allowed variables
       CASE_GET_OUTPUT_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT, float)
       CASE_GET_OUTPUT_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16, uint16_t)
       CASE_GET_OUTPUT_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_BFLOAT16, uint16_t)
@@ -840,7 +663,6 @@ Status BindContextOutput(Ort::KernelContext& ctx,
  * we are waiting for ORT core to support "assign" memory address to ORT context output. Some works need to be done in ORT memory planner to be aware of this memory support.
  */
 Status BindKernelOutput(Ort::KernelContext& ctx,
-                        OrtMemoryInfo* /*mem_info*/,
                         DDSOutputAllocatorMap& allocator_map,
                         char const* output_name,
                         size_t output_index,
@@ -903,31 +725,6 @@ NvExecutionProvider::PerThreadContext::~PerThreadContext() {
   trt_context_map_.clear();
 }
 
-/*
- * Returns true if the shape ranges maintained by the PerThreadContext is different from the shape ragnes maintained by TRT EP, meaning the
- * engine is being updated and the execution context maintained by the PerThreadContext should be updated as well. Otherwise, returns false.
- *
- */
-bool NvExecutionProvider::PerThreadContext::CompareProfileShapes(std::string fused_node, ShapeRangesMap& shape_ranges) {
-  if (shape_ranges.size() > 0) {
-    if (input_shape_ranges_[fused_node] != shape_ranges) {
-      LOGS_DEFAULT(VERBOSE) << "[NvTensorRTRTX EP] The shape ranges maintained by the PerThreadContext is different from the shape ranges maintained by TRT EP. \
-                                This means the engine is updated and will need to update the execution context as well.";
-      return true;
-    }
-  }
-  return false;
-}
-
-/*
- * Updates the shape ranges maintained by the PerThreadContext.
- * As long as the execution context maintained by the PerThreadContext is updated, the associated shape ranges should be updated as well.
- *
- */
-void NvExecutionProvider::PerThreadContext::UpdateProfileShapes(std::string fused_node, ShapeRangesMap& shape_ranges) {
-  input_shape_ranges_[fused_node] = shape_ranges;
-}
-
 void NvExecutionProvider::PerThreadContext::ResetTensorRTContext(std::string fused_node) {
   auto it = trt_context_map_.find(fused_node);
   if (it != trt_context_map_.end()) {
@@ -1081,7 +878,6 @@ NvExecutionProvider::NvExecutionProvider(const NvExecutionProviderInfo& info)
     engine_decryption_lib_path_ = info.engine_decryption_lib_path;
   }
   force_sequential_engine_build_ = info.force_sequential_engine_build;
-  context_memory_sharing_enable_ = info.context_memory_sharing_enable;
   sparsity_enable_ = info.sparsity_enable;
   auxiliary_streams_ = info.auxiliary_streams;
   profile_min_shapes = info.profile_min_shapes;
@@ -1225,7 +1021,6 @@ NvExecutionProvider::NvExecutionProvider(const NvExecutionProviderInfo& info)
                         << ", nv_engine_decryption_enable: " << engine_decryption_enable_
                         << ", nv_engine_decryption_lib_path: " << engine_decryption_lib_path_
                         << ", nv_force_sequential_engine_build: " << force_sequential_engine_build_
-                        << ", nv_context_memory_sharing_enable: " << context_memory_sharing_enable_
                         << ", nv_sparsity_enable: " << sparsity_enable_
                         << ", nv_auxiliary_streams: " << auxiliary_streams_
                         << ", nv_cuda_graph_enable: " << cuda_graph_enable_
@@ -1298,9 +1093,15 @@ void NvExecutionProvider::IncrementRegularRunCountBeforeGraphCapture() {
 }
 
 std::vector<AllocatorPtr> NvExecutionProvider::CreatePreferredAllocators() {
+  OrtArenaCfg arena_cfg(0, static_cast<int>(ArenaExtendStrategy::kSameAsRequested),
+                        -1, -1, -1, -1);
   AllocatorCreationInfo default_memory_info(
       [](OrtDevice::DeviceId device_id) { return std::make_unique<CUDAAllocator>(device_id, CUDA); },
-      narrow<OrtDevice::DeviceId>(device_id_));
+      narrow<OrtDevice::DeviceId>(device_id_),
+      true,
+      arena_cfg,
+      // make it stream aware
+      true);
 
   AllocatorCreationInfo pinned_allocator_info(
       [](OrtDevice::DeviceId device_id) {
@@ -2244,6 +2045,96 @@ common::Status NvExecutionProvider::Compile(const std::vector<FusedNodeAndGraph>
   return Status::OK();
 }
 
+/**
+ * @brief Determines whether I/O binding is required for TensorRT execution.
+ *
+ * This function optimizes TensorRT inference performance by determining when tensor
+ * input/output binding operations can be skipped. Binding is an expensive operation
+ * that involves setting up tensor pointers in the TensorRT execution context, so
+ * avoiding unnecessary rebinding can significantly improve inference throughput.
+ *
+ * The function implements a three-tier decision logic:
+ * 1. First run: Always requires binding to establish initial tensor mappings
+ * 2. Subsequent runs with optimization allowed: Only rebind if tensors have changed
+ * 3. Subsequent runs without optimization: Always rebind for safety
+ *
+ * @tparam TRTState The TensorRT state type (TensorrtFuncState or TensorrtShortFuncState)
+ * @param trt_state Pointer to the TensorRT execution state containing tensor cache
+ *                  and configuration flags
+ * @param ctx ONNX Runtime kernel context providing access to current input tensors
+ *
+ * @return true if I/O binding is required (tensors changed or safety conditions apply),
+ *         false if binding can be safely skipped (optimization enabled and tensors unchanged)
+ *
+ * @note This function modifies trt_state by:
+ *       - Setting is_first_run to false after first execution
+ *       - Caching current tensor parameters in input_tensors vector
+ *       - Updating cached tensors when changes are detected
+ *
+ * @warning The skip_io_binding_allowed flag must be carefully managed as incorrect
+ *          usage can lead to inference with stale tensor bindings and incorrect results.
+ */
+template <class TRTState>
+static bool IsIOBindingRequired(TRTState* const trt_state, const Ort::KernelContext& ctx) {
+  // Check if input tensors have changed since the last run
+  // If so, we need to bind input tensors again
+  bool require_io_binding = false;
+
+  if (trt_state->is_first_run) {
+    // If this is the first run, we always bind input tensors
+    require_io_binding = true;
+    auto input_tensor_count = ctx.GetInputCount();
+    auto output_tensor_count = ctx.GetOutputCount();
+    trt_state->input_tensors.resize(input_tensor_count);
+    trt_state->output_tensors.resize(output_tensor_count);
+    for (size_t input_index = 0; input_index < input_tensor_count; ++input_index) {
+      const auto& input_tensor = ctx.GetInput(input_index);
+      const auto& tensor_info = input_tensor.GetTensorTypeAndShapeInfo();
+
+      trt_state->input_tensors[input_index] = TensorParams{input_tensor.GetTensorRawData(), tensor_info.GetShape()};
+    }
+    trt_state->is_first_run = false;
+  } else if (trt_state->skip_io_binding_allowed) {
+    // If skip_io_binding_allowed is true, we can skip binding if input tensors are the same as before
+    auto input_tensor_count = ctx.GetInputCount();
+    for (size_t input_index = 0; input_index < input_tensor_count; ++input_index) {
+      const auto& input_tensor = ctx.GetInput(input_index);
+      const auto& tensor_info = input_tensor.GetTensorTypeAndShapeInfo();
+
+      TensorParams ip_tensor{input_tensor.GetTensorRawData(), tensor_info.GetShape()};
+
+      if (ip_tensor != trt_state->input_tensors[input_index]) {
+        require_io_binding = true;
+        trt_state->input_tensors[input_index] = ip_tensor;
+      }
+    }
+  } else {
+    // If this is not the first run and skip_io_binding_allowed is false, we need to bind input tensors
+    require_io_binding = true;
+  }
+
+  if (!require_io_binding) {
+    // no need to bind inputs, but check outputs as well
+    auto output_tensor_count = ctx.GetOutputCount();
+
+    for (size_t output_index = 0; output_index < output_tensor_count; ++output_index) {
+      const auto& prev_output_tensor = trt_state->output_tensors[output_index];
+
+      if (prev_output_tensor.dims.nbDims != -1) {
+        const auto& new_output_tensor = ctx.GetOutput(output_index, prev_output_tensor.dims.d, prev_output_tensor.dims.nbDims);
+
+        // different output tensor data means we need to bind outputs again
+        if (prev_output_tensor.data != new_output_tensor.GetTensorRawData()) {
+          require_io_binding = true;
+          break;
+        }
+      }
+    }
+  }
+
+  return require_io_binding;
+}
+
 Status NvExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphViewer& graph_body_viewer,
                                                            const Node& fused_node,
                                                            std::unordered_map<std::string, size_t>& input_map,
@@ -2349,21 +2240,6 @@ Status NvExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphViewer& gr
   ShapeRangesMap input_explicit_shape_ranges;
   ShapeRangesMap input_implicit_shape_ranges;
 
-  auto tensor_is_dynamic = [&](nvinfer1::ITensor* tensor) -> bool {
-    if (tensor->isShapeTensor()) {
-      return true;
-    } else {
-      nvinfer1::Dims dims = tensor->getDimensions();
-      // Execution tensor
-      for (int j = 0, end = dims.nbDims; j < end; ++j) {
-        if (dims.d[j] == -1) {
-          return true;
-        }
-      }
-    }
-    return false;
-  };
-
   bool has_dynamic_shape = false;  // True if input tensor has dynamic shape and no explicit profile is specified, otherwise false
   if ((!profile_min_shapes_.empty()) && (!profile_max_shapes_.empty()) && (!profile_opt_shapes_.empty())) {
     has_explicit_profile = true;
@@ -2375,7 +2251,7 @@ Status NvExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphViewer& gr
   } else {
     for (unsigned int i = 0, end = num_inputs; i < end; ++i) {
       auto input = trt_network->getInput(i);
-      has_dynamic_shape |= tensor_is_dynamic(input);
+      has_dynamic_shape |= checkTrtTensorIsDynamic(input);
     }
     if (has_dynamic_shape) {
       LOGS_DEFAULT(WARNING) << "[NvTensorRTRTX EP] No explicit optimization profile was specified. "
@@ -2574,31 +2450,18 @@ Status NvExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphViewer& gr
   // Build context
   // Note: Creating an execution context from an engine is thread safe per TRT doc
   // https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#threading
-  if (context_memory_sharing_enable_) {
-#if defined(_MSC_VER)
-#pragma warning(push)
-#pragma warning(disable : 4996)
-#endif
-    size_t mem_size = trt_engine->getDeviceMemorySizeV2();
-#if defined(_MSC_VER)
-#pragma warning(pop)
-#endif
-    if (mem_size > max_ctx_mem_size_) {
-      max_ctx_mem_size_ = mem_size;
-    }
-    trt_context = std::unique_ptr<nvinfer1::IExecutionContext>(trt_engine->createExecutionContext(nvinfer1::ExecutionContextAllocationStrategy::kUSER_MANAGED));
-  } else {
-    trt_context = std::unique_ptr<nvinfer1::IExecutionContext>(trt_engine->createExecutionContext());
-  }
+  trt_context = std::unique_ptr<nvinfer1::IExecutionContext>(trt_engine->createExecutionContext(nvinfer1::ExecutionContextAllocationStrategy::kUSER_MANAGED));
   if (!trt_context) {
     return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
                            "Nv EP could not build execution context for fused node: " + fused_node.Name());
   }
 
+  bool is_dynamic_shape_context = false;
   // Create input to index map
   for (int i = 0; i < num_inputs; ++i) {
     auto input = trt_network->getInput(i);
     const std::string& input_name = input->getName();
+    is_dynamic_shape_context |= checkTrtDimIsDynamic(trt_engine->getTensorShape(input_name.c_str()));
     const auto& iter = input_map.find(input_name);
     if (iter != input_map.end()) {
       input_indexes[input_name] = iter->second;
@@ -2639,10 +2502,9 @@ Status NvExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphViewer& gr
           input_shape_ranges_[context->node_name], &tensorrt_mu_, trt_node_name_with_precision,
           engine_cache_enable_, cache_path_,
           runtime_.get(), profiles_[context->node_name],
-          context_memory_sharing_enable_, &max_ctx_mem_size_,
           engine_decryption_enable_, engine_decryption_, engine_encryption_,
           detailed_build_log_, sparsity_enable_,
-          auxiliary_streams_, cuda_graph_enable_, cache_prefix_, cache_suffix};
+          auxiliary_streams_, cuda_graph_enable_, is_dynamic_shape_context, cache_prefix_, cache_suffix};
     *state = p.release();
     return 0;
   };
@@ -2666,25 +2528,20 @@ Status NvExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphViewer& gr
     const std::unordered_map<std::string, size_t>& output_indexes = (trt_state->output_info)[0];
     const std::unordered_map<std::string, size_t>& output_types = (trt_state->output_info)[1];
     auto fused_node_name = trt_state->fused_node_name;
-    // This map "shape_ranges" contains the shape range info for setting TRT optimization profiles.
-    // The info is used for both shape tensor and execution tensor:
-    // tensor name->(dimension->[min, max, opt])
-    auto& shape_ranges = trt_state->input_shape_ranges;
+
     std::unordered_map<std::string, std::vector<int32_t>> shape_tensor_values;        // This map holds "shape tensor -> shape values" for the shape tensor input across this inference run
     std::unordered_map<std::string, std::vector<int64_t>> shape_tensor_values_int64;  // same as above but for int64 shape tensor input
     auto& dds_output_allocator_map = this->dds_output_allocator_maps_[fused_node_name];
     auto trt_engine = trt_state->engine->get();
     auto trt_context = trt_state->context->get();
     auto trt_profiles = trt_state->profiles;
-    auto max_context_mem_size_ptr = trt_state->max_context_mem_size_ptr;
-    int num_inputs = static_cast<int>(input_indexes.size());
     int num_outputs = static_cast<int>(output_indexes.size());
     std::unordered_set<std::string> input_names;
 
-    OrtDevice device(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, OrtDevice::VendorIds::NVIDIA,
-                     narrow<OrtDevice::DeviceId>(device_id_));
-    OrtMemoryInfo mem_info("", OrtAllocatorType::OrtDeviceAllocator, device);
     if (alloc_ == nullptr) {
+      OrtDevice device(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, OrtDevice::VendorIds::NVIDIA,
+                       narrow<OrtDevice::DeviceId>(device_id_));
+      OrtMemoryInfo mem_info("", OrtAllocatorType::OrtDeviceAllocator, device);
       Ort::ThrowOnError(api->KernelContext_GetAllocator(context, &mem_info, &alloc_));
     }
     OrtAllocator* alloc = alloc_;
@@ -2698,68 +2555,13 @@ Status NvExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphViewer& gr
         return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Nv EP select an optimization profile for the current context failed");
     }
 
-    // Name the engine cache based on GPU compute capacity and reduce the chance of loading an incompatible cache
-    // Note: Engine cache generated on a GPU with large memory might not be loadable on a GPU with smaller memory, even if they share the same compute capacity
-    // Prepare cache name
-    std::string cache_path = "";
-    // Customize cache prefix if assigned
-    if (!cache_prefix_.empty()) {
-      cache_path = GetCachePath(trt_state->engine_cache_path, trt_state->cache_prefix) + trt_state->cache_suffix;
-    } else {
-      cache_path = GetCachePath(trt_state->engine_cache_path, trt_state->trt_node_name_with_precision);
-    }
-
-    // Enable hardware compatility mode if assigned
-    std::string cache_hw_compat = "_sm" + compute_capability_;
-
-    // Name the engine cache based on GPU compute capacity and reduce the chance of loading an incompatible cache
-    // Note: Engine cache generated on a GPU with large memory might not be loadable on a GPU with smaller memory, even if they share the same compute capacity
-    const std::string cache_path_prefix = cache_path + cache_hw_compat;
-    std::string engine_cache_path = cache_path_prefix + ".engine";
-    const std::string encrypted_engine_cache_path = engine_cache_path + ".encrypted";
-    const std::string profile_cache_path = cache_path_prefix + ".profile";
-
-    // If weight-stripped engine is enabled and refitted engine cache is not present,
-    // TRT EP will use the engine cache with ".stripped.engine" appended to the end.
-    const std::filesystem::path engine_cache_fs_path = engine_cache_path;
-    if (weight_stripped_engine_enable_ && !std::filesystem::exists(engine_cache_fs_path)) {
-      engine_cache_path = cache_path_prefix + ".stripped.engine";
-      weight_stripped_engine_refit_ = true;
-    }
-
-    // Check and update shape ranges for dynamic shape inputs.
-    for (int i = 0, end = num_inputs; i < end; ++i) {
-      auto input = trt_state->network->get()->getInput(i);
-      const std::string& input_name = input->getName();
-      input_names.insert(input_name);
-
-      // If there is any input tensor in shape_ranges, it means this input tensor has dynamic shape and its profile shape values have not yet resolved.
-      // TRT EP will help determine the min/max/opt profile values based on current input tensor value.
-      if (shape_ranges.find(input_name) != shape_ranges.end()) {
-        return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, "Nv EP failed to parse input tensor and generate optimization profiles.");
-      }
-    }
-
-    if (weight_stripped_engine_refit_) {
-      auto status = RefitEngine(model_path_,
-                                onnx_model_folder_path_,
-                                engine_cache_path,
-                                false /* path check for security */,
-                                onnx_model_bytestream_,
-                                onnx_model_bytestream_size_,
-                                trt_engine,
-                                false /* serialize refitted engine to disk */,
-                                detailed_build_log_);
-      if (status != Status::OK()) {
-        return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, status.ErrorMessage());
-      }
-    }
-
     // Check before using trt_engine
     if (trt_engine == nullptr) {
       return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, "No engine is found.");
     }
 
+    bool require_io_binding = IsIOBindingRequired(trt_state, ctx);
+
     // Get input and output binding names
     int total_bindings = trt_engine->getNbIOTensors();
     std::vector<char const*> input_binding_names, output_binding_names;
@@ -2776,23 +2578,25 @@ Status NvExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphViewer& gr
     /*
      * Set input shapes and bind input buffers
      */
-    std::vector<IAllocatorUniquePtr<void>> scratch_buffers;
-    for (size_t i = 0, end = input_binding_names.size(); i < end; ++i) {
-      char const* input_name = input_binding_names[i];
-
-      size_t input_index = 0;
-      const auto iter = input_indexes.find(input_name);
-      if (iter != input_indexes.end()) {
-        input_index = iter->second;
-      }
-      auto input_tensor = ctx.GetInput(input_index);
-      auto tensor_info = input_tensor.GetTensorTypeAndShapeInfo();
-      const auto tensor_shapes = tensor_info.GetShape();
+    auto& scratch_buffers = trt_state->scratch_buffers;
+    if (require_io_binding) {
+      scratch_buffers.clear();
+      bool skip_input_binding_allowed = true;
+      for (size_t i = 0, end = input_binding_names.size(); i < end; ++i) {
+        char const* input_name = input_binding_names[i];
+
+        size_t input_index = 0;
+        const auto iter = input_indexes.find(input_name);
+        if (iter != input_indexes.end()) {
+          input_index = iter->second;
+        }
 
-      auto status = BindContextInput(ctx, trt_engine, trt_context, input_name, input_index, shape_tensor_values, shape_tensor_values_int64, scratch_buffers, alloc, stream);
-      if (status != Status::OK()) {
-        return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, status.ErrorMessage());
+        auto status = BindContextInput(ctx, trt_engine, trt_context, input_name, input_index, shape_tensor_values, shape_tensor_values_int64, scratch_buffers, alloc, stream, skip_input_binding_allowed);
+        if (status != Status::OK()) {
+          return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, status.ErrorMessage());
+        }
       }
+      trt_state->skip_io_binding_allowed = skip_input_binding_allowed;
     }
 
     /*
@@ -2806,44 +2610,51 @@ Status NvExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphViewer& gr
     std::unordered_map<size_t, int> output_dim_sizes;
     output_dim_sizes.reserve(num_outputs);
 
-    for (size_t i = 0, end = output_binding_names.size(); i < end; ++i) {
-      char const* output_name = output_binding_names[i];
+    if (require_io_binding) {
+      bool skip_output_binding_allowed = true;
+      for (size_t i = 0, end = output_binding_names.size(); i < end; ++i) {
+        char const* output_name = output_binding_names[i];
 
-      size_t output_index = 0;
-      const auto& index_iter = output_indexes.find(output_name);
-      if (index_iter != output_indexes.end()) {
-        output_index = index_iter->second;
-      }
+        size_t output_index = 0;
+        const auto& index_iter = output_indexes.find(output_name);
+        if (index_iter != output_indexes.end()) {
+          output_index = index_iter->second;
+        }
 
-      size_t output_type = 0;
-      const auto type_iter = output_types.find(output_name);
-      if (type_iter != output_types.end()) {
-        output_type = type_iter->second;
-      }
+        size_t output_type = 0;
+        const auto type_iter = output_types.find(output_name);
+        if (type_iter != output_types.end()) {
+          output_type = type_iter->second;
+        }
+
+        nvinfer1::Dims dims;
+        void* data_ptr = nullptr;
+
+        Status status = BindContextOutput(ctx, trt_context, output_name, output_index, output_type, i, output_tensors, output_dim_sizes,
+                                          dds_output_allocator_map, scratch_buffers, alloc, buffers, dims, data_ptr, skip_output_binding_allowed);
+        if (status != Status::OK()) {
+          return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, status.ErrorMessage());
+        }
 
-      Status status = BindContextOutput(ctx, trt_context, output_name, output_index, output_type, i, output_tensors, output_dim_sizes,
-                                        dds_output_allocator_map, scratch_buffers, alloc, buffers);
-      if (status != Status::OK()) {
-        return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, status.ErrorMessage());
+        trt_state->output_tensors[output_index] = TensorParams{data_ptr, dims};
       }
+
+      trt_state->skip_io_binding_allowed = trt_state->skip_io_binding_allowed | skip_output_binding_allowed;
     }
 
     // Set execution context memory
-    if (trt_state->context_memory_sharing_enable) {
-#if defined(_MSC_VER)
-#pragma warning(push)
-#pragma warning(disable : 4996)
-#endif
+    if (require_io_binding) {
       size_t mem_size = trt_engine->getDeviceMemorySizeV2();
-#if defined(_MSC_VER)
-#pragma warning(pop)
-#endif
-      if (mem_size > *max_context_mem_size_ptr) {
-        *max_context_mem_size_ptr = mem_size;
+      if (trt_state->is_dynamic_shape) {
+        mem_size = trt_context->updateDeviceMemorySizeForShapes();
+      }
+      if (trt_state->context_memory_size != mem_size) {
+        LOGS_DEFAULT(INFO) << "[NvTensorRTRTX EP] A new context memory was allocated with size " << mem_size;
+        trt_state->context_memory = IAllocator::MakeUniquePtrFromOrtAllocator<void>(alloc, mem_size, false /*use_reserve*/);
+        trt_state->context_memory_size = mem_size;
+        trt_context->setDeviceMemoryV2(trt_state->context_memory.get(), mem_size);
       }
-      trt_context->setDeviceMemory(IAllocator::MakeUniquePtrFromOrtAllocator<void>(alloc, *max_context_mem_size_ptr).get());
     }
-
     // Start CUDA graph capture.
     // Note: The reason we don't put graph capture in OnRunStart() like CUDA EP does is because
     // current ORT TRT doesn't get cuda stream until compute time and graph capture requires cuda stream.
@@ -2894,7 +2705,7 @@ Status NvExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphViewer& gr
         if (index_iter != output_indexes.end()) {
           output_index = index_iter->second;
         }
-        auto status = BindKernelOutput(ctx, &mem_info, dds_output_allocator_map, output_name, output_index, output_type, stream);
+        auto status = BindKernelOutput(ctx, dds_output_allocator_map, output_name, output_index, output_type, stream);
         if (status != Status::OK()) {
           return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, status.ErrorMessage());
         }
@@ -2961,33 +2772,19 @@ Status NvExecutionProvider::CreateNodeComputeInfoFromPrecompiledEngine(const Gra
   //
   // Note: Creating an execution context from an engine is thread safe per TRT doc
   // https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#threading
-  if (context_memory_sharing_enable_) {
-#if defined(_MSC_VER)
-#pragma warning(push)
-#pragma warning(disable : 4996)
-#endif
-    size_t mem_size = trt_engine->getDeviceMemorySizeV2();
-#if defined(_MSC_VER)
-#pragma warning(pop)
-#endif
-    if (mem_size > max_ctx_mem_size_) {
-      max_ctx_mem_size_ = mem_size;
-    }
-    trt_context = std::unique_ptr<nvinfer1::IExecutionContext>(trt_engine->createExecutionContext(nvinfer1::ExecutionContextAllocationStrategy::kUSER_MANAGED));
-
-  } else {
-    trt_context = std::unique_ptr<nvinfer1::IExecutionContext>(trt_engine->createExecutionContext());
-  }
+  trt_context = std::unique_ptr<nvinfer1::IExecutionContext>(trt_engine->createExecutionContext(nvinfer1::ExecutionContextAllocationStrategy::kUSER_MANAGED));
   if (!trt_context) {
     return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
                            "Nv EP could not build execution context for fused node: " + fused_node.Name());
   }
 
+  bool is_dynamic_shape_context = false;
   // Create input/output to index maps
   for (int32_t i = 0; i < trt_engine->getNbIOTensors(); ++i) {
     auto const& name = trt_engine->getIOTensorName(i);
     auto const& mode = trt_engine->getTensorIOMode(name);
     if (mode == nvinfer1::TensorIOMode::kINPUT) {
+      is_dynamic_shape_context |= checkTrtDimIsDynamic(trt_engine->getTensorShape(name));
       const auto& iter = input_map.find(name);
       if (iter != input_map.end()) {
         input_indexes[name] = iter->second;
@@ -3027,9 +2824,8 @@ Status NvExecutionProvider::CreateNodeComputeInfoFromPrecompiledEngine(const Gra
           &contexts_[context->node_name],
           input_info_[context->node_name],
           output_info_[context->node_name],
-          context_memory_sharing_enable_,
-          &max_ctx_mem_size_,
-          &tensorrt_mu_};
+          &tensorrt_mu_,
+          is_dynamic_shape_context};
     *state = p.release();
     return 0;
   };
@@ -3056,15 +2852,14 @@ Status NvExecutionProvider::CreateNodeComputeInfoFromPrecompiledEngine(const Gra
     auto& dds_output_allocator_map = this->dds_output_allocator_maps_[fused_node_name];
     auto trt_engine = trt_state->engine->get();
     auto trt_context = trt_state->context->get();
-    auto max_context_mem_size_ptr = trt_state->max_context_mem_size_ptr;
     int num_outputs = static_cast<int>(output_indexes.size());
     std::unordered_map<std::string, std::vector<int32_t>> shape_tensor_values;        // This map holds "shape tensor -> shape values" for the shape tensor input across this inference run
     std::unordered_map<std::string, std::vector<int64_t>> shape_tensor_values_int64;  // same as above but for int64 shape tensor input
 
-    OrtDevice device(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, OrtDevice::VendorIds::NVIDIA,
-                     narrow<OrtDevice::DeviceId>(device_id_));
-    OrtMemoryInfo mem_info("", OrtAllocatorType::OrtDeviceAllocator, device);
     if (alloc_ == nullptr) {
+      OrtDevice device(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, OrtDevice::VendorIds::NVIDIA,
+                       narrow<OrtDevice::DeviceId>(device_id_));
+      OrtMemoryInfo mem_info("", OrtAllocatorType::OrtDeviceAllocator, device);
       Ort::ThrowOnError(api->KernelContext_GetAllocator(context, &mem_info, &alloc_));
     }
     OrtAllocator* alloc = alloc_;
@@ -3078,6 +2873,8 @@ Status NvExecutionProvider::CreateNodeComputeInfoFromPrecompiledEngine(const Gra
       return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, "No engine is found.");
     }
 
+    bool require_io_binding = IsIOBindingRequired(trt_state, ctx);
+
     // Get input and output binding names
     int total_bindings = trt_engine->getNbIOTensors();
     std::vector<char const*> input_binding_names, output_binding_names;
@@ -3094,20 +2891,25 @@ Status NvExecutionProvider::CreateNodeComputeInfoFromPrecompiledEngine(const Gra
     /*
      * Set input shapes and bind input buffers
      */
-    std::vector<IAllocatorUniquePtr<void>> scratch_buffers;
-    for (size_t i = 0, end = input_binding_names.size(); i < end; ++i) {
-      char const* input_name = input_binding_names[i];
-
-      size_t input_index = 0;
-      const auto iter = input_indexes.find(input_name);
-      if (iter != input_indexes.end()) {
-        input_index = iter->second;
-      }
+    auto& scratch_buffers = trt_state->scratch_buffers;
+    if (require_io_binding) {
+      scratch_buffers.clear();
+      bool skip_input_binding_allowed = true;
+      for (size_t i = 0, end = input_binding_names.size(); i < end; ++i) {
+        char const* input_name = input_binding_names[i];
+
+        size_t input_index = 0;
+        const auto iter = input_indexes.find(input_name);
+        if (iter != input_indexes.end()) {
+          input_index = iter->second;
+        }
 
-      Status status = BindContextInput(ctx, trt_engine, trt_context, input_name, input_index, shape_tensor_values, shape_tensor_values_int64, scratch_buffers, alloc, stream);
-      if (status != Status::OK()) {
-        return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, status.ErrorMessage());
+        Status status = BindContextInput(ctx, trt_engine, trt_context, input_name, input_index, shape_tensor_values, shape_tensor_values_int64, scratch_buffers, alloc, stream, skip_input_binding_allowed);
+        if (status != Status::OK()) {
+          return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, status.ErrorMessage());
+        }
       }
+      trt_state->skip_io_binding_allowed = skip_input_binding_allowed;
     }
 
     /*
@@ -3121,44 +2923,52 @@ Status NvExecutionProvider::CreateNodeComputeInfoFromPrecompiledEngine(const Gra
     std::unordered_map<size_t, int> output_dim_sizes;
     output_dim_sizes.reserve(num_outputs);
 
-    for (size_t i = 0, end = output_binding_names.size(); i < end; ++i) {
-      char const* output_name = output_binding_names[i];
+    if (require_io_binding) {
+      bool skip_output_binding_allowed = true;
+      for (size_t i = 0, end = output_binding_names.size(); i < end; ++i) {
+        char const* output_name = output_binding_names[i];
 
-      size_t output_index = 0;
-      const auto& index_iter = output_indexes.find(output_name);
-      if (index_iter != output_indexes.end()) {
-        output_index = index_iter->second;
-      }
+        size_t output_index = 0;
+        const auto& index_iter = output_indexes.find(output_name);
+        if (index_iter != output_indexes.end()) {
+          output_index = index_iter->second;
+        }
 
-      size_t output_type = 0;
-      const auto type_iter = output_types.find(output_name);
-      if (type_iter != output_types.end()) {
-        output_type = type_iter->second;
-      }
+        size_t output_type = 0;
+        const auto type_iter = output_types.find(output_name);
+        if (type_iter != output_types.end()) {
+          output_type = type_iter->second;
+        }
+
+        nvinfer1::Dims dims;
+        void* data_ptr = nullptr;
+
+        Status status = BindContextOutput(ctx, trt_context, output_name, output_index, output_type, i, output_tensors, output_dim_sizes,
+                                          dds_output_allocator_map, scratch_buffers, alloc, buffers, dims, data_ptr, skip_output_binding_allowed);
+        if (status != Status::OK()) {
+          return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, status.ErrorMessage());
+        }
 
-      Status status = BindContextOutput(ctx, trt_context, output_name, output_index, output_type, i, output_tensors, output_dim_sizes,
-                                        dds_output_allocator_map, scratch_buffers, alloc, buffers);
-      if (status != Status::OK()) {
-        return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, status.ErrorMessage());
+        trt_state->output_tensors[output_index] = TensorParams{data_ptr, dims};
       }
+
+      trt_state->skip_io_binding_allowed = trt_state->skip_io_binding_allowed | skip_output_binding_allowed;
     }
 
     // Set execution context memory
-    if (trt_state->context_memory_sharing_enable) {
-#if defined(_MSC_VER)
-#pragma warning(push)
-#pragma warning(disable : 4996)
-#endif
+    if (require_io_binding) {
       size_t mem_size = trt_engine->getDeviceMemorySizeV2();
-#if defined(_MSC_VER)
-#pragma warning(pop)
-#endif
-      if (mem_size > *max_context_mem_size_ptr) {
-        *max_context_mem_size_ptr = mem_size;
+      if (trt_state->is_dynamic_shape) {
+        mem_size = trt_context->updateDeviceMemorySizeForShapes();
+      }
+      if (trt_state->context_memory_size != mem_size) {
+        LOGS_DEFAULT(INFO) << "[NvTensorRTRTX EP] A new context memory was allocated with size " << mem_size;
+        trt_state->context_memory = IAllocator::MakeUniquePtrFromOrtAllocator<void>(alloc, mem_size, false /*use_reserve*/);
+        // trt_state->context_memory = IAllocator::MakeUniquePtr<void>(alloc, mem_size, false /*use_reserve*/, stream);
+        trt_state->context_memory_size = mem_size;
+        trt_context->setDeviceMemoryV2(trt_state->context_memory.get(), mem_size);
       }
-      trt_context->setDeviceMemory(IAllocator::MakeUniquePtrFromOrtAllocator<void>(alloc, *max_context_mem_size_ptr).get());
     }
-
     // Start CUDA graph capture.
     // Note: The reason we don't put graph capture in OnRunStart() like CUDA EP does is because
     // current ORT TRT doesn't get cuda stream until compute time and graph capture requires cuda stream.
@@ -3209,7 +3019,7 @@ Status NvExecutionProvider::CreateNodeComputeInfoFromPrecompiledEngine(const Gra
         if (index_iter != output_indexes.end()) {
           output_index = index_iter->second;
         }
-        auto status = BindKernelOutput(ctx, &mem_info, dds_output_allocator_map, output_name, output_index, output_type, stream);
+        auto status = BindKernelOutput(ctx, dds_output_allocator_map, output_name, output_index, output_type, stream);
         if (status != Status::OK()) {
           return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, status.ErrorMessage());
         }
diff --git a/onnxruntime/core/providers/nv_tensorrt_rtx/nv_execution_provider.h b/onnxruntime/core/providers/nv_tensorrt_rtx/nv_execution_provider.h
index 7a0c47d28c81d..83b89a2e9d1fb 100644
--- a/onnxruntime/core/providers/nv_tensorrt_rtx/nv_execution_provider.h
+++ b/onnxruntime/core/providers/nv_tensorrt_rtx/nv_execution_provider.h
@@ -78,6 +78,9 @@ using unique_pointer = std::unique_ptr<T, TensorrtInferDeleter>;
 //
 class OutputAllocator : public nvinfer1::IOutputAllocator {
  public:
+  OutputAllocator() = delete;
+  OutputAllocator(OrtAllocator* allocator) : alloc_(allocator) {};
+
   void* reallocateOutputAsync(char const* tensorName, void* currentMemory, uint64_t size, uint64_t alignment, cudaStream_t stream) noexcept override;
 
   void notifyShape(char const* tensorName, nvinfer1::Dims const& dims) noexcept override;
@@ -95,10 +98,11 @@ class OutputAllocator : public nvinfer1::IOutputAllocator {
   }
 
   ~OutputAllocator() override {
-    cudaFree(outputPtr);
+    alloc_->Free(alloc_, outputPtr);
   }
 
  private:
+  OrtAllocator* alloc_;
   void* outputPtr{nullptr};
   uint64_t allocated_size = 0;
   std::vector<int64_t> output_shapes;
@@ -110,6 +114,45 @@ class OutputAllocator : public nvinfer1::IOutputAllocator {
  */
 using ShapeRangesMap = std::unordered_map<std::string, std::unordered_map<size_t, std::vector<std::vector<int64_t>>>>;
 
+/**
+ * @brief Container for tensor data and their shape.
+ *
+ */
+struct TensorParams {
+  const void* data{nullptr};
+  nvinfer1::Dims dims;
+
+  TensorParams() = default;
+
+  TensorParams(const void* data_ptr, const std::vector<int64_t>& shape) {
+    // Initialize data and dims from the Ort::ConstValue
+    data = data_ptr;
+
+    dims.nbDims = static_cast<int32_t>(shape.size());
+    for (int i = 0; i < dims.nbDims; ++i) {
+      dims.d[i] = static_cast<int32_t>(shape[i]);
+    }
+  }
+
+  TensorParams(const void* data_ptr, nvinfer1::Dims& shape) {
+    // Initialize data and dims from the Ort::ConstValue
+    data = data_ptr;
+
+    dims = shape;
+  }
+
+  bool operator!=(const TensorParams& other) const {
+    if (data != other.data || dims.nbDims != other.dims.nbDims)
+      return true;
+
+    for (int i = 0; i < dims.nbDims; ++i) {
+      if (dims.d[i] != other.dims.d[i])
+        return true;
+    }
+    return false;
+  }
+};
+
 // Information to construct kernel function state.
 struct TensorrtFuncState {
   AllocateFunc test_allocate_func = nullptr;
@@ -130,8 +173,6 @@ struct TensorrtFuncState {
   std::string engine_cache_path;
   nvinfer1::IRuntime* runtime = nullptr;
   std::vector<nvinfer1::IOptimizationProfile*> profiles;
-  bool context_memory_sharing_enable = false;
-  size_t* max_context_mem_size_ptr = nullptr;
   bool engine_decryption_enable = false;
   int (*engine_decryption)(const char*, char*, size_t*) = nullptr;
   int (*engine_encryption)(const char*, char*, size_t) = nullptr;
@@ -139,8 +180,16 @@ struct TensorrtFuncState {
   bool sparsity_enable = false;
   int auxiliary_streams = -1;
   bool cuda_graph_enable = 0;
+  bool is_dynamic_shape = false;
   std::string cache_prefix;
   std::string cache_suffix;
+  std::vector<IAllocatorUniquePtr<void>> scratch_buffers;
+  std::vector<TensorParams> input_tensors;
+  std::vector<TensorParams> output_tensors;
+  bool is_first_run = true;              // Indicates if this is the first run of the engine
+  bool skip_io_binding_allowed = false;  // Indicates if input/output binding can be skipped
+  IAllocatorUniquePtr<void> context_memory = nullptr;
+  size_t context_memory_size = 0;
 };
 
 // Minimum information to construct kernel function state for direct engine load code path
@@ -153,9 +202,15 @@ struct TensorrtShortFuncState {
   std::unique_ptr<nvinfer1::IExecutionContext>* context = nullptr;
   std::vector<std::unordered_map<std::string, size_t>> input_info;
   std::vector<std::unordered_map<std::string, size_t>> output_info;
-  bool context_memory_sharing_enable = false;
-  size_t* max_context_mem_size_ptr = nullptr;
   std::mutex* tensorrt_mu_ptr = nullptr;
+  bool is_dynamic_shape = false;
+  std::vector<IAllocatorUniquePtr<void>> scratch_buffers;
+  std::vector<TensorParams> input_tensors;
+  std::vector<TensorParams> output_tensors;
+  bool is_first_run = true;              // Indicates if this is the first run of the engine
+  bool skip_io_binding_allowed = false;  // Indicates if input/output binding can be skipped
+  IAllocatorUniquePtr<void> context_memory = nullptr;
+  size_t context_memory_size = 0;
 };
 
 // Holds important information for building valid ORT graph.
@@ -251,9 +306,7 @@ class NvExecutionProvider : public IExecutionProvider {
   std::mutex tensorrt_mu_;
   int device_id_;
   std::string compute_capability_;
-  bool context_memory_sharing_enable_ = false;
   size_t max_ctx_mem_size_ = 0;
-  IAllocatorUniquePtr<void> context_memory_ = nullptr;
   mutable char model_path_[4096] = {};  // Reserved for max path length
   bool engine_decryption_enable_ = false;
   int (*engine_decryption_)(const char*, char*, size_t*) = nullptr;
@@ -341,8 +394,6 @@ class NvExecutionProvider : public IExecutionProvider {
     nvinfer1::IExecutionContext& GetTensorRTContext(std::string fused_node);
     bool UpdateTensorRTContext(std::string fused_node, std::unique_ptr<nvinfer1::IExecutionContext> context);
     void ResetTensorRTContext(std::string fused_node);
-    bool CompareProfileShapes(std::string fused_node, ShapeRangesMap& shape_ranges);
-    void UpdateProfileShapes(std::string fused_node, ShapeRangesMap& shape_ranges);
 
     void InitCUDAGraph();
     void SetGraphStream(cudaStream_t stream);
diff --git a/onnxruntime/core/providers/nv_tensorrt_rtx/nv_execution_provider_info.h b/onnxruntime/core/providers/nv_tensorrt_rtx/nv_execution_provider_info.h
index 2a67f3c3bec4d..4d6c6fe116076 100644
--- a/onnxruntime/core/providers/nv_tensorrt_rtx/nv_execution_provider_info.h
+++ b/onnxruntime/core/providers/nv_tensorrt_rtx/nv_execution_provider_info.h
@@ -34,7 +34,6 @@ struct NvExecutionProviderInfo {
   bool engine_decryption_enable{false};
   std::string engine_decryption_lib_path{""};
   bool force_sequential_engine_build{false};
-  bool context_memory_sharing_enable{false};
   std::string timing_cache_path{""};
   bool detailed_build_log{false};
   bool sparsity_enable{false};
diff --git a/onnxruntime/core/providers/nv_tensorrt_rtx/nv_execution_provider_utils.h b/onnxruntime/core/providers/nv_tensorrt_rtx/nv_execution_provider_utils.h
index 22e5eea6924de..ea586ba445ba2 100644
--- a/onnxruntime/core/providers/nv_tensorrt_rtx/nv_execution_provider_utils.h
+++ b/onnxruntime/core/providers/nv_tensorrt_rtx/nv_execution_provider_utils.h
@@ -683,4 +683,29 @@ std::string GetCacheSuffix(const std::string& fused_node_name, const std::string
   }
   return "";
 }
+
+/*
+ * Checks if there is a an element with value `-1` in nvinfer1::Dims
+ */
+static bool checkTrtDimIsDynamic(nvinfer1::Dims dims) {
+  for (int j = 0, end = dims.nbDims; j < end; ++j) {
+    if (dims.d[j] == -1) {
+      return true;
+    }
+  }
+  return false;
+}
+
+/*
+ * Checks if an nvinfer1::ITensor signales a dynamic shape,
+ * either due to dynamic shapes or due to it being a shape tensor
+ */
+static bool checkTrtTensorIsDynamic(nvinfer1::ITensor* tensor) {
+  if (tensor->isShapeTensor()) {
+    return true;
+  } else {
+    // Execution tensor
+    return checkTrtDimIsDynamic(tensor->getDimensions());
+  }
+}
 }  // namespace onnxruntime
diff --git a/onnxruntime/test/providers/nv_tensorrt_rtx/nv_basic_test.cc b/onnxruntime/test/providers/nv_tensorrt_rtx/nv_basic_test.cc
index 0559699670c4a..19505da1bbe56 100644
--- a/onnxruntime/test/providers/nv_tensorrt_rtx/nv_basic_test.cc
+++ b/onnxruntime/test/providers/nv_tensorrt_rtx/nv_basic_test.cc
@@ -394,6 +394,7 @@ TYPED_TEST(NvExecutionProviderTest, IOTypeTests) {
   }
 }
 
+#if defined(WIN32)
 static bool SessionHasEp(Ort::Session& session, const char* ep_name) {
   // Access the underlying InferenceSession.
   const OrtSession* ort_session = session;
@@ -409,11 +410,10 @@ static bool SessionHasEp(Ort::Session& session, const char* ep_name) {
   return has_ep;
 }
 
-#if defined(WIN32)
 // Tests autoEP feature to automatically select an EP that supports the GPU.
 // Currently only works on Windows.
 TEST(NvExecutionProviderTest, AutoEp_PreferGpu) {
-  PathString model_name = ORT_TSTR("nv_execution_provider_data_dyn_test.onnx");
+  PathString model_name = ORT_TSTR("nv_execution_provider_auto_ep.onnx");
   std::string graph_name = "test";
 
   std::vector<int> dims = {1, 3, 2};

From c65de9ff3110b9acb1da85952124418555aed058 Mon Sep 17 00:00:00 2001
From: Scott McKay <skottmckay@gmail.com>
Date: Thu, 7 Aug 2025 02:19:46 +1000
Subject: [PATCH 04/10] Update python bindings to be able to use a shared
 allocator and/or IDataTransfer registered by a plugin EP in the Environment
 (#25346)

### Description
<!-- Describe your changes. -->
Add ability to get shared allocator from env.
Add ability to create a MemCpyFunc using the IDataTransfer from the
environment.

### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
---
 .../onnxruntime/core/framework/ortdevice.h    |   7 ++
 .../onnxruntime/core/session/environment.h    |  11 +-
 .../providers/cuda/cuda_provider_factory.cc   |   4 +
 onnxruntime/core/session/environment.cc       |  34 ++++--
 .../onnxruntime_inference_collection.py       |  71 ++++++-----
 .../python/onnxruntime_pybind_mlvalue.cc      |  91 +++++++++++----
 .../python/onnxruntime_pybind_mlvalue.h       |  32 ++---
 .../python/onnxruntime_pybind_ortvalue.cc     | 110 ++++++++++++------
 .../python/onnxruntime_pybind_state.cc        |  71 ++++++++---
 .../python/onnxruntime_test_python_autoep.py  |  40 ++++++-
 10 files changed, 341 insertions(+), 130 deletions(-)

diff --git a/include/onnxruntime/core/framework/ortdevice.h b/include/onnxruntime/core/framework/ortdevice.h
index 536d641b4eef9..fea970b84fd84 100644
--- a/include/onnxruntime/core/framework/ortdevice.h
+++ b/include/onnxruntime/core/framework/ortdevice.h
@@ -150,6 +150,13 @@ struct OrtDevice {
     return alignment < other.alignment;
   }
 
+  bool EqualIgnoringAlignment(const OrtDevice& other) const {
+    return device_type == other.device_type &&
+           memory_type == other.memory_type &&
+           vendor_id == other.vendor_id &&
+           device_id == other.device_id;
+  }
+
  private:
   // Device type.
   int32_t device_type : 8;
diff --git a/include/onnxruntime/core/session/environment.h b/include/onnxruntime/core/session/environment.h
index 306f81df38e48..89467f5238fa9 100644
--- a/include/onnxruntime/core/session/environment.h
+++ b/include/onnxruntime/core/session/environment.h
@@ -106,6 +106,15 @@ class Environment {
     return shared_allocators_;
   }
 
+  /**
+   * Returns an AllocatorPtr for a shared IAllocator based allocator if it matches the memory info.
+   * The OrtMemoryInfo name and whether it's an arena or device allocator is ignored in the lookup, as is the
+   * alignment.
+   * The user calling this function is not expected to know the alignment, and we expect the allocator instance to be
+   * created with a valid alignment for the device.
+   */
+  AllocatorPtr GetRegisteredSharedAllocator(const OrtMemoryInfo& mem_info) const;
+
   /**
    * Removes registered allocator that was previously registered for sharing between multiple sessions.
    */
@@ -171,7 +180,7 @@ class Environment {
   std::unique_ptr<onnxruntime::concurrency::ThreadPool> inter_op_thread_pool_;
   bool create_global_thread_pools_{false};
 
-  std::mutex mutex_;
+  mutable std::mutex mutex_;
 
   // shared allocators from various sources.
   // CreateAndRegisterAllocator[V2]: IAllocator allocators created by ORT
diff --git a/onnxruntime/core/providers/cuda/cuda_provider_factory.cc b/onnxruntime/core/providers/cuda/cuda_provider_factory.cc
index e8d133779f33c..51a8b13cd8261 100644
--- a/onnxruntime/core/providers/cuda/cuda_provider_factory.cc
+++ b/onnxruntime/core/providers/cuda/cuda_provider_factory.cc
@@ -734,6 +734,10 @@ struct CudaEpFactory : OrtEpFactory {
     }
     */
 
+    // guard against bad device discovery. max devices we expect to add is num_cuda_devices. if we're attempting
+    // to add more than that we have duplicates in the `devices` array.
+    max_ep_devices = std::min(max_ep_devices, static_cast<size_t>(num_cuda_devices));
+
     int16_t device_id = 0;
     for (size_t i = 0; i < num_devices && num_ep_devices < max_ep_devices; ++i) {
       const OrtHardwareDevice& device = *devices[i];
diff --git a/onnxruntime/core/session/environment.cc b/onnxruntime/core/session/environment.cc
index 2b553aecbca6c..dfb2e33f8cb32 100644
--- a/onnxruntime/core/session/environment.cc
+++ b/onnxruntime/core/session/environment.cc
@@ -72,21 +72,23 @@ ProviderInfo_CUDA& GetProviderInfo_CUDA();
 #endif  // defined(USE_CUDA) || defined(USE_CUDA_PROVIDER_INTERFACE)
 
 namespace {
-// Ignore whether there is an arena wrapping the allocator by excluding OrtMemoryInfo.alloc_type from the comparison
+// Ignore whether there is an arena wrapping the allocator by excluding OrtMemoryInfo.alloc_type from the comparison.
 static bool AreOrtMemoryInfosEquivalent(
     const OrtMemoryInfo& left, const OrtMemoryInfo& right,
-    bool match_name = true) {
+    bool match_name = true,
+    bool ignore_alignment = false) {
   return left.mem_type == right.mem_type &&
-         left.device == right.device &&
+         (ignore_alignment ? left.device.EqualIgnoringAlignment(right.device) : left.device == right.device) &&
          (!match_name || strcmp(left.name, right.name) == 0);
 }
 
 std::vector<AllocatorPtr>::const_iterator FindExistingAllocator(const std::vector<AllocatorPtr>& allocators,
                                                                 const OrtMemoryInfo& mem_info,
-                                                                bool match_name = true) {
+                                                                bool match_name = true,
+                                                                bool ignore_alignment = false) {
   auto ite = std::find_if(std::begin(allocators),
                           std::end(allocators),
-                          [&mem_info, match_name](const AllocatorPtr& alloc_ptr) {
+                          [&mem_info, match_name, ignore_alignment](const AllocatorPtr& alloc_ptr) {
                             // We want to do the equality checking of 2 OrtMemoryInfos sans the OrtAllocatorType field.
                             // This is because we want to avoid registering two allocators for the same device that just
                             // differ on OrtAllocatorType.
@@ -96,7 +98,8 @@ std::vector<AllocatorPtr>::const_iterator FindExistingAllocator(const std::vecto
                             // OrtDeviceAllocator (which is the only accepted value while registering a custom allocator).
                             // If we allowed this, it could potentially cause a lot of confusion as to which shared allocator
                             // to use for that device and we want to avoid having any ugly logic around this.
-                            return AreOrtMemoryInfosEquivalent(alloc_ptr->Info(), mem_info, match_name);
+                            return AreOrtMemoryInfosEquivalent(alloc_ptr->Info(), mem_info,
+                                                               match_name, ignore_alignment);
                           });
 
   return ite;
@@ -428,8 +431,25 @@ Status Environment::CreateAndRegisterAllocatorV2(const std::string& provider_typ
 }
 
 Environment::~Environment() {
-  // need to make sure all the OrtAllocator instances are released prior to any plugin EPs being freed
+  // need to make sure all the OrtAllocator instances are released prior to any plugin EPs being freed.
+  // this is because any entry in shared_allocators_ wrapping an OrtAllocator from a plugin EP owns the OrtAllocator
+  // instance and will call Release on it. If the plugin EP has been freed the Release will fail.
   shared_allocators_.clear();
+
+#if !defined(ORT_MINIMAL_BUILD)
+  // unregister any remaining EP libraries so they're cleaned up in a determistic way.
+  while (!ep_libraries_.empty()) {
+    auto it = ep_libraries_.begin();
+    ORT_IGNORE_RETURN_VALUE(UnregisterExecutionProviderLibrary(it->first));
+  }
+#endif
+}
+
+AllocatorPtr Environment::GetRegisteredSharedAllocator(const OrtMemoryInfo& mem_info) const {
+  std::lock_guard<std::mutex> lock{mutex_};
+
+  auto it = FindExistingAllocator(shared_allocators_, mem_info, /*match_name*/ false, /*ignore_alignment*/ true);
+  return it != shared_allocators_.end() ? *it : nullptr;
 }
 
 Status Environment::GetSharedAllocator(const OrtMemoryInfo& mem_info, OrtAllocator*& allocator) {
diff --git a/onnxruntime/python/onnxruntime_inference_collection.py b/onnxruntime/python/onnxruntime_inference_collection.py
index e8e51db13bcd3..64c4ada07f28f 100644
--- a/onnxruntime/python/onnxruntime_inference_collection.py
+++ b/onnxruntime/python/onnxruntime_inference_collection.py
@@ -21,7 +21,7 @@
     import onnxruntime
 
 
-def get_ort_device_type(device_type: str, device_index) -> C.OrtDevice:
+def get_ort_device_type(device_type: str) -> int:
     if device_type == "cuda":
         return C.OrtDevice.cuda()
     elif device_type == "cann":
@@ -32,8 +32,10 @@ def get_ort_device_type(device_type: str, device_index) -> C.OrtDevice:
         return C.OrtDevice.dml()
     elif device_type == "webgpu":
         return C.OrtDevice.webgpu()
-    elif device_type == "ort":
-        return C.get_ort_device(device_index).device_type()
+    elif device_type == "gpu":
+        return C.OrtDevice.gpu()
+    elif device_type == "npu":
+        return C.OrtDevice.npu()
     else:
         raise Exception("Unsupported device type: " + device_type)
 
@@ -765,7 +767,7 @@ def bind_input(self, name, device_type, device_id, element_type, shape, buffer_p
         self._iobinding.bind_input(
             name,
             C.OrtDevice(
-                get_ort_device_type(device_type, device_id),
+                get_ort_device_type(device_type),
                 C.OrtDevice.default_memory(),
                 device_id,
             ),
@@ -812,7 +814,7 @@ def bind_output(
             self._iobinding.bind_output(
                 name,
                 C.OrtDevice(
-                    get_ort_device_type(device_type, device_id),
+                    get_ort_device_type(device_type),
                     C.OrtDevice.default_memory(),
                     device_id,
                 ),
@@ -823,7 +825,7 @@ def bind_output(
             self._iobinding.bind_output(
                 name,
                 C.OrtDevice(
-                    get_ort_device_type(device_type, device_id),
+                    get_ort_device_type(device_type),
                     C.OrtDevice.default_memory(),
                     device_id,
                 ),
@@ -889,7 +891,7 @@ def _get_c_value(self) -> C.OrtValue:
         return self._ortvalue
 
     @classmethod
-    def ortvalue_from_numpy(cls, numpy_obj: np.ndarray, /, device_type="cpu", device_id=0) -> OrtValue:
+    def ortvalue_from_numpy(cls, numpy_obj: np.ndarray, /, device_type="cpu", device_id=0, vendor_id=-1) -> OrtValue:
         """
         Factory method to construct an OrtValue (which holds a Tensor) from a given Numpy object
         A copy of the data in the Numpy object is held by the OrtValue only if the device is NOT cpu
@@ -897,6 +899,7 @@ def ortvalue_from_numpy(cls, numpy_obj: np.ndarray, /, device_type="cpu", device
         :param numpy_obj: The Numpy object to construct the OrtValue from
         :param device_type: e.g. cpu, cuda, cann, cpu by default
         :param device_id: device id, e.g. 0
+        :param vendor_id: The device's PCI vendor id. If provided, the device_type should be "gpu" or "npu".
         """
         # Hold a reference to the numpy object (if device_type is 'cpu') as the OrtValue
         # is backed directly by the data buffer of the numpy object and so the numpy object
@@ -904,11 +907,7 @@ def ortvalue_from_numpy(cls, numpy_obj: np.ndarray, /, device_type="cpu", device
         return cls(
             C.OrtValue.ortvalue_from_numpy(
                 numpy_obj,
-                C.OrtDevice(
-                    get_ort_device_type(device_type, device_id),
-                    C.OrtDevice.default_memory(),
-                    device_id,
-                ),
+                OrtDevice.make(device_type, device_id, vendor_id)._get_c_device(),
             ),
             numpy_obj if device_type.lower() == "cpu" else None,
         )
@@ -929,7 +928,7 @@ def ortvalue_from_numpy_with_onnx_type(cls, data: np.ndarray, /, onnx_element_ty
 
     @classmethod
     def ortvalue_from_shape_and_type(
-        cls, shape: Sequence[int], element_type, device_type: str = "cpu", device_id: int = 0
+        cls, shape: Sequence[int], element_type, device_type: str = "cpu", device_id: int = 0, vendor_id: int = -1
     ) -> OrtValue:
         """
         Factory method to construct an OrtValue (which holds a Tensor) from given shape and element_type
@@ -938,7 +937,11 @@ def ortvalue_from_shape_and_type(
         :param element_type: The data type of the elements. It can be either numpy type (like numpy.float32) or an integer for onnx type (like onnx.TensorProto.BFLOAT16).
         :param device_type: e.g. cpu, cuda, cann, cpu by default
         :param device_id: device id, e.g. 0
+        :param vendor_id: If provided the device type should be "gpu" or "npu".
         """
+
+        device = OrtDevice.make(device_type, device_id, vendor_id)._get_c_device()
+
         # Integer for onnx element type (see https://onnx.ai/onnx/api/mapping.html).
         # This is helpful for some data type (like TensorProto.BFLOAT16) that is not available in numpy.
         if isinstance(element_type, int):
@@ -946,11 +949,7 @@ def ortvalue_from_shape_and_type(
                 C.OrtValue.ortvalue_from_shape_and_onnx_type(
                     shape,
                     element_type,
-                    C.OrtDevice(
-                        get_ort_device_type(device_type, device_id),
-                        C.OrtDevice.default_memory(),
-                        device_id,
-                    ),
+                    device,
                 )
             )
 
@@ -958,11 +957,7 @@ def ortvalue_from_shape_and_type(
             C.OrtValue.ortvalue_from_shape_and_type(
                 shape,
                 element_type,
-                C.OrtDevice(
-                    get_ort_device_type(device_type, device_id),
-                    C.OrtDevice.default_memory(),
-                    device_id,
-                ),
+                device,
             )
         )
 
@@ -1085,14 +1080,27 @@ def _get_c_device(self):
         return self._ort_device
 
     @staticmethod
-    def make(ort_device_name, device_id):
-        return OrtDevice(
-            C.OrtDevice(
-                get_ort_device_type(ort_device_name, device_id),
-                C.OrtDevice.default_memory(),
-                device_id,
+    def make(ort_device_name, device_id, vendor_id=-1):
+        if vendor_id < 0:
+            # backwards compatibility with predefined OrtDevice names
+            return OrtDevice(
+                C.OrtDevice(
+                    get_ort_device_type(ort_device_name),
+                    C.OrtDevice.default_memory(),
+                    device_id,
+                )
+            )
+        else:
+            # generic. use GPU or NPU for ort_device_name and provide a vendor id.
+            # vendor id of 0 is valid in some cases (e.g. webgpu is generic and does not have a vendor id)
+            return OrtDevice(
+                C.OrtDevice(
+                    get_ort_device_type(ort_device_name),
+                    C.OrtDevice.default_memory(),
+                    vendor_id,
+                    device_id,
+                )
             )
-        )
 
     def device_id(self):
         return self._ort_device.device_id()
@@ -1100,6 +1108,9 @@ def device_id(self):
     def device_type(self):
         return self._ort_device.device_type()
 
+    def device_vendor_id(self):
+        return self._ort_device.vendor_id()
+
 
 class SparseTensor:
     """
diff --git a/onnxruntime/python/onnxruntime_pybind_mlvalue.cc b/onnxruntime/python/onnxruntime_pybind_mlvalue.cc
index 958c9fc46bcd8..590e1ef3cdbdb 100644
--- a/onnxruntime/python/onnxruntime_pybind_mlvalue.cc
+++ b/onnxruntime/python/onnxruntime_pybind_mlvalue.cc
@@ -99,6 +99,44 @@ TensorShape GetShape(const py::array& arr) {
   return shape;
 }
 
+AllocatorPtr GetSharedAllocator(const OrtDevice& device) {
+  auto& env = GetOrtEnv()->GetEnvironment();
+
+  OrtMemoryInfo mem_info("ignored", OrtDeviceAllocator, device);
+  return env.GetRegisteredSharedAllocator(mem_info);
+}
+
+MemCpyFunc CreateDataTransferMemCpy([[maybe_unused]] const OrtDevice& src_device,
+                                    [[maybe_unused]] const OrtDevice& dst_device) {
+#if defined(ORT_MINIMAL_BUILD)
+  // plugin EPs are not supported in a minimal build so there won't be any data transfers registered
+  return nullptr;
+#else
+
+  auto& env = GetOrtEnv()->GetEnvironment();
+  const DataTransferManager& data_transfer_manager = env.GetDataTransferManager();
+  const IDataTransfer* data_transfer = data_transfer_manager.GetDataTransfer(src_device, dst_device);
+  if (!data_transfer) {
+    return nullptr;
+  }
+
+  const auto copy_func = [src_device, dst_device, data_transfer](void* dst, const void* src, size_t bytes) {
+    OrtMemoryInfo src_memory_info("ignored", OrtDeviceAllocator, src_device);
+    OrtMemoryInfo dst_memory_info("ignored", OrtDeviceAllocator, dst_device);
+
+    // real shape doesn't matter as the Tensor instances here are temporary in order to be able to call CopyTensor.
+    // we set the shape to `bytes` and the data type to uint8_t to copy the correct number of bytes.
+    TensorShape shape = {narrow<int64_t>(bytes)};
+    Tensor src_tensor{DataTypeImpl::GetType<uint8_t>(), shape, const_cast<void*>(src), src_memory_info};
+    Tensor dst_tensor{DataTypeImpl::GetType<uint8_t>(), shape, dst, dst_memory_info};
+
+    ORT_THROW_IF_ERROR(data_transfer->CopyTensor(src_tensor, dst_tensor));
+  };
+
+  return copy_func;
+#endif
+}
+
 void CpuToCpuMemCpy(void* dst, const void* src, size_t num_bytes) {
   memcpy(dst, src, num_bytes);
 }
@@ -158,9 +196,10 @@ void CudaToCpuMemCpy(void* dst, const void* src, size_t num_bytes) {
   GetProviderInfo_CUDA().cudaMemcpy_DeviceToHost(dst, src, num_bytes);
 }
 
-const std::unordered_map<OrtDevice::DeviceType, MemCpyFunc>* GetCudaToHostMemCpyFunction() {
-  static std::unordered_map<OrtDevice::DeviceType, MemCpyFunc> map{
-      {OrtDevice::GPU, CudaToCpuMemCpy}};
+const std::unordered_map<OrtDevice, MemCpyFunc>* GetCudaToHostMemCpyFunction() {
+  static std::unordered_map<OrtDevice, MemCpyFunc> map{
+      {OrtDevice{OrtDevice::GPU, OrtDevice::MemType::DEFAULT, OrtDevice::VendorIds::NVIDIA, 0}, CudaToCpuMemCpy},
+  };
 
   return &map;
 }
@@ -215,9 +254,10 @@ void MIGraphXToCpuMemCpy(void* dst, const void* src, size_t num_bytes) {
   GetProviderInfo_MIGraphX().MIGraphXMemcpy_DeviceToHost(dst, src, num_bytes);
 }
 
-const std::unordered_map<OrtDevice::DeviceType, MemCpyFunc>* GetMIGraphXToHostMemCpyFunction() {
-  static std::unordered_map<OrtDevice::DeviceType, MemCpyFunc> map{
-      {OrtDevice::GPU, MIGraphXToCpuMemCpy}};
+const std::unordered_map<OrtDevice, MemCpyFunc>* GetMIGraphXToHostMemCpyFunction(const OrtDevice& device) {
+  static std::unordered_map<OrtDevice, MemCpyFunc> map{
+      {OrtDevice{OrtDevice::GPU, OrtDevice::MemType::DEFAULT, OrtDevice::VendorIds::AMD, 0}, MIGraphXToCpuMemCpy},
+  };
 
   return &map;
 }
@@ -334,9 +374,10 @@ void DmlToCpuMemCpy(void* dst, const void* src, size_t num_bytes) {
       D3D12_RESOURCE_STATE_UNORDERED_ACCESS);
 }
 
-const std::unordered_map<OrtDevice::DeviceType, MemCpyFunc>* GetDmlToHostMemCpyFunction() {
-  static std::unordered_map<OrtDevice::DeviceType, MemCpyFunc> map{
-      {OrtDevice::GPU, DmlToCpuMemCpy}};
+const std::unordered_map<OrtDevice, MemCpyFunc>* GetDmlToHostMemCpyFunction() {
+  static std::unordered_map<OrtDevice, MemCpyFunc> map{
+      {OrtDevice{OrtDevice::GPU, OrtDevice::MemType::DEFAULT, OrtDevice::VendorIds::MICROSOFT, 0}, DmlToCpuMemCpy},
+  };
 
   return &map;
 }
@@ -352,9 +393,10 @@ void CannToCpuMemCpy(void* dst, const void* src, size_t num_bytes) {
   GetProviderInfo_CANN().cannMemcpy_DeviceToHost(dst, src, num_bytes);
 }
 
-const std::unordered_map<OrtDevice::DeviceType, MemCpyFunc>* GetCannToHostMemCpyFunction() {
-  static std::unordered_map<OrtDevice::DeviceType, MemCpyFunc> map{
-      {OrtDevice::NPU, CannToCpuMemCpy}};
+const std::unordered_map<OrtDevice, MemCpyFunc>* GetCannToHostMemCpyFunction() {
+  static std::unordered_map<OrtDevice, MemCpyFunc> map{
+      {OrtDevice{OrtDevice::NPU, OrtDevice::MemType::DEFAULT, OrtDevice::VendorIds::HUAWEI, 0}, CannToCpuMemCpy},
+  };
 
   return &map;
 }
@@ -402,9 +444,10 @@ void RocmToCpuMemCpy(void* dst, const void* src, size_t num_bytes) {
   GetProviderInfo_ROCM().rocmMemcpy_DeviceToHost(dst, src, num_bytes);
 }
 
-const std::unordered_map<OrtDevice::DeviceType, MemCpyFunc>* GetRocmToHostMemCpyFunction() {
-  static std::unordered_map<OrtDevice::DeviceType, MemCpyFunc> map{
-      {OrtDevice::GPU, RocmToCpuMemCpy}};
+const std::unordered_map<OrtDevice, MemCpyFunc>* GetRocmToHostMemCpyFunction() {
+  static std::unordered_map<OrtDevice, MemCpyFunc> map{
+      {OrtDevice{OrtDevice::GPU, OrtDevice::MemType::DEFAULT, OrtDevice::VendorIds::AMD, 0}, RocmToCpuMemCpy},
+  };
 
   return &map;
 }
@@ -581,7 +624,7 @@ using OrtPybindSingleUseAllocatorPtr = std::shared_ptr<OrtPybindSingleUseAllocat
 // Does not manage darray life-cycle
 
 static void CopyDataToTensor(PyArrayObject* darray, int npy_type, Tensor& tensor,
-                             MemCpyFunc mem_cpy_to_device = CpuToCpuMemCpy) {
+                             const MemCpyFunc& mem_cpy_to_device = CpuToCpuMemCpy) {
   const auto total_items = tensor.Shape().Size();
   if (npy_type == NPY_UNICODE) {
     // Copy string data which needs to be done after Tensor is allocated.
@@ -643,11 +686,11 @@ static void CopyDataToTensor(PyArrayObject* darray, int npy_type, Tensor& tensor
 }
 
 inline void CopyDataToTensor(PyArrayObject* darray, int npy_type, std::unique_ptr<Tensor>& p_tensor,
-                             MemCpyFunc mem_cpy_to_device = CpuToCpuMemCpy) {
+                             const MemCpyFunc& mem_cpy_to_device = CpuToCpuMemCpy) {
   CopyDataToTensor(darray, npy_type, *p_tensor, mem_cpy_to_device);
 }
 
-void CopyDataToTensor(const py::array& py_array, int npy_type, Tensor& tensor, MemCpyFunc mem_cpy_to_device) {
+void CopyDataToTensor(const py::array& py_array, int npy_type, Tensor& tensor, const MemCpyFunc& mem_cpy_to_device) {
   CopyDataToTensor(reinterpret_cast<PyArrayObject*>(py_array.ptr()), npy_type, tensor, mem_cpy_to_device);
 }
 
@@ -656,7 +699,7 @@ void CopyDataToTensor(const py::array& py_array, int npy_type, Tensor& tensor, M
 // The numpy object owns the memory and needs to be alive until the corresponding OrtValue is in scope
 static std::unique_ptr<Tensor> CreateTensor(const AllocatorPtr& alloc, const std::string& name_input,
                                             PyArrayObject* pyObject, bool use_numpy_data_memory = true,
-                                            MemCpyFunc mem_cpy_to_device = CpuToCpuMemCpy) {
+                                            const MemCpyFunc& mem_cpy_to_device = CpuToCpuMemCpy) {
   PyArrayObject* darray = PyArray_GETCONTIGUOUS(pyObject);
   ORT_ENFORCE(darray != nullptr, "The object must be a contiguous array for input '", name_input, "'.");
 
@@ -746,7 +789,8 @@ static void CreateSequenceOfTensors(AllocatorPtr alloc, const std::string& name_
 // as the backing data buffer for the ORT Tensor where applicable (for numeric tensors)
 // The numpy object owns the memory and needs to be alive until the corresponding OrtValue is in scope
 static void CreateTensorMLValue(const AllocatorPtr& alloc, const std::string& name_input, PyArrayObject* pyObject,
-                                OrtValue* p_mlvalue, bool use_numpy_data_memory = true, MemCpyFunc mem_cpy_to_device = CpuToCpuMemCpy) {
+                                OrtValue* p_mlvalue, bool use_numpy_data_memory = true,
+                                const MemCpyFunc& mem_cpy_to_device = CpuToCpuMemCpy) {
   auto p_tensor = CreateTensor(alloc, name_input, pyObject, use_numpy_data_memory, mem_cpy_to_device);
 
   auto ml_tensor = DataTypeImpl::GetType<Tensor>();
@@ -994,9 +1038,10 @@ static void CreateGenericIterableMLValue(PyObject* iterator, AllocatorPtr alloc,
 // Setting `use_numpy_data_memory` to `true` will ensure that the underlying numpy array buffer is directly used
 // as the backing data buffer for the ORT Tensor where applicable (for numeric tensors)
 // The numpy object owns the memory and needs to be alive until the corresponding OrtValue is in scope
-void CreateGenericMLValue(const onnxruntime::InputDefList* input_def_list, const AllocatorPtr& alloc, const std::string& name_input,
-                          const py::object& value, OrtValue* p_mlvalue, bool accept_only_numpy_array,
-                          bool use_numpy_data_memory, MemCpyFunc mem_cpy_to_device) {
+void CreateGenericMLValue(const onnxruntime::InputDefList* input_def_list, const AllocatorPtr& alloc,
+                          const std::string& name_input, const py::object& value, OrtValue* p_mlvalue,
+                          bool accept_only_numpy_array, bool use_numpy_data_memory,
+                          const MemCpyFunc& mem_cpy_to_device) {
   onnx::TypeProto type_proto;
   if (PyObjectCheck_NumpyArray(value.ptr())) {
     // The most frequent case: input comes as an array.
diff --git a/onnxruntime/python/onnxruntime_pybind_mlvalue.h b/onnxruntime/python/onnxruntime_pybind_mlvalue.h
index e9bafea2ed1b5..7b65c0aae45c1 100644
--- a/onnxruntime/python/onnxruntime_pybind_mlvalue.h
+++ b/onnxruntime/python/onnxruntime_pybind_mlvalue.h
@@ -42,22 +42,27 @@ MLDataType NumpyTypeToOnnxRuntimeTensorType(int numpy_type);
 
 MLDataType OnnxTypeToOnnxRuntimeTensorType(int onnx_element_type);
 
-using MemCpyFunc = void (*)(void*, const void*, size_t);
-
+using MemCpyFunc = std::function<void(void*, const void*, size_t)>;
 using DataTransferAlternative = std::variant<const DataTransferManager*, MemCpyFunc>;
 
+// helpers to get allocator and IDataTransfer from Environment for plugin EP
+AllocatorPtr GetSharedAllocator(const OrtDevice& device);
+MemCpyFunc CreateDataTransferMemCpy(const OrtDevice& src_device, const OrtDevice& dst_device);
+
 void CpuToCpuMemCpy(void*, const void*, size_t);
 
-void CopyDataToTensor(const pybind11::array& py_array, int npy_type, Tensor& tensor, MemCpyFunc mem_cpy_to_device = CpuToCpuMemCpy);
+void CopyDataToTensor(const pybind11::array& py_array, int npy_type, Tensor& tensor,
+                      const MemCpyFunc& mem_cpy_to_device = CpuToCpuMemCpy);
 
 pybind11::object AddTensorAsPyObj(const OrtValue& val, const DataTransferManager* data_transfer_manager,
-                                  const std::unordered_map<OrtDevice::DeviceType, MemCpyFunc>* mem_cpy_to_host_functions);
+                                  const std::unordered_map<OrtDevice, MemCpyFunc>* mem_cpy_to_host_functions);
 
-pybind11::object GetPyObjectFromSparseTensor(size_t pos, const OrtValue& ort_value, const DataTransferManager* data_transfer_manager);
+pybind11::object GetPyObjectFromSparseTensor(size_t pos, const OrtValue& ort_value,
+                                             const DataTransferManager* data_transfer_manager);
 
 pybind11::object AddNonTensorAsPyObj(const OrtValue& val,
                                      const DataTransferManager* data_transfer_manager,
-                                     const std::unordered_map<OrtDevice::DeviceType, MemCpyFunc>* mem_cpy_to_host_functions);
+                                     const std::unordered_map<OrtDevice, MemCpyFunc>* mem_cpy_to_host_functions);
 
 OrtMemoryInfo GetMemoryInfoPerDeviceType(const OrtDevice& ort_device);
 
@@ -69,7 +74,7 @@ void CpuToCudaMemCpy(void* dst, const void* src, size_t num_bytes);
 
 void CudaToCpuMemCpy(void* dst, const void* src, size_t num_bytes);
 
-const std::unordered_map<OrtDevice::DeviceType, MemCpyFunc>* GetCudaToHostMemCpyFunction();
+const std::unordered_map<OrtDevice, MemCpyFunc>* GetCudaToHostMemCpyFunction();
 
 bool IsCudaDeviceIdValid(const onnxruntime::logging::Logger& logger, int id);
 
@@ -87,7 +92,7 @@ void CpuToDmlMemCpy(void* dst, const void* src, size_t num_bytes);
 
 void DmlToCpuMemCpy(void* dst, const void* src, size_t num_bytes);
 
-const std::unordered_map<OrtDevice::DeviceType, MemCpyFunc>* GetDmlToHostMemCpyFunction();
+const std::unordered_map<OrtDevice, MemCpyFunc>* GetDmlToHostMemCpyFunction();
 
 #endif
 
@@ -97,7 +102,7 @@ void CpuToMIGraphXMemCpy(void* dst, const void* src, size_t num_bytes);
 
 void MIGraphXToCpuMemCpy(void* dst, const void* src, size_t num_bytes);
 
-const std::unordered_map<OrtDevice::DeviceType, MemCpyFunc>* GetMIGraphXToHostMemCpyFunction();
+const std::unordered_map<OrtDevice, MemCpyFunc>* GetMIGraphXToHostMemCpyFunction();
 
 AllocatorPtr GetMIGraphXAllocator(OrtDevice::DeviceId id);
 
@@ -109,7 +114,7 @@ void CpuToCannMemCpy(void* dst, const void* src, size_t num_bytes);
 
 void CannToCpuMemCpy(void* dst, const void* src, size_t num_bytes);
 
-const std::unordered_map<OrtDevice::DeviceType, MemCpyFunc>* GetCannToHostMemCpyFunction();
+const std::unordered_map<OrtDevice, MemCpyFunc>* GetCannToHostMemCpyFunction();
 
 bool IsCannDeviceIdValid(const onnxruntime::logging::Logger& logger, int id);
 
@@ -127,17 +132,18 @@ void CpuToRocmMemCpy(void* dst, const void* src, size_t num_bytes);
 
 void RocmToCpuMemCpy(void* dst, const void* src, size_t num_bytes);
 
-const std::unordered_map<OrtDevice::DeviceType, MemCpyFunc>* GetRocmToHostMemCpyFunction();
+const std::unordered_map<OrtDevice, MemCpyFunc>* GetRocmToHostMemCpyFunction();
 
 #endif
 
 void CreateGenericMLValue(const onnxruntime::InputDefList* input_def_list, const AllocatorPtr& alloc,
                           const std::string& name_input, const pybind11::object& value, OrtValue* p_mlvalue,
-                          bool accept_only_numpy_array = false, bool use_numpy_data_memory = true, MemCpyFunc mem_cpy_to_device = CpuToCpuMemCpy);
+                          bool accept_only_numpy_array = false, bool use_numpy_data_memory = true,
+                          const MemCpyFunc& mem_cpy_to_device = CpuToCpuMemCpy);
 
 pybind11::object GetPyObjFromTensor(const OrtValue& rtensor,
                                     const DataTransferManager* data_transfer_manager = nullptr,
-                                    const std::unordered_map<OrtDevice::DeviceType, MemCpyFunc>* mem_cpy_to_host_functions = nullptr);
+                                    const std::unordered_map<OrtDevice, MemCpyFunc>* mem_cpy_to_host_functions = nullptr);
 
 // The below two functions are used to convert OrtValue to numpy arrays
 
diff --git a/onnxruntime/python/onnxruntime_pybind_ortvalue.cc b/onnxruntime/python/onnxruntime_pybind_ortvalue.cc
index d1d4d6f3cdad5..7234543eb14de 100644
--- a/onnxruntime/python/onnxruntime_pybind_ortvalue.cc
+++ b/onnxruntime/python/onnxruntime_pybind_ortvalue.cc
@@ -23,42 +23,57 @@ std::unique_ptr<OrtValue> OrtValueFromShapeAndType(const std::vector<int64_t>& s
                                                    MLDataType element_type,
                                                    const OrtDevice& device) {
   AllocatorPtr allocator;
+
   if (strcmp(GetDeviceName(device), CPU) == 0) {
     allocator = GetAllocator();
-  } else if (strcmp(GetDeviceName(device), CUDA) == 0) {
+  } else {
+#if !defined(ORT_MINIMAL_BUILD)
+    // prefer a shared allocator from the environment.
+    // these are provided by plugin EPs or custom allocators explicitly registered by the user.
+    allocator = GetSharedAllocator(device);
+#endif
+
+    if (!allocator) {
+      if (strcmp(GetDeviceName(device), CUDA) == 0) {
 #ifdef USE_CUDA
-    if (!IsCudaDeviceIdValid(logging::LoggingManager::DefaultLogger(), device.Id())) {
-      throw std::runtime_error("The provided device id doesn't match any available GPUs on the machine.");
-    }
-    allocator = GetCudaAllocator(device.Id());
+        if (!IsCudaDeviceIdValid(logging::LoggingManager::DefaultLogger(), device.Id())) {
+          throw std::runtime_error("The provided device id doesn't match any available GPUs on the machine.");
+        }
+
+        allocator = GetCudaAllocator(device.Id());
 #else
-    throw std::runtime_error(
-        "Can't allocate memory on the CUDA device using this package of OnnxRuntime. "
-        "Please use the CUDA package of OnnxRuntime to use this feature.");
+        throw std::runtime_error(
+            "Can't allocate memory on the CUDA device using this package of OnnxRuntime. "
+            "Please use the CUDA package of OnnxRuntime to use this feature.");
 #endif
-  } else if (strcmp(GetDeviceName(device), HIP) == 0) {
+      } else if (strcmp(GetDeviceName(device), HIP) == 0) {
 #if USE_ROCM
-    if (!IsRocmDeviceIdValid(logging::LoggingManager::DefaultLogger(), device.Id())) {
-      throw std::runtime_error("The provided device id doesn't match any available GPUs on the machine.");
-    }
-    allocator = GetRocmAllocator(device.Id());
+        if (!IsRocmDeviceIdValid(logging::LoggingManager::DefaultLogger(), device.Id())) {
+          throw std::runtime_error("The provided device id doesn't match any available GPUs on the machine.");
+        }
+
+        allocator = GetRocmAllocator(device.Id());
 #elif USE_MIGRAPHX
-    allocator = GetMIGraphXAllocator(device.Id());
+        allocator = GetMIGraphXAllocator(device.Id());
 #else
-    throw std::runtime_error(
-        "Can't allocate memory on the AMD device using this package of OnnxRuntime. "
-        "Please use the ROCm package of OnnxRuntime to use this feature.");
+        throw std::runtime_error(
+            "Can't allocate memory on the AMD device using this package of OnnxRuntime. "
+            "Please use the ROCm package of OnnxRuntime to use this feature.");
 #endif
-  } else if (strcmp(GetDeviceName(device), DML) == 0) {
+      } else if (strcmp(GetDeviceName(device), DML) == 0) {
 #if USE_DML
-    allocator = GetDmlAllocator(device.Id());
+        allocator = GetDmlAllocator(device.Id());
 #else
-    throw std::runtime_error(
-        "Can't allocate memory on the DirectML device using this package of OnnxRuntime. "
-        "Please use the DirectML package of OnnxRuntime to use this feature.");
+        throw std::runtime_error(
+            "Can't allocate memory on the DirectML device using this package of OnnxRuntime. "
+            "Please use the DirectML package of OnnxRuntime to use this feature.");
 #endif
-  } else {
-    throw std::runtime_error("Unsupported device: Cannot place the OrtValue on this device");
+      }
+    }
+
+    if (!allocator) {
+      throw std::runtime_error("Unsupported device: Cannot place the OrtValue on this device");
+    }
   }
 
   auto ml_value = std::make_unique<OrtValue>();
@@ -90,7 +105,8 @@ void addOrtValueMethods(pybind11::module& m) {
           if (device.Vendor() == OrtDevice::VendorIds::MICROSOFT) {
             // InputDeflist is null because OrtValue creation is not tied to a specific model
             // Likewise, there is no need to specify the name (as the name was previously used to lookup the def list)
-            // TODO: Add check to ensure that string arrays are not passed - we currently don't support string tensors in DML
+            // TODO: Add check to ensure that string arrays are not passed - we currently don't support string tensors
+            // in DML
             CreateGenericMLValue(
                 nullptr, GetDmlAllocator(device.Id()), "", array_on_cpu, ml_value.get(), true, false, CpuToDmlMemCpy);
           } else
@@ -103,8 +119,10 @@ void addOrtValueMethods(pybind11::module& m) {
 
             // InputDeflist is null because OrtValue creation is not tied to a specific model
             // Likewise, there is no need to specify the name (as the name was previously used to lookup the def list)
-            // TODO: Add check to ensure that string arrays are not passed - we currently don't support string tensors in CUDA
-            CreateGenericMLValue(nullptr, GetCudaAllocator(device.Id()), "", array_on_cpu, ml_value.get(), true, false, CpuToCudaMemCpy);
+            // TODO: Add check to ensure that string arrays are not passed - we currently don't support string tensors
+            // in CUDA
+            CreateGenericMLValue(nullptr, GetCudaAllocator(device.Id()), "", array_on_cpu, ml_value.get(),
+                                 true, false, CpuToCudaMemCpy);
           } else
 #endif
 #ifdef USE_ROCM
@@ -115,22 +133,34 @@ void addOrtValueMethods(pybind11::module& m) {
 
             // InputDeflist is null because OrtValue creation is not tied to a specific model
             // Likewise, there is no need to specify the name (as the name was previously used to lookup the def list)
-            // TODO: Add check to ensure that string arrays are not passed - we currently don't support string tensors in CUDA
-            CreateGenericMLValue(nullptr, GetRocmAllocator(device.Id()), "", array_on_cpu, ml_value.get(), true, false, CpuToRocmMemCpy);
+            // TODO: Add check to ensure that string arrays are not passed - we currently don't support string tensors
+            // in ROCM
+            CreateGenericMLValue(nullptr, GetRocmAllocator(device.Id()), "", array_on_cpu, ml_value.get(),
+                                 true, false, CpuToRocmMemCpy);
           } else
 #endif
 #if USE_MIGRAPHX
               if (device.Vendor() == OrtDevice::VendorIds::AMD) {
             // InputDeflist is null because OrtValue creation is not tied to a specific model
             // Likewise, there is no need to specify the name (as the name was previously used to lookup the def list)
-            // TODO: Add check to ensure that string arrays are not passed - we currently don't support string tensors in MIGraphX
-            CreateGenericMLValue(nullptr, GetMIGraphXAllocator(device.Id()), "", array_on_cpu, ml_value.get(), true, false, CpuToMIGraphXMemCpy);
+            // TODO: Add check to ensure that string arrays are not passed - we currently don't support string tensors
+            // in MIGraphX
+            CreateGenericMLValue(nullptr, GetMIGraphXAllocator(device.Id()), "", array_on_cpu, ml_value.get(),
+                                 true, false, CpuToMIGraphXMemCpy);
           } else
 #endif
           {
-            throw std::runtime_error(
-                "Can't allocate memory on the CUDA device using this package of OnnxRuntime. "
-                "Please use the CUDA package of OnnxRuntime to use this feature.");
+            // see if we can do the copy with an allocator and IDataTransfer registered by a plugin EP
+            auto allocator = GetSharedAllocator(device);
+            auto cpu_to_device_copy_fn = allocator ? CreateDataTransferMemCpy(OrtDevice{}, device) : nullptr;
+            if (cpu_to_device_copy_fn) {
+              CreateGenericMLValue(nullptr, allocator, "", array_on_cpu, ml_value.get(), true, false,
+                                   cpu_to_device_copy_fn);
+            } else {
+              throw std::runtime_error(
+                  "Can't allocate memory on the device using this package of OnnxRuntime. "
+                  "Please use the appropriate package of OnnxRuntime for your hardware to use this feature.");
+            }
           }
         } else if (device.Type() == OrtDevice::NPU && device.Vendor() == OrtDevice::VendorIds::HUAWEI) {
 #ifdef USE_CANN
@@ -214,8 +244,16 @@ void addOrtValueMethods(pybind11::module& m) {
           } else
 #endif
           {
-            throw std::runtime_error(
-                "Unsupported GPU device: Cannot find the supported GPU device.");
+            // see if we can do the copy with an allocator and IDataTransfer registered by a plugin EP
+            auto allocator = GetSharedAllocator(device);
+            auto cpu_to_device_copy_fn = allocator ? CreateDataTransferMemCpy(OrtDevice{}, device) : nullptr;
+            if (cpu_to_device_copy_fn) {
+              onnxruntime::python::CopyDataToTensor(py_values, values_type, *(ml_value->GetMutable<Tensor>()),
+                                                    cpu_to_device_copy_fn);
+            } else {
+              throw std::runtime_error(
+                  "Unsupported GPU device: Cannot find the supported GPU device.");
+            }
           }
         } else if (device.Type() == OrtDevice::DML) {
 #if USE_DML
diff --git a/onnxruntime/python/onnxruntime_pybind_state.cc b/onnxruntime/python/onnxruntime_pybind_state.cc
index acf0681cf8752..03ad0185d1394 100644
--- a/onnxruntime/python/onnxruntime_pybind_state.cc
+++ b/onnxruntime/python/onnxruntime_pybind_state.cc
@@ -205,7 +205,7 @@ void AppendLoraParametersAsInputs(const RunOptions& run_options,
 template <typename T>
 static py::object AddNonTensor(const OrtValue& val,
                                const DataTransferManager* /*data_transfer_manager*/,
-                               const std::unordered_map<OrtDevice::DeviceType, MemCpyFunc>* /*mem_cpy_to_host_functions*/) {
+                               const std::unordered_map<OrtDevice, MemCpyFunc>* /*mem_cpy_to_host_functions*/) {
   return py::cast(val.Get<T>());
 }
 
@@ -265,39 +265,65 @@ pybind11::array PrimitiveTensorToNumpyFromDevice(const OrtValue& ort_value, cons
 // pretty much does what a DataTransferManager does - copy data from device(s) to the host
 py::object GetPyObjFromTensor(const OrtValue& ort_value,
                               const DataTransferManager* data_transfer_manager,
-                              const std::unordered_map<OrtDevice::DeviceType, MemCpyFunc>* mem_cpy_to_host_functions) {
+                              const std::unordered_map<OrtDevice, MemCpyFunc>* mem_cpy_to_host_functions) {
   ORT_ENFORCE(ort_value.IsTensor(), "This function only supports tensors");
 
   const auto& tensor = ort_value.Get<Tensor>();
+  const auto& device = tensor.Location().device;
+
   if (tensor.IsDataTypeString()) {
-    ORT_ENFORCE(tensor.Location().device.Type() == OrtDevice::CPU, "Strings can only be on CPU");
+    ORT_ENFORCE(device.Type() == OrtDevice::CPU, "Strings can only be on CPU");
     // Create a numpy array of strings (python objects) by copy/converting them
     py::array result = StringTensorToNumpyArray(tensor);
     return py::cast<py::object>(result);
   }
 
-  const auto device_type = tensor.Location().device.Type();
+  const auto device_type = device.Type();
   // Create an numpy array on top of the OrtValue memory, no copy
   if (device_type == OrtDevice::CPU) {
     py::array result = PrimitiveTensorToNumpyOverOrtValue(ort_value);
     return py::cast<py::object>(result);
   }
 
-  if (!data_transfer_manager && !mem_cpy_to_host_functions) {
-    throw std::runtime_error(
-        "GetPyObjFromTensor: Either data transfer manager or a "
-        "function to copy data to the host is needed to convert non-CPU tensor to numpy array");
-  }
-
   py::array result;
   if (data_transfer_manager != nullptr) {
     result = PrimitiveTensorToNumpyFromDevice(ort_value, data_transfer_manager);
   } else {
-    auto mem_cpy_to_host = mem_cpy_to_host_functions->find(device_type);
-    ORT_ENFORCE(mem_cpy_to_host != mem_cpy_to_host_functions->end(),
-                "Unable to locate a function that can copy data to the host from the device");
-    result = PrimitiveTensorToNumpyFromDevice(ort_value, mem_cpy_to_host->second);
+    bool copied = false;
+    if (mem_cpy_to_host_functions) {
+      auto it = std::find_if(mem_cpy_to_host_functions->begin(), mem_cpy_to_host_functions->end(),
+                             [&device](const auto& entry) {
+                               const auto& copy_device = entry.first;
+                               // We're ignoring OrtDevice.Id() currently for historical reasons.
+                               // The key to mem_cpy_to_host_functions was previously the device type (CPU/GPU/NPU).
+                               // This changed to be OrtDevice to get the vendor id.
+                               // Assumably it would be better to also match on device id, but that was not possible
+                               // previously and to preserve existing behavior we keep the old logic and expect the
+                               // copy function to handle the device id correctly.
+                               return device.Type() == copy_device.Type() &&
+                                      device.MemType() == copy_device.MemType() &&
+                                      device.Vendor() == copy_device.Vendor();
+                             });
+
+      if (it != mem_cpy_to_host_functions->end()) {
+        result = PrimitiveTensorToNumpyFromDevice(ort_value, it->second);
+        copied = true;
+      }
+    }
+
+    if (!copied) {
+      // see if we have a shared data transfer function from a plugin EP
+      auto device_to_cpu_copy_func = CreateDataTransferMemCpy(device, OrtDevice{});
+      if (device_to_cpu_copy_func) {
+        result = PrimitiveTensorToNumpyFromDevice(ort_value, device_to_cpu_copy_func);
+      } else {
+        throw std::runtime_error(
+            "GetPyObjFromTensor: Either data transfer manager or a "
+            "function to copy data to the host is needed to convert non-CPU tensor to numpy array");
+      }
+    }
   }
+
   return py::cast<py::object>(result);
 }
 
@@ -373,7 +399,7 @@ py::object GetPyObjectFromSparseTensor(size_t pos, const OrtValue& ort_value, co
 template <>
 py::object AddNonTensor<TensorSeq>(const OrtValue& val,
                                    const DataTransferManager* data_transfer_manager,
-                                   const std::unordered_map<OrtDevice::DeviceType, MemCpyFunc>* mem_cpy_to_host_functions) {
+                                   const std::unordered_map<OrtDevice, MemCpyFunc>* mem_cpy_to_host_functions) {
   const auto& seq_tensors = val.Get<TensorSeq>();
   py::list py_list;
   for (const auto& ort_value : seq_tensors) {
@@ -389,7 +415,7 @@ py::object AddNonTensor<TensorSeq>(const OrtValue& val,
 
 py::object AddNonTensorAsPyObj(const OrtValue& val,
                                const DataTransferManager* data_transfer_manager,
-                               const std::unordered_map<OrtDevice::DeviceType, MemCpyFunc>* mem_cpy_to_host_functions) {
+                               const std::unordered_map<OrtDevice, MemCpyFunc>* mem_cpy_to_host_functions) {
   // Should be in sync with core/framework/datatypes.h
   auto val_type = val.Type();
   if (val_type->IsTensorSequenceType()) {
@@ -429,7 +455,7 @@ py::object AddNonTensorAsPyObj(const OrtValue& val,
 }
 
 py::object AddTensorAsPyObj(const OrtValue& val, const DataTransferManager* data_transfer_manager,
-                            const std::unordered_map<OrtDevice::DeviceType, MemCpyFunc>* mem_cpy_to_host_functions) {
+                            const std::unordered_map<OrtDevice, MemCpyFunc>* mem_cpy_to_host_functions) {
   return GetPyObjFromTensor(val, data_transfer_manager, mem_cpy_to_host_functions);
 }
 
@@ -1885,6 +1911,10 @@ void addObjectMethods(py::module& m, ExecutionProviderRegistrationFn ep_registra
                vendor = OrtDevice::VendorIds::NVIDIA;
 #elif USE_ROCM || USE_MIGRAPHX
                vendor = OrtDevice::VendorIds::AMD;
+#endif
+             } else if (type == OrtDevice::NPU) {
+#if USE_CANN
+               vendor = OrtDevice::VendorIds::HUAWEI;
 #endif
              }
 
@@ -1894,12 +1924,15 @@ void addObjectMethods(py::module& m, ExecutionProviderRegistrationFn ep_registra
       .def("device_id", &OrtDevice::Id, R"pbdoc(Device Id.)pbdoc")
       .def("device_type", &OrtDevice::Type, R"pbdoc(Device Type.)pbdoc")
       .def("vendor_id", &OrtDevice::Vendor, R"pbdoc(Vendor Id.)pbdoc")
+      // generic device types that are typically used with a vendor id.
       .def_static("cpu", []() { return OrtDevice::CPU; })
+      .def_static("gpu", []() { return OrtDevice::GPU; })
+      .def_static("npu", []() { return OrtDevice::NPU; })
+      // EP specific device types for backward compatibility.
       .def_static("cuda", []() { return OrtDevice::GPU; })
       .def_static("cann", []() { return OrtDevice::NPU; })
-      .def_static("fpga", []() { return OrtDevice::FPGA; })
-      .def_static("npu", []() { return OrtDevice::NPU; })
       .def_static("dml", []() { return OrtDevice::DML; })
+      .def_static("fpga", []() { return OrtDevice::FPGA; })
       .def_static("webgpu", []() { return OrtDevice::GPU; })
       .def_static("default_memory", []() { return OrtDevice::MemType::DEFAULT; });
 
diff --git a/onnxruntime/test/python/onnxruntime_test_python_autoep.py b/onnxruntime/test/python/onnxruntime_test_python_autoep.py
index 0c52740398b7a..cb31627a87c48 100644
--- a/onnxruntime/test/python/onnxruntime_test_python_autoep.py
+++ b/onnxruntime/test/python/onnxruntime_test_python_autoep.py
@@ -183,7 +183,7 @@ def test_example_plugin_ep_devices(self):
         Test registration of an example EP plugin and retrieval of its OrtEpDevice.
         """
         if sys.platform != "win32":
-            self.skipTest("Skipping test because it device discovery is only supported on Windows")
+            self.skipTest("Skipping test because device discovery is only supported on Windows")
 
         ep_lib_path = "example_plugin_ep.dll"
         try:
@@ -244,6 +244,44 @@ def test_example_plugin_ep_devices(self):
         del sess  # Delete session before unregistering library
         self.unregister_execution_provider_library(ep_name)
 
+    def test_example_plugin_ep_data_transfer(self):
+        """
+        Test usage of shared data transfer and allocator from plugin EP.
+        """
+        if sys.platform != "win32":
+            self.skipTest("Skipping test because device discovery is only supported on Windows")
+
+        if "DmlExecutionProvider" in onnxrt.get_available_providers():
+            self.skipTest("Skipping because DML EP data transfer is broken if we haven't created an inference session")
+
+        ep_lib_path = "example_plugin_ep.dll"
+        try:
+            ep_lib_path = get_name("example_plugin_ep.dll")
+        except FileNotFoundError:
+            self.skipTest(f"Skipping test because EP library '{ep_lib_path}' cannot be found")
+
+        ep_name = "example_ep"
+        self.register_execution_provider_library(ep_name, os.path.realpath(ep_lib_path))
+
+        data = np.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], dtype=np.float32)
+        data2 = data + 1
+
+        # the example EP pretends to use GPU memory so we can test data transfer.
+        # by matching its OrtDevice info we will hit its allocator and data transfer implementations.
+        # copy data from CPU to the fake GPU memory
+        gpu_value = onnxrt.OrtValue.ortvalue_from_numpy(data, "gpu", 0, 0xBE57)
+        # copy back to CPU
+        cpu_data = gpu_value.numpy()
+        np.testing.assert_equal(data, cpu_data)
+
+        gpu_value.update_inplace(data2)  # update the fake GPU data
+        cpu_data_2 = gpu_value.numpy()  # copy back to CPU
+        np.testing.assert_equal(data2, cpu_data_2)
+
+        gpu_value = None  # Delete OrtValue before unregistering library as the allocator will be destroyed.
+
+        self.unregister_execution_provider_library(ep_name)
+
 
 if __name__ == "__main__":
     unittest.main(verbosity=1)

From 53a49955adefb882f13831476b3cdd773a52f32b Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Wed, 6 Aug 2025 09:29:42 -0700
Subject: [PATCH 05/10] upgrade dawn to
 794b6fadc4171f7b853a77ffdf0948fbec431f41 (#25461)

### Description

upgrade dawn to latest


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
---
 .github/workflows/windows_webgpu.yml          |  2 +-
 cmake/CMakeLists.txt                          | 10 ++++----
 cmake/deps.txt                                |  2 +-
 cmake/external/abseil-cpp.cmake               |  3 ---
 .../external/onnxruntime_external_deps.cmake  | 23 +++++++++++--------
 cmake/onnxruntime_java.cmake                  |  4 ++--
 cmake/onnxruntime_nodejs.cmake                |  2 +-
 cmake/onnxruntime_providers_webgpu.cmake      |  2 +-
 cmake/onnxruntime_python.cmake                |  2 +-
 onnxruntime/core/dll/delay_load_hook.cc       |  4 ++--
 .../core/providers/webgpu/program_manager.cc  |  6 ++---
 .../core/providers/webgpu/webgpu_context.cc   |  8 ++++---
 onnxruntime/wasm/wasm_post_build.js           | 16 ++++++-------
 .../templates/mac-cpu-packing-jobs.yml        |  6 +++--
 14 files changed, 48 insertions(+), 42 deletions(-)

diff --git a/.github/workflows/windows_webgpu.yml b/.github/workflows/windows_webgpu.yml
index 996e0d816d51a..65657835da9eb 100644
--- a/.github/workflows/windows_webgpu.yml
+++ b/.github/workflows/windows_webgpu.yml
@@ -128,7 +128,7 @@ jobs:
             ${{ matrix.vcpkg_option == 'vcpkg' && '--use_vcpkg' || '' }} `
             --cmake_extra_defines `
             onnxruntime_BUILD_UNIT_TESTS=ON `
-            onnxruntime_BUILD_DAWN_MONOLITHIC_LIBRARY=ON
+            onnxruntime_BUILD_DAWN_SHARED_LIBRARY=ON
           if ($lastExitCode -ne 0) {
             exit $lastExitCode
           }
diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
index bdc18c424efd1..c407f53786a88 100644
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -140,7 +140,7 @@ option(onnxruntime_USE_WEBGPU "Build with WebGPU support. Enable WebGPU via C/C+
 option(onnxruntime_WGSL_TEMPLATE "Specify the code generator for WGSL template. Default is static." "static")
 option(onnxruntime_USE_EXTERNAL_DAWN "Build with treating Dawn as external dependency. Will not link Dawn at build time." OFF)
 option(onnxruntime_CUSTOM_DAWN_SRC_PATH "Path to custom Dawn src dir.")
-option(onnxruntime_BUILD_DAWN_MONOLITHIC_LIBRARY "Build Dawn as a monolithic library" OFF)
+option(onnxruntime_BUILD_DAWN_SHARED_LIBRARY "Build Dawn as a shared library" OFF)
 option(onnxruntime_ENABLE_PIX_FOR_WEBGPU_EP "Adding frame present for PIX to capture a frame" OFF)
 # The following 2 options are only for Windows
 option(onnxruntime_ENABLE_DAWN_BACKEND_VULKAN "Enable Vulkan backend for Dawn (on Windows)" OFF)
@@ -899,8 +899,8 @@ if (onnxruntime_USE_WEBGPU)
   #
   # if (onnxruntime_USE_VCPKG AND NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten")
   if (FALSE)
-    if (NOT onnxruntime_BUILD_DAWN_MONOLITHIC_LIBRARY)
-      message(FATAL_ERROR "onnxruntime_USE_VCPKG is not supported with onnxruntime_BUILD_DAWN_MONOLITHIC_LIBRARY=OFF")
+    if (onnxruntime_BUILD_DAWN_SHARED_LIBRARY)
+      message(FATAL_ERROR "onnxruntime_USE_VCPKG is not supported with onnxruntime_BUILD_DAWN_SHARED_LIBRARY=ON")
     endif()
     if (onnxruntime_USE_EXTERNAL_DAWN)
       message(FATAL_ERROR "onnxruntime_USE_VCPKG is not supported with onnxruntime_USE_EXTERNAL_DAWN=ON")
@@ -921,8 +921,8 @@ if (onnxruntime_USE_WEBGPU)
     endif()
   endif()
 
-  if (onnxruntime_BUILD_DAWN_MONOLITHIC_LIBRARY)
-    list(APPEND ORT_PROVIDER_FLAGS -DBUILD_DAWN_MONOLITHIC_LIBRARY=1)
+  if (onnxruntime_BUILD_DAWN_SHARED_LIBRARY)
+    list(APPEND ORT_PROVIDER_FLAGS -DBUILD_DAWN_SHARED_LIBRARY=1)
   endif()
   if (onnxruntime_USE_EXTERNAL_DAWN)
     list(APPEND ORT_PROVIDER_FLAGS -DUSE_EXTERNAL_DAWN=1)
diff --git a/cmake/deps.txt b/cmake/deps.txt
index ed1de06f33dcb..7bac2a0fbced9 100644
--- a/cmake/deps.txt
+++ b/cmake/deps.txt
@@ -55,6 +55,6 @@ cutlass;https://github.com/NVIDIA/cutlass/archive/refs/tags/v3.9.2.zip;b7f8dc4a8
 extensions;https://github.com/microsoft/onnxruntime-extensions/archive/c24b7bab0c12f53da76d0c31b03b9f0f8ec8f3b4.zip;239063aee4946a9af147b473a4c3da78ba7413b4
 directx_headers;https://github.com/microsoft/DirectX-Headers/archive/refs/tags/v1.613.1.zip;47653509a3371eabb156360f42faf582f314bf2e
 cudnn_frontend;https://github.com/NVIDIA/cudnn-frontend/archive/refs/tags/v1.12.0.zip;7e733cfdc410d777b76122d64232499205589a96
-dawn;https://github.com/google/dawn/archive/9733be39e18186961d503e064874afe3e9ceb8d1.zip;2a4017c32892b90d072a9102eba90ae691fae36d
+dawn;https://github.com/google/dawn/archive/794b6fadc4171f7b853a77ffdf0948fbec431f41.zip;77bb02deace0d140411f02a2fb8f5f925ea6a1b6
 kleidiai;https://github.com/ARM-software/kleidiai/archive/refs/tags/v1.9.0.tar.gz;a2765979f64efb173a4b8ba4de39dcba9c655786
 duktape;https://github.com/svaarala/duktape/releases/download/v2.7.0/duktape-2.7.0.tar.xz;8200c8e417dbab7adcc12c4dbdef7651cfc55794
diff --git a/cmake/external/abseil-cpp.cmake b/cmake/external/abseil-cpp.cmake
index eede60a4a977a..427e77a524586 100644
--- a/cmake/external/abseil-cpp.cmake
+++ b/cmake/external/abseil-cpp.cmake
@@ -43,9 +43,6 @@ onnxruntime_fetchcontent_makeavailable(abseil_cpp)
 FetchContent_GetProperties(abseil_cpp)
 if(abseil_cpp_SOURCE_DIR)
   set(ABSEIL_SOURCE_DIR ${abseil_cpp_SOURCE_DIR})
-  if(onnxruntime_USE_WEBGPU)
-    set(DAWN_ABSEIL_DIR ${abseil_cpp_SOURCE_DIR})
-  endif()
 endif()
 
 # abseil_cpp_SOURCE_DIR is non-empty if we build it from source
diff --git a/cmake/external/onnxruntime_external_deps.cmake b/cmake/external/onnxruntime_external_deps.cmake
index a2375b959752d..8563aa24e2f33 100644
--- a/cmake/external/onnxruntime_external_deps.cmake
+++ b/cmake/external/onnxruntime_external_deps.cmake
@@ -224,11 +224,6 @@ onnxruntime_fetchcontent_makeavailable(Protobuf)
 if(Protobuf_FOUND)
   message(STATUS "Using protobuf from find_package(or vcpkg). Protobuf version: ${Protobuf_VERSION}")
 else()
-  if(protobuf_SOURCE_DIR)
-    if(onnxruntime_USE_WEBGPU)
-      set(DAWN_PROTOBUF_DIR ${protobuf_SOURCE_DIR})
-    endif()
-  endif()
   # Adjust warning flags
   if (TARGET libprotoc)
     if (NOT MSVC)
@@ -645,19 +640,28 @@ if (onnxruntime_USE_WEBGPU)
     set(DAWN_BUILD_SAMPLES OFF CACHE BOOL "" FORCE)
     set(DAWN_ENABLE_NULL OFF CACHE BOOL "" FORCE)
     set(DAWN_FETCH_DEPENDENCIES ON CACHE BOOL "" FORCE)
+    set(DAWN_BUILD_PROTOBUF OFF CACHE BOOL "" FORCE)
     set(DAWN_BUILD_TESTS OFF CACHE BOOL "" FORCE)
     if (NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten")
-      if (onnxruntime_BUILD_DAWN_MONOLITHIC_LIBRARY)
-        set(DAWN_BUILD_MONOLITHIC_LIBRARY ON CACHE BOOL "" FORCE)
+      if (onnxruntime_BUILD_DAWN_SHARED_LIBRARY)
+        set(DAWN_BUILD_MONOLITHIC_LIBRARY SHARED CACHE BOOL "" FORCE)
         set(DAWN_ENABLE_INSTALL ON CACHE BOOL "" FORCE)
 
         if (onnxruntime_USE_EXTERNAL_DAWN)
-          message(FATAL_ERROR "onnxruntime_USE_EXTERNAL_DAWN and onnxruntime_BUILD_DAWN_MONOLITHIC_LIBRARY cannot be enabled at the same time.")
+          message(FATAL_ERROR "onnxruntime_USE_EXTERNAL_DAWN and onnxruntime_BUILD_DAWN_SHARED_LIBRARY cannot be enabled at the same time.")
         endif()
       else()
         # use dawn::dawn_native and dawn::dawn_proc instead of the monolithic dawn::webgpu_dawn to minimize binary size
         set(DAWN_BUILD_MONOLITHIC_LIBRARY OFF CACHE BOOL "" FORCE)
         set(DAWN_ENABLE_INSTALL OFF CACHE BOOL "" FORCE)
+
+        # use the same protobuf/abseil for ORT and Dawn when static linking
+        if(abseil_cpp_SOURCE_DIR)
+          set(DAWN_ABSEIL_DIR ${abseil_cpp_SOURCE_DIR})
+        endif()
+        if(protobuf_SOURCE_DIR)
+          set(DAWN_PROTOBUF_DIR ${protobuf_SOURCE_DIR})
+        endif()
       endif()
 
       if (onnxruntime_ENABLE_PIX_FOR_WEBGPU_EP)
@@ -714,6 +718,7 @@ if (onnxruntime_USE_WEBGPU)
         set(DAWN_ENABLE_D3D11 OFF CACHE BOOL "" FORCE)
       endif()
     endif()
+
     if (onnxruntime_CUSTOM_DAWN_SRC_PATH)
       # use the custom dawn source path if provided
       #
@@ -766,7 +771,7 @@ if (onnxruntime_USE_WEBGPU)
   endif()
 
   if (NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten")
-    if (onnxruntime_BUILD_DAWN_MONOLITHIC_LIBRARY)
+    if (onnxruntime_BUILD_DAWN_SHARED_LIBRARY)
       list(APPEND onnxruntime_EXTERNAL_LIBRARIES dawn::webgpu_dawn)
     else()
       if (NOT onnxruntime_USE_EXTERNAL_DAWN)
diff --git a/cmake/onnxruntime_java.cmake b/cmake/onnxruntime_java.cmake
index a65bd9373d1b7..c81bd9d41e4e0 100644
--- a/cmake/onnxruntime_java.cmake
+++ b/cmake/onnxruntime_java.cmake
@@ -197,7 +197,7 @@ if (WIN32)
           )
         endif()
       endif()
-      if (onnxruntime_BUILD_DAWN_MONOLITHIC_LIBRARY)
+      if (onnxruntime_BUILD_DAWN_SHARED_LIBRARY)
         add_custom_command(TARGET onnxruntime4j_jni POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy_if_different $<TARGET_FILE:dawn::webgpu_dawn> ${JAVA_PACKAGE_LIB_DIR}/$<TARGET_FILE_NAME:dawn::webgpu_dawn>)
       endif()
     endif()
@@ -223,7 +223,7 @@ else()
   if (onnxruntime_USE_QNN AND NOT onnxruntime_BUILD_QNN_EP_STATIC_LIB)
     add_custom_command(TARGET onnxruntime4j_jni POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy_if_different $<TARGET_FILE:onnxruntime_providers_qnn> ${JAVA_PACKAGE_LIB_DIR}/$<TARGET_LINKER_FILE_NAME:onnxruntime_providers_qnn>)
   endif()
-  if (onnxruntime_USE_WEBGPU AND onnxruntime_BUILD_DAWN_MONOLITHIC_LIBRARY)
+  if (onnxruntime_USE_WEBGPU AND onnxruntime_BUILD_DAWN_SHARED_LIBRARY)
     add_custom_command(TARGET onnxruntime4j_jni POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy_if_different $<TARGET_FILE:dawn::webgpu_dawn> ${JAVA_PACKAGE_LIB_DIR}/$<TARGET_LINKER_FILE_NAME:dawn::webgpu_dawn>)
   endif()
 endif()
diff --git a/cmake/onnxruntime_nodejs.cmake b/cmake/onnxruntime_nodejs.cmake
index 146a00c1b98e2..b28bda6c94276 100644
--- a/cmake/onnxruntime_nodejs.cmake
+++ b/cmake/onnxruntime_nodejs.cmake
@@ -65,7 +65,7 @@ if (onnxruntime_USE_WEBGPU)
             list(APPEND NODEJS_DLL_DEPS "$<TARGET_FILE_DIR:dxcompiler>/dxcompiler.dll")
         endif()
     endif()
-    if (onnxruntime_BUILD_DAWN_MONOLITHIC_LIBRARY)
+    if (onnxruntime_BUILD_DAWN_SHARED_LIBRARY)
         list(APPEND NODEJS_DLL_DEPS "$<TARGET_FILE:dawn::webgpu_dawn>")
     endif()
 endif()
diff --git a/cmake/onnxruntime_providers_webgpu.cmake b/cmake/onnxruntime_providers_webgpu.cmake
index 5e092f4e4e57c..b762b149d9d6f 100644
--- a/cmake/onnxruntime_providers_webgpu.cmake
+++ b/cmake/onnxruntime_providers_webgpu.cmake
@@ -63,7 +63,7 @@
 
     set(onnxruntime_providers_webgpu_dll_deps)
 
-    if (onnxruntime_BUILD_DAWN_MONOLITHIC_LIBRARY)
+    if (onnxruntime_BUILD_DAWN_SHARED_LIBRARY)
       target_link_libraries(onnxruntime_providers_webgpu dawn::webgpu_dawn)
 
       if (WIN32)
diff --git a/cmake/onnxruntime_python.cmake b/cmake/onnxruntime_python.cmake
index c5c85dff96411..617a1e057c388 100644
--- a/cmake/onnxruntime_python.cmake
+++ b/cmake/onnxruntime_python.cmake
@@ -1100,7 +1100,7 @@ if (onnxruntime_USE_WEBGPU)
       )
     endif()
   endif()
-  if (onnxruntime_BUILD_DAWN_MONOLITHIC_LIBRARY)
+  if (onnxruntime_BUILD_DAWN_SHARED_LIBRARY)
     add_custom_command(
       TARGET onnxruntime_pybind11_state POST_BUILD
       COMMAND ${CMAKE_COMMAND} -E copy
diff --git a/onnxruntime/core/dll/delay_load_hook.cc b/onnxruntime/core/dll/delay_load_hook.cc
index 3e7b9c132456d..faef37881b7c3 100644
--- a/onnxruntime/core/dll/delay_load_hook.cc
+++ b/onnxruntime/core/dll/delay_load_hook.cc
@@ -21,10 +21,10 @@
 // - https://learn.microsoft.com/en-us/windows/win32/dlls/dynamic-link-library-search-order#alternate-search-order-for-unpackaged-apps
 //
 // The DLL DelayLoad hook is only enabled when the compiler is MSVC and at least one of the following is True:
-// - both USE_WEBGPU and BUILD_DAWN_MONOLITHIC_LIBRARY are defined
+// - both USE_WEBGPU and BUILD_DAWN_SHARED_LIBRARY are defined
 // - USE_DML is defined
 //
-#if defined(USE_WEBGPU) && defined(BUILD_DAWN_MONOLITHIC_LIBRARY)
+#if defined(USE_WEBGPU) && defined(BUILD_DAWN_SHARED_LIBRARY)
 #define ORT_DELAY_LOAD_WEBGPU_DAWN_DLL 1
 #else
 #define ORT_DELAY_LOAD_WEBGPU_DAWN_DLL 0
diff --git a/onnxruntime/core/providers/webgpu/program_manager.cc b/onnxruntime/core/providers/webgpu/program_manager.cc
index 7a4a873a1adf3..dcf89d8bb06a1 100644
--- a/onnxruntime/core/providers/webgpu/program_manager.cc
+++ b/onnxruntime/core/providers/webgpu/program_manager.cc
@@ -79,11 +79,11 @@ Status ProgramManager::Build(const ProgramBase& program,
 #endif
                         << "] End ===\n";
 
-  wgpu::ShaderModuleWGSLDescriptor wgsl_descriptor{};
-  wgsl_descriptor.code = code.c_str();
+  wgpu::ShaderSourceWGSL wgsl_source{};
+  wgsl_source.code = code.c_str();
 
   wgpu::ShaderModuleDescriptor descriptor{};
-  descriptor.nextInChain = &wgsl_descriptor;
+  descriptor.nextInChain = &wgsl_source;
 
   auto shader_module = device_.CreateShaderModule(&descriptor);
 
diff --git a/onnxruntime/core/providers/webgpu/webgpu_context.cc b/onnxruntime/core/providers/webgpu/webgpu_context.cc
index a9557f7b9aa87..731b8e678c58a 100644
--- a/onnxruntime/core/providers/webgpu/webgpu_context.cc
+++ b/onnxruntime/core/providers/webgpu/webgpu_context.cc
@@ -10,7 +10,7 @@
 #endif
 
 #if !defined(__wasm__)
-#if !defined(BUILD_DAWN_MONOLITHIC_LIBRARY)
+#if !defined(BUILD_DAWN_SHARED_LIBRARY)
 #include "dawn/dawn_proc.h"
 #endif
 #if !defined(USE_EXTERNAL_DAWN)
@@ -860,7 +860,7 @@ WebGpuContext& WebGpuContextFactory::CreateContext(const WebGpuContextConfig& co
 
 #if !defined(__wasm__)
     const DawnProcTable* dawn_procs = reinterpret_cast<const DawnProcTable*>(dawn_proc_table);
-#if defined(BUILD_DAWN_MONOLITHIC_LIBRARY)
+#if defined(BUILD_DAWN_SHARED_LIBRARY)
     ORT_ENFORCE(dawn_procs == nullptr, "setting DawnProcTable is not allowed when dynamically linked to webgpu_dawn.");
 #else
 #if !defined(USE_EXTERNAL_DAWN)
@@ -875,8 +875,10 @@ WebGpuContext& WebGpuContextFactory::CreateContext(const WebGpuContextConfig& co
 #endif
 
     // Step.2 - Create wgpu::Instance
+    wgpu::InstanceFeatureName required_instance_features[] = {wgpu::InstanceFeatureName::TimedWaitAny};
     wgpu::InstanceDescriptor instance_desc{};
-    instance_desc.capabilities.timedWaitAnyEnable = true;
+    instance_desc.requiredFeatures = required_instance_features;
+    instance_desc.requiredFeatureCount = sizeof(required_instance_features) / sizeof(required_instance_features[0]);
     default_instance_ = wgpu::CreateInstance(&instance_desc);
 
     ORT_ENFORCE(default_instance_ != nullptr, "Failed to create wgpu::Instance.");
diff --git a/onnxruntime/wasm/wasm_post_build.js b/onnxruntime/wasm/wasm_post_build.js
index 64bece0da4cbb..f024026a2a3ee 100644
--- a/onnxruntime/wasm/wasm_post_build.js
+++ b/onnxruntime/wasm/wasm_post_build.js
@@ -55,21 +55,21 @@ if (path.basename(mjsFilePath).includes('-threaded')) {
 //
 // (for debug build)
 //
-// > subgroups: "17",
+// > subgroups: "18",
 // --- change to -->
-// > "subgroups": "17",
+// > "subgroups": "18",
 //
 // (for release build)
 //
-// > Pe:"17",
+// > Pe:"18",
 // --- change to -->
-// > "subgroups":"17",
+// > "subgroups":"18",
 //
 
 // This step should only be applied for WebGPU EP builds
 if (path.basename(mjsFilePath).includes('.async')) {
-    const regexDebug = 'subgroups: "17"';
-    const regexRelease = '[a-zA-Z_$][a-zA-Z0-9_$]*:"17"';
+    const regexDebug = 'subgroups: "18"';
+    const regexRelease = '[a-zA-Z_$][a-zA-Z0-9_$]*:"18"';
 
     const matchesDebug = [...contents.matchAll(new RegExp(regexDebug, 'g'))];
     const matchesRelease = [...contents.matchAll(new RegExp(regexRelease, 'g'))];
@@ -77,12 +77,12 @@ if (path.basename(mjsFilePath).includes('.async')) {
     if (matchesDebug.length === 1 && matchesRelease.length === 0) {
         contents = contents.replace(
             new RegExp(regexDebug),
-            '"subgroups": "17"',
+            '"subgroups": "18"',
         );
     } else if (matchesDebug.length === 0 && matchesRelease.length === 1) {
         contents = contents.replace(
             new RegExp(regexRelease),
-            '"subgroups":"17"',
+            '"subgroups":"18"',
         );
     } else {
         throw new Error(
diff --git a/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packing-jobs.yml b/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packing-jobs.yml
index 6d908dec05ee2..095a53b2e44b9 100644
--- a/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packing-jobs.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packing-jobs.yml
@@ -38,7 +38,7 @@ jobs:
     ORT_CACHE_DIR: $(Pipeline.Workspace)/ccache_ort
   pool:
     name: "Azure Pipelines"
-    image: 'macOS-14'
+    image: 'macOS-15'
     os: macOS
   timeoutInMinutes: 300
   steps:
@@ -64,6 +64,8 @@ jobs:
   - template: set-version-number-variables-step.yml
 
   - template: use-xcode-version.yml
+    parameters:
+      xcodeVersion: 16.4
 
   - script: |
       set -e -x
@@ -101,7 +103,7 @@ jobs:
     - template: mac-cpu-packaging-steps.yml
       parameters:
         MacosArch: ${{ parameters.MacosArch }}
-        AdditionalBuildFlags: ${{ parameters.AdditionalBuildFlags }} --build_nodejs --build_java --use_coreml --use_webgpu
+        AdditionalBuildFlags: ${{ parameters.AdditionalBuildFlags }} --build_nodejs --build_java --use_coreml --use_webgpu --cmake_extra_defines CMAKE_OSX_ARCHITECTURES=x86_64
         BuildJava: true
         BuildNodejs: true
         WithCache: ${{ parameters.WithCache }}

From 55f4c2a64489346fd862b8f006db930c283915d6 Mon Sep 17 00:00:00 2001
From: Eren Akbiyik <23424198+meakbiyik@users.noreply.github.com>
Date: Wed, 6 Aug 2025 19:00:57 +0200
Subject: [PATCH 06/10] Fix the is_leaf check in TreeEnsemble (#25410)

### Description
Fixes #24679.

### Motivation and Context
The original check for a leaf node was insufficient because a branch
child and a leaf child could have the same index. The bug described in
issue #24679 is not a rare occasion; in fact, it is something likely to
be faced in estimators with small and balanced trees. I encountered it
myself in a unit test.

The corrected check ensures that for a node to be considered a leaf,
both of its children must be leaves and share the same index.
---
 .../cpu/ml/tree_ensemble_attribute.h          |  4 +-
 .../providers/cpu/ml/tree_ensembler_test.cc   | 46 +++++++++++++++++++
 2 files changed, 49 insertions(+), 1 deletion(-)

diff --git a/onnxruntime/core/providers/cpu/ml/tree_ensemble_attribute.h b/onnxruntime/core/providers/cpu/ml/tree_ensemble_attribute.h
index 3426b959a15c3..ca568e485da11 100644
--- a/onnxruntime/core/providers/cpu/ml/tree_ensemble_attribute.h
+++ b/onnxruntime/core/providers/cpu/ml/tree_ensemble_attribute.h
@@ -389,8 +389,10 @@ struct TreeEnsembleAttributesV5 {
     int64_t curr_treeid = 0;
     for (const int64_t& tree_root : tree_roots) {
       size_t tree_root_size_t = onnxruntime::narrow<size_t>(tree_root);
+      bool is_leaf = (nodes_falsenodeids[tree_root_size_t] == nodes_truenodeids[tree_root_size_t] &&
+                      nodes_falseleafs[tree_root_size_t] && nodes_trueleafs[tree_root_size_t]);
       transformInputOneTree(tree_root_size_t, curr_treeid, 0,
-                            nodes_falsenodeids[tree_root_size_t] == nodes_truenodeids[tree_root_size_t],
+                            is_leaf,
                             membership_values_by_id, output);
       curr_treeid++;
     }
diff --git a/onnxruntime/test/providers/cpu/ml/tree_ensembler_test.cc b/onnxruntime/test/providers/cpu/ml/tree_ensembler_test.cc
index 2c3722b2cbfb9..32368734f3530 100644
--- a/onnxruntime/test/providers/cpu/ml/tree_ensembler_test.cc
+++ b/onnxruntime/test/providers/cpu/ml/tree_ensembler_test.cc
@@ -290,6 +290,52 @@ TEST(MLOpTest, TreeEnsembleLeafOnly) {
   test.Run();
 }
 
+TEST(MLOpTest, TreeEnsembleLeafLike) {
+  OpTester test("TreeEnsemble", 5, onnxruntime::kMLDomain);
+  int64_t n_targets = 1;
+
+  int64_t aggregate_function = 1;  // SUM
+  int64_t post_transform = 0;      // NONE
+  std::vector<int64_t> tree_roots = {0, 2};
+  std::vector<uint8_t> nodes_modes = {0, 0, 0, 0, 0};  // BRANCH_LEQ
+  std::vector<int64_t> nodes_featureids = {0, 1, 0, 1, 2};
+  std::vector<double> nodes_splits = {2.0, 2.0, 3.0, 2.0, 1.0};
+  std::vector<int64_t> nodes_truenodeids = {1, 0, 3, 4, 5};
+  std::vector<int64_t> nodes_trueleafs = {0, 1, 1, 1, 1};
+  std::vector<int64_t> nodes_falsenodeids = {2, 1, 3, 4, 6};
+  std::vector<int64_t> nodes_falseleafs = {1, 1, 0, 0, 1};
+
+  std::vector<int64_t> leaf_targetids = {0, 0, 0, 0, 0, 0, 0};
+  std::vector<double> leaf_weights = {100.0, 0.0, 25.0, 0.5, -0.5, -5.0, -9.0};
+
+  auto nodes_modes_as_tensor = make_tensor(nodes_modes, "nodes_modes");
+  auto nodes_splits_as_tensor = make_tensor(nodes_splits, "nodes_splits");
+  auto leaf_weights_as_tensor = make_tensor(leaf_weights, "leaf_weight");
+
+  // add attributes
+  test.AddAttribute("n_targets", n_targets);
+  test.AddAttribute("aggregate_function", aggregate_function);
+  test.AddAttribute("post_transform", post_transform);
+  test.AddAttribute("tree_roots", tree_roots);
+  test.AddAttribute("nodes_modes", nodes_modes_as_tensor);
+  test.AddAttribute("nodes_featureids", nodes_featureids);
+  test.AddAttribute("nodes_splits", nodes_splits_as_tensor);
+  test.AddAttribute("nodes_truenodeids", nodes_truenodeids);
+  test.AddAttribute("nodes_trueleafs", nodes_trueleafs);
+  test.AddAttribute("nodes_falsenodeids", nodes_falsenodeids);
+  test.AddAttribute("nodes_falseleafs", nodes_falseleafs);
+  test.AddAttribute("leaf_targetids", leaf_targetids);
+  test.AddAttribute("leaf_weights", leaf_weights_as_tensor);
+
+  // fill input data
+  std::vector<double> X = {7.0, 7.0, 4.0};
+  std::vector<double> Y = {16.0};
+
+  test.AddInput<double>("X", {1, 3}, X);
+  test.AddOutput<double>("Y", {1, 1}, Y);
+  test.Run();
+}
+
 TEST(MLOpTest, TreeEnsembleBigSet) {
   // https://github.com/microsoft/onnxruntime/issues/25400
   OpTester test("TreeEnsemble", 5, onnxruntime::kMLDomain);

From eff5b9a77952665c3728635339573a781d1ad12f Mon Sep 17 00:00:00 2001
From: vraspar <vrajang@outlook.com>
Date: Wed, 6 Aug 2025 13:16:23 -0700
Subject: [PATCH 07/10] Remove training packages from
 onnxruntime-ios-packaging-pipeline (#25451)

### Description
remove support for multiple package variants (`Full` and `Training`) in
the Apple packaging pipeline, consolidating the codebase to only support
the `Full` variant. The changes simplify the code by eliminating the
`PackageVariant` enum, related logic, and configuration files for the
`Training` variant.

---------

Co-authored-by: Edward Chen <18449977+edgchen1@users.noreply.github.com>
---
 tools/ci_build/build.py                       |  4 --
 .../apple/build_and_assemble_apple_pods.py    | 14 +-----
 .../github/apple/c/assemble_c_pod_package.py  | 20 ++-------
 .../c/onnxruntime-training-c.config.json      |  5 ---
 ...training_ios_framework_build_settings.json | 39 ----------------
 .../objectivec/assemble_objc_pod_package.py   | 44 ++++++-------------
 .../onnxruntime-training-objc.config.json     |  5 ---
 .../github/apple/package_assembly_utils.py    | 14 ------
 .../github/apple/test_apple_packages.py       | 10 +----
 ...os_simulator_framework_build_settings.json | 22 ----------
 .../mac-ios-packaging-pipeline.yml            |  6 ---
 .../azure-pipelines/post-merge-jobs.yml       | 36 ---------------
 .../azure-pipelines/templates/c-api-cpu.yml   |  1 -
 .../templates/react-native-ci.yml             |  4 --
 .../stages/mac-ios-packaging-build-stage.yml  | 31 ++++---------
 15 files changed, 28 insertions(+), 227 deletions(-)
 delete mode 100644 tools/ci_build/github/apple/c/onnxruntime-training-c.config.json
 delete mode 100644 tools/ci_build/github/apple/default_training_ios_framework_build_settings.json
 delete mode 100644 tools/ci_build/github/apple/objectivec/onnxruntime-training-objc.config.json
 delete mode 100644 tools/ci_build/github/apple/test_minimal_training_ios_simulator_framework_build_settings.json

diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py
index 2080f4f7941c6..bf24d3f3e5357 100644
--- a/tools/ci_build/build.py
+++ b/tools/ci_build/build.py
@@ -1668,8 +1668,6 @@ def run_ios_tests(args, source_dir, config, cwd):
                 dynamic_framework_dir,
                 "--framework_info_file",
                 framework_info_file,
-                "--variant",
-                "Full",
                 "--skip_macos_test",
             ],
             cwd=cwd,
@@ -1683,8 +1681,6 @@ def run_ios_tests(args, source_dir, config, cwd):
                 static_framework_dir,
                 "--framework_info_file",
                 framework_info_file,
-                "--variant",
-                "Full",
                 "--skip_macos_test",
             ],
             cwd=cwd,
diff --git a/tools/ci_build/github/apple/build_and_assemble_apple_pods.py b/tools/ci_build/github/apple/build_and_assemble_apple_pods.py
index 1a8ebafdab1d5..dea2baf904231 100755
--- a/tools/ci_build/github/apple/build_and_assemble_apple_pods.py
+++ b/tools/ci_build/github/apple/build_and_assemble_apple_pods.py
@@ -14,7 +14,7 @@
 import tempfile
 
 from c.assemble_c_pod_package import assemble_c_pod_package
-from package_assembly_utils import PackageVariant, get_ort_version
+from package_assembly_utils import get_ort_version
 
 from objectivec.assemble_objc_pod_package import assemble_objc_pod_package
 
@@ -53,13 +53,6 @@ def parse_args():
         help="The version string of the pod. The same version is used for all pods.",
     )
 
-    parser.add_argument(
-        "--variant",
-        choices=PackageVariant.release_variant_names(),
-        default=PackageVariant.Full.name,
-        help="Pod package variant.",
-    )
-
     parser.add_argument("--test", action="store_true", help="Run tests on the framework and pod package files.")
     parser.add_argument(
         "--skip-build",
@@ -103,7 +96,6 @@ def main():
     staging_dir = args.staging_dir.resolve()
 
     # build framework
-    package_variant = PackageVariant[args.variant]
     framework_info_file = build_dir / "xcframework_info.json"
 
     log.info("Building Apple framework.")
@@ -131,8 +123,6 @@ def main():
             str(framework_info_file),
             "--c_framework_dir",
             str(build_dir / "framework_out"),
-            "--variant",
-            package_variant.name,
             "--test_project_stage_dir",  # use a specific directory so it's easier to debug
             str(build_dir / "test_apple_packages_staging"),
         ]
@@ -153,7 +143,6 @@ def main():
             framework_info_file=framework_info_file,
             framework_dir=build_dir / "framework_out" / "onnxruntime.xcframework",
             public_headers_dir=build_dir / "framework_out" / "Headers",
-            package_variant=package_variant,
         )
 
         if args.test:
@@ -168,7 +157,6 @@ def main():
             staging_dir=objc_pod_staging_dir,
             pod_version=args.pod_version,
             framework_info_file=framework_info_file,
-            package_variant=package_variant,
         )
 
         if args.test:
diff --git a/tools/ci_build/github/apple/c/assemble_c_pod_package.py b/tools/ci_build/github/apple/c/assemble_c_pod_package.py
index 59052734ddd26..ee76ad80ba0cc 100644
--- a/tools/ci_build/github/apple/c/assemble_c_pod_package.py
+++ b/tools/ci_build/github/apple/c/assemble_c_pod_package.py
@@ -13,7 +13,6 @@
 
 
 from package_assembly_utils import (  # noqa: E402
-    PackageVariant,
     copy_repo_relative_to_dir,
     gen_file_from_template,
     get_podspec_values,
@@ -21,16 +20,11 @@
 )
 
 
-def get_pod_config_file(package_variant: PackageVariant):
+def get_pod_config_file():
     """
-    Gets the pod configuration file path for the given package variant.
+    Gets the pod configuration file path.
     """
-    if package_variant == PackageVariant.Full:
-        return _script_dir / "onnxruntime-c.config.json"
-    elif package_variant == PackageVariant.Training:
-        return _script_dir / "onnxruntime-training-c.config.json"
-    else:
-        raise ValueError(f"Unhandled package variant: {package_variant}")
+    return _script_dir / "onnxruntime-c.config.json"
 
 
 def assemble_c_pod_package(
@@ -39,7 +33,6 @@ def assemble_c_pod_package(
     framework_info_file: pathlib.Path,
     public_headers_dir: pathlib.Path,
     framework_dir: pathlib.Path,
-    package_variant: PackageVariant,
 ):
     """
     Assembles the files for the C/C++ pod package in a staging directory.
@@ -49,7 +42,6 @@ def assemble_c_pod_package(
     :param framework_info_file Path to the framework_info.json or xcframework_info.json file containing additional values for the podspec.
     :param public_headers_dir Path to the public headers directory to include in the pod.
     :param framework_dir Path to the onnxruntime framework directory to include in the pod.
-    :param package_variant The pod package variant.
     :return Tuple of (package name, path to the podspec file).
     """
     staging_dir = staging_dir.resolve()
@@ -58,7 +50,7 @@ def assemble_c_pod_package(
     framework_dir = framework_dir.resolve(strict=True)
 
     framework_info = load_json_config(framework_info_file)
-    pod_config = load_json_config(get_pod_config_file(package_variant))
+    pod_config = load_json_config(get_pod_config_file())
 
     pod_name = pod_config["name"]
 
@@ -130,9 +122,6 @@ def parse_args():
         required=True,
         help="Path to the onnxruntime framework directory to include in the pod.",
     )
-    parser.add_argument(
-        "--variant", choices=PackageVariant.all_variant_names(), required=True, help="Pod package variant."
-    )
 
     return parser.parse_args()
 
@@ -146,7 +135,6 @@ def main():
         framework_info_file=args.framework_info_file,
         public_headers_dir=args.public_headers_dir,
         framework_dir=args.framework_dir,
-        package_variant=PackageVariant[args.variant],
     )
 
     return 0
diff --git a/tools/ci_build/github/apple/c/onnxruntime-training-c.config.json b/tools/ci_build/github/apple/c/onnxruntime-training-c.config.json
deleted file mode 100644
index 87011c216a50c..0000000000000
--- a/tools/ci_build/github/apple/c/onnxruntime-training-c.config.json
+++ /dev/null
@@ -1,5 +0,0 @@
-{
-    "name": "onnxruntime-training-c",
-    "summary": "ONNX Runtime Training C/C++ Pod",
-    "description": "A pod for the ONNX Runtime C/C++ library. This pod supports additional training features."
-}
diff --git a/tools/ci_build/github/apple/default_training_ios_framework_build_settings.json b/tools/ci_build/github/apple/default_training_ios_framework_build_settings.json
deleted file mode 100644
index bcc23296b7d3a..0000000000000
--- a/tools/ci_build/github/apple/default_training_ios_framework_build_settings.json
+++ /dev/null
@@ -1,39 +0,0 @@
-{
-    "build_osx_archs": {
-        "iphoneos": [
-            "arm64"
-        ],
-        "iphonesimulator": [
-            "arm64",
-            "x86_64"
-        ],
-        "macosx": [
-            "arm64",
-            "x86_64"
-        ]
-    },
-    "build_params": {
-        "base": [
-            "--parallel",
-            "--use_xcode",
-            "--enable_training_apis",
-            "--build_apple_framework",
-            "--use_coreml",
-            "--use_xnnpack",
-            "--skip_tests",
-            "--cmake_extra_defines=onnxruntime_BUILD_UNIT_TESTS=OFF"
-        ],
-        "iphoneos": [
-            "--ios",
-            "--apple_deploy_target=15.1"
-        ],
-        "iphonesimulator": [
-            "--ios",
-            "--apple_deploy_target=15.1"
-        ],
-        "macosx": [
-            "--macos=MacOSX",
-            "--apple_deploy_target=13.4"
-        ]
-    }
-}
diff --git a/tools/ci_build/github/apple/objectivec/assemble_objc_pod_package.py b/tools/ci_build/github/apple/objectivec/assemble_objc_pod_package.py
index a829d244ad6e0..40bf155371f98 100755
--- a/tools/ci_build/github/apple/objectivec/assemble_objc_pod_package.py
+++ b/tools/ci_build/github/apple/objectivec/assemble_objc_pod_package.py
@@ -13,7 +13,6 @@
 
 from c.assemble_c_pod_package import get_pod_config_file as get_c_pod_config_file  # noqa: E402
 from package_assembly_utils import (  # noqa: E402
-    PackageVariant,
     copy_repo_relative_to_dir,
     filter_files,
     gen_file_from_template,
@@ -84,50 +83,39 @@
 }
 
 
-def get_pod_files(package_variant: PackageVariant):
+def get_pod_files():
     """
-    Gets the source and header files for the given package variant.
+    Gets the source and header files.
     """
-    if package_variant == PackageVariant.Training:
-        return all_objc_files
-    else:
-        # return files that are in pod_files but not in training_only_objc_files
-        filtered_pod_files = {}
-        for key, value in all_objc_files.items():
-            filtered_pod_files[key] = filter_files(value, training_only_objc_files[key])
-        return filtered_pod_files
+    # return files that are in pod_files but not in training_only_objc_files
+    filtered_pod_files = {}
+    for key, value in all_objc_files.items():
+        filtered_pod_files[key] = filter_files(value, training_only_objc_files[key])
+    return filtered_pod_files
 
 
-def get_pod_config_file(package_variant: PackageVariant):
+def get_pod_config_file():
     """
-    Gets the pod configuration file path for the given package variant.
+    Gets the pod configuration file path.
     """
-    if package_variant == PackageVariant.Full:
-        return _script_dir / "onnxruntime-objc.config.json"
-    elif package_variant == PackageVariant.Training:
-        return _script_dir / "onnxruntime-training-objc.config.json"
-    else:
-        raise ValueError(f"Unhandled package variant: {package_variant}")
+    return _script_dir / "onnxruntime-objc.config.json"
 
 
-def assemble_objc_pod_package(
-    staging_dir: pathlib.Path, pod_version: str, framework_info_file: pathlib.Path, package_variant: PackageVariant
-):
+def assemble_objc_pod_package(staging_dir: pathlib.Path, pod_version: str, framework_info_file: pathlib.Path):
     """
     Assembles the files for the Objective-C pod package in a staging directory.
 
     :param staging_dir Path to the staging directory for the Objective-C pod files.
     :param pod_version Objective-C pod version.
     :param framework_info_file Path to the framework_info.json or xcframework_info.json file containing additional values for the podspec.
-    :param package_variant The pod package variant.
     :return Tuple of (package name, path to the podspec file).
     """
     staging_dir = staging_dir.resolve()
     framework_info_file = framework_info_file.resolve(strict=True)
 
     framework_info = load_json_config(framework_info_file)
-    pod_config = load_json_config(get_pod_config_file(package_variant))
-    c_pod_config = load_json_config(get_c_pod_config_file(package_variant))
+    pod_config = load_json_config(get_pod_config_file())
+    c_pod_config = load_json_config(get_c_pod_config_file())
 
     pod_name = pod_config["name"]
 
@@ -135,7 +123,7 @@ def assemble_objc_pod_package(
     if staging_dir.exists():
         print("Warning: staging directory already exists", file=sys.stderr)
 
-    pod_files = get_pod_files(package_variant)
+    pod_files = get_pod_files()
 
     # copy the necessary files to the staging directory
     copy_repo_relative_to_dir(
@@ -196,9 +184,6 @@ def parse_args():
         help="Path to the framework_info.json or xcframework_info.json file containing additional values for the podspec. "
         "This file should be generated by CMake in the build directory.",
     )
-    parser.add_argument(
-        "--variant", choices=PackageVariant.release_variant_names(), required=True, help="Pod package variant."
-    )
 
     return parser.parse_args()
 
@@ -210,7 +195,6 @@ def main():
         staging_dir=args.staging_dir,
         pod_version=args.pod_version,
         framework_info_file=args.framework_info_file,
-        package_variant=PackageVariant[args.variant],
     )
 
     return 0
diff --git a/tools/ci_build/github/apple/objectivec/onnxruntime-training-objc.config.json b/tools/ci_build/github/apple/objectivec/onnxruntime-training-objc.config.json
deleted file mode 100644
index b1cc2d4aad5a8..0000000000000
--- a/tools/ci_build/github/apple/objectivec/onnxruntime-training-objc.config.json
+++ /dev/null
@@ -1,5 +0,0 @@
-{
-    "name": "onnxruntime-training-objc",
-    "summary": "ONNX Runtime Objective-C Pod",
-    "description": "A pod for the ONNX Runtime Objective-C training API."
-}
diff --git a/tools/ci_build/github/apple/package_assembly_utils.py b/tools/ci_build/github/apple/package_assembly_utils.py
index 829bca8c743df..04eecbd6b4705 100644
--- a/tools/ci_build/github/apple/package_assembly_utils.py
+++ b/tools/ci_build/github/apple/package_assembly_utils.py
@@ -1,7 +1,6 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License.
 
-import enum
 import json
 import os
 import pathlib
@@ -12,19 +11,6 @@
 repo_root = _script_dir.parents[3]
 
 
-class PackageVariant(enum.Enum):
-    Full = 0  # full ORT build with all opsets, ops, and types
-    Training = 1  # full ORT build with all opsets, ops, and types, plus training APIs
-
-    @classmethod
-    def release_variant_names(cls):
-        return [v.name for v in cls if v.value >= 0]
-
-    @classmethod
-    def all_variant_names(cls):
-        return [v.name for v in cls]
-
-
 _template_variable_pattern = re.compile(r"@(\w+)@")  # match "@var@"
 
 
diff --git a/tools/ci_build/github/apple/test_apple_packages.py b/tools/ci_build/github/apple/test_apple_packages.py
index 14c0b46676ac6..6a935e2fc167d 100644
--- a/tools/ci_build/github/apple/test_apple_packages.py
+++ b/tools/ci_build/github/apple/test_apple_packages.py
@@ -13,7 +13,7 @@
 import tempfile
 
 from c.assemble_c_pod_package import assemble_c_pod_package
-from package_assembly_utils import PackageVariant, gen_file_from_template, get_ort_version
+from package_assembly_utils import gen_file_from_template, get_ort_version
 
 SCRIPT_PATH = pathlib.Path(__file__).resolve(strict=True)
 REPO_DIR = SCRIPT_PATH.parents[4]
@@ -81,7 +81,6 @@ def _test_apple_packages(args):
             framework_info_file=args.framework_info_file,
             public_headers_dir=public_headers_dir,
             framework_dir=framework_dir,
-            package_variant=PackageVariant[args.variant],
         )
 
         # move podspec out to target_proj_path first
@@ -239,13 +238,6 @@ def parse_args():
         "--c_framework_dir", type=pathlib.Path, required=True, help="Provide the parent directory for C/C++ framework"
     )
 
-    parser.add_argument(
-        "--variant",
-        choices=PackageVariant.all_variant_names(),
-        required=True,
-        help="Pod package variant.",
-    )
-
     parser.add_argument(
         "--test_project_stage_dir",
         type=pathlib.Path,
diff --git a/tools/ci_build/github/apple/test_minimal_training_ios_simulator_framework_build_settings.json b/tools/ci_build/github/apple/test_minimal_training_ios_simulator_framework_build_settings.json
deleted file mode 100644
index 43516d0bd659c..0000000000000
--- a/tools/ci_build/github/apple/test_minimal_training_ios_simulator_framework_build_settings.json
+++ /dev/null
@@ -1,22 +0,0 @@
-{
-  "build_osx_archs": {
-    "iphonesimulator": [
-      "x86_64"
-    ]
-  },
-  "build_params": {
-    "base": [
-      "--parallel",
-      "--use_xcode",
-      "--build_apple_framework",
-      "--minimal_build=extended",
-      "--enable_training_apis",
-      "--skip_tests",
-      "--cmake_extra_defines=onnxruntime_BUILD_UNIT_TESTS=OFF"
-    ],
-    "iphonesimulator": [
-      "--ios",
-      "--apple_deploy_target=15.1"
-    ]
-  }
-}
diff --git a/tools/ci_build/github/azure-pipelines/mac-ios-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/mac-ios-packaging-pipeline.yml
index 70d8e954808a5..d446f6264ca8c 100644
--- a/tools/ci_build/github/azure-pipelines/mac-ios-packaging-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/mac-ios-packaging-pipeline.yml
@@ -66,10 +66,4 @@ extends:
 
     - template: templates/stages/mac-ios-packaging-build-stage.yml
       parameters:
-        packageVariant: Full
-        buildType: ${{ parameters.buildType }}
-
-    - template: templates/stages/mac-ios-packaging-build-stage.yml
-      parameters:
-        packageVariant: Training
         buildType: ${{ parameters.buildType }}
diff --git a/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml b/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml
index 8647b32962165..197edf7bcad24 100644
--- a/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml
+++ b/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml
@@ -447,41 +447,5 @@ stages:
         python tools/ci_build/github/apple/test_apple_packages.py \
           --framework_info_file "$(Build.BinariesDirectory)/ios_framework/xcframework_info.json" \
           --c_framework_dir "$(Build.BinariesDirectory)/ios_framework/framework_out" \
-          --variant Full \
-          --skip_macos_test
-      displayName: "Test pod with iOS framework"
-
-- stage: IosMinimalTrainingBuild
-  dependsOn: []
-  jobs:
-  - job: IosMinimalTrainingBuild
-    timeoutInMinutes: 120
-    pool:
-      vmImage: "macOS-14"
-
-    steps:
-    - task: UsePythonVersion@0
-      inputs:
-        versionSpec: "3.12"
-        addToPath: true
-        architecture: "x64"
-
-    - template: templates/use-xcode-version.yml
-
-    - script: |
-        pip install -r tools/ci_build/github/apple/ios_packaging/requirements.txt
-      displayName: "Install Python requirements"
-
-    - script: |
-        python tools/ci_build/github/apple/build_apple_framework.py \
-          --build_dir "$(Build.BinariesDirectory)/ios_framework" \
-          tools/ci_build/github/apple/test_minimal_training_ios_simulator_framework_build_settings.json
-      displayName: "Build iOS framework with minimal build and training enabled"
-
-    - script: |
-        python tools/ci_build/github/apple/test_apple_packages.py \
-          --framework_info_file "$(Build.BinariesDirectory)/ios_framework/xcframework_info.json" \
-          --c_framework_dir "$(Build.BinariesDirectory)/ios_framework/framework_out" \
-          --variant Training \
           --skip_macos_test
       displayName: "Test pod with iOS framework"
diff --git a/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml b/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml
index bf65b0c54cf27..becd3a44841b6 100644
--- a/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml
@@ -134,7 +134,6 @@ stages:
           --fail_if_cocoapods_missing \
           --framework_info_file "$(Build.BinariesDirectory)/ios_framework/xcframework_info.json" \
           --c_framework_dir "$(Build.BinariesDirectory)/ios_framework/framework_out" \
-          --variant Full \
           --skip_macos_test \
           --mac_catalyst_enabled
       displayName: "Test Apple framework"
diff --git a/tools/ci_build/github/azure-pipelines/templates/react-native-ci.yml b/tools/ci_build/github/azure-pipelines/templates/react-native-ci.yml
index fe26dc20106f7..47632dd3e0b1f 100644
--- a/tools/ci_build/github/azure-pipelines/templates/react-native-ci.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/react-native-ci.yml
@@ -103,7 +103,6 @@ stages:
             python $(Build.SourcesDirectory)/tools/ci_build/github/apple/build_and_assemble_apple_pods.py \
               --build-dir "$(Build.BinariesDirectory)/ios_framework_full" \
               --staging-dir "$(Build.BinariesDirectory)/ios-full-pod" \
-              --variant Full \
               --build-settings-file $(Build.SourcesDirectory)/tools/ci_build/github/js/react_native_e2e_full_ios_framework_build_settings.json
           displayName: Build iOS package and assemble pods
           env:
@@ -226,6 +225,3 @@ stages:
         displayName: Publish React Native Detox E2E test logs
 
     - template: explicitly-defined-final-tasks.yml
-
-
-
diff --git a/tools/ci_build/github/azure-pipelines/templates/stages/mac-ios-packaging-build-stage.yml b/tools/ci_build/github/azure-pipelines/templates/stages/mac-ios-packaging-build-stage.yml
index 5eef1ae8e8e93..c7fced0c50cf5 100644
--- a/tools/ci_build/github/azure-pipelines/templates/stages/mac-ios-packaging-build-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/stages/mac-ios-packaging-build-stage.yml
@@ -1,10 +1,4 @@
 parameters:
-- name: packageVariant
-  type: string
-  values:
-  - Full
-  - Training
-
 - name: buildType
   type: string
   values:
@@ -13,11 +7,11 @@ parameters:
   default: normal
 
 stages:
-- stage: IosPackaging_Build_${{ parameters.packageVariant }}
+- stage: IosPackaging_Build
   dependsOn: []
   jobs:
   - job:
-    displayName: "Build iOS package for variant: ${{ parameters.packageVariant}}"
+    displayName: "Build iOS package"
 
     variables:
       # Note: Keep the Xcode version and iOS simulator version compatible.
@@ -25,21 +19,15 @@ stages:
       # https://developer.apple.com/support/xcode/
       xcodeVersion: "15.3.0"
       iosSimulatorRuntimeVersion: "17.4"
-      ${{ if eq(parameters.packageVariant, 'Full') }}:
-        buildSettingsFile: "tools/ci_build/github/apple/default_full_apple_framework_build_settings.json"
-        cPodName: onnxruntime-c
-        objcPodName: onnxruntime-objc
-
-      ${{ if eq(parameters.packageVariant, 'Training') }}:
-        buildSettingsFile: "tools/ci_build/github/apple/default_training_ios_framework_build_settings.json"
-        cPodName: onnxruntime-training-c
-        objcPodName: onnxruntime-training-objc
+      buildSettingsFile: "tools/ci_build/github/apple/default_full_apple_framework_build_settings.json"
+      cPodName: onnxruntime-c
+      objcPodName: onnxruntime-objc
     timeoutInMinutes: 270
     templateContext:
       outputs:
       - output: pipelineArtifact
         targetPath: $(Build.ArtifactStagingDirectory)
-        artifactName: ios_packaging_artifacts_${{ lower(parameters.packageVariant) }}
+        artifactName: ios_packaging_artifacts_full
 
     steps:
     - bash: |
@@ -108,9 +96,7 @@ stages:
           --staging-dir "$(Build.BinariesDirectory)/staging" \
           --pod-version "$(ortPodVersion)" \
           --test \
-          --variant ${{ parameters.packageVariant }} \
-          --build-settings-file "${{ variables.buildSettingsFile }}" \
-          ${{ variables.optionalIncludeOpsByConfigOption }}
+          --build-settings-file "${{ variables.buildSettingsFile }}"
       displayName: "Build macOS/iOS framework and assemble pod package files"
       env:
         ORT_GET_SIMULATOR_DEVICE_INFO_REQUESTED_RUNTIME_VERSION: $(iosSimulatorRuntimeVersion)
@@ -120,7 +106,6 @@ stages:
           --fail_if_cocoapods_missing \
           --framework_info_file "$(Build.BinariesDirectory)/apple_framework/xcframework_info.json" \
           --c_framework_dir "$(Build.BinariesDirectory)/apple_framework/framework_out" \
-          --variant ${{ parameters.packageVariant }} \
           --test_project_stage_dir "$(Build.BinariesDirectory)/app_center_test" \
           --prepare_test_project_only
       displayName: "Assemble test project for App Center"
@@ -190,7 +175,7 @@ stages:
     - task: 1ES.PublishPipelineArtifact@1
       inputs:
         path: $(Build.ArtifactStagingDirectory)
-        artifact: browserstack_test_artifacts_${{ lower(parameters.packageVariant) }}
+        artifact: browserstack_test_artifacts_full
       displayName: "Publish BrowserStack artifacts"
 
     - script: |

From ee794afec24bd448eabf15102d1f14a110c35516 Mon Sep 17 00:00:00 2001
From: Caroline Zhu <wolfivyaura@gmail.com>
Date: Wed, 6 Aug 2025 14:13:38 -0700
Subject: [PATCH 08/10] Enable BrowserStack testing stage  (#25668)

### Description
Tests previously not running in ADO pipelines correctly -- instead would
time out
---
 .../browserstack.yml                                     | 4 ++--
 .../azure-pipelines/nuget/templates/test_android.yml     | 9 +++++++++
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.BrowserStack.Android/browserstack.yml b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.BrowserStack.Android/browserstack.yml
index 9efbc9fc6aa61..0afa8ca9c7fae 100644
--- a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.BrowserStack.Android/browserstack.yml
+++ b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.BrowserStack.Android/browserstack.yml
@@ -3,11 +3,11 @@ platforms:
   - platformName: android
     deviceName: Samsung Galaxy S22 Ultra
     platformVersion: 12.0
-browserstackLocal: true
+browserstackLocal: false
 buildName: ORT android test
 buildIdentifier: ${BUILD_NUMBER}
 projectName: ORT-UITests
 debug: true
 networkLogs: false
 testContextOptions:
-    skipSessionStatus: true   
\ No newline at end of file
+    skipSessionStatus: true
diff --git a/tools/ci_build/github/azure-pipelines/nuget/templates/test_android.yml b/tools/ci_build/github/azure-pipelines/nuget/templates/test_android.yml
index e75804f0b35cb..546d7e1c2e038 100644
--- a/tools/ci_build/github/azure-pipelines/nuget/templates/test_android.yml
+++ b/tools/ci_build/github/azure-pipelines/nuget/templates/test_android.yml
@@ -49,6 +49,15 @@ stages:
             dotnet publish -c Release --property:UsePrebuiltNativePackage=true --property:CurrentOnnxRuntimeVersion=$(NuGetPackageVersionNumber) -f net8.0-android
           workingDirectory: '$(Build.SourcesDirectory)\csharp\test\Microsoft.ML.OnnxRuntime.Tests.MAUI'
 
+      - task: BrowserStackConfig@0
+        inputs:
+          BrowserStackServiceEndPoint: 'OnnxRuntimeBrowserStackConnection'
+
+      - task: BrowserStackAppUploader@0
+        inputs:
+          appPath: '$(Build.SourcesDirectory)\csharp\test\Microsoft.ML.OnnxRuntime.Tests.MAUI\bin\Release\net8.0-android\publish\ORT.CSharp.Tests.MAUI-Signed.apk'
+          appCustomId: 'Appium-Android-CI'
+
       - task: PowerShell@2
         displayName: Run BrowserStack test
         inputs:

From e0786fe7bbab388be360daaa1b4937e75be4fa31 Mon Sep 17 00:00:00 2001
From: Chi Lo <54722500+chilo-ms@users.noreply.github.com>
Date: Wed, 6 Aug 2025 15:30:18 -0700
Subject: [PATCH 09/10] ORT perf test support for plugin EP (#25374)

### Description
Add support for onnxruntime_perf_test to register plugin EP dll and run
plugin EP.

As support for plugin execution providers (EPs) requires additional
options and most single-character options have already been used,
multi-character options are now necessary to ensure clarity and
readability. Therefore, support for `Abseil flags` is added, which
enables multi-character options and provides cross-platform
compatibility.


**New options:**

- `--plugin_ep_libs [registration names and libraries]` Specifies a list
of plugin execution provider (EP) registration names and their
corresponding shared libraries to register.
[Usage]: `--plugin_ep_libs "plugin_ep_name_1|plugin_ep_1.dll
plugin_ep_name_2|plugin_ep_2.dll ... "`


- `--plugin_eps [Plugin EPs]` Specifies a semicolon-separated list of
plugin execution providers (EPs) to use.
      [Usage]: `--plugin_eps "plugin_ep_1;plugin_ep_2;... "`

- `--plugin_ep_options [EP options]` Specifies provider options for each
EP listed in --plugin_eps. Options (key-value pairs) for each EP are
separated by space and EPs are separated by semicolons.
      [Usage]:
`--plugin_ep_options "ep_1_option_1_key|ep_1_option_1_value
...;ep_2_option_1_key|ep_2_option_1_value ...;..."` or
`--plugin_ep_options ";ep_2_option_1_key|ep_2_option_1_value ...;..."`
or
`--plugin_ep_options "ep_1_option_1_key|ep_1_option_1_value
...;;ep_3_option_1_key|ep_3_option_1_value ...;..."`

- `--list_ep_devices` Prints all available device indices and their
properties (including metadata). This option makes the program exit
early without performing inference.

- ` --select_ep_devices [list of device indices]` A semicolon-separated
list of device indices to add to the session and run with.

**Usage:**

1. Use `--plugin_ep_libs` and `--list_ep_devices` to list all the
devices.

````sh
--list_ep_devices --plugin_ep_libs "TensorRTEp|C:\TensorRTEp.dll example_ep|C:\example_plugin_ep.dll"
````
   It will print the devices info
````
===== EP device id 0 ======
name: CPUExecutionProvider
vendor: Microsoft
metadata:
  version: 1.23.0

===== EP device id 1 ======
name: example_ep
vendor: Contoso
metadata:
  supported_devices: CrackGriffin 7+
  version: 0.1.0

===== EP device id 2 ======
name: TensorRTEp
vendor: Nvidia
metadata:
  gpu_type: data center
  version: 0.1.0
````

2. Use `--select_ep_devices` to select the device by index. And add
`--plugin_eps` to specify the EP name. The EP name should match the name
when ep library passes in to create the ep factory.

````sh
--plugin_ep_libs "TensorRTEp|C:\TensorRTEp.dll" --select_ep_devices 2 --plugin_eps TensorRTEp -r 1 C:\mul_op\mul_1.onnx
````

3. Or simply use `-e` to specify the EP name. ORT perf test will add all
the devices created by the plugin EP.
The EP name should match the name when ep library passes in to create
the ep factory.

````sh
--plugin_ep_libs "TensorRTEp|C:\TensorRTEp.dll" --plugin_eps TensorRTEp -r 1 C:\mul_op\mul_1.onnx
````
---
 cmake/onnxruntime_unittests.cmake             |   4 +-
 .../test/perftest/command_args_parser.cc      | 833 ++++++++++--------
 .../test/perftest/command_args_parser.h       |   1 -
 onnxruntime/test/perftest/common_utils.cc     |  95 ++
 onnxruntime/test/perftest/main.cc             |  28 +-
 onnxruntime/test/perftest/ort_test_session.cc |  78 ++
 onnxruntime/test/perftest/strings_helper.cc   |  37 +
 onnxruntime/test/perftest/strings_helper.h    |   7 +
 .../test/perftest/test_configuration.h        |   9 +-
 onnxruntime/test/perftest/utils.h             |  13 +-
 10 files changed, 722 insertions(+), 383 deletions(-)
 create mode 100644 onnxruntime/test/perftest/common_utils.cc

diff --git a/cmake/onnxruntime_unittests.cmake b/cmake/onnxruntime_unittests.cmake
index c3bebba3bab54..ef2f7244698c8 100644
--- a/cmake/onnxruntime_unittests.cmake
+++ b/cmake/onnxruntime_unittests.cmake
@@ -1258,7 +1258,7 @@ if (NOT onnxruntime_ENABLE_TRAINING_TORCH_INTEROP)
             onnx_test_runner_common onnxruntime_test_utils onnxruntime_common
             onnxruntime onnxruntime_flatbuffers onnx_test_data_proto
             ${onnxruntime_EXTERNAL_LIBRARIES}
-            ${GETOPT_LIB_WIDE} ${SYS_PATH_LIB} ${CMAKE_DL_LIBS})
+            absl::flags absl::flags_parse ${SYS_PATH_LIB} ${CMAKE_DL_LIBS})
       if(NOT WIN32)
         if(onnxruntime_USE_SNPE)
           list(APPEND onnxruntime_perf_test_libs onnxruntime_providers_snpe)
@@ -1278,7 +1278,7 @@ if (NOT onnxruntime_ENABLE_TRAINING_TORCH_INTEROP)
         target_link_libraries(onnxruntime_perf_test PRIVATE debug dbghelp advapi32)
       endif()
     else()
-      target_link_libraries(onnxruntime_perf_test PRIVATE onnx_test_runner_common ${GETOPT_LIB_WIDE} ${onnx_test_libs})
+      target_link_libraries(onnxruntime_perf_test PRIVATE onnx_test_runner_common absl::flags absl::flags_parse ${onnx_test_libs})
     endif()
     set_target_properties(onnxruntime_perf_test PROPERTIES FOLDER "ONNXRuntimeTest")
 
diff --git a/onnxruntime/test/perftest/command_args_parser.cc b/onnxruntime/test/perftest/command_args_parser.cc
index 843875a881f0a..5c81696d5c57e 100644
--- a/onnxruntime/test/perftest/command_args_parser.cc
+++ b/onnxruntime/test/perftest/command_args_parser.cc
@@ -4,6 +4,7 @@
 // Licensed under the MIT License.
 
 #include "command_args_parser.h"
+#include "utils.h"
 
 #include <string.h>
 #include <iostream>
@@ -11,14 +12,6 @@
 #include <string_view>
 #include <unordered_map>
 
-// Windows Specific
-#ifdef _WIN32
-#include "getopt.h"
-#include "windows.h"
-#else
-#include <unistd.h>
-#endif
-
 #include <core/graph/constants.h>
 #include <core/platform/path_lib.h>
 #include <core/optimizer/graph_transformer_level.h>
@@ -26,161 +19,163 @@
 #include "test_configuration.h"
 #include "strings_helper.h"
 
+#include "absl/flags/flag.h"
+#include "absl/flags/parse.h"
+#include "absl/flags/usage.h"
+#include "absl/flags/usage_config.h"
+#include "absl/flags/reflection.h"
+
+static const onnxruntime::perftest::PerformanceTestConfig& DefaultPerformanceTestConfig() {
+  static onnxruntime::perftest::PerformanceTestConfig default_config{};
+  return default_config;
+}
+
+ABSL_FLAG(std::string, f, "", "Specifies a free dimension by name to override to a specific value for performance optimization.");
+ABSL_FLAG(std::string, F, "", "Specifies a free dimension by denotation to override to a specific value for performance optimization.");
+ABSL_FLAG(std::string, m, "duration", "Specifies the test mode. Value could be 'duration' or 'times'.");
+ABSL_FLAG(std::string, e, "cpu", "Specifies the provider 'cpu','cuda','dnnl','tensorrt', 'nvtensorrtrtx', 'openvino', 'dml', 'acl', 'nnapi', 'coreml', 'qnn', 'snpe', 'rocm', 'migraphx', 'xnnpack', 'vitisai' or 'webgpu'.");
+ABSL_FLAG(size_t, r, DefaultPerformanceTestConfig().run_config.repeated_times, "Specifies the repeated times if running in 'times' test mode.");
+ABSL_FLAG(size_t, t, DefaultPerformanceTestConfig().run_config.duration_in_seconds, "Specifies the seconds to run for 'duration' mode.");
+ABSL_FLAG(std::string, p, "", "Specifies the profile name to enable profiling and dump the profile data to the file.");
+ABSL_FLAG(int, x, DefaultPerformanceTestConfig().run_config.intra_op_num_threads, "Sets the number of threads used to parallelize the execution within nodes, A value of 0 means ORT will pick a default. Must >=0.");
+ABSL_FLAG(int, y, DefaultPerformanceTestConfig().run_config.inter_op_num_threads, "Sets the number of threads used to parallelize the execution of the graph (across nodes), A value of 0 means ORT will pick a default. Must >=0.");
+ABSL_FLAG(size_t, c, DefaultPerformanceTestConfig().run_config.concurrent_session_runs, "Specifies the (max) number of runs to invoke simultaneously.");
+ABSL_FLAG(int, d, DefaultPerformanceTestConfig().run_config.cudnn_conv_algo, "Specifies CUDNN convolution algorithms: 0(benchmark), 1(heuristic), 2(default).");
+ABSL_FLAG(int, o, DefaultPerformanceTestConfig().run_config.optimization_level, "Specifies graph optimization level. Default is 99 (all). Valid values are 0 (disable), 1 (basic), 2 (extended), 3 (layout), 99 (all).");
+ABSL_FLAG(std::string, u, "", "Specifies the optimized model path for saving.");
+ABSL_FLAG(std::string, i, "",
+          "Specifies EP specific runtime options as key-value pairs.\n Different runtime options available are: \n"
+          "  [Usage]: -e <provider_name> -i '<key1>|<value1> <key2>|<value2>'\n"
+          "\n"
+          "  [ACL only] [enable_fast_math]: Options: 'true', 'false', default: 'false', \n"
+          "\n"
+          "  [DML only] [performance_preference]: DML device performance preference, options: 'default', 'minimum_power', 'high_performance', \n"
+          "  [DML only] [device_filter]: DML device filter, options: 'any', 'gpu', 'npu', \n"
+          "  [DML only] [disable_metacommands]: Options: 'true', 'false', \n"
+          "  [DML only] [enable_graph_capture]: Options: 'true', 'false', \n"
+          "  [DML only] [enable_graph_serialization]: Options: 'true', 'false', \n"
+          "\n"
+          "  [OpenVINO only] [device_type]: Overrides the accelerator hardware type and precision with these values at runtime.\n"
+          "  [OpenVINO only] [device_id]: Selects a particular hardware device for inference.\n"
+          "  [OpenVINO only] [num_of_threads]: Overrides the accelerator hardware type and precision with these values at runtime.\n"
+          "  [OpenVINO only] [cache_dir]: Explicitly specify the path to dump and load the blobs(Model caching) or cl_cache (Kernel Caching) files feature. If blob files are already present, it will be directly loaded.\n"
+          "  [OpenVINO only] [enable_opencl_throttling]: Enables OpenCL queue throttling for GPU device(Reduces the CPU Utilization while using GPU) \n"
+          "  [Example] [For OpenVINO EP] -e openvino -i \"device_type|CPU num_of_threads|5 enable_opencl_throttling|true cache_dir|\"<path>\"\"\n"
+          "\n"
+          "  [QNN only] [backend_type]: QNN backend type. E.g., 'cpu', 'htp'. Mutually exclusive with 'backend_path'.\n"
+          "  [QNN only] [backend_path]: QNN backend path. E.g., '/folderpath/libQnnHtp.so', '/winfolderpath/QnnHtp.dll'. Mutually exclusive with 'backend_type'.\n"
+          "  [QNN only] [profiling_level]: QNN profiling level, options: 'basic', 'detailed', default 'off'.\n"
+          "  [QNN only] [profiling_file_path] : QNN profiling file path if ETW not enabled.\n"
+          "  [QNN only] [rpc_control_latency]: QNN rpc control latency. default to 10.\n"
+          "  [QNN only] [vtcm_mb]: QNN VTCM size in MB. default to 0(not set).\n"
+          "  [QNN only] [htp_performance_mode]: QNN performance mode, options: 'burst', 'balanced', 'default', 'high_performance', \n"
+          "  'high_power_saver', 'low_balanced', 'extreme_power_saver', 'low_power_saver', 'power_saver', 'sustained_high_performance'. Default to 'default'. \n"
+          "  [QNN only] [op_packages]: QNN UDO package, allowed format: \n"
+          "  op_packages|<op_type>:<op_package_path>:<interface_symbol_name>[:<target>],<op_type2>:<op_package_path2>:<interface_symbol_name2>[:<target2>]. \n"
+          "  [QNN only] [qnn_context_priority]: QNN context priority, options: 'low', 'normal', 'normal_high', 'high'. Default to 'normal'. \n"
+          "  [QNN only] [qnn_saver_path]: QNN Saver backend path. e.g '/folderpath/libQnnSaver.so'.\n"
+          "  [QNN only] [htp_graph_finalization_optimization_mode]: QNN graph finalization optimization mode, options: \n"
+          "  '0', '1', '2', '3', default is '0'.\n"
+          "  [QNN only] [soc_model]: The SoC Model number. Refer to QNN SDK documentation for specific values. Defaults to '0' (unknown). \n"
+          "  [QNN only] [htp_arch]: The minimum HTP architecture. The driver will use ops compatible with this architecture. \n"
+          "  Options are '0', '68', '69', '73', '75'. Defaults to '0' (none). \n"
+          "  [QNN only] [device_id]: The ID of the device to use when setting 'htp_arch'. Defaults to '0' (for single device). \n"
+          "  [QNN only] [enable_htp_fp16_precision]: Enable the HTP_FP16 precision so that the float32 model will be inferenced with fp16 precision. \n"
+          "  Otherwise, it will be fp32 precision. Works for float32 model for HTP backend. Defaults to '1' (with FP16 precision.). \n"
+          "  [QNN only] [offload_graph_io_quantization]: Offload graph input quantization and graph output dequantization to another EP (typically CPU EP). \n"
+          "  Defaults to '0' (QNN EP handles the graph I/O quantization and dequantization). \n"
+          "  [QNN only] [enable_htp_spill_fill_buffer]: Enable HTP spill fill buffer, used while generating QNN context binary.\n"
+          "  [QNN only] [enable_htp_shared_memory_allocator]: Enable the QNN HTP shared memory allocator and use it for inputs and outputs. Requires libcdsprpc.so/dll to be available.\n"
+          "  Defaults to '0' (disabled).\n"
+          "  [Example] [For QNN EP] -e qnn -i \"backend_type|cpu\" \n"
+          "\n"
+          "  [TensorRT only] [trt_max_partition_iterations]: Maximum iterations for TensorRT parser to get capability.\n"
+          "  [TensorRT only] [trt_min_subgraph_size]: Minimum size of TensorRT subgraphs.\n"
+          "  [TensorRT only] [trt_max_workspace_size]: Set TensorRT maximum workspace size in byte.\n"
+          "  [TensorRT only] [trt_fp16_enable]: Enable TensorRT FP16 precision.\n"
+          "  [TensorRT only] [trt_int8_enable]: Enable TensorRT INT8 precision.\n"
+          "  [TensorRT only] [trt_int8_calibration_table_name]: Specify INT8 calibration table name.\n"
+          "  [TensorRT only] [trt_int8_use_native_calibration_table]: Use Native TensorRT calibration table.\n"
+          "  [TensorRT only] [trt_dla_enable]: Enable DLA in Jetson device.\n"
+          "  [TensorRT only] [trt_dla_core]: DLA core number.\n"
+          "  [TensorRT only] [trt_dump_subgraphs]: Dump TRT subgraph to onnx model.\n"
+          "  [TensorRT only] [trt_engine_cache_enable]: Enable engine caching.\n"
+          "  [TensorRT only] [trt_engine_cache_path]: Specify engine cache path.\n"
+          "  [TensorRT only] [trt_engine_cache_prefix]: Customize engine cache prefix when trt_engine_cache_enable is true.\n"
+          "  [TensorRT only] [trt_engine_hw_compatible]: Enable hardware compatibility. Engines ending with '_sm80+' can be re-used across all Ampere+ GPU (a hardware-compatible engine may have lower throughput and/or higher latency than its non-hardware-compatible counterpart).\n"
+          "  [TensorRT only] [trt_weight_stripped_engine_enable]: Enable weight-stripped engine build.\n"
+          "  [TensorRT only] [trt_onnx_model_folder_path]: Folder path for the ONNX model with weights.\n"
+          "  [TensorRT only] [trt_force_sequential_engine_build]: Force TensorRT engines to be built sequentially.\n"
+          "  [TensorRT only] [trt_context_memory_sharing_enable]: Enable TensorRT context memory sharing between subgraphs.\n"
+          "  [TensorRT only] [trt_layer_norm_fp32_fallback]: Force Pow + Reduce ops in layer norm to run in FP32 to avoid overflow.\n"
+          "  [Example] [For TensorRT EP] -e tensorrt -i 'trt_fp16_enable|true trt_int8_enable|true trt_int8_calibration_table_name|calibration.flatbuffers trt_int8_use_native_calibration_table|false trt_force_sequential_engine_build|false'\n"
+          "\n"
+          "  [NNAPI only] [NNAPI_FLAG_USE_FP16]: Use fp16 relaxation in NNAPI EP..\n"
+          "  [NNAPI only] [NNAPI_FLAG_USE_NCHW]: Use the NCHW layout in NNAPI EP.\n"
+          "  [NNAPI only] [NNAPI_FLAG_CPU_DISABLED]: Prevent NNAPI from using CPU devices.\n"
+          "  [NNAPI only] [NNAPI_FLAG_CPU_ONLY]: Using CPU only in NNAPI EP.\n"
+          "  [Example] [For NNAPI EP] -e nnapi -i \"NNAPI_FLAG_USE_FP16 NNAPI_FLAG_USE_NCHW NNAPI_FLAG_CPU_DISABLED\"\n"
+          "\n"
+          "  [CoreML only] [ModelFormat]:[MLProgram, NeuralNetwork] Create an ML Program model or Neural Network. Default is NeuralNetwork.\n"
+          "  [CoreML only] [MLComputeUnits]:[CPUAndNeuralEngine CPUAndGPU ALL CPUOnly] Specify to limit the backend device used to run the model.\n"
+          "  [CoreML only] [AllowStaticInputShapes]:[0 1].\n"
+          "  [CoreML only] [EnableOnSubgraphs]:[0 1].\n"
+          "  [CoreML only] [SpecializationStrategy]:[Default FastPrediction].\n"
+          "  [CoreML only] [ProfileComputePlan]:[0 1].\n"
+          "  [CoreML only] [AllowLowPrecisionAccumulationOnGPU]:[0 1].\n"
+          "  [CoreML only] [ModelCacheDirectory]:[path../a/b/c].\n"
+          "  [Example] [For CoreML EP] -e coreml -i \"ModelFormat|MLProgram MLComputeUnits|CPUAndGPU\"\n"
+          "\n"
+          "  [SNPE only] [runtime]: SNPE runtime, options: 'CPU', 'GPU', 'GPU_FLOAT16', 'DSP', 'AIP_FIXED_TF'. \n"
+          "  [SNPE only] [priority]: execution priority, options: 'low', 'normal'. \n"
+          "  [SNPE only] [buffer_type]: options: 'TF8', 'TF16', 'UINT8', 'FLOAT', 'ITENSOR'. default: ITENSOR'. \n"
+          "  [SNPE only] [enable_init_cache]: enable SNPE init caching feature, set to 1 to enabled it. Disabled by default. \n"
+          "  [Example] [For SNPE EP] -e snpe -i \"runtime|CPU priority|low\" \n");
+ABSL_FLAG(int, S, DefaultPerformanceTestConfig().run_config.random_seed_for_input_data, "Given random seed, to produce the same input data. This defaults to -1(no initialize).");
+ABSL_FLAG(std::string, T, "", "Specifies intra op thread affinity string.");
+ABSL_FLAG(std::string, C, "",
+          "Specifies session configuration entries as key-value pairs:\n -C \"<key1>|<value1> <key2>|<value2>\" \n"
+          "Refer to onnxruntime_session_options_config_keys.h for valid keys and values. \n"
+          "[Example] -C \"session.disable_cpu_ep_fallback|1 ep.context_enable|1\" \n");
+ABSL_FLAG(std::string, R, "", "Allows user to register custom op by .so or .dll file.");
+ABSL_FLAG(bool, A, DefaultPerformanceTestConfig().run_config.enable_cpu_mem_arena, "Disables memory arena.");
+ABSL_FLAG(bool, M, DefaultPerformanceTestConfig().run_config.enable_memory_pattern, "Disables memory pattern.");
+ABSL_FLAG(bool, s, DefaultPerformanceTestConfig().run_config.f_dump_statistics, "Shows statistics result, like P75, P90. If no result_file provided this defaults to on.");
+ABSL_FLAG(bool, v, DefaultPerformanceTestConfig().run_config.f_verbose, "Shows verbose information.");
+ABSL_FLAG(bool, I, DefaultPerformanceTestConfig().run_config.generate_model_input_binding, "Generates tensor input binding. Free dimensions are treated as 1 unless overridden using -f.");
+ABSL_FLAG(bool, P, false, "Uses parallel executor instead of sequential executor.");
+ABSL_FLAG(bool, q, DefaultPerformanceTestConfig().run_config.do_cuda_copy_in_separate_stream, "[CUDA only] Uses separate stream for copy.");
+ABSL_FLAG(bool, z, DefaultPerformanceTestConfig().run_config.set_denormal_as_zero, "Sets denormal as zero. When turning on this option reduces latency dramatically, a model may have denormals.");
+ABSL_FLAG(bool, D, DefaultPerformanceTestConfig().run_config.disable_spinning, "Disables spinning entirely for thread owned by onnxruntime intra-op thread pool.");
+ABSL_FLAG(bool, Z, DefaultPerformanceTestConfig().run_config.disable_spinning_between_run, "Disallows thread from spinning during runs to reduce cpu usage.");
+ABSL_FLAG(bool, n, DefaultPerformanceTestConfig().run_config.exit_after_session_creation, "Allows user to measure session creation time to measure impact of enabling any initialization optimizations.");
+ABSL_FLAG(bool, l, DefaultPerformanceTestConfig().model_info.load_via_path, "Provides file as binary in memory by using fopen before session creation.");
+ABSL_FLAG(bool, g, DefaultPerformanceTestConfig().run_config.enable_cuda_io_binding, "[TensorRT RTX | TensorRT | CUDA] Enables tensor input and output bindings on CUDA before session run.");
+ABSL_FLAG(bool, X, DefaultPerformanceTestConfig().run_config.use_extensions, "Registers custom ops from onnxruntime-extensions.");
+ABSL_FLAG(std::string, plugin_ep_libs, "",
+          "Specifies a list of plugin execution provider (EP) registration names and their corresponding shared libraries to register.\n"
+          "[Usage]: --plugin_ep_libs \"plugin_ep_name_1|plugin_ep_1.dll plugin_ep_name_2|plugin_ep_2.dll ... \"");
+ABSL_FLAG(std::string, plugin_eps, "", "Specifies a semicolon-separated list of plugin execution providers (EPs) to use.");
+ABSL_FLAG(std::string, plugin_ep_options, "",
+          "Specifies provider options for each EP listed in --plugin_eps. Options (key-value pairs) for each EP are separated by space and EPs are separated by semicolons.\n"
+          "[Usage]: --plugin_ep_options \"ep_1_option_1_key|ep_1_option_1_value ...;ep_2_option_1_key|ep_2_option_1_value ...;... \" or \n"
+          "--plugin_ep_options \";ep_2_option_1_key|ep_2_option_1_value ...;... \" or \n"
+          "--plugin_ep_options \"ep_1_option_1_key|ep_1_option_1_value ...;;ep_3_option_1_key|ep_3_option_1_value ...;... \"");
+ABSL_FLAG(bool, list_ep_devices, false, "Prints all available device indices and their properties (including metadata). This option makes the program exit early without performing inference.\n");
+ABSL_FLAG(std::string, select_ep_devices, "", "Specifies a semicolon-separated list of device indices to add to the session and run with.");
+ABSL_FLAG(bool, h, false, "Print program usage.");
+
 namespace onnxruntime {
 namespace perftest {
 
-/*static*/ void CommandLineParser::ShowUsage() {
-  printf(
-      "perf_test [options...] model_path [result_file]\n"
-      "Options:\n"
-      "\t-m [test_mode]: Specifies the test mode. Value could be 'duration' or 'times'.\n"
-      "\t\tProvide 'duration' to run the test for a fix duration, and 'times' to repeated for a certain times. \n"
-      "\t-M: Disable memory pattern.\n"
-      "\t-A: Disable memory arena\n"
-      "\t-I: Generate tensor input binding. Free dimensions are treated as 1 unless overridden using -f.\n"
-      "\t-c [parallel runs]: Specifies the (max) number of runs to invoke simultaneously. Default:1.\n"
-      "\t-e [cpu|cuda|dnnl|tensorrt|openvino|dml|acl|nnapi|coreml|qnn|snpe|rocm|migraphx|xnnpack|vitisai|webgpu]: Specifies the provider 'cpu','cuda','dnnl','tensorrt', "
-      "'nvtensorrtrtx', 'openvino', 'dml', 'acl', 'nnapi', 'coreml', 'qnn', 'snpe', 'rocm', 'migraphx', 'xnnpack', 'vitisai' or 'webgpu'. "
-      "Default:'cpu'.\n"
-      "\t-b [tf|ort]: backend to use. Default:ort\n"
-      "\t-r [repeated_times]: Specifies the repeated times if running in 'times' test mode.Default:1000.\n"
-      "\t-t [seconds_to_run]: Specifies the seconds to run for 'duration' mode. Default:600.\n"
-      "\t-p [profile_file]: Specifies the profile name to enable profiling and dump the profile data to the file.\n"
-      "\t-s: Show statistics result, like P75, P90. If no result_file provided this defaults to on.\n"
-      "\t-S: Given random seed, to produce the same input data. This defaults to -1(no initialize).\n"
-      "\t-v: Show verbose information.\n"
-      "\t-x [intra_op_num_threads]: Sets the number of threads used to parallelize the execution within nodes, A value of 0 means ORT will pick a default. Must >=0.\n"
-      "\t-y [inter_op_num_threads]: Sets the number of threads used to parallelize the execution of the graph (across nodes), A value of 0 means ORT will pick a default. Must >=0.\n"
-      "\t-f [free_dimension_override]: Specifies a free dimension by name to override to a specific value for performance optimization. "
-      "Syntax is [dimension_name:override_value]. override_value must > 0\n"
-      "\t-F [free_dimension_override]: Specifies a free dimension by denotation to override to a specific value for performance optimization. "
-      "Syntax is [dimension_denotation:override_value]. override_value must > 0\n"
-      "\t-P: Use parallel executor instead of sequential executor.\n"
-      "\t-o [optimization level]: Default is 99 (all). Valid values are 0 (disable), 1 (basic), 2 (extended), 3 (layout), 99 (all).\n"
-      "\t\tPlease see onnxruntime_c_api.h (enum GraphOptimizationLevel) for the full list of all optimization levels.\n"
-      "\t-u [optimized_model_path]: Specify the optimized model path for saving.\n"
-      "\t-d [CUDA only][cudnn_conv_algorithm]: Specify CUDNN convolution algorithms: 0(benchmark), 1(heuristic), 2(default). \n"
-      "\t-q [CUDA only] use separate stream for copy. \n"
-      "\t-g [TensorRT RTX | TensorRT | CUDA] Enable tensor input and output bindings on CUDA before session run \n"
-      "\t-z: Set denormal as zero. When turning on this option reduces latency dramatically, a model may have denormals.\n"
-      "\t-C: Specify session configuration entries as key-value pairs: -C \"<key1>|<value1> <key2>|<value2>\" \n"
-      "\t    Refer to onnxruntime_session_options_config_keys.h for valid keys and values. \n"
-      "\t    [Example] -C \"session.disable_cpu_ep_fallback|1 ep.context_enable|1\" \n"
-      "\t-i: Specify EP specific runtime options as key value pairs. Different runtime options available are: \n"
-      "\t    [Usage]: -e <provider_name> -i '<key1>|<value1> <key2>|<value2>'\n"
-      "\n"
-      "\t    [ACL only] [enable_fast_math]: Options: 'true', 'false', default: 'false', \n"
-      "\t    [DML only] [performance_preference]: DML device performance preference, options: 'default', 'minimum_power', 'high_performance', \n"
-      "\t    [DML only] [device_filter]: DML device filter, options: 'any', 'gpu', 'npu', \n"
-      "\t    [DML only] [disable_metacommands]: Options: 'true', 'false', \n"
-      "\t    [DML only] [enable_graph_capture]: Options: 'true', 'false', \n"
-      "\t    [DML only] [enable_graph_serialization]: Options: 'true', 'false', \n"
-      "\n"
-      "\t    [OpenVINO only] [device_type]: Overrides the accelerator hardware type and precision with these values at runtime.\n"
-      "\t    [OpenVINO only] [device_id]: Selects a particular hardware device for inference.\n"
-      "\t    [OpenVINO only] [num_of_threads]: Overrides the accelerator hardware type and precision with these values at runtime.\n"
-      "\t    [OpenVINO only] [cache_dir]: Explicitly specify the path to dump and load the blobs(Model caching) or cl_cache (Kernel Caching) files feature. If blob files are already present, it will be directly loaded.\n"
-      "\t    [OpenVINO only] [enable_opencl_throttling]: Enables OpenCL queue throttling for GPU device(Reduces the CPU Utilization while using GPU) \n"
-      "\t    [Example] [For OpenVINO EP] -e openvino -i \"device_type|CPU num_of_threads|5 enable_opencl_throttling|true cache_dir|\"<path>\"\"\n"
-      "\n"
-      "\t    [QNN only] [backend_type]: QNN backend type. E.g., 'cpu', 'htp'. Mutually exclusive with 'backend_path'.\n"
-      "\t    [QNN only] [backend_path]: QNN backend path. E.g., '/folderpath/libQnnHtp.so', '/winfolderpath/QnnHtp.dll'. Mutually exclusive with 'backend_type'.\n"
-      "\t    [QNN only] [profiling_level]: QNN profiling level, options: 'basic', 'detailed', default 'off'.\n"
-      "\t    [QNN only] [profiling_file_path] : QNN profiling file path if ETW not enabled.\n"
-      "\t    [QNN only] [rpc_control_latency]: QNN rpc control latency. default to 10.\n"
-      "\t    [QNN only] [vtcm_mb]: QNN VTCM size in MB. default to 0(not set).\n"
-      "\t    [QNN only] [htp_performance_mode]: QNN performance mode, options: 'burst', 'balanced', 'default', 'high_performance', \n"
-      "\t    'high_power_saver', 'low_balanced', 'extreme_power_saver', 'low_power_saver', 'power_saver', 'sustained_high_performance'. Default to 'default'. \n"
-      "\t    [QNN only] [op_packages]: QNN UDO package, allowed format: \n"
-      "\t    op_packages|<op_type>:<op_package_path>:<interface_symbol_name>[:<target>],<op_type2>:<op_package_path2>:<interface_symbol_name2>[:<target2>]. \n"
-      "\t    [QNN only] [qnn_context_priority]: QNN context priority, options: 'low', 'normal', 'normal_high', 'high'. Default to 'normal'. \n"
-      "\t    [QNN only] [qnn_saver_path]: QNN Saver backend path. e.g '/folderpath/libQnnSaver.so'.\n"
-      "\t    [QNN only] [htp_graph_finalization_optimization_mode]: QNN graph finalization optimization mode, options: \n"
-      "\t    '0', '1', '2', '3', default is '0'.\n"
-      "\t    [QNN only] [soc_model]: The SoC Model number. Refer to QNN SDK documentation for specific values. Defaults to '0' (unknown). \n"
-      "\t    [QNN only] [htp_arch]: The minimum HTP architecture. The driver will use ops compatible with this architecture. \n"
-      "\t    Options are '0', '68', '69', '73', '75'. Defaults to '0' (none). \n"
-      "\t    [QNN only] [device_id]: The ID of the device to use when setting 'htp_arch'. Defaults to '0' (for single device). \n"
-      "\t    [QNN only] [enable_htp_fp16_precision]: Enable the HTP_FP16 precision so that the float32 model will be inferenced with fp16 precision. \n"
-      "\t    Otherwise, it will be fp32 precision. Works for float32 model for HTP backend. Defaults to '1' (with FP16 precision.). \n"
-      "\t    [QNN only] [offload_graph_io_quantization]: Offload graph input quantization and graph output dequantization to another EP (typically CPU EP). \n"
-      "\t    Defaults to '0' (QNN EP handles the graph I/O quantization and dequantization). \n"
-      "\t    [QNN only] [enable_htp_spill_fill_buffer]: Enable HTP spill fill buffer, used while generating QNN context binary.\n"
-      "\t    [QNN only] [enable_htp_shared_memory_allocator]: Enable the QNN HTP shared memory allocator and use it for inputs and outputs. Requires libcdsprpc.so/dll to be available.\n"
-      "\t    Defaults to '0' (disabled).\n"
-      "\t    [Example] [For QNN EP] -e qnn -i \"backend_type|cpu\" \n"
-      "\n"
-      "\t    [TensorRT only] [trt_max_partition_iterations]: Maximum iterations for TensorRT parser to get capability.\n"
-      "\t    [TensorRT only] [trt_min_subgraph_size]: Minimum size of TensorRT subgraphs.\n"
-      "\t    [TensorRT only] [trt_max_workspace_size]: Set TensorRT maximum workspace size in byte.\n"
-      "\t    [TensorRT only] [trt_fp16_enable]: Enable TensorRT FP16 precision.\n"
-      "\t    [TensorRT only] [trt_int8_enable]: Enable TensorRT INT8 precision.\n"
-      "\t    [TensorRT only] [trt_int8_calibration_table_name]: Specify INT8 calibration table name.\n"
-      "\t    [TensorRT only] [trt_int8_use_native_calibration_table]: Use Native TensorRT calibration table.\n"
-      "\t    [TensorRT only] [trt_dla_enable]: Enable DLA in Jetson device.\n"
-      "\t    [TensorRT only] [trt_dla_core]: DLA core number.\n"
-      "\t    [TensorRT only] [trt_dump_subgraphs]: Dump TRT subgraph to onnx model.\n"
-      "\t    [TensorRT only] [trt_engine_cache_enable]: Enable engine caching.\n"
-      "\t    [TensorRT only] [trt_engine_cache_path]: Specify engine cache path.\n"
-      "\t    [TensorRT only] [trt_engine_cache_prefix]: Customize engine cache prefix when trt_engine_cache_enable is true.\n"
-      "\t    [TensorRT only] [trt_engine_hw_compatible]: Enable hardware compatibility. Engines ending with '_sm80+' can be re-used across all Ampere+ GPU (a hardware-compatible engine may have lower throughput and/or higher latency than its non-hardware-compatible counterpart).\n"
-      "\t    [TensorRT only] [trt_weight_stripped_engine_enable]: Enable weight-stripped engine build.\n"
-      "\t    [TensorRT only] [trt_onnx_model_folder_path]: Folder path for the ONNX model with weights.\n"
-      "\t    [TensorRT only] [trt_force_sequential_engine_build]: Force TensorRT engines to be built sequentially.\n"
-      "\t    [TensorRT only] [trt_context_memory_sharing_enable]: Enable TensorRT context memory sharing between subgraphs.\n"
-      "\t    [TensorRT only] [trt_layer_norm_fp32_fallback]: Force Pow + Reduce ops in layer norm to run in FP32 to avoid overflow.\n"
-      "\t    [Example] [For TensorRT EP] -e tensorrt -i 'trt_fp16_enable|true trt_int8_enable|true trt_int8_calibration_table_name|calibration.flatbuffers trt_int8_use_native_calibration_table|false trt_force_sequential_engine_build|false'\n"
-      "\n"
-      "\t    [NNAPI only] [NNAPI_FLAG_USE_FP16]: Use fp16 relaxation in NNAPI EP..\n"
-      "\t    [NNAPI only] [NNAPI_FLAG_USE_NCHW]: Use the NCHW layout in NNAPI EP.\n"
-      "\t    [NNAPI only] [NNAPI_FLAG_CPU_DISABLED]: Prevent NNAPI from using CPU devices.\n"
-      "\t    [NNAPI only] [NNAPI_FLAG_CPU_ONLY]: Using CPU only in NNAPI EP.\n"
-      "\t    [Example] [For NNAPI EP] -e nnapi -i \"NNAPI_FLAG_USE_FP16 NNAPI_FLAG_USE_NCHW NNAPI_FLAG_CPU_DISABLED\"\n"
-      "\n"
-      "\t    [CoreML only] [ModelFormat]:[MLProgram, NeuralNetwork] Create an ML Program model or Neural Network. Default is NeuralNetwork.\n"
-      "\t    [CoreML only] [MLComputeUnits]:[CPUAndNeuralEngine CPUAndGPU ALL CPUOnly] Specify to limit the backend device used to run the model.\n"
-      "\t    [CoreML only] [AllowStaticInputShapes]:[0 1].\n"
-      "\t    [CoreML only] [EnableOnSubgraphs]:[0 1].\n"
-      "\t    [CoreML only] [SpecializationStrategy]:[Default FastPrediction].\n"
-      "\t    [CoreML only] [ProfileComputePlan]:[0 1].\n"
-      "\t    [CoreML only] [AllowLowPrecisionAccumulationOnGPU]:[0 1].\n"
-      "\t    [CoreML only] [ModelCacheDirectory]:[path../a/b/c].\n"
-      "\t    [Example] [For CoreML EP] -e coreml -i \"ModelFormat|MLProgram MLComputeUnits|CPUAndGPU\"\n"
-      "\n"
-      "\t    [SNPE only] [runtime]: SNPE runtime, options: 'CPU', 'GPU', 'GPU_FLOAT16', 'DSP', 'AIP_FIXED_TF'. \n"
-      "\t    [SNPE only] [priority]: execution priority, options: 'low', 'normal'. \n"
-      "\t    [SNPE only] [buffer_type]: options: 'TF8', 'TF16', 'UINT8', 'FLOAT', 'ITENSOR'. default: ITENSOR'. \n"
-      "\t    [SNPE only] [enable_init_cache]: enable SNPE init caching feature, set to 1 to enabled it. Disabled by default. \n"
-      "\t    [Example] [For SNPE EP] -e snpe -i \"runtime|CPU priority|low\" \n\n"
-      "\n"
-      "\t-T [Set intra op thread affinities]: Specify intra op thread affinity string\n"
-      "\t [Example]: -T 1,2;3,4;5,6 or -T 1-2;3-4;5-6 \n"
-      "\t\t Use semicolon to separate configuration between threads.\n"
-      "\t\t E.g. 1,2;3,4;5,6 specifies affinities for three threads, the first thread will be attached to the first and second logical processor.\n"
-      "\t\t The number of affinities must be equal to intra_op_num_threads - 1\n\n"
-      "\t-D [Disable thread spinning]: disable spinning entirely for thread owned by onnxruntime intra-op thread pool.\n"
-      "\t-Z [Force thread to stop spinning between runs]: disallow thread from spinning during runs to reduce cpu usage.\n"
-      "\t-n [Exit after session creation]: allow user to measure session creation time to measure impact of enabling any initialization optimizations.\n"
-      "\t-l Provide file as binary in memory by using fopen before session creation.\n"
-      "\t-R [Register custom op]: allow user to register custom op by .so or .dll file.\n"
-      "\t-X [Enable onnxruntime-extensions custom ops]: Registers custom ops from onnxruntime-extensions. "
-      "onnxruntime-extensions must have been built in to onnxruntime. This can be done with the build.py "
-      "'--use_extensions' option.\n"
-      "\t-h: help\n");
-}
-#ifdef _WIN32
-static const ORTCHAR_T* overrideDelimiter = L":";
-#else
-static const ORTCHAR_T* overrideDelimiter = ":";
-#endif
-static bool ParseDimensionOverride(std::basic_string<ORTCHAR_T>& dim_identifier, int64_t& override_val) {
-  std::basic_string<ORTCHAR_T> free_dim_str(optarg);
-  size_t delimiter_location = free_dim_str.find(overrideDelimiter);
+static bool ParseDimensionOverride(std::string& dim_identifier, int64_t& override_val, const char* option) {
+  std::basic_string<char> free_dim_str(option);
+  size_t delimiter_location = free_dim_str.find(":");
   if (delimiter_location >= free_dim_str.size() - 1) {
     return false;
   }
   dim_identifier = free_dim_str.substr(0, delimiter_location);
-  std::basic_string<ORTCHAR_T> override_val_str = free_dim_str.substr(delimiter_location + 1, std::wstring::npos);
+  std::string override_val_str = free_dim_str.substr(delimiter_location + 1, std::string::npos);
   ORT_TRY {
     override_val = std::stoll(override_val_str.c_str());
     if (override_val <= 0) {
@@ -193,240 +188,326 @@ static bool ParseDimensionOverride(std::basic_string<ORTCHAR_T>& dim_identifier,
   return true;
 }
 
-/*static*/ bool CommandLineParser::ParseArguments(PerformanceTestConfig& test_config, int argc, ORTCHAR_T* argv[]) {
-  int ch;
-  while ((ch = getopt(argc, argv, ORT_TSTR("m:e:r:t:p:x:y:c:d:o:u:i:f:F:S:T:C:AMPIDZvhsqznlgR:X"))) != -1) {
-    switch (ch) {
-      case 'f': {
-        std::basic_string<ORTCHAR_T> dim_name;
-        int64_t override_val;
-        if (!ParseDimensionOverride(dim_name, override_val)) {
-          return false;
-        }
-        test_config.run_config.free_dim_name_overrides[dim_name] = override_val;
-        break;
+std::string CustomUsageMessage() {
+  std::ostringstream oss;
+  oss << "onnxruntime_perf_test [options...] model_path [result_file]\n\n";
+  oss << "Note: Options may be specified with either a single dash(-option) or a double dash(--option). Both forms are accepted and treated identically.\n\n";
+  oss << "Options:";
+
+  return oss.str();
+}
+
+bool CommandLineParser::ParseArguments(PerformanceTestConfig& test_config, int argc, ORTCHAR_T* argv[]) {
+  // Following callback is to make sure all the ABSL flags defined above will be showed up when running with "--help".
+  // Note: By default abseil only wants flags in binary's main. It expects the main routine to reside in <program>.cc or <program>-main.cc or
+  // <program>_main.cc, where the <program> is the name of the binary (without .exe on Windows). See usage_config.cc in abseil for more details.
+  absl::FlagsUsageConfig config;
+  config.contains_help_flags = [](absl::string_view filename) {
+    return std::filesystem::path(filename).filename() == std::filesystem::path(__FILE__).filename();
+  };
+
+  config.normalize_filename = [](absl::string_view f) {
+    return std::string(f);
+  };
+  absl::SetFlagsUsageConfig(config);
+  absl::SetProgramUsageMessage(CustomUsageMessage());
+
+  auto utf8_strings = utils::ConvertArgvToUtf8Strings(argc, argv);
+  auto utf8_argv = utils::CStringsFromStrings(utf8_strings);
+  auto positional = absl::ParseCommandLine(static_cast<int>(utf8_argv.size()), utf8_argv.data());
+
+  // -f
+  {
+    const auto& dim_override_str = absl::GetFlag(FLAGS_f);
+    if (!dim_override_str.empty()) {
+      std::string dim_name;
+      int64_t override_val;
+      if (!ParseDimensionOverride(dim_name, override_val, dim_override_str.c_str())) {
+        return false;
       }
-      case 'F': {
-        std::basic_string<ORTCHAR_T> dim_denotation;
-        int64_t override_val;
-        if (!ParseDimensionOverride(dim_denotation, override_val)) {
-          return false;
-        }
-        test_config.run_config.free_dim_denotation_overrides[dim_denotation] = override_val;
-        break;
+      test_config.run_config.free_dim_name_overrides[dim_name] = override_val;
+    }
+  }
+
+  // -F
+  {
+    const auto& dim_override_str = absl::GetFlag(FLAGS_F);
+    if (!dim_override_str.empty()) {
+      std::string dim_denotation;
+      int64_t override_val;
+      if (!ParseDimensionOverride(dim_denotation, override_val, dim_override_str.c_str())) {
+        return false;
       }
-      case 'm':
-        if (!CompareCString(optarg, ORT_TSTR("duration"))) {
-          test_config.run_config.test_mode = TestMode::kFixDurationMode;
-        } else if (!CompareCString(optarg, ORT_TSTR("times"))) {
-          test_config.run_config.test_mode = TestMode::KFixRepeatedTimesMode;
-        } else {
-          return false;
-        }
-        break;
-      case 'p':
-        test_config.run_config.profile_file = optarg;
-        break;
-      case 'M':
-        test_config.run_config.enable_memory_pattern = false;
-        break;
-      case 'A':
-        test_config.run_config.enable_cpu_mem_arena = false;
-        break;
-      case 'e':
-        if (!CompareCString(optarg, ORT_TSTR("cpu"))) {
-          test_config.machine_config.provider_type_name = onnxruntime::kCpuExecutionProvider;
-        } else if (!CompareCString(optarg, ORT_TSTR("cuda"))) {
-          test_config.machine_config.provider_type_name = onnxruntime::kCudaExecutionProvider;
-        } else if (!CompareCString(optarg, ORT_TSTR("dnnl"))) {
-          test_config.machine_config.provider_type_name = onnxruntime::kDnnlExecutionProvider;
-        } else if (!CompareCString(optarg, ORT_TSTR("openvino"))) {
-          test_config.machine_config.provider_type_name = onnxruntime::kOpenVINOExecutionProvider;
-        } else if (!CompareCString(optarg, ORT_TSTR("tensorrt"))) {
-          test_config.machine_config.provider_type_name = onnxruntime::kTensorrtExecutionProvider;
-        } else if (!CompareCString(optarg, ORT_TSTR("qnn"))) {
-          test_config.machine_config.provider_type_name = onnxruntime::kQnnExecutionProvider;
-        } else if (!CompareCString(optarg, ORT_TSTR("snpe"))) {
-          test_config.machine_config.provider_type_name = onnxruntime::kSnpeExecutionProvider;
-        } else if (!CompareCString(optarg, ORT_TSTR("nnapi"))) {
-          test_config.machine_config.provider_type_name = onnxruntime::kNnapiExecutionProvider;
-        } else if (!CompareCString(optarg, ORT_TSTR("vsinpu"))) {
-          test_config.machine_config.provider_type_name = onnxruntime::kVSINPUExecutionProvider;
-        } else if (!CompareCString(optarg, ORT_TSTR("coreml"))) {
-          test_config.machine_config.provider_type_name = onnxruntime::kCoreMLExecutionProvider;
-        } else if (!CompareCString(optarg, ORT_TSTR("dml"))) {
-          test_config.machine_config.provider_type_name = onnxruntime::kDmlExecutionProvider;
-        } else if (!CompareCString(optarg, ORT_TSTR("acl"))) {
-          test_config.machine_config.provider_type_name = onnxruntime::kAclExecutionProvider;
-        } else if (!CompareCString(optarg, ORT_TSTR("armnn"))) {
-          test_config.machine_config.provider_type_name = onnxruntime::kArmNNExecutionProvider;
-        } else if (!CompareCString(optarg, ORT_TSTR("rocm"))) {
-          test_config.machine_config.provider_type_name = onnxruntime::kRocmExecutionProvider;
-        } else if (!CompareCString(optarg, ORT_TSTR("migraphx"))) {
-          test_config.machine_config.provider_type_name = onnxruntime::kMIGraphXExecutionProvider;
-        } else if (!CompareCString(optarg, ORT_TSTR("xnnpack"))) {
-          test_config.machine_config.provider_type_name = onnxruntime::kXnnpackExecutionProvider;
-        } else if (!CompareCString(optarg, ORT_TSTR("vitisai"))) {
-          test_config.machine_config.provider_type_name = onnxruntime::kVitisAIExecutionProvider;
-        } else if (!CompareCString(optarg, ORT_TSTR("webgpu"))) {
-          test_config.machine_config.provider_type_name = onnxruntime::kWebGpuExecutionProvider;
-        } else if (!CompareCString(optarg, ORT_TSTR("nvtensorrtrtx"))) {
-          test_config.machine_config.provider_type_name = onnxruntime::kNvTensorRTRTXExecutionProvider;
-        } else {
-          return false;
-        }
-        break;
-      case 'r':
-        test_config.run_config.repeated_times = static_cast<size_t>(OrtStrtol<PATH_CHAR_TYPE>(optarg, nullptr));
-        if (test_config.run_config.repeated_times <= 0) {
-          return false;
-        }
-        test_config.run_config.test_mode = TestMode::KFixRepeatedTimesMode;
-        break;
-      case 't':
-        test_config.run_config.duration_in_seconds = static_cast<size_t>(OrtStrtol<PATH_CHAR_TYPE>(optarg, nullptr));
-        if (test_config.run_config.repeated_times <= 0) {
-          return false;
-        }
+      test_config.run_config.free_dim_denotation_overrides[dim_denotation] = override_val;
+    }
+  }
+
+  // -m
+  {
+    const auto& test_mode_str = absl::GetFlag(FLAGS_m);
+    if (!test_mode_str.empty()) {
+      if (test_mode_str == "duration") {
         test_config.run_config.test_mode = TestMode::kFixDurationMode;
-        break;
-      case 's':
-        test_config.run_config.f_dump_statistics = true;
-        break;
-      case 'S':
-        test_config.run_config.random_seed_for_input_data = static_cast<int32_t>(
-            OrtStrtol<PATH_CHAR_TYPE>(optarg, nullptr));
-        break;
-      case 'v':
-        test_config.run_config.f_verbose = true;
-        break;
-      case 'x':
-        test_config.run_config.intra_op_num_threads = static_cast<int>(OrtStrtol<PATH_CHAR_TYPE>(optarg, nullptr));
-        if (test_config.run_config.intra_op_num_threads < 0) {
-          return false;
-        }
-        break;
-      case 'y':
-        test_config.run_config.inter_op_num_threads = static_cast<int>(OrtStrtol<PATH_CHAR_TYPE>(optarg, nullptr));
-        if (test_config.run_config.inter_op_num_threads < 0) {
-          return false;
-        }
-        break;
-      case 'P':
-        test_config.run_config.execution_mode = ExecutionMode::ORT_PARALLEL;
-        break;
-      case 'c':
-        test_config.run_config.concurrent_session_runs =
-            static_cast<size_t>(OrtStrtol<PATH_CHAR_TYPE>(optarg, nullptr));
-        if (test_config.run_config.concurrent_session_runs <= 0) {
-          return false;
-        }
-        break;
-      case 'o': {
-        int tmp = static_cast<int>(OrtStrtol<PATH_CHAR_TYPE>(optarg, nullptr));
-        switch (tmp) {
-          case ORT_DISABLE_ALL:
-            test_config.run_config.optimization_level = ORT_DISABLE_ALL;
-            break;
-          case ORT_ENABLE_BASIC:
-            test_config.run_config.optimization_level = ORT_ENABLE_BASIC;
-            break;
-          case ORT_ENABLE_EXTENDED:
-            test_config.run_config.optimization_level = ORT_ENABLE_EXTENDED;
-            break;
-          case ORT_ENABLE_LAYOUT:
-            test_config.run_config.optimization_level = ORT_ENABLE_LAYOUT;
-            break;
-          case ORT_ENABLE_ALL:
+      } else if (test_mode_str == "times") {
+        test_config.run_config.test_mode = TestMode::KFixRepeatedTimesMode;
+      } else {
+        return false;
+      }
+    }
+  }
+
+  // -p
+  {
+    const auto& profile_file = absl::GetFlag(FLAGS_p);
+    if (!profile_file.empty()) test_config.run_config.profile_file = ToPathString(profile_file);
+  }
+
+  // -M
+  test_config.run_config.enable_memory_pattern = absl::GetFlag(FLAGS_M);
+
+  // -A
+  test_config.run_config.enable_cpu_mem_arena = absl::GetFlag(FLAGS_A);
+
+  // -e
+  {
+    auto const& ep = absl::GetFlag(FLAGS_e);
+    if (!ep.empty()) {
+      if (ep == "cpu") {
+        test_config.machine_config.provider_type_name = onnxruntime::kCpuExecutionProvider;
+      } else if (ep == "cuda") {
+        test_config.machine_config.provider_type_name = onnxruntime::kCudaExecutionProvider;
+      } else if (ep == "dnnl") {
+        test_config.machine_config.provider_type_name = onnxruntime::kDnnlExecutionProvider;
+      } else if (ep == "openvino") {
+        test_config.machine_config.provider_type_name = onnxruntime::kOpenVINOExecutionProvider;
+      } else if (ep == "tensorrt") {
+        test_config.machine_config.provider_type_name = onnxruntime::kTensorrtExecutionProvider;
+      } else if (ep == "qnn") {
+        test_config.machine_config.provider_type_name = onnxruntime::kQnnExecutionProvider;
+      } else if (ep == "snpe") {
+        test_config.machine_config.provider_type_name = onnxruntime::kSnpeExecutionProvider;
+      } else if (ep == "nnapi") {
+        test_config.machine_config.provider_type_name = onnxruntime::kNnapiExecutionProvider;
+      } else if (ep == "vsinpu") {
+        test_config.machine_config.provider_type_name = onnxruntime::kVSINPUExecutionProvider;
+      } else if (ep == "coreml") {
+        test_config.machine_config.provider_type_name = onnxruntime::kCoreMLExecutionProvider;
+      } else if (ep == "dml") {
+        test_config.machine_config.provider_type_name = onnxruntime::kDmlExecutionProvider;
+      } else if (ep == "acl") {
+        test_config.machine_config.provider_type_name = onnxruntime::kAclExecutionProvider;
+      } else if (ep == "armnn") {
+        test_config.machine_config.provider_type_name = onnxruntime::kArmNNExecutionProvider;
+      } else if (ep == "rocm") {
+        test_config.machine_config.provider_type_name = onnxruntime::kRocmExecutionProvider;
+      } else if (ep == "migraphx") {
+        test_config.machine_config.provider_type_name = onnxruntime::kMIGraphXExecutionProvider;
+      } else if (ep == "xnnpack") {
+        test_config.machine_config.provider_type_name = onnxruntime::kXnnpackExecutionProvider;
+      } else if (ep == "vitisai") {
+        test_config.machine_config.provider_type_name = onnxruntime::kVitisAIExecutionProvider;
+      } else if (ep == "webgpu") {
+        test_config.machine_config.provider_type_name = onnxruntime::kWebGpuExecutionProvider;
+      } else if (ep == "nvtensorrtrtx") {
+        test_config.machine_config.provider_type_name = onnxruntime::kNvTensorRTRTXExecutionProvider;
+      } else {
+        return false;
+      }
+    }
+  }
+
+  // Helper function to check if the option is explicitly specified.
+  // Abseil Flags does not provide this capability by default.
+  // It cannot distinguish between cases where:
+  //   - The user typed `-r 1000` (explicitly passing the default value), and
+  //   - The user omitted `-r` entirely.
+  // To determine this accurately, we must inspect argv directly.
+  auto is_option_specified = [&](std::string option) {
+    for (int i = 1; i < argc; ++i) {
+      auto utf8_arg = ToUTF8String(argv[i]);
+      if (utf8_arg == ("-" + option) || utf8_arg == ("--" + option)) {
+        return true;
+      }
+    }
+    return false;
+  };
+
+  // -r
+  if (is_option_specified("r")) {
+    if (absl::GetFlag(FLAGS_r) == static_cast<size_t>(0)) return false;
+    test_config.run_config.repeated_times = absl::GetFlag(FLAGS_r);
+    test_config.run_config.test_mode = TestMode::KFixRepeatedTimesMode;
+  }
+
+  // -t
+  if (is_option_specified("t")) {
+    if (absl::GetFlag(FLAGS_t) <= static_cast<size_t>(0)) return false;
+    test_config.run_config.duration_in_seconds = absl::GetFlag(FLAGS_t);
+    test_config.run_config.test_mode = TestMode::kFixDurationMode;
+  }
+
+  // -s
+  test_config.run_config.f_dump_statistics = absl::GetFlag(FLAGS_s);
+
+  // -S
+  test_config.run_config.random_seed_for_input_data = absl::GetFlag(FLAGS_S);
+
+  // -v
+  test_config.run_config.f_verbose = absl::GetFlag(FLAGS_v);
+
+  // -x
+  if (absl::GetFlag(FLAGS_x) < 0) return false;
+  test_config.run_config.intra_op_num_threads = absl::GetFlag(FLAGS_x);
+
+  // -y
+  if (absl::GetFlag(FLAGS_y) < 0) return false;
+  test_config.run_config.inter_op_num_threads = absl::GetFlag(FLAGS_y);
+
+  // -P
+  if (absl::GetFlag(FLAGS_P)) test_config.run_config.execution_mode = ExecutionMode::ORT_PARALLEL;
+
+  // -c
+  if (absl::GetFlag(FLAGS_c) <= static_cast<size_t>(0)) return false;
+  test_config.run_config.concurrent_session_runs = absl::GetFlag(FLAGS_c);
+
+  // -o
+  {
+    const auto optimization_level = absl::GetFlag(FLAGS_o);
+    if (optimization_level != test_config.run_config.optimization_level) {
+      switch (optimization_level) {
+        case ORT_DISABLE_ALL:
+          test_config.run_config.optimization_level = ORT_DISABLE_ALL;
+          break;
+        case ORT_ENABLE_BASIC:
+          test_config.run_config.optimization_level = ORT_ENABLE_BASIC;
+          break;
+        case ORT_ENABLE_EXTENDED:
+          test_config.run_config.optimization_level = ORT_ENABLE_EXTENDED;
+          break;
+        case ORT_ENABLE_LAYOUT:
+          test_config.run_config.optimization_level = ORT_ENABLE_LAYOUT;
+          break;
+        case ORT_ENABLE_ALL:
+          test_config.run_config.optimization_level = ORT_ENABLE_ALL;
+          break;
+        default: {
+          if (optimization_level > ORT_ENABLE_ALL) {  // relax constraint
             test_config.run_config.optimization_level = ORT_ENABLE_ALL;
-            break;
-          default: {
-            if (tmp > ORT_ENABLE_ALL) {  // relax constraint
-              test_config.run_config.optimization_level = ORT_ENABLE_ALL;
-            } else {
-              return false;
-            }
+          } else {
+            return false;
           }
         }
-        break;
       }
-      case 'u':
-        test_config.run_config.optimized_model_path = optarg;
-        break;
-      case 'I':
-        test_config.run_config.generate_model_input_binding = true;
-        break;
-      case 'd':
-        test_config.run_config.cudnn_conv_algo = static_cast<int>(OrtStrtol<PATH_CHAR_TYPE>(optarg, nullptr));
-        break;
-      case 'q':
-        test_config.run_config.do_cuda_copy_in_separate_stream = true;
-        break;
-      case 'z':
-        test_config.run_config.set_denormal_as_zero = true;
-        break;
-      case 'i':
-        test_config.run_config.ep_runtime_config_string = optarg;
-        break;
-      case 'T':
-        test_config.run_config.intra_op_thread_affinities = ToUTF8String(optarg);
-        break;
-      case 'C': {
-        ORT_TRY {
-          ParseSessionConfigs(ToUTF8String(optarg), test_config.run_config.session_config_entries);
-        }
-        ORT_CATCH(const std::exception& ex) {
-          ORT_HANDLE_EXCEPTION([&]() {
-            fprintf(stderr, "Error parsing session configuration entries: %s\n", ex.what());
-          });
-          return false;
-        }
-        break;
+    }
+  }
+
+  // -u
+  {
+    const auto& optimized_model_path = absl::GetFlag(FLAGS_u);
+    if (!optimized_model_path.empty()) test_config.run_config.optimized_model_path = ToPathString(optimized_model_path);
+  }
+
+  // -I
+  test_config.run_config.generate_model_input_binding = absl::GetFlag(FLAGS_I);
+
+  // -d
+  if (absl::GetFlag(FLAGS_d) < 0) return false;
+  test_config.run_config.cudnn_conv_algo = absl::GetFlag(FLAGS_d);
+
+  // -q
+  test_config.run_config.do_cuda_copy_in_separate_stream = absl::GetFlag(FLAGS_q);
+
+  // -z
+  test_config.run_config.set_denormal_as_zero = absl::GetFlag(FLAGS_z);
+
+  // -i
+  {
+    const auto& ep_options = absl::GetFlag(FLAGS_i);
+    if (!ep_options.empty()) test_config.run_config.ep_runtime_config_string = ToPathString(ep_options);
+  }
+
+  // -T
+  if (!absl::GetFlag(FLAGS_T).empty()) test_config.run_config.intra_op_thread_affinities = absl::GetFlag(FLAGS_T);
+
+  // -C
+  {
+    const auto& session_configs = absl::GetFlag(FLAGS_C);
+    if (!session_configs.empty()) {
+      ORT_TRY {
+        ParseSessionConfigs(session_configs, test_config.run_config.session_config_entries);
       }
-      case 'D':
-        test_config.run_config.disable_spinning = true;
-        break;
-      case 'Z':
-        test_config.run_config.disable_spinning_between_run = true;
-        break;
-      case 'n':
-        test_config.run_config.exit_after_session_creation = true;
-        break;
-      case 'l':
-        test_config.model_info.load_via_path = true;
-        break;
-      case 'R':
-        test_config.run_config.register_custom_op_path = optarg;
-        break;
-      case 'g':
-        test_config.run_config.enable_cuda_io_binding = true;
-        break;
-      case 'X':
-        test_config.run_config.use_extensions = true;
-        break;
-      case '?':
-      case 'h':
-      default:
+      ORT_CATCH(const std::exception& ex) {
+        ORT_HANDLE_EXCEPTION([&]() {
+          fprintf(stderr, "Error parsing session configuration entries: %s\n", ex.what());
+        });
         return false;
+      }
     }
   }
 
-  // parse model_path and result_file_path
-  argc -= optind;
-  argv += optind;
-
-  switch (argc) {
-    case 2:
-      test_config.model_info.result_file_path = argv[1];
-      break;
-    case 1:
-      test_config.run_config.f_dump_statistics = true;
-      break;
-    default:
-      return false;
+  // -D
+  test_config.run_config.disable_spinning = absl::GetFlag(FLAGS_D);
+
+  // -Z
+  test_config.run_config.disable_spinning_between_run = absl::GetFlag(FLAGS_Z);
+
+  // -n
+  test_config.run_config.exit_after_session_creation = absl::GetFlag(FLAGS_n);
+
+  // -l
+  test_config.model_info.load_via_path = absl::GetFlag(FLAGS_l);
+
+  // -R
+  {
+    const auto& register_custom_op_path = absl::GetFlag(FLAGS_R);
+    if (!register_custom_op_path.empty()) test_config.run_config.register_custom_op_path = ToPathString(register_custom_op_path);
   }
 
-  test_config.model_info.model_file_path = argv[0];
+  // -g
+  test_config.run_config.enable_cuda_io_binding = absl::GetFlag(FLAGS_g);
+
+  // -X
+  test_config.run_config.use_extensions = absl::GetFlag(FLAGS_X);
+
+  // --plugin_ep_libs
+  {
+    const auto& plugin_ep_names_and_libs = absl::GetFlag(FLAGS_plugin_ep_libs);
+    if (!plugin_ep_names_and_libs.empty()) test_config.plugin_ep_names_and_libs = ToPathString(plugin_ep_names_and_libs);
+  }
+
+  // --plugin_eps
+  {
+    const auto& plugin_eps = absl::GetFlag(FLAGS_plugin_eps);
+    if (!plugin_eps.empty()) ParseEpList(plugin_eps, test_config.machine_config.plugin_provider_type_list);
+  }
+
+  // --plugin_ep_options
+  {
+    const auto& plugin_ep_options = absl::GetFlag(FLAGS_plugin_ep_options);
+    if (!plugin_ep_options.empty()) test_config.run_config.ep_runtime_config_string = ToPathString(plugin_ep_options);
+  }
+
+  // --list_ep_devices
+  if (absl::GetFlag(FLAGS_list_ep_devices)) {
+    test_config.list_available_ep_devices = true;
+    return true;
+  }
+
+  // --select_ep_devices
+  {
+    const auto& select_ep_devices = absl::GetFlag(FLAGS_select_ep_devices);
+    if (!select_ep_devices.empty()) test_config.selected_ep_device_indices = select_ep_devices;
+  }
+
+  if (positional.size() == 2) {
+    test_config.model_info.model_file_path = ToPathString(positional[1]);
+    test_config.run_config.f_dump_statistics = true;
+  } else if (positional.size() == 3) {
+    test_config.model_info.model_file_path = ToPathString(positional[1]);
+    test_config.model_info.result_file_path = ToPathString(positional[2]);
+  } else {
+    return false;
+  }
 
   return true;
 }
diff --git a/onnxruntime/test/perftest/command_args_parser.h b/onnxruntime/test/perftest/command_args_parser.h
index 86c81072233c0..5a94f99874797 100644
--- a/onnxruntime/test/perftest/command_args_parser.h
+++ b/onnxruntime/test/perftest/command_args_parser.h
@@ -11,7 +11,6 @@ struct PerformanceTestConfig;
 
 class CommandLineParser {
  public:
-  static void ShowUsage();
   static bool ParseArguments(PerformanceTestConfig& test_config, int argc, ORTCHAR_T* argv[]);
 };
 
diff --git a/onnxruntime/test/perftest/common_utils.cc b/onnxruntime/test/perftest/common_utils.cc
new file mode 100644
index 0000000000000..5cc6c240e25f0
--- /dev/null
+++ b/onnxruntime/test/perftest/common_utils.cc
@@ -0,0 +1,95 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "test/perftest/utils.h"
+#include "test/perftest/strings_helper.h"
+#include <core/platform/path_lib.h>
+
+#include <cstdint>
+
+#include <filesystem>
+
+namespace onnxruntime {
+namespace perftest {
+namespace utils {
+
+void ListEpDevices(const Ort::Env& env) {
+  std::vector<Ort::ConstEpDevice> ep_devices = env.GetEpDevices();
+
+  for (size_t i = 0; i < ep_devices.size(); ++i) {
+    auto device = ep_devices[i];
+    std::string device_info_msg = "===== device id " + std::to_string(i) + " ======\n";
+    device_info_msg += "name: " + std::string(device.EpName()) + "\n";
+    device_info_msg += "vendor: " + std::string(device.EpVendor()) + "\n";
+
+    auto metadata = device.EpMetadata();
+    std::unordered_map<std::string, std::string> metadata_entries = metadata.GetKeyValuePairs();
+    if (!metadata_entries.empty()) {
+      device_info_msg += "metadata:\n";
+    }
+
+    for (auto& entry : metadata_entries) {
+      device_info_msg += "  " + entry.first + ": " + entry.second + "\n";
+    }
+    device_info_msg += "\n";
+    fprintf(stdout, "%s", device_info_msg.c_str());
+  }
+}
+
+void RegisterExecutionProviderLibrary(Ort::Env& env, PerformanceTestConfig& test_config) {
+  if (!test_config.plugin_ep_names_and_libs.empty()) {
+    std::unordered_map<std::string, std::string> ep_names_to_libs;
+    ParseSessionConfigs(ToUTF8String(test_config.plugin_ep_names_and_libs), ep_names_to_libs);
+    if (ep_names_to_libs.size() > 0) {
+      for (auto& pair : ep_names_to_libs) {
+        const std::filesystem::path library_path = pair.second;
+        const std::string registration_name = pair.first;
+        Ort::Status status(Ort::GetApi().RegisterExecutionProviderLibrary(env, registration_name.c_str(), ToPathString(library_path.string()).c_str()));
+        if (status.IsOK()) {
+          test_config.registered_plugin_eps.push_back(registration_name);
+        } else {
+          fprintf(stderr, "Can't register %s plugin library: %s\n", registration_name.c_str(), status.GetErrorMessage().c_str());
+        }
+      }
+    }
+  }
+}
+
+void UnregisterExecutionProviderLibrary(Ort::Env& env, PerformanceTestConfig& test_config) {
+  for (auto& registration_name : test_config.registered_plugin_eps) {
+    Ort::Status status(Ort::GetApi().UnregisterExecutionProviderLibrary(env, registration_name.c_str()));
+    if (!status.IsOK()) {
+      fprintf(stderr, "%s", status.GetErrorMessage().c_str());
+    }
+  }
+}
+
+std::vector<std::string> ConvertArgvToUtf8Strings(int argc, ORTCHAR_T* argv[]) {
+  std::vector<std::string> utf8_args;
+  utf8_args.reserve(argc);
+  for (int i = 0; i < argc; ++i) {
+    std::string utf8_string = ToUTF8String(argv[i]);
+
+    // Abseil flags doens't natively alias "-h" to "--help".
+    // We make "-h" alias to "--help" here.
+    if (utf8_string == "-h" || utf8_string == "--h") {
+      utf8_args.push_back("--help");
+    } else {
+      utf8_args.push_back(utf8_string);
+    }
+  }
+  return utf8_args;
+}
+
+std::vector<char*> CStringsFromStrings(std::vector<std::string>& utf8_args) {
+  std::vector<char*> utf8_argv;
+  utf8_argv.reserve(utf8_args.size());
+  for (auto& str : utf8_args) {
+    utf8_argv.push_back(&str[0]);
+  }
+  return utf8_argv;
+}
+
+}  // namespace utils
+}  // namespace perftest
+}  // namespace onnxruntime
diff --git a/onnxruntime/test/perftest/main.cc b/onnxruntime/test/perftest/main.cc
index 43bf54963cabb..973baf774b024 100644
--- a/onnxruntime/test/perftest/main.cc
+++ b/onnxruntime/test/perftest/main.cc
@@ -6,6 +6,8 @@
 #include <random>
 #include "command_args_parser.h"
 #include "performance_runner.h"
+#include "utils.h"
+#include "strings_helper.h"
 #include <google/protobuf/stubs/common.h>
 
 using namespace onnxruntime;
@@ -19,7 +21,7 @@ int real_main(int argc, char* argv[]) {
   g_ort = OrtGetApiBase()->GetApi(ORT_API_VERSION);
   perftest::PerformanceTestConfig test_config;
   if (!perftest::CommandLineParser::ParseArguments(test_config, argc, argv)) {
-    perftest::CommandLineParser::ShowUsage();
+    fprintf(stderr, "%s", "See 'onnxruntime_perf_test --help'.");
     return -1;
   }
   Ort::Env env{nullptr};
@@ -41,6 +43,30 @@ int real_main(int argc, char* argv[]) {
     if (failed)
       return -1;
   }
+
+  if (!test_config.plugin_ep_names_and_libs.empty()) {
+    perftest::utils::RegisterExecutionProviderLibrary(env, test_config);
+  }
+
+  // Unregister all registered plugin EP libraries before program exits.
+  // This is necessary because unregistering the plugin EP also unregisters any associated shared allocators.
+  // If we don't do this and program returns, the factories stored inside the environment will be destroyed when the environment goes out of scope.
+  // Later, when the shared allocator's deleter runs, it may cause a segmentation fault because it attempts to use the already-destroyed factory to call ReleaseAllocator.
+  // See "ep_device.ep_factory->ReleaseAllocator" in Environment::CreateSharedAllocatorImpl.
+  auto unregister_plugin_eps_at_scope_exit = gsl::finally([&]() {
+    if (!test_config.registered_plugin_eps.empty()) {
+      perftest::utils::UnregisterExecutionProviderLibrary(env, test_config);  // this won't throw
+    }
+  });
+
+  if (test_config.list_available_ep_devices) {
+    perftest::utils::ListEpDevices(env);
+    if (test_config.registered_plugin_eps.empty()) {
+      fprintf(stdout, "No plugin execution provider libraries are registered. Please specify them using \"--plugin_ep_libs\"; otherwise, only CPU may be available.\n");
+    }
+    return 0;
+  }
+
   std::random_device rd;
   perftest::PerformanceRunner perf_runner(env, test_config, rd);
 
diff --git a/onnxruntime/test/perftest/ort_test_session.cc b/onnxruntime/test/perftest/ort_test_session.cc
index 7a210ca8482a4..7156a1eb5c347 100644
--- a/onnxruntime/test/perftest/ort_test_session.cc
+++ b/onnxruntime/test/perftest/ort_test_session.cc
@@ -62,6 +62,84 @@ OnnxRuntimeTestSession::OnnxRuntimeTestSession(Ort::Env& env, std::random_device
     : rand_engine_(rd()), input_names_(m.GetInputCount()), input_names_str_(m.GetInputCount()), input_length_(m.GetInputCount()) {
   Ort::SessionOptions session_options;
 
+  // Add EP devices if any (created by plugin EP)
+  if (!performance_test_config.registered_plugin_eps.empty()) {
+    std::vector<Ort::ConstEpDevice> ep_devices = env.GetEpDevices();
+    // EP -> associated EP devices (All OrtEpDevice instances must be from the same execution provider)
+    std::unordered_map<std::string, std::vector<Ort::ConstEpDevice>> added_ep_devices;
+    std::unordered_set<int> added_ep_device_index_set;
+
+    auto& ep_list = performance_test_config.machine_config.plugin_provider_type_list;
+    std::unordered_set<std::string> ep_set(ep_list.begin(), ep_list.end());
+
+    // Select EP devices by provided device index
+    if (!performance_test_config.selected_ep_device_indices.empty()) {
+      std::vector<int> device_list;
+      device_list.reserve(performance_test_config.selected_ep_device_indices.size());
+      ParseEpDeviceIndexList(performance_test_config.selected_ep_device_indices, device_list);
+      for (auto index : device_list) {
+        if (static_cast<size_t>(index) > (ep_devices.size() - 1)) {
+          fprintf(stderr, "%s", "The device index provided is not correct. Will skip this device id.");
+          continue;
+        }
+
+        Ort::ConstEpDevice& device = ep_devices[index];
+        if (ep_set.find(std::string(device.EpName())) != ep_set.end()) {
+          if (added_ep_device_index_set.find(index) == added_ep_device_index_set.end()) {
+            added_ep_devices[device.EpName()].push_back(device);
+            added_ep_device_index_set.insert(index);
+            fprintf(stdout, "[Plugin EP] EP Device [Index: %d, Name: %s] has been added to session.\n", index, device.EpName());
+          }
+        } else {
+          std::string err_msg = "[Plugin EP] [WARNING] : The EP device index and its corresponding OrtEpDevice is not created from " +
+                                performance_test_config.machine_config.provider_type_name + ". Will skip adding this device.\n";
+          fprintf(stderr, "%s", err_msg.c_str());
+        }
+      }
+    } else {
+      // Find and select the OrtEpDevice associated with the EP in "--plugin_eps".
+      for (size_t index = 0; index < ep_devices.size(); ++index) {
+        Ort::ConstEpDevice& device = ep_devices[index];
+        if (ep_set.find(std::string(device.EpName())) != ep_set.end()) {
+          added_ep_devices[device.EpName()].push_back(device);
+          fprintf(stdout, "EP Device [Index: %d, Name: %s] has been added to session.\n", static_cast<int>(index), device.EpName());
+        }
+      }
+    }
+
+    if (added_ep_devices.empty()) {
+      ORT_THROW("[ERROR] [Plugin EP]: No matching EP devices found.");
+    }
+
+    std::string ep_option_string = ToUTF8String(performance_test_config.run_config.ep_runtime_config_string);
+
+    // EP's associated provider option lists
+    std::vector<std::unordered_map<std::string, std::string>> ep_options_list;
+    ParseEpOptions(ep_option_string, ep_options_list);
+
+    // If user only provide the EPs' provider option lists for the first several EPs,
+    // add empty provider option lists for the rest EPs.
+    if (ep_options_list.size() < ep_list.size()) {
+      for (size_t i = ep_options_list.size(); i < ep_list.size(); ++i) {
+        ep_options_list.emplace_back();  // Adds a new empty map
+      }
+    } else if (ep_options_list.size() > ep_list.size()) {
+      ORT_THROW("[ERROR] [Plugin EP]: Too many EP provider option lists provided.");
+    }
+
+    // EP -> associated provider options
+    std::unordered_map<std::string, std::unordered_map<std::string, std::string>> ep_options_map;
+    for (size_t i = 0; i < ep_list.size(); ++i) {
+      ep_options_map.emplace(ep_list[i], ep_options_list[i]);
+    }
+
+    for (auto& ep_and_devices : added_ep_devices) {
+      auto& ep = ep_and_devices.first;
+      auto& devices = ep_and_devices.second;
+      session_options.AppendExecutionProvider_V2(env, devices, ep_options_map[ep]);
+    }
+  }
+
   provider_name_ = performance_test_config.machine_config.provider_type_name;
   std::unordered_map<std::string, std::string> provider_options;
   if (provider_name_ == onnxruntime::kDnnlExecutionProvider) {
diff --git a/onnxruntime/test/perftest/strings_helper.cc b/onnxruntime/test/perftest/strings_helper.cc
index 9fd49da1d0486..f4860b35c79da 100644
--- a/onnxruntime/test/perftest/strings_helper.cc
+++ b/onnxruntime/test/perftest/strings_helper.cc
@@ -8,6 +8,8 @@
 
 #include "strings_helper.h"
 #include "core/common/common.h"
+#include "core/common/parse_string.h"
+#include "core/common/string_utils.h"
 
 namespace onnxruntime {
 namespace perftest {
@@ -53,5 +55,40 @@ void ParseSessionConfigs(const std::string& configs_string,
     session_configs.insert(std::make_pair(std::move(key), std::move(value)));
   }
 }
+
+void ParseEpOptions(const std::string& input, std::vector<std::unordered_map<std::string, std::string>>& result) {
+  auto tokens = utils::SplitString(input, ";", true);
+
+  for (const auto& token : tokens) {
+    result.emplace_back();  // Adds a new empty map
+    if (!token.empty()) {
+      ParseSessionConfigs(std::string(token), result.back());  // only parse non-empty
+    }
+    // if token is empty, we still get an empty map in `result`
+  }
+}
+
+void ParseEpList(const std::string& input, std::vector<std::string>& result) {
+  std::stringstream ss(input);
+  std::string token;
+
+  while (std::getline(ss, token, ';')) {
+    if (!token.empty()) {
+      result.push_back(token);
+    }
+  }
+}
+
+void ParseEpDeviceIndexList(const std::string& input, std::vector<int>& result) {
+  std::stringstream ss(input);
+  std::string item;
+
+  while (std::getline(ss, item, ';')) {
+    if (!item.empty()) {
+      int value = ParseStringWithClassicLocale<int>(item);
+      result.push_back(value);
+    }
+  }
+}
 }  // namespace perftest
 }  // namespace onnxruntime
diff --git a/onnxruntime/test/perftest/strings_helper.h b/onnxruntime/test/perftest/strings_helper.h
index 0d6c56709fde6..621ab746273bd 100644
--- a/onnxruntime/test/perftest/strings_helper.h
+++ b/onnxruntime/test/perftest/strings_helper.h
@@ -5,6 +5,7 @@
 #include <string_view>
 #include <unordered_map>
 #include <unordered_set>
+#include <vector>
 
 namespace onnxruntime {
 namespace perftest {
@@ -12,5 +13,11 @@ namespace perftest {
 void ParseSessionConfigs(const std::string& configs_string,
                          std::unordered_map<std::string, std::string>& session_configs,
                          const std::unordered_set<std::string>& available_keys = {});
+
+void ParseEpList(const std::string& input, std::vector<std::string>& result);
+
+void ParseEpOptions(const std::string& input, std::vector<std::unordered_map<std::string, std::string>>& result);
+
+void ParseEpDeviceIndexList(const std::string& input, std::vector<int>& result);
 }  // namespace perftest
 }  // namespace onnxruntime
diff --git a/onnxruntime/test/perftest/test_configuration.h b/onnxruntime/test/perftest/test_configuration.h
index 8145f5f35c3b3..29ee84dd40dac 100644
--- a/onnxruntime/test/perftest/test_configuration.h
+++ b/onnxruntime/test/perftest/test_configuration.h
@@ -35,6 +35,7 @@ struct ModelInfo {
 struct MachineConfig {
   Platform platform{Platform::kWindows};
   std::string provider_type_name{onnxruntime::kCpuExecutionProvider};
+  std::vector<std::string> plugin_provider_type_list;
 };
 
 struct RunConfig {
@@ -59,8 +60,8 @@ struct RunConfig {
   bool set_denormal_as_zero{false};
   std::basic_string<ORTCHAR_T> ep_runtime_config_string;
   std::unordered_map<std::string, std::string> session_config_entries;
-  std::map<std::basic_string<ORTCHAR_T>, int64_t> free_dim_name_overrides;
-  std::map<std::basic_string<ORTCHAR_T>, int64_t> free_dim_denotation_overrides;
+  std::map<std::string, int64_t> free_dim_name_overrides;
+  std::map<std::string, int64_t> free_dim_denotation_overrides;
   std::string intra_op_thread_affinities;
   bool disable_spinning = false;
   bool disable_spinning_between_run = false;
@@ -74,6 +75,10 @@ struct PerformanceTestConfig {
   ModelInfo model_info;
   MachineConfig machine_config;
   RunConfig run_config;
+  std::basic_string<ORTCHAR_T> plugin_ep_names_and_libs;
+  std::vector<std::string> registered_plugin_eps;
+  std::string selected_ep_device_indices;
+  bool list_available_ep_devices = false;
 };
 
 }  // namespace perftest
diff --git a/onnxruntime/test/perftest/utils.h b/onnxruntime/test/perftest/utils.h
index f22abc04fa99e..9f180e2c8d942 100644
--- a/onnxruntime/test/perftest/utils.h
+++ b/onnxruntime/test/perftest/utils.h
@@ -2,7 +2,8 @@
 // Licensed under the MIT License.
 
 #pragma once
-
+#include "test/perftest/test_configuration.h"
+#include <core/session/onnxruntime_cxx_api.h>
 #include <memory>
 
 namespace onnxruntime {
@@ -22,6 +23,16 @@ class ICPUUsage {
 
 std::unique_ptr<ICPUUsage> CreateICPUUsage();
 
+std::vector<std::string> ConvertArgvToUtf8Strings(int argc, ORTCHAR_T* argv[]);
+
+std::vector<char*> CStringsFromStrings(std::vector<std::string>& utf8_args);
+
+void RegisterExecutionProviderLibrary(Ort::Env& env, PerformanceTestConfig& test_config);
+
+void UnregisterExecutionProviderLibrary(Ort::Env& env, PerformanceTestConfig& test_config);
+
+void ListEpDevices(const Ort::Env& env);
+
 }  // namespace utils
 }  // namespace perftest
 }  // namespace onnxruntime

From 9c9e3a6f3f6af9ba13b687bb61bbba2790b36f81 Mon Sep 17 00:00:00 2001
From: Scott McKay <skottmckay@gmail.com>
Date: Thu, 7 Aug 2025 12:48:13 +1000
Subject: [PATCH 10/10] Allow DML EP to be used with any CPU EP (#25664)

### Description
<!-- Describe your changes. -->
Relax restriction on DML EP so other CPU based EPs can be used.


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
#25504
---
 .../onnxruntime/core/framework/execution_provider.h  |  7 ++++++-
 onnxruntime/core/session/inference_session.cc        | 12 +++++++-----
 2 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/include/onnxruntime/core/framework/execution_provider.h b/include/onnxruntime/core/framework/execution_provider.h
index 7df3368ad4e0b..1bb7f219c9a45 100644
--- a/include/onnxruntime/core/framework/execution_provider.h
+++ b/include/onnxruntime/core/framework/execution_provider.h
@@ -179,7 +179,12 @@ class IExecutionProvider {
   /**
      Get the device id of current execution provider
   */
-  virtual int GetDeviceId() const { return default_device_.Id(); };
+  virtual int GetDeviceId() const { return default_device_.Id(); }
+
+  /**
+   * Get the OrtDevice the execution provider was registered with.
+   */
+  const OrtDevice& GetDevice() const { return default_device_; }
 
   /**
      Get execution provider's configuration options.
diff --git a/onnxruntime/core/session/inference_session.cc b/onnxruntime/core/session/inference_session.cc
index e0542768aef2f..0dbf54a3ec99e 100644
--- a/onnxruntime/core/session/inference_session.cc
+++ b/onnxruntime/core/session/inference_session.cc
@@ -1984,13 +1984,15 @@ static void ResolveMemoryPatternFlags(SessionState& session_state) {
 // For now, this function only checks for invalid combination of DML EP with other EPs.
 // TODO: extend this function to check for other invalid combinations of EPs.
 common::Status InferenceSession::HasInvalidCombinationOfExecutionProviders() const {
-  // DML EP is only allowed with CPU EP
+  // DML EP is not allowed with other GPU or NPU EPs.
+  // historical reason for this is unknown. relaxing the limit that it must only be used with the CPU EP to support
+  // scenarios where alternative EPs are CPU based (e.g. openvino).
   bool has_dml_ep = execution_providers_.Get(kDmlExecutionProvider) != nullptr;
   if (has_dml_ep) {
-    const auto& ep_list = execution_providers_.GetIds();
-    for (const auto& ep : ep_list) {
-      if (ep == kDmlExecutionProvider || ep == kCpuExecutionProvider) continue;
-      return common::Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT, "DML EP can be used with only CPU EP.");
+    for (const auto& ep : execution_providers_) {
+      if (ep->Type() != kDmlExecutionProvider && ep->GetDevice().Type() != OrtDevice::CPU) {
+        return common::Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT, "DML EP can only be used with CPU EPs.");
+      }
     }
   }
   return Status::OK();