From 1be64f883190f058256948c8d254c61d1a724008 Mon Sep 17 00:00:00 2001
From: Sushanth Rajasankar <44513542+sushraja-msft@users.noreply.github.com>
Date: Fri, 28 Feb 2025 08:02:04 -0800
Subject: [PATCH 01/46] Fix flash attention for GQA (Phi4) (#23850)

### Description
This change fixes GQA for Flash Attention on Nvidia GPUs. The root cause
appears to be
`k_start + capped_sg_id < seq_causal_length`
check. This is either because,
a. seq_causal_length varies per lane, so the check becomes non uniform
control flow, which is having interactions with subgroupShuffle.
or
b. The check itself is incorrect and is wiping out values of v based on
the source lane's seq_causal_length. While in actualness values of v
need to be causal as per the lane that is going to multiply it with qkt.

qkt is already causal because earlier values of qk for out of bounds k
are set to min_value, and exp(<-4) are 0.

This fix works by removing that causal check and relying on the qk being
wiped out earlier. The documentation for causality behavior for GQA is
missing to determine which of this reason is the true reason.

Prior to this prompts with sequence length > 16 < 32 or 1k would break
with Phi 4 but smaller prompts would work.
Tested on Intel Alderlake, Nvidia 4070.
---
 onnxruntime/contrib_ops/webgpu/bert/flash_attention.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/onnxruntime/contrib_ops/webgpu/bert/flash_attention.cc b/onnxruntime/contrib_ops/webgpu/bert/flash_attention.cc
index 57ae8a7e5ba74..c1b025b10e067 100644
--- a/onnxruntime/contrib_ops/webgpu/bert/flash_attention.cc
+++ b/onnxruntime/contrib_ops/webgpu/bert/flash_attention.cc
@@ -379,7 +379,7 @@ Status FlashAttentionProgram::GenerateShaderCode(ShaderHelper& shader) const {
     if (sg_size > 8) {
       for (var i:u32 = 0; i < qkv_head_size_vec; i++)
       {
-          var val = select(vec4<q_element_t>(0), v_tile[capped_sg_id][i], k_start + capped_sg_id < seq_causal_length);
+          var val = v_tile[capped_sg_id][i];
           var sum = subgroupShuffle(val, 0) * qk_1[0];
           sum += subgroupShuffle(val, 1) * qk_1[1];
           sum += subgroupShuffle(val, 2) * qk_1[2];

From 1088a1edfecc377958175dfdde8df425c43b9571 Mon Sep 17 00:00:00 2001
From: Scott McKay <skottmckay@gmail.com>
Date: Sat, 1 Mar 2025 09:09:13 +1000
Subject: [PATCH 02/46] Model Builder API (#23223)

### Description
<!-- Describe your changes. -->
Supports creating a model programmatically using the ORT C or C++ API.
Supports augmenting an existing model to add nodes.

### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
---
 cmake/onnxruntime_session.cmake               |   1 +
 cmake/onnxruntime_unittests.cmake             |   6 +
 include/onnxruntime/core/graph/graph.h        |  32 +-
 include/onnxruntime/core/graph/graph_viewer.h |   6 +
 .../core/session/onnxruntime_c_api.h          | 491 +++++++++++-
 .../core/session/onnxruntime_cxx_api.h        | 258 ++++++-
 .../core/session/onnxruntime_cxx_inline.h     | 350 ++++++++-
 .../core/framework/onnxruntime_typeinfo.cc    |  71 +-
 .../core/framework/onnxruntime_typeinfo.h     |   2 +-
 .../core/framework/session_state_utils.cc     |  30 +-
 .../core/framework/tensor_type_and_shape.cc   |  35 +-
 .../core/framework/tensorprotoutils.cc        |  13 +-
 onnxruntime/core/graph/graph.cc               | 295 +++++++-
 .../core/graph/graph_flatbuffers_utils.cc     |  14 +-
 onnxruntime/core/graph/model.cc               |  32 +-
 onnxruntime/core/graph/model.h                |   8 +-
 .../core/graph/model_editor_api_types.h       |  47 ++
 .../core/session/abi_session_options.cc       |  17 +-
 onnxruntime/core/session/api_utils.cc         |  25 -
 onnxruntime/core/session/api_utils.h          |   9 -
 onnxruntime/core/session/custom_ops.cc        |   2 +-
 onnxruntime/core/session/inference_session.cc |  57 +-
 onnxruntime/core/session/inference_session.h  |  35 +-
 onnxruntime/core/session/model_editor_api.h   |  65 ++
 .../core/session/model_editor_c_api.cc        | 358 +++++++++
 onnxruntime/core/session/onnxruntime_c_api.cc | 328 ++++----
 onnxruntime/core/session/ort_apis.h           |  16 +
 onnxruntime/core/session/utils.cc             | 125 ++++
 onnxruntime/core/session/utils.h              |  28 +
 onnxruntime/test/framework/type_info_test.cc  |  26 +-
 onnxruntime/test/shared_lib/custom_op_utils.h |   6 -
 onnxruntime/test/shared_lib/test_inference.cc | 162 ++--
 .../test/shared_lib/test_model_builder_api.cc | 701 ++++++++++++++++++
 .../test/shared_lib/test_ort_format_models.cc |  14 +-
 onnxruntime/test/shared_lib/utils.h           |  52 ++
 winml/adapter/winml_adapter_model.cpp         |  18 +-
 36 files changed, 3286 insertions(+), 449 deletions(-)
 create mode 100644 onnxruntime/core/graph/model_editor_api_types.h
 delete mode 100644 onnxruntime/core/session/api_utils.cc
 delete mode 100644 onnxruntime/core/session/api_utils.h
 create mode 100644 onnxruntime/core/session/model_editor_api.h
 create mode 100644 onnxruntime/core/session/model_editor_c_api.cc
 create mode 100644 onnxruntime/core/session/utils.cc
 create mode 100644 onnxruntime/core/session/utils.h
 create mode 100644 onnxruntime/test/shared_lib/test_model_builder_api.cc

diff --git a/cmake/onnxruntime_session.cmake b/cmake/onnxruntime_session.cmake
index 3d63285d50e72..2c2c59091fae5 100644
--- a/cmake/onnxruntime_session.cmake
+++ b/cmake/onnxruntime_session.cmake
@@ -22,6 +22,7 @@ endif()
 if (onnxruntime_MINIMAL_BUILD)
   set(onnxruntime_session_src_exclude
     "${ONNXRUNTIME_ROOT}/core/session/provider_bridge_ort.cc"
+    "${ONNXRUNTIME_ROOT}/core/session/model_builder_c_api.cc"
   )
 
   list(REMOVE_ITEM onnxruntime_session_srcs ${onnxruntime_session_src_exclude})
diff --git a/cmake/onnxruntime_unittests.cmake b/cmake/onnxruntime_unittests.cmake
index 0916aeb3dd92c..cb5a28f82de66 100644
--- a/cmake/onnxruntime_unittests.cmake
+++ b/cmake/onnxruntime_unittests.cmake
@@ -503,6 +503,7 @@ set (onnxruntime_shared_lib_test_SRC
 
 if (NOT onnxruntime_MINIMAL_BUILD)
   list(APPEND onnxruntime_shared_lib_test_SRC ${ONNXRUNTIME_SHARED_LIB_TEST_SRC_DIR}/test_inference.cc)
+  list(APPEND onnxruntime_shared_lib_test_SRC ${ONNXRUNTIME_SHARED_LIB_TEST_SRC_DIR}/test_model_builder_api.cc)
 endif()
 
 if(onnxruntime_RUN_ONNX_TESTS)
@@ -1359,14 +1360,19 @@ if (NOT onnxruntime_ENABLE_TRAINING_TORCH_INTEROP)
             LIBS ${onnxruntime_shared_lib_test_LIBS}
             DEPENDS ${all_dependencies}
     )
+
+    target_include_directories(onnxruntime_shared_lib_test PRIVATE ${ONNXRUNTIME_ROOT})
+
     if (onnxruntime_USE_CUDA)
       target_include_directories(onnxruntime_shared_lib_test PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
       target_sources(onnxruntime_shared_lib_test PRIVATE ${ONNXRUNTIME_SHARED_LIB_TEST_SRC_DIR}/cuda_ops.cu)
     endif()
+
     if (onnxruntime_USE_ROCM)
       target_include_directories(onnxruntime_shared_lib_test PRIVATE ${onnxruntime_ROCM_HOME}/include)
       target_compile_definitions(onnxruntime_shared_lib_test PRIVATE __HIP_PLATFORM_AMD__)
     endif()
+
     if (CMAKE_SYSTEM_NAME STREQUAL "Android")
       target_sources(onnxruntime_shared_lib_test PRIVATE
         "${ONNXRUNTIME_ROOT}/core/platform/android/cxa_demangle.cc"
diff --git a/include/onnxruntime/core/graph/graph.h b/include/onnxruntime/core/graph/graph.h
index 7798394b045dc..35b568e3f8e28 100644
--- a/include/onnxruntime/core/graph/graph.h
+++ b/include/onnxruntime/core/graph/graph.h
@@ -27,6 +27,7 @@
 #include "core/common/span_utils.h"
 #include "core/common/status.h"
 #include "core/common/logging/logging.h"
+#include "core/framework/ort_value.h"
 #include "core/framework/prepacked_weights_container.h"
 #include "core/graph/onnx_protobuf.h"
 #include "core/graph/basic_types.h"
@@ -39,6 +40,9 @@
 #include "core/graph/node_arg.h"
 #include "core/graph/ort_format_load_options.h"
 
+// Type from Model Editor API in ORT C API so can't be in a namespace
+struct OrtGraph;
+
 namespace onnxruntime {
 class Graph;
 struct IndexedSubGraph;
@@ -763,6 +767,10 @@ class Graph {  // NOLINT(clang-analyzer-optin.performance.Padding): preserve exi
   */
   bool GetInitializedTensor(const std::string& tensor_name, const ONNX_NAMESPACE::TensorProto*& value) const;
 
+  /** Populate `value` if an externally allocated OrtValue exists for an initializer with the given name.
+   */
+  bool GetOrtValueInitializer(const std::string& name, OrtValue& value) const;
+
   /** Gets all the initializer tensors in this Graph. */
   const InitializedTensorSet& GetAllInitializedTensors() const noexcept { return name_to_initial_tensor_; }
 
@@ -1430,6 +1438,16 @@ class Graph {  // NOLINT(clang-analyzer-optin.performance.Padding): preserve exi
                                   const OrtFormatLoadOptions& load_options,
                                   const logging::Logger& logger, std::unique_ptr<Graph>& graph);
 
+  static Status LoadFromModelEditorApiModel(const OrtGraph& api_graph,
+                                            const Model& owning_model,
+                                            const std::unordered_map<std::string, int>& domain_to_version,
+                                            IOnnxRuntimeOpSchemaCollectionPtr schema_registry,
+                                            bool strict_shape_type_inference,
+                                            const logging::Logger& logger,
+                                            std::unique_ptr<Graph>& graph);
+
+  Status UpdateUsingModelEditorApiModel(const OrtModel& api_model);
+
 #if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
   const RuntimeOptimizationRecordContainer& RuntimeOptimizations() const {
     return runtime_optimizations_;
@@ -1630,7 +1648,8 @@ class Graph {  // NOLINT(clang-analyzer-optin.performance.Padding): preserve exi
   // Implementation for initializer replacement
   Status ReplaceInitializedTensorImpl(ONNX_NAMESPACE::TensorProto new_initializer, bool is_external);
 
-  std::vector<NodeArg*> CreateNodeArgs(const google::protobuf::RepeatedPtrField<std::string>& names,
+  template <typename StringRange>  // range-initializer returning std::string
+  std::vector<NodeArg*> CreateNodeArgs(const StringRange& names,
                                        const ArgNameToTypeMap& name_to_type_map);
 
   void ToGraphProtoInternal(ONNX_NAMESPACE::GraphProto& graph_proto) const;
@@ -1694,6 +1713,8 @@ class Graph {  // NOLINT(clang-analyzer-optin.performance.Padding): preserve exi
     return nodes_[node_index].get();
   }
 
+  Status LoadFromModelEditorApiModel(const OrtGraph& api_graph, bool updating_existing_graph = false);
+
   const Model& owning_model_;
 
   // GraphProto to store name, version, initializer.
@@ -1708,6 +1729,12 @@ class Graph {  // NOLINT(clang-analyzer-optin.performance.Padding): preserve exi
 
   InitializedTensorSet name_to_initial_tensor_;
 
+  // Initializers that are external to the Graph.
+  // e.g. created from existing memory using CreateTensorWithDataAndDeleterAsOrtValue in the ORT API.
+  // As we need to convert to TensorProto for the optimizers to work and keep the deleter information we store them
+  // in the Graph instance and retrieve during session state finalization.
+  std::unordered_map<std::string, OrtValue> ortvalue_initializers_;
+
   std::unordered_set<std::reference_wrapper<const std::string>,
                      std::hash<std::string>, std::equal_to<std::string>>
       sparse_tensor_names_;
@@ -1744,6 +1771,7 @@ class Graph {  // NOLINT(clang-analyzer-optin.performance.Padding): preserve exi
   // in some case, a fused sub-graph will happens multiple times in one model, we use a map
   // to store reusable-schema in lookup.
   InlinedHashMap<std::string, std::reference_wrapper<ONNX_NAMESPACE::OpSchema>> reusable_fused_schema_map_;
+
 #endif  // !defined(ORT_MINIMAL_BUILD)
 
   // Graph nodes.
@@ -1806,7 +1834,7 @@ class Graph {  // NOLINT(clang-analyzer-optin.performance.Padding): preserve exi
   std::unordered_map<std::string, std::unordered_set<NodeIndex>> node_arg_to_consumer_nodes_;
 #endif  // !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
 
-  const std::unordered_map<std::string, int> domain_to_version_;
+  std::unordered_map<std::string, int> domain_to_version_;
 
   // Model IR version.
   Version ir_version_{ONNX_NAMESPACE::Version::IR_VERSION};
diff --git a/include/onnxruntime/core/graph/graph_viewer.h b/include/onnxruntime/core/graph/graph_viewer.h
index 9385e2f092e58..6a664d8be9c05 100644
--- a/include/onnxruntime/core/graph/graph_viewer.h
+++ b/include/onnxruntime/core/graph/graph_viewer.h
@@ -193,6 +193,12 @@ class GraphViewer {
   IOnnxRuntimeOpSchemaCollectionPtr GetSchemaRegistry() const { return graph_->GetSchemaRegistry(); }
 #endif
 
+  /** Populate `value` if an externally allocated OrtValue exists for an initializer with the given name.
+   */
+  bool GetOrtValueInitializer(const std::string& name, OrtValue& value) const {
+    return graph_->GetOrtValueInitializer(name, value);
+  }
+
  private:
   ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(GraphViewer);
   GraphViewer(const Graph& graph, const IndexedSubGraph* filter_info);
diff --git a/include/onnxruntime/core/session/onnxruntime_c_api.h b/include/onnxruntime/core/session/onnxruntime_c_api.h
index 47e6389492f30..098de14bdfd61 100644
--- a/include/onnxruntime/core/session/onnxruntime_c_api.h
+++ b/include/onnxruntime/core/session/onnxruntime_c_api.h
@@ -305,6 +305,10 @@ ORT_RUNTIME_CLASS(OpAttr);
 ORT_RUNTIME_CLASS(Logger);
 ORT_RUNTIME_CLASS(ShapeInferContext);
 ORT_RUNTIME_CLASS(LoraAdapter);
+ORT_RUNTIME_CLASS(ValueInfo);
+ORT_RUNTIME_CLASS(Node);
+ORT_RUNTIME_CLASS(Graph);
+ORT_RUNTIME_CLASS(Model);
 
 #ifdef _WIN32
 typedef _Return_type_success_(return == 0) OrtStatus* OrtStatusPtr;
@@ -665,6 +669,9 @@ typedef struct OrtApi OrtApi;
 struct OrtTrainingApi;
 typedef struct OrtTrainingApi OrtTrainingApi;
 
+struct OrtModelEditorApi;
+typedef struct OrtModelEditorApi OrtModelEditorApi;
+
 /** \brief The helper interface to get the right version of OrtApi
  *
  * Get a pointer to this structure through ::OrtGetApiBase
@@ -847,7 +854,8 @@ struct OrtApi {
    *
    * \snippet{doc} snippets.dox OrtStatus Return Value
    */
-  ORT_API2_STATUS(CreateSessionFromArray, _In_ const OrtEnv* env, _In_ const void* model_data, size_t model_data_length,
+  ORT_API2_STATUS(CreateSessionFromArray, _In_ const OrtEnv* env,
+                  _In_ const void* model_data, size_t model_data_length,
                   _In_ const OrtSessionOptions* options, _Outptr_ OrtSession** out);
 
   /** \brief Run the model in an ::OrtSession
@@ -1340,6 +1348,8 @@ struct OrtApi {
    * Create a tensor with user's buffer. You can fill the buffer either before calling this function or after.
    * p_data is owned by caller. ReleaseValue won't release p_data.
    *
+   * If you wish to transfer ownership of p_data to ORT use CreateTensorWithDataAndDeleterAsOrtValue.
+   *
    * \param[in] info Memory description of where the p_data buffer resides (CPU vs GPU etc).
    * \param[in] p_data Pointer to the data buffer.
    * \param[in] p_data_len The number of bytes in the data buffer.
@@ -1997,7 +2007,8 @@ struct OrtApi {
   /** \brief Get the value type from an ::OrtMapTypeInfo
    *
    * \param[in] map_type_info
-   * \param[out] type_info
+   * \param[out] type_info A copy of the OrtTypeInfo for the map value type.
+   *                       The user must free this value with ReleaseTypeInfo.
    *
    * \snippet{doc} snippets.dox OrtStatus Return Value
    */
@@ -2012,7 +2023,8 @@ struct OrtApi {
    * This is used by WinML to support model reflection APIs.
    *
    * \param[in] sequence_type_info
-   * \param[out] type_info
+   * \param[out] type_info A copy of the OrtTypeInfo for the sequence element type.
+   *                       The user must free this value with ReleaseTypeInfo.
    *
    * \snippet{doc} snippets.dox OrtStatus Return Value
    */
@@ -2887,7 +2899,8 @@ struct OrtApi {
    * \snippet{doc} snippets.dox OrtStatus Return Value
    */
   ORT_API2_STATUS(CreateSessionWithPrepackedWeightsContainer, _In_ const OrtEnv* env, _In_ const ORTCHAR_T* model_path,
-                  _In_ const OrtSessionOptions* options, _Inout_ OrtPrepackedWeightsContainer* prepacked_weights_container,
+                  _In_ const OrtSessionOptions* options,
+                  _Inout_ OrtPrepackedWeightsContainer* prepacked_weights_container,
                   _Outptr_ OrtSession** out);
 
   /** \brief Create session from memory with prepacked weights container
@@ -2910,7 +2923,8 @@ struct OrtApi {
    */
   ORT_API2_STATUS(CreateSessionFromArrayWithPrepackedWeightsContainer, _In_ const OrtEnv* env,
                   _In_ const void* model_data, size_t model_data_length,
-                  _In_ const OrtSessionOptions* options, _Inout_ OrtPrepackedWeightsContainer* prepacked_weights_container,
+                  _In_ const OrtSessionOptions* options,
+                  _Inout_ OrtPrepackedWeightsContainer* prepacked_weights_container,
                   _Outptr_ OrtSession** out);
 
   /// @}
@@ -4293,8 +4307,8 @@ struct OrtApi {
    * specific type that is described by the returned ::OrtTypeInfo.
    *
    * \param[in] optional_type_info
-   * \param[out] out A pointer to the ::OrtTypeInfo for what the optional value could be.
-   * it is owned by OrtOptionalTypeInfo instance.
+   * \param[out] out A copy of ::OrtTypeInfo for what the optional value could be.
+   *                 The user must free this value with ReleaseTypeInfo.
    *
    * \snippet{doc} snippets.dox OrtStatus Return Value
    *
@@ -4786,6 +4800,75 @@ struct OrtApi {
    */
   ORT_API2_STATUS(SetEpDynamicOptions, _Inout_ OrtSession* sess, _In_reads_(kv_len) const char* const* keys,
                   _In_reads_(kv_len) const char* const* values, _In_ size_t kv_len);
+
+  /** \brief Release an OrtValueInfo instance if it was not added to an OrtGraph.
+   * \since Version 1.21.
+   */
+  ORT_CLASS_RELEASE(ValueInfo);
+
+  /** \brief Release an OrtNode if it was not added to an OrtGraph.
+   * \since Version 1.21.
+   */
+  ORT_CLASS_RELEASE(Node);
+
+  /** \brief Release an OrtGraph.
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   * \since Version 1.21.
+   */
+  ORT_CLASS_RELEASE(Graph);
+
+  /** \brief Release an OrtModel.
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   * \since Version 1.21.
+   */
+  ORT_CLASS_RELEASE(Model);
+
+  /** \brief Get the value name from an OrtValueInfo instance.
+   * \param[in] value_info The OrtValueInfo instance.
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   * \since Version 1.21.
+   */
+  ORT_API2_STATUS(GetValueInfoName, _In_ const OrtValueInfo* value_info, _Out_ const char** name);
+
+  /** \brief Get the type information from an OrtValueInfo instance.
+   * \param[in] value_info The OrtValueInfo instance.
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   * \since Version 1.21.
+   */
+  ORT_API2_STATUS(GetValueInfoTypeInfo, _In_ const OrtValueInfo* value_info, _Outptr_ const OrtTypeInfo** type_info);
+
+  /** \brief Get the Model Editor API instance
+   *
+   * Get the Model Editor API instance to create a new model or augment an existing model.
+   *
+   * \return Model Editor API struct
+   *
+   * \since Version 1.21.
+   */
+  const OrtModelEditorApi*(ORT_API_CALL* GetModelEditorApi)();
+
+  /** \brief Create an OrtValue for a Tensor that uses pre-existing memory.
+   *
+   * ORT will take ownership of the memory and free it using the provided deleter when no longer in use.
+   *
+   * \param[in] deleter OrtAllocator instance that will be used to free the memory.
+   *                    Only the OrtAllocator:Info and OrtAllocator::Release functions are required.
+   *                    The OrtMemoryInfo returned by OrtAllocator::Info must match the location of p_data.
+   * \param[in] p_data Pointer to the memory that will be used by the Tensor. ORT will take ownership of the memory.
+   * \param[in] p_data_len Length of the memory in bytes.
+   * \param[in] shape Dimensions of the Tensor. All values should be > 0.
+   * \param[in] shape_len Number of dimensions in the shape array.
+   * \param[in] type Data type of the Tensor.
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   *
+   * \since Version 1.21.
+   */
+  ORT_API2_STATUS(CreateTensorWithDataAndDeleterAsOrtValue, _In_ OrtAllocator* deleter,
+                  _In_ void* p_data, size_t p_data_len,
+                  _In_ const int64_t* shape, size_t shape_len,
+                  ONNXTensorElementDataType type,
+                  _Outptr_ OrtValue** out);
 };
 
 /*
@@ -4900,6 +4983,400 @@ struct OrtCustomOp {
   void(ORT_API_CALL* ReleaseAliasMap)(_Frees_ptr_opt_ int* input_index, _Frees_ptr_opt_ int* output_index);
 };
 
+/**
+ * ORT Model Editor API
+ */
+
+/**
+ * \brief The OrtModelEditorApi struct provides functions to create or edit an ONNX model.
+ *
+ * See onnxruntime/test/shared_lib/test_model_editor_api.cc for example usage.
+ *
+ * \since Version 1.21.
+ */
+struct OrtModelEditorApi {
+  // Model building/editing requires a full build. We return nullptr from GetModelEditorApi if this is a minimal
+  // build, so it doesn't matter if there are no function pointers in this struct as a user will never get an
+  // OrtModelEditorApi instance. We do however need a dummy field to avoid empty struct warning.
+#if defined(ORT_MINIMAL_BUILD)
+  const bool not_defined_in_this_build;
+#else
+  /** \brief Create an OrtTypeInfo instance for a Tensor.
+   *
+   * Create an OrtTypeInfo instance for a Tensor to use as graph inputs/outputs with the Model Editor API.
+   *
+   * User can release `tensor_info` after creating the OrtTypeInfo.
+   *
+   * \param[in] tensor_info Tensor type and shape information.
+   * \param[out] TypeInfo instance for the tensor.
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   *
+   * \since Version 1.21.
+   */
+  ORT_API2_STATUS(CreateTensorTypeInfo, _In_ const OrtTensorTypeAndShapeInfo* tensor_info,
+                  _Outptr_ OrtTypeInfo** type_info);
+
+  /** \brief Create an OrtTypeInfo instance for a SparseTensor.
+   *
+   * Create an OrtTypeInfo instance for a SparseTensor to use as graph inputs/outputs with the Model Editor API.
+   *
+   * User can release `tensor_info` after creating the OrtTypeInfo.
+   *
+   * \param[in] tensor_info SparseTensor type and shape information.
+   * \param[out] TypeInfo instance for the tensor.
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   *
+   * \since Version 1.21.
+   */
+  ORT_API2_STATUS(CreateSparseTensorTypeInfo, _In_ const OrtTensorTypeAndShapeInfo* tensor_info,
+                  _Outptr_ OrtTypeInfo** type_info);
+
+  /** \brief Create an OrtTypeInfo instance for a Map.
+   *
+   * Create an OrtTypeInfo instance for a Map to use as graph inputs/outputs with the Model Editor API.
+   *
+   * User can release `map_value_type` after creating the OrtTypeInfo.
+   *
+   * \param[in] map_key_type Key type for the map.
+   * \param[in] map_value_type Value type for the map.
+   * \param[out] TypeInfo instance for the map.
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   *
+   * \since Version 1.21.
+   */
+  ORT_API2_STATUS(CreateMapTypeInfo, ONNXTensorElementDataType map_key_type, _In_ const OrtTypeInfo* map_value_type,
+                  _Outptr_ OrtTypeInfo** type_info);
+
+  /** \brief Create an OrtTypeInfo instance for a Sequence.
+   *
+   * Create an OrtTypeInfo instance for a Sequence to use as graph inputs/outputs with the Model Editor API.
+   *
+   * User can release `sequence_type` after creating the OrtTypeInfo.
+   *
+   * \param[in] sequence_type Sequence type and shape information.
+   * \param[out] TypeInfo instance for the sequence.
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   *
+   * \since Version 1.21.
+   */
+  ORT_API2_STATUS(CreateSequenceTypeInfo, _In_ const OrtTypeInfo* sequence_type, _Outptr_ OrtTypeInfo** type_info);
+
+  /** \brief Create an OrtTypeInfo instance for an Optional.
+   *
+   * Create an OrtTypeInfo instance for an Optional to use as graph inputs/outputs with the Model Editor API.
+   *
+   * User can release `contained_type` after creating the OrtTypeInfo.
+   *
+   * \param[in] tensor_info Tensor type and shape information.
+   * \param[out] TypeInfo instance for the tensor.
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   *
+   * \since Version 1.21.
+   */
+  ORT_API2_STATUS(CreateOptionalTypeInfo, _In_ const OrtTypeInfo* contained_type, _Outptr_ OrtTypeInfo** type_info);
+
+  /** \brief Create an OrtValueInfo for use as an OrtGraph input or output.
+   *
+   * \param[in] name The name of the input or output.
+   * \param[in] type_info The type information for the input or output. The provided value is copied.
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   *
+   * \since Version 1.21.
+   */
+  ORT_API2_STATUS(CreateValueInfo, _In_ const char* name, _In_ const OrtTypeInfo* type_info,
+                  _Outptr_ OrtValueInfo** value_info);
+
+  /** \brief Create an OrtNode to add to an OrtGraph.
+   *
+   * Create an OrtNode.
+   *
+   * Create attributes with CreateOpAttr. OrtOpAttr instances are copied.
+   *
+   * \param[in] operator_name The name of the operator.
+   * \param[in] domain_name The domain of the operator. Use an empty string for ONNX operators.
+   * \param[in] node_name The name of the node.
+   * \param[in] input_names The names of the inputs.
+   * \param[in] input_names_len The number of input names.
+   * \param[in] output_names The names of the outputs.
+   * \param[in] output_names_len The number of output names.
+   * \param[in] attributes The optional attributes of the node.
+   * \param[in] attribs_len The number of attributes. May be zero.
+   * \param[out] node The OrtNode instance.
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   *
+   * \since Version 1.21.
+   */
+  ORT_API2_STATUS(CreateNode, _In_ const char* operator_name, _In_ const char* domain_name, _In_ const char* node_name,
+                  _In_reads_(input_names_len) const char* const* input_names, size_t input_names_len,
+                  _In_reads_(output_names_len) const char* const* output_names, size_t output_names_len,
+                  _In_reads_(attribs_len) _In_opt_ OrtOpAttr** attributes, _In_ size_t attribs_len,
+                  _Outptr_ OrtNode** node);
+
+  /** \brief Create an OrtGraph
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   * \since Version 1.21.
+   */
+  ORT_API2_STATUS(CreateGraph, _Outptr_ OrtGraph** graph);
+
+  /** \brief Set the inputs for the OrtGraph.
+   *
+   * Set the graph inputs. This will replace any existing inputs with the new values.
+   * The OrtGraph takes ownership of the OrtValueInfo instances and you should NOT call ReleaseOrtValueInfo.
+   *
+   * \param[in] graph The OrtGraph instance to update.
+   * \param[in] inputs The input OrtValueInfo instances.
+   * \param[in] inputs_len The number of input OrtValueInfo instances.
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   *
+   * \since Version 1.21.
+   */
+  ORT_API2_STATUS(SetGraphInputs, _Inout_ OrtGraph* graph,
+                  _In_reads_(inputs_len) _In_ OrtValueInfo** inputs, _In_ size_t inputs_len);
+
+  /** \brief Set the outputs for the OrtGraph.
+   *
+   * Set the graph outputs. This will replace any existing outputs with the new values.
+   * The OrtGraph takes ownership of the OrtValueInfo instances provided and you should NOT call ReleaseOrtValueInfo.
+   *
+   * \param[in] graph The OrtGraph instance to update.
+   * \param[in] outputs The output OrtValueInfo instances.
+   * \param[in] outputs_len The number of output OrtValueInfo instances.
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   *
+   * \since Version 1.21.
+   */
+  ORT_API2_STATUS(SetGraphOutputs, _Inout_ OrtGraph* graph,
+                  _In_reads_(outputs_len) _In_ OrtValueInfo** outputs, _In_ size_t outputs_len);
+
+  /** \brief Add an initializer to the OrtGraph
+   *
+   * ORT will take ownership of the OrtValue and you should NOT call ReleaseOrtValue.
+   *
+   * Two options:
+   *
+   * Allocated memory:
+   *    Use CreateTensorAsOrtValue (allocates memory) and populate the tensor with the data.
+   *    Set `data_is_external` to false.
+   *
+   * Pre-existing memory:
+   *    Use CreateTensorWithDataAsOrtValue or CreateTensorWithDataAndDeleterAsOrtValue to create an OrtValue
+   *    with a tensor that contains a pointer to the existing data.
+   *    Set `data_is_external` to true.
+   *
+   *    The pointer must remain valid for the duration of the inference session.
+   *    If using CreateTensorWithDataAsOrtValue you are responsible for freeing the memory after the inference session
+   *    is released.
+   *    If using CreateTensorWithDataAndDeleterAsOrtValue, ORT will free the memory using the provided deleter as
+   *    soon as the OrtValue is no longer in use.
+   *
+   *    NOTE: A tensor containing pre-existing memory MUST have 128 bytes of data or more.
+   *          For smaller tensors use CreateTensorAsOrtValue.
+   *
+   *          ONNX shape inferencing does not support external data. An initializer involved in shape inferencing is
+   *          typically small (a single value or limited by the rank of a tensor) and uses less than 128 bytes of
+   *          memory, so this limit acts as a simple catch-all rule to avoid issues.
+   *          e.g. Reshape's `shape`, Clip's `min` and `max`, various ops `axes`.
+   *
+   * \param[in] graph The OrtGraph instance to update.
+   * \param[in] name The value name for the initializer.
+   * \param[in] tensor The OrtValue instance containing the tensor data.
+   * \param[in] data_is_external Set to true if the data is external and should not be copied.
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   *
+   * \since Version 1.21.
+   */
+  ORT_API2_STATUS(AddInitializerToGraph, _Inout_ OrtGraph* graph, _In_ const char* name, _In_ OrtValue* tensor,
+                  bool data_is_external);
+
+  /** \brief Add an OrtNode to an OrtGraph
+   *
+   * Add the node to the graph. The OrtGraph will take ownership of OrtNode and you should NOT call ReleaseOrtNode.
+   *
+   * \param[in] graph The OrtGraph instance to update.
+   * \param[in] node The OrtNode instance to add to the graph.
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   *
+   * \since Version 1.21.
+   */
+  ORT_API2_STATUS(AddNodeToGraph, _Inout_ OrtGraph* graph, _In_ OrtNode* node);
+
+  /** \brief Create an OrtModel.
+   *
+   * Create an OrtModel.
+   *
+   * This can be used to build a new model, or to augment an existing model.
+   *
+   * \param[in] domain_names The domain names for the model.
+   *                         If augmenting an existing model add additional domains if needed.
+   * \param[in] opset_versions The opset versions for the model.
+   *                           If augmenting an existing model add additional opset versions if needed.
+   * \param[in] opset_entries_len The number of domain_names and opset_versions entries.
+   *                              Domain and opset entries should be 1:1
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   *
+   * \since Version 1.21.
+   */
+  ORT_API2_STATUS(CreateModel,
+                  _In_reads_(opset_entries_len) const char* const* domain_names,
+                  _In_reads_(opset_entries_len) const int* opset_versions,
+                  size_t opset_entries_len,
+                  _Outptr_ OrtModel** model);
+
+  /** \brief Add an OrtGraph to an OrtModel.
+   *
+   * Add the graph to a model. This should be called once when creating a new model.
+   *
+   * The OrtModel takes ownership of the OrtGraph and you should NOT call ReleaseOrtGraph.
+   *
+   * \param[in] model The OrtModel instance to update.
+   * \param[in] graph The OrtGraph instance to add to the model.
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   *
+   * \since Version 1.21.
+   */
+  ORT_API2_STATUS(AddGraphToModel, _Inout_ OrtModel* model, _In_ OrtGraph* graph);
+
+  /** \brief Create an OrtSession using the OrtModel.
+   *
+   * Create an inference session using the OrtModel instance.
+   * The OrtModel should have been populated with an OrtGraph containing nodes and initializers, and SetGraphInputs
+   * and SetGraphOutputs must have been called.
+   * This will validate the model, run optimizers, and prepare the session for inferencing.
+   *
+   * ReleaseOrtModel must be called to free the OrtModel after session creation.
+   *
+   * \param[in] env The OrtEnv instance.
+   * \param[in] model The OrtModel instance.
+   * \param[in] options The OrtSessionOptions instance.
+   * \param[out] out The OrtSession instance.
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   *
+   * \since Version 1.21.
+   */
+  ORT_API2_STATUS(CreateSessionFromModel, _In_ const OrtEnv* env, _In_ const OrtModel* model,
+                  _In_ const OrtSessionOptions* options, _Outptr_ OrtSession** out);
+
+  /** \brief Create an OrtSession to augment an existing model.
+   *
+   * Create an OrtSession with an existing model that will be augmented with additional nodes and initializers.
+   * Nodes can be added before or after the existing nodes in the model. ONNX Runtime will connect the nodes when the
+   * model is finalized.
+   *
+   * To add nodes and initializers to the existing model, first create an OrtModel using CreateModel.
+   * Add nodes and initializers to the OrtModel using AddNodeToGraph and AddInitializerToGraph.
+   * Graph inputs/outputs should be updated with SetGraphInputs and SetGraphOutputs as needed to reflect changes made
+   * by the new nodes. The list of graph inputs/outputs should be for the overall model and not just the new nodes.
+   *
+   * Add the new information from the OrtModel to the original model using ApplyModelToSession, and prepare the
+   * session for inferencing by calling FinalizeModelEditorSession.
+   *
+   * \param{in} env The OrtEnv instance.
+   * \param{in} model_path The path to the existing ONNX model to augment.
+   * \param{in} options The OrtSessionOptions instance.
+   * \param{out} out The created OrtSession instance.
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   *
+   * \since Version 1.21.
+   */
+  ORT_API2_STATUS(CreateModelEditorSession, _In_ const OrtEnv* env, _In_ const ORTCHAR_T* model_path,
+                  _In_ const OrtSessionOptions* options,
+                  _Outptr_ OrtSession** out);
+
+  /** \brief Create an OrtSession to augment an existing model.
+   *
+   * Create an OrtSession with an existing model that will be augmented with additional nodes and initializers.
+   * Nodes can be added before or after the existing nodes in the model. ONNX Runtime will connect the nodes when the
+   * model is finalized.
+   *
+   * To add nodes and initializers to the existing model, first create an OrtModel using CreateModel.
+   * Add nodes and initializers to the OrtModel using AddNodeToGraph and AddInitializerToGraph.
+   * Graph inputs/outputs should be updated with SetGraphInputs and SetGraphOutputs as needed to reflect changes made
+   * by the new nodes. The list of graph inputs/outputs should be for the overall model and not just the new nodes.
+   *
+   * Add the new information from the OrtModel to the original model using ApplyModelToSession, and prepare the
+   * session for inferencing by calling FinalizeModelEditorSession.
+   *
+   * \param{in} env The OrtEnv instance.
+   * \param{in} model_data The model data for the existing model to augment.
+   * \param{in} model_data_length The length of the model data.
+   * \param{in} options The OrtSessionOptions instance.
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   *
+   * \since Version 1.21.
+   */
+  ORT_API2_STATUS(CreateModelEditorSessionFromArray, _In_ const OrtEnv* env,
+                  _In_ const void* model_data, size_t model_data_length,
+                  _In_ const OrtSessionOptions* options,
+                  _Outptr_ OrtSession** out);
+
+  /** \brief Query the session for the opset version of a domain.
+   *
+   * When using the Model Editor API to augment a model, any new nodes must conform to the opset version of the
+   * original model. To do that the user must be able to discover that opset version.
+   *
+   * \param[in] session OrtSession to query
+   * \param[in] domain Domain to query. The ONNX domain is an empty string.
+   * \param[out] opset The opset version of the domain.
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value. Returns an error if the domain is not used in the model.
+   *
+   * \since Version 1.21.
+   */
+  ORT_API2_STATUS(SessionGetOpsetForDomain, _In_ const OrtSession* session, _In_ const char* domain, _Out_ int* opset);
+
+  /** \brief Apply changes to augment the ONNX model in a session created using CreateModelEditorSession[FromArray]
+   *
+   * Adds new nodes and updates graph inputs/outputs using `model` to augment the original ONNX model in the session.
+   * All changes will be validated.
+   * Call FinalizeModelEditorSession to prepare the session for inferencing.
+   *
+   * Existing input/outputs will only be updated if the OrtGraph inputs/outputs are set in the OrtModel.
+   *   i.e. you don't need to call SetGraphInputs/SetGraphOutputs if they are unchanged.
+   *
+   * ReleaseOrtModel must be called to free the OrtModel after it is applied to the session.
+   *
+   * \param[in] session OrtSession to update. Session must have been created using CreateModelEditorSession[FromArray].
+   * \param[in] model OrtModel containing new nodes, new initializers, and updated graph input and/or output info.
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   *
+   * \since Version 1.21.
+   */
+  ORT_API2_STATUS(ApplyModelToModelEditorSession, _Inout_ OrtSession* session, _In_ OrtModel* model);
+
+  /** \brief Finalize the Model Editor session that was created using CreateModelEditorSession[FromArray].
+   *
+   * Finalize the Model Editor session that augmented an ONNX model by adding new nodes.
+   * This will run optimizers and prepare the session for inferencing.
+   *
+   * \param[in] session OrtSession to finalize. Session must have been created using CreateModelEditorSession[FromArray].
+   * \param[in] options OrtSessionOptions to use for the session.
+   * \param[in] Optional prepacked_weights_container OrtPrepackedWeightsContainer to use for the session.
+                Set to nullptr if not used.
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   *
+   * \since Version 1.21.
+   */
+  ORT_API2_STATUS(FinalizeModelEditorSession, _Inout_ OrtSession* session, _In_ const OrtSessionOptions* options,
+                  _In_opt_ OrtPrepackedWeightsContainer* prepacked_weights_container);
+#endif  // !defined(ORT_MINIMAL_BUILD)
+};
+
 /*
  * This is the old way to add the CUDA provider to the session, please use SessionOptionsAppendExecutionProvider_CUDA above to access the latest functionality
  * This function always exists, but will only succeed if Onnxruntime was built with CUDA support and the CUDA provider shared library exists
diff --git a/include/onnxruntime/core/session/onnxruntime_cxx_api.h b/include/onnxruntime/core/session/onnxruntime_cxx_api.h
index 123ef98901003..89488b5158c93 100644
--- a/include/onnxruntime/core/session/onnxruntime_cxx_api.h
+++ b/include/onnxruntime/core/session/onnxruntime_cxx_api.h
@@ -26,16 +26,17 @@
 #include "onnxruntime_c_api.h"
 #include "onnxruntime_float16.h"
 
+#include <array>
 #include <cstddef>
 #include <cstdio>
-#include <array>
 #include <memory>
 #include <stdexcept>
 #include <string>
-#include <vector>
+#include <type_traits>
 #include <unordered_map>
 #include <utility>
-#include <type_traits>
+#include <variant>
+#include <vector>
 
 #ifdef ORT_NO_EXCEPTIONS
 #include <iostream>
@@ -120,7 +121,7 @@ const OrtApi* Global<T>::api_ = OrtGetApiBase()->GetApi(ORT_API_VERSION);
 #endif
 #endif
 
-/// This returns a reference to the OrtApi interface in use
+/// This returns a reference to the ORT C API.
 inline const OrtApi& GetApi() noexcept { return *Global<void>::api_; }
 
 /// <summary>
@@ -143,6 +144,20 @@ std::string GetBuildInfoString();
 /// <returns>vector of strings</returns>
 std::vector<std::string> GetAvailableProviders();
 
+/// <summary>
+/// This returns a reference to the ORT C Model Editor API. Used if building or augmenting a model at runtime.
+/// </summary>
+/// <returns>ORT C Model Editor API reference</returns>
+inline const OrtModelEditorApi& GetModelEditorApi() {
+  auto* api = GetApi().GetModelEditorApi();
+  if (api == nullptr) {
+    // minimal build
+    ORT_CXX_API_THROW("Model Editor API is not available in this build", ORT_FAIL);
+  }
+
+  return *api;
+}
+
 /** \brief IEEE 754 half-precision floating point data type
  *
  * \details This struct is used for converting float to float16 and back
@@ -523,6 +538,10 @@ ORT_DEFINE_RELEASE(Status);
 ORT_DEFINE_RELEASE(OpAttr);
 ORT_DEFINE_RELEASE(Op);
 ORT_DEFINE_RELEASE(KernelInfo);
+ORT_DEFINE_RELEASE(ValueInfo);
+ORT_DEFINE_RELEASE(Node);
+ORT_DEFINE_RELEASE(Graph);
+ORT_DEFINE_RELEASE(Model);
 
 #undef ORT_DEFINE_RELEASE
 
@@ -559,7 +578,9 @@ struct Base {
 
   constexpr Base() = default;
   constexpr explicit Base(contained_type* p) noexcept : p_{p} {}
-  ~Base() { OrtRelease(p_); }
+  ~Base() {
+    OrtRelease(p_);
+  }
 
   Base(const Base&) = delete;
   Base& operator=(const Base&) = delete;
@@ -635,9 +656,13 @@ struct AllocatedFree {
 
 struct AllocatorWithDefaultOptions;
 struct Env;
+struct Graph;
+struct Model;
+struct Node;
+struct ModelMetadata;
 struct TypeInfo;
 struct Value;
-struct ModelMetadata;
+struct ValueInfo;
 
 /** \brief unique_ptr typedef used to own strings allocated by OrtAllocators
  *  and release them at the end of the scope. The lifespan of the given allocator
@@ -1051,6 +1076,10 @@ struct ConstSessionImpl : Base<T> {
   size_t GetOutputCount() const;                  ///< Returns the number of model outputs
   size_t GetOverridableInitializerCount() const;  ///< Returns the number of inputs that have defaults that can be overridden
 
+  std::vector<std::string> GetInputNames() const;
+  std::vector<std::string> GetOutputNames() const;
+  std::vector<std::string> GetOverridableInitializerNames() const;
+
   /** \brief Returns a copy of input name at the specified index.
    *
    * \param index must less than the value returned by GetInputCount()
@@ -1084,6 +1113,12 @@ struct ConstSessionImpl : Base<T> {
   TypeInfo GetInputTypeInfo(size_t index) const;                   ///< Wraps OrtApi::SessionGetInputTypeInfo
   TypeInfo GetOutputTypeInfo(size_t index) const;                  ///< Wraps OrtApi::SessionGetOutputTypeInfo
   TypeInfo GetOverridableInitializerTypeInfo(size_t index) const;  ///< Wraps OrtApi::SessionGetOverridableInitializerTypeInfo
+
+  int GetOpset(const std::string& domain) const;  ///< Wraps OrtApi::SessionGetOpsetForDomain
+
+  // Will move before checkin if that's the case.
+  std::vector<ValueInfo> GetInputs() const;
+  std::vector<ValueInfo> GetOutputs() const;
 };
 
 template <typename T>
@@ -1161,6 +1196,9 @@ struct SessionImpl : ConstSessionImpl<T> {
    * \param[in] kv_len Number of elements in the keys and values arrays
    */
   void SetEpDynamicOptions(const char* const* keys, const char* const* values, size_t kv_len);
+
+  void FinalizeModelEditorSession(const Model& model, const SessionOptions& options,
+                                  OrtPrepackedWeightsContainer* prepacked_weights_container = nullptr);
 };
 
 }  // namespace detail
@@ -1172,13 +1210,34 @@ using UnownedSession = detail::SessionImpl<detail::Unowned<OrtSession>>;
  *
  */
 struct Session : detail::SessionImpl<OrtSession> {
-  explicit Session(std::nullptr_t) {}                                                   ///< Create an empty Session object, must be assigned a valid one to be used
-  Session(const Env& env, const ORTCHAR_T* model_path, const SessionOptions& options);  ///< Wraps OrtApi::CreateSession
+  /// Create an empty Session object, must be assigned a valid one to be used. Wraps OrtApi::CreateSession
+  explicit Session(std::nullptr_t) {}
+  explicit Session(OrtSession* p) : SessionImpl{p} {}  ///< C API Interop
+
+  Session(const Env& env, const ORTCHAR_T* model_path, const SessionOptions& options);
+
+  /// Wraps OrtApi::CreateSessionWithPrepackedWeightsContainer
   Session(const Env& env, const ORTCHAR_T* model_path, const SessionOptions& options,
-          OrtPrepackedWeightsContainer* prepacked_weights_container);                                        ///< Wraps OrtApi::CreateSessionWithPrepackedWeightsContainer
-  Session(const Env& env, const void* model_data, size_t model_data_length, const SessionOptions& options);  ///< Wraps OrtApi::CreateSessionFromArray
+          OrtPrepackedWeightsContainer* prepacked_weights_container);
+
+  /// Wraps OrtApi::CreateSessionFromArray
+  Session(const Env& env, const void* model_data, size_t model_data_length, const SessionOptions& options);
+
+  /// Wraps OrtApi::CreateSessionFromArrayWithPrepackedWeightsContainer
   Session(const Env& env, const void* model_data, size_t model_data_length, const SessionOptions& options,
-          OrtPrepackedWeightsContainer* prepacked_weights_container);  ///< Wraps OrtApi::CreateSessionFromArrayWithPrepackedWeightsContainer
+          OrtPrepackedWeightsContainer* prepacked_weights_container);
+
+#if !defined(ORT_MINIMAL_BUILD)
+  /// Wraps OrtModelEditorApi::CreateSessionFromModel
+  Session(const Env& env, const Model& model, const SessionOptions& options);
+
+  /// Wraps OrtModelEditorApi::CreateModelEditorSession
+  static Session CreateModelEditorSession(const Env& env, const ORTCHAR_T* model_path, const SessionOptions& options);
+
+  /// Wraps OrtModelEditorApi::CreateModelEditorSession
+  static Session CreateModelEditorSession(const Env& env, const void* model_data, size_t model_data_length,
+                                          const SessionOptions& options);
+#endif  // !defined(ORT_MINIMAL_BUILD)
 
   ConstSession GetConst() const { return ConstSession{this->p_}; }
   UnownedSession GetUnowned() const { return UnownedSession{this->p_}; }
@@ -1210,7 +1269,7 @@ using ConstMemoryInfo = detail::MemoryInfoImpl<detail::Unowned<const OrtMemoryIn
 struct MemoryInfo : detail::MemoryInfoImpl<OrtMemoryInfo> {
   static MemoryInfo CreateCpu(OrtAllocatorType type, OrtMemType mem_type1);
   explicit MemoryInfo(std::nullptr_t) {}                                       ///< No instance is created
-  explicit MemoryInfo(OrtMemoryInfo* p) : MemoryInfoImpl<OrtMemoryInfo>{p} {}  ///< Take ownership of a pointer created by C Api
+  explicit MemoryInfo(OrtMemoryInfo* p) : MemoryInfoImpl<OrtMemoryInfo>{p} {}  ///< Take ownership of a pointer created by C API
   MemoryInfo(const char* name, OrtAllocatorType type, int id, OrtMemType mem_type);
   ConstMemoryInfo GetConst() const { return ConstMemoryInfo{this->p_}; }
 };
@@ -1233,6 +1292,7 @@ struct TensorTypeAndShapeInfoImpl : Base<T> {
   [[deprecated("use GetShape()")]] void GetDimensions(int64_t* values, size_t values_count) const;  ///< Wraps OrtApi::GetDimensions
 
   void GetSymbolicDimensions(const char** values, size_t values_count) const;  ///< Wraps OrtApi::GetSymbolicDimensions
+  std::vector<const char*> GetSymbolicDimensions() const;
 
   std::vector<int64_t> GetShape() const;  ///< Uses GetDimensionsCount & GetDimensions to return a std::vector of the shape
 };
@@ -1248,8 +1308,18 @@ struct TensorTypeAndShapeInfo : detail::TensorTypeAndShapeInfoImpl<OrtTensorType
   using Base = detail::TensorTypeAndShapeInfoImpl<OrtTensorTypeAndShapeInfo>;
   using Base::Base;
 
-  explicit TensorTypeAndShapeInfo(std::nullptr_t) {}                                                ///< Create an empty TensorTypeAndShapeInfo object, must be assigned a valid one to be used
-  explicit TensorTypeAndShapeInfo(OrtTensorTypeAndShapeInfo* p) : TensorTypeAndShapeInfoImpl{p} {}  ///< Used for interop with the C API
+  /// Create an empty TensorTypeAndShapeInfo object, must be assigned a valid one to be used
+  explicit TensorTypeAndShapeInfo(std::nullptr_t) {}
+  /// Used for interop with the C API
+  explicit TensorTypeAndShapeInfo(OrtTensorTypeAndShapeInfo* p) : TensorTypeAndShapeInfoImpl{p} {}
+
+  // Create a TensorTypeAndShapeInfo object with the specified element type and dimensions
+  // symbolic_dims are optional, but should be 1:1 with dims.
+  // The value in symbolic_dims will be used for all entries in dims that are -1.
+  explicit TensorTypeAndShapeInfo(ONNXTensorElementDataType element_type,
+                                  const std::vector<int64_t>& dims,
+                                  const std::vector<std::string>* symbolic_dims = nullptr);
+
   ConstTensorTypeAndShapeInfo GetConst() const { return ConstTensorTypeAndShapeInfo{this->p_}; }
 };
 
@@ -1344,9 +1414,18 @@ struct TypeInfo : detail::TypeInfoImpl<OrtTypeInfo> {
   using Base = detail::TypeInfoImpl<OrtTypeInfo>;
   using Base::Base;
 
-  explicit TypeInfo(std::nullptr_t) {}                                 ///< Create an empty TypeInfo object, must be assigned a valid one to be used
+  /// Create an empty TypeInfo object, must be assigned a valid one to be used
+  explicit TypeInfo(std::nullptr_t) {}
   explicit TypeInfo(OrtTypeInfo* p) : TypeInfoImpl<OrtTypeInfo>{p} {}  ///< C API Interop
 
+#if !defined(ORT_MINIMAL_BUILD)
+  static TypeInfo CreateTensorInfo(ConstTensorTypeAndShapeInfo tensor_info);
+  static TypeInfo CreateSparseTensorInfo(ConstTensorTypeAndShapeInfo sparse_tensor_info);
+  static TypeInfo CreateSequenceTypeInfo(ConstTypeInfo sequence_type);
+  static TypeInfo CreateMapTypeInfo(ONNXTensorElementDataType key_type, ConstTypeInfo value_type);
+  static TypeInfo CreateOptionalTypeInfo(ConstTypeInfo contained_type);
+#endif  // !defined(ORT_MINIMAL_BUILD)
+
   ConstTypeInfo GetConst() const { return ConstTypeInfo{this->p_}; }
 };
 
@@ -1701,7 +1780,8 @@ struct Value : detail::ValueImpl<OrtValue> {
    * \param shape_len The number of tensor shape dimensions.
    */
   template <typename T>
-  static Value CreateTensor(const OrtMemoryInfo* info, T* p_data, size_t p_data_element_count, const int64_t* shape, size_t shape_len);
+  static Value CreateTensor(const OrtMemoryInfo* info, T* p_data, size_t p_data_element_count,
+                            const int64_t* shape, size_t shape_len);
 
   /** \brief Creates a tensor with a user supplied buffer. Wraps OrtApi::CreateTensorWithDataAsOrtValue.
    *
@@ -1712,11 +1792,25 @@ struct Value : detail::ValueImpl<OrtValue> {
    * \param shape_len The number of tensor shape dimensions.
    * \param type The data type.
    */
-  static Value CreateTensor(const OrtMemoryInfo* info, void* p_data, size_t p_data_byte_count, const int64_t* shape, size_t shape_len,
+  static Value CreateTensor(const OrtMemoryInfo* info, void* p_data, size_t p_data_byte_count,
+                            const int64_t* shape, size_t shape_len,
+                            ONNXTensorElementDataType type);
+
+  /** \brief Creates a tensor with a user supplied buffer. Wraps OrtApi::CreateTensorWithDataAndDeleterAsOrtValue.
+   *
+   * \param deleter OrtAllocator that will be used to free the buffer when no longer required.
+   * \param p_data Pointer to the data buffer.
+   * \param p_data_byte_count The number of bytes in the data buffer.
+   * \param shape Pointer to the tensor shape dimensions.
+   * \param shape_len The number of tensor shape dimensions.
+   * \param type The data type.
+   */
+  static Value CreateTensor(OrtAllocator* deleter, void* p_data, size_t p_data_byte_count,
+                            const int64_t* shape, size_t shape_len,
                             ONNXTensorElementDataType type);
 
   /** \brief Creates an OrtValue with a tensor using a supplied OrtAllocator. Wraps OrtApi::CreateTensorAsOrtValue.
-   *         This overload will allocate the buffer for the tensor  according to the supplied shape and data type.
+   *         This overload will allocate the buffer for the tensor according to the supplied shape and data type.
    *         The allocated buffer will be owned by the returned OrtValue and will be freed when the OrtValue is released.
    *         The input data would need to be copied into the allocated buffer.
    *         This API is not suitable for strings.
@@ -1740,7 +1834,8 @@ struct Value : detail::ValueImpl<OrtValue> {
    * \param shape_len The number of tensor shape dimensions.
    * \param type The data type.
    */
-  static Value CreateTensor(OrtAllocator* allocator, const int64_t* shape, size_t shape_len, ONNXTensorElementDataType type);
+  static Value CreateTensor(OrtAllocator* allocator, const int64_t* shape, size_t shape_len,
+                            ONNXTensorElementDataType type);
 
   /** \brief Creates an OrtValue with a Map Onnx type representation.
    *  The API would ref-count the supplied OrtValues and they will be released
@@ -2459,6 +2554,129 @@ struct CustomOpBase : OrtCustomOp {
   int end_ver_ = MAX_CUSTOM_OP_END_VER;
 };
 
-}  // namespace Ort
+namespace detail {
+template <typename T>
+struct ValueInfoImpl : Ort::detail::Base<T> {
+  using B = Ort::detail::Base<T>;
+  using B::B;
+
+  std::string Name() const;
+  ConstTypeInfo TypeInfo() const;
+};
+}  // namespace detail
+
+// Const object holder that does not own the underlying object
+using ConstValueInfo = detail::ValueInfoImpl<Ort::detail::Unowned<const OrtValueInfo>>;
 
+/** \brief Wrapper around ::OrtValueInfo
+ *
+ */
+struct ValueInfo : detail::ValueInfoImpl<OrtValueInfo> {
+  explicit ValueInfo(std::nullptr_t) {}  ///< No instance is created
+  /// Take ownership of a pointer created by C API
+  explicit ValueInfo(OrtValueInfo* p) : ValueInfoImpl<OrtValueInfo>{p} {}
+
+  // Create ValueInfo for a tensor
+  explicit ValueInfo(const std::string& name, const ConstTypeInfo& type_info);
+
+  ConstValueInfo GetConst() const { return ConstValueInfo{this->p_}; }
+};
+
+namespace detail {
+template <typename T>
+struct NodeImpl : Ort::detail::Base<T> {
+  using B = Ort::detail::Base<T>;
+  using B::B;
+};
+}  // namespace detail
+
+/** \brief Wrapper around ::OrtNode
+ *
+ */
+struct Node : detail::NodeImpl<OrtNode> {
+  explicit Node(std::nullptr_t) {}                     ///< No instance is created
+  explicit Node(OrtNode* p) : NodeImpl<OrtNode>{p} {}  ///< Take ownership of a pointer created by C API
+
+#if !defined(ORT_MINIMAL_BUILD)
+  Node(const std::string& operator_name, const std::string& operator_domain,
+       const std::string& node_name,
+       const std::vector<std::string>& input_names,
+       const std::vector<std::string>& output_names);
+
+  /// <summary>
+  /// Wraps CreateNode. Node takes ownership of attributes on success and updates the OpAttr in `attributes` to do so.
+  /// </summary>
+  Node(const std::string& operator_name, const std::string& operator_domain,
+       const std::string& node_name,
+       const std::vector<std::string>& input_names,
+       const std::vector<std::string>& output_names,
+       std::vector<OpAttr>& attributes);
+
+ private:
+  static void Init(const std::string& operator_name, const std::string& operator_domain,
+                   const std::string& node_name,
+                   const std::vector<std::string>& input_names,
+                   const std::vector<std::string>& output_names,
+                   std::vector<OpAttr>& attributes,
+                   OrtNode*& node);
+#endif  // !defined(ORT_MINIMAL_BUILD)
+};
+
+namespace detail {
+template <typename T>
+struct GraphImpl : Ort::detail::Base<T> {
+  using B = Ort::detail::Base<T>;
+  using B::B;
+
+#if !defined(ORT_MINIMAL_BUILD)
+  void SetInputs(std::vector<ValueInfo>& inputs);
+  void SetOutputs(std::vector<ValueInfo>& outputs);
+  void AddInitializer(const std::string& name, Value& initializer, bool data_is_external);  // Graph takes ownership of Value
+  void AddNode(Node& node);                                                                 // Graph takes ownership of Node
+#endif                                                                                      // !defined(ORT_MINIMAL_BUILD)
+};
+}  // namespace detail
+
+/** \brief Wrapper around ::OrtGraph
+ *
+ */
+struct Graph : detail::GraphImpl<OrtGraph> {
+  explicit Graph(std::nullptr_t) {}                        ///< No instance is created
+  explicit Graph(OrtGraph* p) : GraphImpl<OrtGraph>{p} {}  ///< Take ownership of a pointer created by C API
+#if !defined(ORT_MINIMAL_BUILD)
+  Graph();
+#endif
+};
+
+namespace detail {
+template <typename T>
+struct ModelImpl : Ort::detail::Base<T> {
+  using B = Ort::detail::Base<T>;
+  using B::B;
+
+#if !defined(ORT_MINIMAL_BUILD)
+  void AddGraph(Graph& graph);
+#endif
+};
+}  // namespace detail
+
+// Const object holder that does not own the underlying object
+using ConstModel = detail::ModelImpl<Ort::detail::Unowned<const OrtModel>>;
+
+/** \brief Wrapper around ::OrtModel
+ *
+ */
+struct Model : detail::ModelImpl<OrtModel> {
+  using DomainOpsetPair = std::pair<std::string, int>;
+
+  explicit Model(std::nullptr_t) {}                        ///< No instance is created
+  explicit Model(OrtModel* p) : ModelImpl<OrtModel>{p} {}  ///< Take ownership of a pointer created by C API
+
+#if !defined(ORT_MINIMAL_BUILD)
+  explicit Model(const std::vector<DomainOpsetPair>& opsets);
+#endif
+
+  ConstModel GetConst() const { return ConstModel{this->p_}; }
+};
+}  // namespace Ort
 #include "onnxruntime_cxx_inline.h"
diff --git a/include/onnxruntime/core/session/onnxruntime_cxx_inline.h b/include/onnxruntime/core/session/onnxruntime_cxx_inline.h
index 3aeb9412f350e..48c5e52e33c53 100644
--- a/include/onnxruntime/core/session/onnxruntime_cxx_inline.h
+++ b/include/onnxruntime/core/session/onnxruntime_cxx_inline.h
@@ -10,7 +10,9 @@
 #include <algorithm>
 #include <functional>
 #include <iterator>
+#include <string>
 #include <type_traits>
+#include <vector>
 
 // Convert OrtStatus to Ort::Status and return
 // instead of throwing
@@ -995,6 +997,59 @@ inline size_t ConstSessionImpl<T>::GetOverridableInitializerCount() const {
   return out;
 }
 
+template <typename T>
+inline std::vector<std::string> ConstSessionImpl<T>::GetInputNames() const {
+  AllocatorWithDefaultOptions allocator;
+
+  auto num_inputs = GetInputCount();
+  std::vector<std::string> input_names;
+  input_names.reserve(num_inputs);
+
+  for (size_t i = 0; i < num_inputs; ++i) {
+    char* name = nullptr;
+    ThrowOnError(GetApi().SessionGetInputName(this->p_, i, allocator, &name));
+    input_names.push_back(name);
+    allocator.Free(name);
+  }
+
+  return input_names;
+}
+
+template <typename T>
+inline std::vector<std::string> ConstSessionImpl<T>::GetOutputNames() const {
+  AllocatorWithDefaultOptions allocator;
+
+  auto num_inputs = GetOutputCount();
+  std::vector<std::string> output_names;
+  output_names.reserve(num_inputs);
+
+  for (size_t i = 0; i < num_inputs; ++i) {
+    char* name = nullptr;
+    ThrowOnError(GetApi().SessionGetOutputName(this->p_, i, allocator, &name));
+    output_names.push_back(name);
+    allocator.Free(name);
+  }
+
+  return output_names;
+}
+
+template <typename T>
+inline std::vector<std::string> ConstSessionImpl<T>::GetOverridableInitializerNames() const {
+  AllocatorWithDefaultOptions allocator;
+
+  auto num_initializers = GetOverridableInitializerCount();
+  std::vector<std::string> initializer_names;
+  initializer_names.reserve(num_initializers);
+
+  for (size_t i = 0; i < num_initializers; ++i) {
+    char* name = nullptr;
+    ThrowOnError(GetApi().SessionGetOverridableInitializerName(this->p_, i, allocator, &name));
+    initializer_names.push_back(name);
+  }
+
+  return initializer_names;
+}
+
 template <typename T>
 inline AllocatedStringPtr ConstSessionImpl<T>::GetInputNameAllocated(size_t index, OrtAllocator* allocator) const {
   char* out;
@@ -1051,6 +1106,45 @@ inline TypeInfo ConstSessionImpl<T>::GetOverridableInitializerTypeInfo(size_t in
   return TypeInfo{out};
 }
 
+#if !defined(ORT_MINIMAL_BUILD)
+template <typename T>
+inline int ConstSessionImpl<T>::GetOpset(const std::string& domain) const {
+  int opset;
+  ThrowOnError(GetModelEditorApi().SessionGetOpsetForDomain(this->p_, domain.c_str(), &opset));
+  return opset;
+}
+#endif  // !defined(ORT_MINIMAL_BUILD)
+
+template <typename T>
+std::vector<ValueInfo> ConstSessionImpl<T>::GetInputs() const {
+  const std::vector<std::string> input_names = GetInputNames();
+
+  std::vector<ValueInfo> inputs;
+  inputs.reserve(input_names.size());
+
+  for (size_t i = 0; i < input_names.size(); ++i) {
+    auto type_info = GetInputTypeInfo(i);
+    inputs.emplace_back(ValueInfo{input_names[i], type_info.GetConst()});
+  }
+
+  return inputs;
+}
+
+template <typename T>
+std::vector<ValueInfo> ConstSessionImpl<T>::GetOutputs() const {
+  const std::vector<std::string> output_names = GetOutputNames();
+
+  std::vector<ValueInfo> outputs;
+  outputs.reserve(output_names.size());
+
+  for (size_t i = 0; i < output_names.size(); ++i) {
+    auto type_info = GetOutputTypeInfo(i);
+    outputs.emplace_back(ValueInfo{output_names[i], type_info.GetConst()});
+  }
+
+  return outputs;
+}
+
 template <typename T>
 inline std::vector<Value> SessionImpl<T>::Run(const RunOptions& run_options, const char* const* input_names, const Value* input_values, size_t input_count,
                                               const char* const* output_names, size_t output_count) {
@@ -1098,6 +1192,15 @@ inline void SessionImpl<T>::SetEpDynamicOptions(const char* const* keys, const c
   ThrowOnError(GetApi().SetEpDynamicOptions(this->p_, keys, values, kv_len));
 }
 
+#if !defined(ORT_MINIMAL_BUILD)
+template <typename T>
+inline void SessionImpl<T>::FinalizeModelEditorSession(const Model& model, const SessionOptions& options,
+                                                       OrtPrepackedWeightsContainer* prepacked_weights_container) {
+  ThrowOnError(GetModelEditorApi().ApplyModelToModelEditorSession(this->p_, model));
+  ThrowOnError(GetModelEditorApi().FinalizeModelEditorSession(this->p_, options, prepacked_weights_container));
+}
+#endif  // #if !defined(ORT_MINIMAL_BUILD)
+
 }  // namespace detail
 
 inline SessionOptions::SessionOptions() {
@@ -1144,6 +1247,32 @@ inline Session::Session(const Env& env, const void* model_data, size_t model_dat
                                                                             prepacked_weights_container, &this->p_));
 }
 
+#if !defined(ORT_MINIMAL_BUILD)
+inline Session::Session(const Env& env, const Model& model, const SessionOptions& options) {
+  ThrowOnError(GetModelEditorApi().CreateSessionFromModel(env, model.GetConst(), options, &this->p_));
+}
+
+// static
+inline Session Session::CreateModelEditorSession(const Env& env, const ORTCHAR_T* model_path,
+                                                 const SessionOptions& options) {
+  OrtSession* session = nullptr;
+  ThrowOnError(GetModelEditorApi().CreateModelEditorSession(env, model_path, options, &session));
+  return Session(session);
+}
+
+// static
+inline Session Session::CreateModelEditorSession(const Env& env, const void* model_data, size_t model_data_length,
+                                                 const SessionOptions& options) {
+  OrtSession* session = nullptr;
+  ThrowOnError(GetModelEditorApi().CreateModelEditorSessionFromArray(env, model_data, model_data_length, options,
+                                                                     &session));
+  return Session(session);
+}
+
+void FinalizeModelEditorSession(const Model& model, const SessionOptions& options,
+                                OrtPrepackedWeightsContainer* prepacked_weights_container);
+#endif  // #if !defined(ORT_MINIMAL_BUILD)
+
 inline AllocatedStringPtr ModelMetadata::GetProducerNameAllocated(OrtAllocator* allocator) const {
   char* out;
   ThrowOnError(GetApi().ModelMetadataGetProducerName(p_, allocator, &out));
@@ -1211,6 +1340,59 @@ inline int64_t ModelMetadata::GetVersion() const {
   return out;
 }
 
+inline TensorTypeAndShapeInfo::TensorTypeAndShapeInfo(ONNXTensorElementDataType element_type,
+                                                      const std::vector<int64_t>& dims,
+                                                      const std::vector<std::string>* symbolic_dims) {
+  ThrowOnError(GetApi().CreateTensorTypeAndShapeInfo(&p_));
+  ThrowOnError(GetApi().SetTensorElementType(p_, element_type));
+  ThrowOnError(GetApi().SetDimensions(p_, dims.data(), dims.size()));
+
+  if (symbolic_dims) {
+    std::vector<const char*> symbolic_dims_cstr;
+    symbolic_dims_cstr.reserve(symbolic_dims->size());
+    std::transform(symbolic_dims->begin(), symbolic_dims->end(), std::back_inserter(symbolic_dims_cstr),
+                   [](const std::string& s) { return s.c_str(); });
+    ThrowOnError(GetApi().SetSymbolicDimensions(p_, symbolic_dims_cstr.data(), symbolic_dims_cstr.size()));
+  }
+}
+
+#if !defined(ORT_MINIMAL_BUILD)
+// static
+inline TypeInfo TypeInfo::CreateTensorInfo(ConstTensorTypeAndShapeInfo tensor_type_and_shape_info) {
+  OrtTypeInfo* output = nullptr;
+  ThrowOnError(GetModelEditorApi().CreateTensorTypeInfo(tensor_type_and_shape_info, &output));
+  return TypeInfo{output};
+}
+
+// static
+inline TypeInfo TypeInfo::CreateSparseTensorInfo(ConstTensorTypeAndShapeInfo sparse_tensor_type_and_shape_info) {
+  OrtTypeInfo* output = nullptr;
+  ThrowOnError(GetModelEditorApi().CreateSparseTensorTypeInfo(sparse_tensor_type_and_shape_info, &output));
+  return TypeInfo{output};
+}
+
+// static
+inline TypeInfo TypeInfo::CreateSequenceTypeInfo(ConstTypeInfo sequence_type) {
+  OrtTypeInfo* output;
+  ThrowOnError(GetModelEditorApi().CreateSequenceTypeInfo(sequence_type, &output));
+  return TypeInfo{output};
+}
+
+// static
+inline TypeInfo TypeInfo::CreateMapTypeInfo(ONNXTensorElementDataType key_type, ConstTypeInfo value_type) {
+  OrtTypeInfo* output;
+  ThrowOnError(GetModelEditorApi().CreateMapTypeInfo(key_type, value_type, &output));
+  return TypeInfo{output};
+}
+
+// static
+inline TypeInfo TypeInfo::CreateOptionalTypeInfo(ConstTypeInfo contained_type) {
+  OrtTypeInfo* output;
+  ThrowOnError(GetModelEditorApi().CreateOptionalTypeInfo(contained_type, &output));
+  return TypeInfo{output};
+}
+#endif  // #if !defined(ORT_MINIMAL_BUILD)
+
 namespace detail {
 
 template <typename T>
@@ -1244,9 +1426,16 @@ inline void TensorTypeAndShapeInfoImpl<T>::GetSymbolicDimensions(const char** va
   ThrowOnError(GetApi().GetSymbolicDimensions(this->p_, values, values_count));
 }
 
+template <typename T>
+inline std::vector<const char*> TensorTypeAndShapeInfoImpl<T>::GetSymbolicDimensions() const {
+  std::vector<const char*> out(GetDimensionsCount(), nullptr);
+  ThrowOnError(GetApi().GetSymbolicDimensions(this->p_, out.data(), out.size()));
+  return out;
+}
+
 template <typename T>
 inline std::vector<int64_t> TensorTypeAndShapeInfoImpl<T>::GetShape() const {
-  std::vector<int64_t> out(GetDimensionsCount(), 0);
+  std::vector<int64_t> out(GetDimensionsCount(), -1);
   ThrowOnError(GetApi().GetDimensions(this->p_, out.data(), out.size()));
   return out;
 }
@@ -1560,23 +1749,35 @@ void ValueImpl<T>::FillSparseTensorBlockSparse(const OrtMemoryInfo* data_mem_inf
 }  // namespace detail
 
 template <typename T>
-inline Value Value::CreateTensor(const OrtMemoryInfo* info, T* p_data, size_t p_data_element_count, const int64_t* shape, size_t shape_len) {
+inline Value Value::CreateTensor(const OrtMemoryInfo* info, T* p_data, size_t p_data_element_count,
+                                 const int64_t* shape, size_t shape_len) {
   return CreateTensor(info, p_data, p_data_element_count * sizeof(T), shape, shape_len, TypeToTensorType<T>::type);
 }
 
-inline Value Value::CreateTensor(const OrtMemoryInfo* info, void* p_data, size_t p_data_byte_count, const int64_t* shape, size_t shape_len,
+inline Value Value::CreateTensor(const OrtMemoryInfo* info, void* p_data, size_t p_data_byte_count,
+                                 const int64_t* shape, size_t shape_len,
                                  ONNXTensorElementDataType type) {
   OrtValue* out;
   ThrowOnError(GetApi().CreateTensorWithDataAsOrtValue(info, p_data, p_data_byte_count, shape, shape_len, type, &out));
   return Value{out};
 }
 
+inline Value Value::CreateTensor(OrtAllocator* deleter, void* p_data, size_t p_data_byte_count,
+                                 const int64_t* shape, size_t shape_len,
+                                 ONNXTensorElementDataType type) {
+  OrtValue* out;
+  ThrowOnError(GetApi().CreateTensorWithDataAndDeleterAsOrtValue(deleter, p_data, p_data_byte_count,
+                                                                 shape, shape_len, type, &out));
+  return Value{out};
+}
+
 template <typename T>
 inline Value Value::CreateTensor(OrtAllocator* allocator, const int64_t* shape, size_t shape_len) {
   return CreateTensor(allocator, shape, shape_len, TypeToTensorType<T>::type);
 }
 
-inline Value Value::CreateTensor(OrtAllocator* allocator, const int64_t* shape, size_t shape_len, ONNXTensorElementDataType type) {
+inline Value Value::CreateTensor(OrtAllocator* allocator, const int64_t* shape, size_t shape_len,
+                                 ONNXTensorElementDataType type) {
   OrtValue* out;
   ThrowOnError(GetApi().CreateTensorAsOrtValue(allocator, shape, shape_len, type, &out));
   return Value{out};
@@ -1594,7 +1795,8 @@ inline Value Value::CreateSparseTensor(const OrtMemoryInfo* info, void* p_data,
                                        const Shape& values_shape, ONNXTensorElementDataType type) {
   OrtValue* out;
   ThrowOnError(GetApi().CreateSparseTensorWithValuesAsOrtValue(info, p_data, dense_shape.shape, dense_shape.shape_len,
-                                                               values_shape.shape, values_shape.shape_len, type, &out));
+                                                               values_shape.shape, values_shape.shape_len, type,
+                                                               &out));
   return Value{out};
 }
 
@@ -2167,4 +2369,142 @@ inline const OrtOpAttr* ShapeInferContext::GetAttrHdl(const char* attr_name) con
   return attr_hdl;
 }
 
+namespace detail {
+inline std::vector<const char*> StringsToCharPtrs(const std::vector<std::string>& strings) {
+  std::vector<const char*> ptrs;
+  ptrs.reserve(strings.size());
+  std::transform(strings.begin(), strings.end(), std::back_inserter(ptrs),
+                 [](const std::string& s) { return s.c_str(); });
+
+  return ptrs;
+}
+}  // namespace detail
+
+#if !defined(ORT_MINIMAL_BUILD)
+// static
+inline void Node::Init(const std::string& operator_name, const std::string& operator_domain,
+                       const std::string& node_name,
+                       const std::vector<std::string>& input_names,
+                       const std::vector<std::string>& output_names,
+                       std::vector<OpAttr>& attributes,
+                       OrtNode*& node) {
+  auto inputs = detail::StringsToCharPtrs(input_names);
+  auto outputs = detail::StringsToCharPtrs(output_names);
+
+  std::vector<OrtOpAttr*> attributes_ptrs;
+  attributes_ptrs.reserve(attributes.size());
+  std::transform(attributes.begin(), attributes.end(), std::back_inserter(attributes_ptrs),
+                 [](OpAttr& attr) -> OrtOpAttr* { return attr; });
+
+  ThrowOnError(GetModelEditorApi().CreateNode(operator_name.c_str(), operator_domain.c_str(), node_name.c_str(),
+                                              inputs.data(), inputs.size(),
+                                              outputs.data(), outputs.size(),
+                                              attributes_ptrs.data(), attributes_ptrs.size(),
+                                              &node));
+
+  // Node now owns the attributes
+  std::for_each(attributes.begin(), attributes.end(), [](OpAttr& attr) { attr.release(); });
+}
+
+inline Node::Node(const std::string& operator_name, const std::string& operator_domain,
+                  const std::string& node_name,
+                  const std::vector<std::string>& input_names,
+                  const std::vector<std::string>& output_names,
+                  std::vector<OpAttr>& attributes) {
+  Init(operator_name, operator_domain, node_name, input_names, output_names, attributes, p_);
+}
+
+inline Node::Node(const std::string& operator_name, const std::string& operator_domain,
+                  const std::string& node_name,
+                  const std::vector<std::string>& input_names,
+                  const std::vector<std::string>& output_names) {
+  std::vector<OpAttr> empty_attributes;
+  Init(operator_name, operator_domain, node_name, input_names, output_names, empty_attributes, p_);
+}
+
+inline Graph::Graph() {
+  ThrowOnError(GetModelEditorApi().CreateGraph(&p_));
+}
+
+inline Model::Model(const std::vector<DomainOpsetPair>& opsets) {
+  std::vector<const char*> domains;
+  std::vector<int> versions;
+  domains.reserve(opsets.size());
+  versions.reserve(opsets.size());
+
+  for (const auto& pair : opsets) {
+    domains.push_back(pair.first.c_str());
+    versions.push_back(pair.second);
+  }
+
+  ThrowOnError(GetModelEditorApi().CreateModel(domains.data(), versions.data(), opsets.size(), &p_));
+}
+
+inline ValueInfo::ValueInfo(const std::string& name, const ConstTypeInfo& type_info) {
+  ThrowOnError(GetModelEditorApi().CreateValueInfo(name.c_str(), type_info, &p_));
+}
+#endif  // !defined(ORT_MINIMAL_BUILD)
+
+namespace detail {
+template <>
+inline std::string ValueInfoImpl<OrtValueInfo>::Name() const {
+  const char* name = nullptr;
+  ThrowOnError(GetApi().GetValueInfoName(this->p_, &name));
+  return name;
+}
+
+template <>
+inline ConstTypeInfo ValueInfoImpl<OrtValueInfo>::TypeInfo() const {
+  const OrtTypeInfo* type_info = nullptr;
+  ThrowOnError(GetApi().GetValueInfoTypeInfo(this->p_, &type_info));
+  return ConstTypeInfo{type_info};
+}
+
+#if !defined(ORT_MINIMAL_BUILD)
+template <>
+inline void GraphImpl<OrtGraph>::SetInputs(std::vector<ValueInfo>& inputs) {
+  std::vector<OrtValueInfo*> inputs_ptrs;
+  inputs_ptrs.reserve(inputs.size());
+  std::transform(inputs.begin(), inputs.end(), std::back_inserter(inputs_ptrs),
+                 [](ValueInfo& vi) -> OrtValueInfo* { return vi; });
+
+  ThrowOnError(GetModelEditorApi().SetGraphInputs(p_, inputs_ptrs.data(), inputs_ptrs.size()));
+
+  // Graph now owns the inputs
+  std::for_each(inputs.begin(), inputs.end(), [](ValueInfo& vi) { vi.release(); });
+}
+
+template <>
+inline void GraphImpl<OrtGraph>::SetOutputs(std::vector<ValueInfo>& outputs) {
+  std::vector<OrtValueInfo*> outputs_ptrs;
+  outputs_ptrs.reserve(outputs.size());
+  std::transform(outputs.begin(), outputs.end(), std::back_inserter(outputs_ptrs),
+                 [](ValueInfo& vi) -> OrtValueInfo* { return vi; });
+
+  ThrowOnError(GetModelEditorApi().SetGraphOutputs(p_, outputs_ptrs.data(), outputs_ptrs.size()));
+
+  // Graph now owns the outputs
+  std::for_each(outputs.begin(), outputs.end(), [](ValueInfo& vi) { vi.release(); });
+}
+
+template <>
+inline void GraphImpl<OrtGraph>::AddInitializer(const std::string& name, Value& initializer, bool data_is_external) {
+  // Graph takes ownership of `initializer`
+  ThrowOnError(GetModelEditorApi().AddInitializerToGraph(p_, name.c_str(), initializer.release(), data_is_external));
+}
+
+template <>
+inline void GraphImpl<OrtGraph>::AddNode(Node& node) {
+  // Graph takes ownership of `node`
+  ThrowOnError(GetModelEditorApi().AddNodeToGraph(p_, node.release()));
+}
+
+template <>
+inline void ModelImpl<OrtModel>::AddGraph(Graph& graph) {
+  // Model takes ownership of `graph`
+  ThrowOnError(GetModelEditorApi().AddGraphToModel(p_, graph.release()));
+}
+#endif  // !defined(ORT_MINIMAL_BUILD)
+
+}  // namespace detail
 }  // namespace Ort
diff --git a/onnxruntime/core/framework/onnxruntime_typeinfo.cc b/onnxruntime/core/framework/onnxruntime_typeinfo.cc
index a884927abddb7..1c446840b7938 100644
--- a/onnxruntime/core/framework/onnxruntime_typeinfo.cc
+++ b/onnxruntime/core/framework/onnxruntime_typeinfo.cc
@@ -10,8 +10,8 @@
 #include "core/framework/sparse_tensor.h"
 #include "core/graph/onnx_protobuf.h"
 #include "core/session/ort_apis.h"
+#include "core/session/model_editor_api.h"
 #include "core/framework/error_code_helper.h"
-
 #include "core/framework/tensor_type_and_shape.h"
 #include "core/framework/onnxruntime_map_type_info.h"
 #include "core/framework/onnxruntime_sequence_type_info.h"
@@ -40,7 +40,7 @@ OrtTypeInfo::OrtTypeInfo(std::unique_ptr<OrtOptionalTypeInfo> optional_type_info
     : type(ONNX_TYPE_OPTIONAL), optional_type_info(std::move(optional_type_info)) {}
 
 OrtTypeInfo::OrtTypeInfo(ONNXType type, std::unique_ptr<OrtTensorTypeAndShapeInfo> data) noexcept
-    : type(type), data(std::move(data)) {
+    : type(type), tensor_type_info(std::move(data)) {
 }
 
 OrtTypeInfo::~OrtTypeInfo() = default;
@@ -55,7 +55,9 @@ ORT_API_STATUS_IMPL(OrtApis::GetOnnxTypeFromTypeInfo, _In_ const struct OrtTypeI
 ORT_API_STATUS_IMPL(OrtApis::CastTypeInfoToTensorInfo, _In_ const struct OrtTypeInfo* input,
                     _Outptr_result_maybenull_ const struct OrtTensorTypeAndShapeInfo** out) {
   API_IMPL_BEGIN
-  *out = (input->type == ONNX_TYPE_TENSOR || input->type == ONNX_TYPE_SPARSETENSOR) ? input->data.get() : nullptr;
+  *out = (input->type == ONNX_TYPE_TENSOR || input->type == ONNX_TYPE_SPARSETENSOR)
+             ? input->tensor_type_info.get()
+             : nullptr;
   return nullptr;
   API_IMPL_END
 }
@@ -84,8 +86,8 @@ ORT_API_STATUS_IMPL(OrtApis::CastTypeInfoToOptionalTypeInfo, _In_ const OrtTypeI
   API_IMPL_END
 }
 
-ORT_API_STATUS_IMPL(OrtApis::GetDenotationFromTypeInfo, _In_ const OrtTypeInfo* type_info, _Out_ const char** const out,
-                    _Out_ size_t* len) {
+ORT_API_STATUS_IMPL(OrtApis::GetDenotationFromTypeInfo, _In_ const OrtTypeInfo* type_info,
+                    _Out_ const char** const out, _Out_ size_t* len) {
   API_IMPL_BEGIN
   *out = type_info->denotation.c_str();
   *len = type_info->denotation.size();
@@ -93,6 +95,61 @@ ORT_API_STATUS_IMPL(OrtApis::GetDenotationFromTypeInfo, _In_ const OrtTypeInfo*
   API_IMPL_END
 }
 
+#if !defined(ORT_MINIMAL_BUILD)
+ORT_API_STATUS_IMPL(OrtModelEditorAPI::CreateTensorTypeInfo, _In_ const OrtTensorTypeAndShapeInfo* tensor_info,
+                    _Out_ OrtTypeInfo** type_info) {
+  API_IMPL_BEGIN
+  auto ti = std::make_unique<OrtTypeInfo>(ONNXType::ONNX_TYPE_TENSOR);
+  ti->tensor_type_info = tensor_info->Clone();
+  *type_info = ti.release();
+  return nullptr;
+  API_IMPL_END
+}
+
+ORT_API_STATUS_IMPL(OrtModelEditorAPI::CreateSparseTensorTypeInfo, _In_ const OrtTensorTypeAndShapeInfo* tensor_info,
+                    _Out_ OrtTypeInfo** type_info) {
+  API_IMPL_BEGIN
+  auto ti = std::make_unique<OrtTypeInfo>(ONNXType::ONNX_TYPE_SPARSETENSOR);
+  ti->tensor_type_info = tensor_info->Clone();
+  *type_info = ti.release();
+  return nullptr;
+  API_IMPL_END
+}
+
+ORT_API_STATUS_IMPL(OrtModelEditorAPI::CreateMapTypeInfo, ONNXTensorElementDataType map_key_type,
+                    _In_ const OrtTypeInfo* map_value_type, _Out_ OrtTypeInfo** type_info) {
+  API_IMPL_BEGIN
+  auto ti = std::make_unique<OrtTypeInfo>(ONNXType::ONNX_TYPE_MAP);
+  ti->map_type_info = std::make_unique<OrtMapTypeInfo>(map_key_type, map_value_type->Clone());
+  *type_info = ti.release();
+
+  return nullptr;
+  API_IMPL_END
+}
+
+ORT_API_STATUS_IMPL(OrtModelEditorAPI::CreateSequenceTypeInfo, _In_ const OrtTypeInfo* sequence_type,
+                    _Out_ OrtTypeInfo** type_info) {
+  API_IMPL_BEGIN
+  auto ti = std::make_unique<OrtTypeInfo>(ONNXType::ONNX_TYPE_SEQUENCE);
+  ti->sequence_type_info = std::make_unique<OrtSequenceTypeInfo>(sequence_type->Clone());
+  *type_info = ti.release();
+
+  return nullptr;
+  API_IMPL_END
+}
+
+ORT_API_STATUS_IMPL(OrtModelEditorAPI::CreateOptionalTypeInfo, _In_ const OrtTypeInfo* contained_type,
+                    _Out_ OrtTypeInfo** type_info) {
+  API_IMPL_BEGIN
+  auto ti = std::make_unique<OrtTypeInfo>(ONNXType::ONNX_TYPE_OPTIONAL);
+  ti->optional_type_info = std::make_unique<OrtOptionalTypeInfo>(contained_type->Clone());
+  *type_info = ti.release();
+
+  return nullptr;
+  API_IMPL_END
+}
+#endif  // !defined(ORT_MINIMAL_BUILD)
+
 ORT_API(void, OrtApis::ReleaseTypeInfo, _Frees_ptr_opt_ OrtTypeInfo* ptr) {
   std::unique_ptr<OrtTypeInfo> p(ptr);
 }
@@ -298,8 +355,8 @@ std::unique_ptr<OrtTypeInfo> OrtTypeInfo::Clone() const {
 #endif
     case ONNX_TYPE_TENSOR: {
       std::unique_ptr<OrtTensorTypeAndShapeInfo> info;
-      if (data) {
-        info = data->Clone();
+      if (tensor_type_info) {
+        info = tensor_type_info->Clone();
       }
       result = MakePtr(type, std::move(info));
       result->denotation = denotation;
diff --git a/onnxruntime/core/framework/onnxruntime_typeinfo.h b/onnxruntime/core/framework/onnxruntime_typeinfo.h
index 72d263d5fa442..54bb946e0d36b 100644
--- a/onnxruntime/core/framework/onnxruntime_typeinfo.h
+++ b/onnxruntime/core/framework/onnxruntime_typeinfo.h
@@ -31,7 +31,7 @@ struct OrtTypeInfo {
   ONNXType type;
   std::string denotation;
 
-  std::unique_ptr<OrtTensorTypeAndShapeInfo> data;
+  std::unique_ptr<OrtTensorTypeAndShapeInfo> tensor_type_info;
   std::unique_ptr<OrtMapTypeInfo> map_type_info;
   std::unique_ptr<OrtSequenceTypeInfo> sequence_type_info;
   std::unique_ptr<OrtOptionalTypeInfo> optional_type_info;
diff --git a/onnxruntime/core/framework/session_state_utils.cc b/onnxruntime/core/framework/session_state_utils.cc
index 83a353615bc35..343d634b44691 100644
--- a/onnxruntime/core/framework/session_state_utils.cc
+++ b/onnxruntime/core/framework/session_state_utils.cc
@@ -203,13 +203,12 @@ static common::Status DeserializeTensorProto(const Env& env, const std::basic_st
   }
 }
 
-common::Status AllocateTensor(
-    const onnxruntime::MemBuffer* m,
-    std::unique_ptr<onnxruntime::Tensor>& p_tensor,
-    const onnxruntime::DataTypeImpl* const& type,
-    onnxruntime::TensorShape& tensor_shape,
-    bool use_device_allocator_for_initializers,
-    const onnxruntime::AllocatorPtr& alloc) {
+common::Status AllocateTensor(const onnxruntime::MemBuffer* m,
+                              std::unique_ptr<onnxruntime::Tensor>& p_tensor,
+                              const onnxruntime::DataTypeImpl* const& type,
+                              onnxruntime::TensorShape& tensor_shape,
+                              bool use_device_allocator_for_initializers,
+                              const onnxruntime::AllocatorPtr& alloc) {
   if (m != nullptr) {
     p_tensor = std::make_unique<Tensor>(type, tensor_shape, m->GetBuffer(), m->GetAllocInfo());
     if (m->GetLen() < p_tensor->SizeInBytes()) {
@@ -354,6 +353,7 @@ common::Status SaveInitializedTensors(
     }
     ORT_RETURN_IF_ERROR(planner.Trace(entry.first, entry.second));
   }
+
   // 2. allocate weight buffer on different locations
   //  planned_initializers_memory_size_in_byte is not actual physical size.
   //  It's the virtual size computed by planner.
@@ -386,6 +386,9 @@ common::Status SaveInitializedTensors(
     if (user_supplied_initializer_ids.find(entry.first) != user_supplied_initializer_ids.end()) {
       ort_value = *(session_options.initializers_to_share_map.at(name));
       LOGS(logger, INFO) << "Using user supplied initializer with name (" << name << ").";
+
+    } else if (graph.GetOrtValueInitializer(name, ort_value)) {
+      // populated OrtValue from the Graph instance
     } else {
       const ONNX_NAMESPACE::TensorProto& tensor_proto = *(entry.second);
 
@@ -397,10 +400,9 @@ common::Status SaveInitializedTensors(
           session_options.config_options.GetConfigOrDefault(kOrtSessionOptionsUseDeviceAllocatorForInitializers, "0") == "1";
 
       Tensor* p_tensor = nullptr;
-      if (auto iter = buffered_tensors.find(name);
-          iter != buffered_tensors.end()) {
-        p_tensor = iter->second.release();
-        buffered_tensors.erase(iter);
+      auto buffered_tensors_iter = buffered_tensors.find(name);
+      if (buffered_tensors_iter != buffered_tensors.end()) {
+        p_tensor = buffered_tensors_iter->second.get();
       }
 
       Status st = DeserializeTensorProto(env, graph_loc, tensor_proto, (m.has_value()) ? &*m : nullptr, alloc,
@@ -412,6 +414,12 @@ common::Status SaveInitializedTensors(
         oss << "Deserialize tensor " << name << " failed." << st.ErrorMessage();
         return Status(st.Category(), st.Code(), oss.str());
       }
+
+      if (p_tensor != nullptr) {
+        // p_tensor was wrapped in a deleter by DeserializeTensorProto so we can simply release it here.
+        ORT_IGNORE_RETURN_VALUE(buffered_tensors_iter->second.release());
+        buffered_tensors.erase(buffered_tensors_iter);
+      }
     }
 
     // 'name' is a reference to a string within the TensorProto that save_tensor_func may free
diff --git a/onnxruntime/core/framework/tensor_type_and_shape.cc b/onnxruntime/core/framework/tensor_type_and_shape.cc
index 418e46924fb9f..9bbea279da82d 100644
--- a/onnxruntime/core/framework/tensor_type_and_shape.cc
+++ b/onnxruntime/core/framework/tensor_type_and_shape.cc
@@ -49,10 +49,27 @@ ORT_API_STATUS_IMPL(OrtApis::SetTensorElementType, _Inout_ OrtTensorTypeAndShape
   API_IMPL_END
 }
 
-ORT_API_STATUS_IMPL(OrtApis::SetDimensions, OrtTensorTypeAndShapeInfo* this_ptr,
+ORT_API_STATUS_IMPL(OrtApis::SetDimensions, OrtTensorTypeAndShapeInfo* info,
                     _In_ const int64_t* dim_values, size_t dim_count) {
   API_IMPL_BEGIN
-  this_ptr->shape = onnxruntime::TensorShape(dim_values, dim_count);
+  if (std::any_of(dim_values, dim_values + dim_count, [](int64_t v) { return v < -1; })) {
+    return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, "dim_values must be -1 (symbolic dimension) or larger.");
+  }
+
+  auto num_dims = std::max(dim_count, info->dim_params.size());
+
+  // make shape and dim_values consistent
+  info->dim_params.resize(num_dims, "");
+
+  onnxruntime::TensorShapeVector dims;
+  dims.resize(num_dims, -1);
+
+  for (size_t idx = 0; idx < dim_count; ++idx) {
+    dims[idx] = dim_values[idx];
+  }
+
+  info->shape = onnxruntime::TensorShape(dims);
+
   return nullptr;
   API_IMPL_END
 }
@@ -88,10 +105,22 @@ ORT_API_STATUS_IMPL(OrtApis::GetSymbolicDimensions,
 ORT_API_STATUS_IMPL(OrtApis::SetSymbolicDimensions,
                     _In_ struct OrtTensorTypeAndShapeInfo* info,
                     _In_ const char** names, _In_ size_t dim_params_length) {
+  auto num_dims = std::max(info->shape.NumDimensions(), dim_params_length);
+
+  // make shape and dim_values consistent
+  if (num_dims > info->shape.NumDimensions()) {
+    auto dim_values = info->shape.AsShapeVector();
+    dim_values.resize(num_dims, -1);
+    info->shape = onnxruntime::TensorShape(dim_values);
+  }
+
   info->dim_params.clear();
+  info->dim_params.resize(num_dims, "");
+
   for (size_t idx = 0; idx < dim_params_length; ++idx) {
-    info->dim_params.push_back(names[idx]);
+    info->dim_params[idx] = names[idx];
   }
+
   return nullptr;
 }
 
diff --git a/onnxruntime/core/framework/tensorprotoutils.cc b/onnxruntime/core/framework/tensorprotoutils.cc
index 17c37b8882168..ae1ec2e53bd7c 100644
--- a/onnxruntime/core/framework/tensorprotoutils.cc
+++ b/onnxruntime/core/framework/tensorprotoutils.cc
@@ -1317,22 +1317,15 @@ ONNX_NAMESPACE::TensorProto TensorToTensorProto(const Tensor& tensor,
     const auto* raw_data = tensor.DataRaw();
     ORT_ENFORCE(raw_data, "Missing raw data for tensor proto. Invalid tensor.");
     static_assert(sizeof(void*) <= sizeof(ExternalDataInfo::OFFSET_TYPE));
-    tensor_proto.set_data_location(ONNX_NAMESPACE::TensorProto_DataLocation_EXTERNAL);
 
     // we reinterpret_cast this back to void* in tensorprotoutils.cc:GetExtDataFromTensorProto.
     // use intptr_t as OFFSET_TYPE is signed. in theory you could get a weird looking value if the address uses the
     // high bit, but that should be unlikely in a scenario where we care about memory usage enough to use this path.
     auto offset = narrow<ExternalDataInfo::OFFSET_TYPE>(reinterpret_cast<intptr_t>(raw_data));
 
-    ONNX_NAMESPACE::StringStringEntryProto* entry = tensor_proto.mutable_external_data()->Add();
-    entry->set_key("location");
-    entry->set_value(ToUTF8String(onnxruntime::utils::kTensorProtoMemoryAddressTag));
-    entry = tensor_proto.mutable_external_data()->Add();
-    entry->set_key("offset");
-    entry->set_value(std::to_string(offset));
-    entry = tensor_proto.mutable_external_data()->Add();
-    entry->set_key("length");
-    entry->set_value(std::to_string(tensor.SizeInBytes()));
+    ExternalDataInfo::SetExternalLocationToProto(onnxruntime::utils::kTensorProtoMemoryAddressTag,
+                                                 offset, tensor.SizeInBytes(), tensor_proto);
+
   } else {
     utils::SetRawDataInTensorProto(tensor_proto, tensor.DataRaw(), tensor.SizeInBytes());
   }
diff --git a/onnxruntime/core/graph/graph.cc b/onnxruntime/core/graph/graph.cc
index e4915616b7b7c..39ffc6a5b0cee 100644
--- a/onnxruntime/core/graph/graph.cc
+++ b/onnxruntime/core/graph/graph.cc
@@ -7,30 +7,34 @@
 #include <fstream>
 #include <iostream>
 #include <numeric>
-#include <stack>
 #include <queue>
+#include <stack>
 
-#include "core/common/common.h"
 #include <gsl/gsl>
+
+#include "core/common/common.h"
 #include "core/common/inlined_containers.h"
 #include "core/common/logging/logging.h"
 #include "core/common/narrow.h"
 #include "core/flatbuffers/flatbuffers_utils.h"
+#include "core/framework/tensor_type_and_shape.h"
 #include "core/flatbuffers/schema/ort.fbs.h"
-#include "core/framework/tensor_shape.h"
 #include "core/framework/tensor_external_data_info.h"
+#include "core/framework/tensor_shape.h"
+#include "core/framework/tensor_type_and_shape.h"
 #include "core/framework/tensorprotoutils.h"
 #include "core/framework/utils.h"
+#include "core/graph/function_utils.h"
 #include "core/graph/graph_flatbuffers_utils.h"
 #include "core/graph/graph_viewer.h"
 #include "core/graph/indexed_sub_graph.h"
 #include "core/graph/model.h"
+#include "core/graph/model_editor_api_types.h"
 #include "core/graph/model_load_utils.h"
 #include "core/graph/model_saving_options.h"
 #include "core/graph/node_attr_utils.h"
 #include "core/graph/op.h"
 #include "core/graph/runtime_optimization_record_container.h"
-#include "core/graph/function_utils.h"
 
 #if !defined(ORT_MINIMAL_BUILD)
 #include "core/graph/function.h"
@@ -3500,6 +3504,10 @@ void Graph::RemoveInitializedTensor(const std::string& tensor_name) {
 #if !defined(DISABLE_SPARSE_TENSORS)
     sparse_tensor_names_.erase(tensor_name);
 #endif
+
+    // doesn't matter if it existed or not
+    ORT_IGNORE_RETURN_VALUE(ortvalue_initializers_.erase(tensor_name));
+
     SetGraphResolveNeeded();
   } else {
 #if !defined(DISABLE_SPARSE_TENSORS)
@@ -3631,8 +3639,8 @@ Status Graph::InjectExternalInitializersFromFilesInMemory(
 
   return Status::OK();
 }
-#endif  // DISABLE_EXTERNAL_INITIALIZERS
 
+#endif  // DISABLE_EXTERNAL_INITIALIZERS
 #endif  // !defined(ORT_MINIMAL_BUILD)
 
 bool Graph::GetInitializedTensor(const std::string& tensor_name, const TensorProto*& value) const {
@@ -3645,6 +3653,16 @@ bool Graph::GetInitializedTensor(const std::string& tensor_name, const TensorPro
   return true;
 }
 
+bool Graph::GetOrtValueInitializer(const std::string& name, OrtValue& value) const {
+  auto it = ortvalue_initializers_.find(name);
+  if (it == ortvalue_initializers_.end()) {
+    return false;
+  }
+
+  value = it->second;
+  return true;
+}
+
 void Graph::CleanAllInitializedTensors() noexcept {
   name_to_initial_tensor_.clear();
 #if !defined(DISABLE_SPARSE_TENSORS)
@@ -3660,6 +3678,8 @@ void Graph::CleanAllInitializedTensors() noexcept {
     delete graph_proto_->mutable_initializer()->ReleaseCleared();
   }
 #endif
+
+  ortvalue_initializers_.clear();
 }
 
 const ONNX_NAMESPACE::TensorProto* Graph::GetConstantInitializer(const std::string& initializer_name,
@@ -3709,13 +3729,14 @@ void Graph::AddValueInfo(const NodeArg* new_value_info) {
   value_info_.insert(new_value_info);
 }
 
-std::vector<NodeArg*> Graph::CreateNodeArgs(const google::protobuf::RepeatedPtrField<std::string>& names,
+template <typename StringRange>
+std::vector<NodeArg*> Graph::CreateNodeArgs(const StringRange& names,
                                             const ArgNameToTypeMap& name_to_type_map) {
   const auto name_to_type_map_end = name_to_type_map.end();
   std::vector<NodeArg*> results;
   results.reserve(names.size());
 
-  for (auto& name : names) {
+  for (const std::string& name : names) {
     const TypeProto* type = nullptr;
 
     auto name_to_type_iter = name_to_type_map.find(name);
@@ -4076,27 +4097,51 @@ ONNX_NAMESPACE::GraphProto Graph::ToGraphProto() const {
   // This is used for constructing full path for external data
   // if it exists
 
+  auto add_initializer = [](TensorList& output_initializers, const TensorProto& initializer) -> void {
+    TensorProto& output = *output_initializers.Add();
+    output = initializer;
+
+    // copy any in-memory external data into raw data
+    if (utils::HasExternalData(initializer)) {
+      const std::filesystem::path ignored;
+      std::basic_string<ORTCHAR_T> location;
+      onnxruntime::FileOffsetType file_offset;
+      SafeInt<size_t> tensor_byte_size;
+
+      ORT_THROW_IF_ERROR(utils::GetExternalDataInfo(initializer, ignored, location, file_offset, tensor_byte_size));
+
+      if (location == onnxruntime::utils::kTensorProtoMemoryAddressTag) {
+        // file_offset is address
+        void* data = reinterpret_cast<void*>(file_offset);
+
+        // set in raw data
+        output.clear_data_location();
+        output.set_raw_data(data, tensor_byte_size);
+      }
+    }
+  };
+
+  auto* mutable_initializers = result.mutable_initializer();
+
 #if !defined(DISABLE_SPARSE_TENSORS)
   const auto& model_path = ModelPath();
   // We want to make sure that sparse initializers do not appear
   // as dense duplicates within the initializers list.
-  if (!sparse_tensor_names_.empty()) {
-    const auto sparse_end = sparse_tensor_names_.end();
-    auto* mutable_initializer = result.mutable_initializer();
-    for (const auto& initializer : graph_proto_->initializer()) {
-      if (sparse_end == sparse_tensor_names_.find(initializer.name())) {
-        *mutable_initializer->Add() = initializer;
-      } else {
-        auto& sparse_initializer = *result.add_sparse_initializer();
-        auto status = utils::DenseTensorToSparseTensorProto(initializer, model_path, sparse_initializer);
-        ORT_ENFORCE(status.IsOK(), "Failed to convert dense initializer to sparse");
-      }
+  const bool has_sparse_initializers = !sparse_tensor_names_.empty();
+  const auto sparse_end = sparse_tensor_names_.end();
+  for (const auto& initializer : graph_proto_->initializer()) {
+    if (!has_sparse_initializers || sparse_end == sparse_tensor_names_.find(initializer.name())) {
+      add_initializer(*mutable_initializers, initializer);
+    } else {
+      auto& sparse_initializer = *result.add_sparse_initializer();
+      auto status = utils::DenseTensorToSparseTensorProto(initializer, model_path, sparse_initializer);
+      ORT_ENFORCE(status.IsOK(), "Failed to convert dense initializer to sparse");
     }
-  } else {
-    *result.mutable_initializer() = graph_proto_->initializer();
   }
 #else
-  *result.mutable_initializer() = graph_proto_->initializer();
+  for (const auto& initializer : graph_proto_->initializer()) {
+    add_initializer(*mutable_initializers, initializer);
+  }
 #endif
 
   return result;
@@ -5345,6 +5390,9 @@ Status Graph::InlineFunction(Node& callnode) {
 }
 
 void Graph::SetInputs(gsl::span<const NodeArg* const> inputs) {
+  graph_inputs_including_initializers_.clear();
+  graph_inputs_excluding_initializers_.clear();
+
   // creating graph from scratch
   // rely on SetGraphInputsOutputs() to fix up graph_inputs_excluding_initializers_
   // if is_loaded_from_model_file_ == false
@@ -5353,7 +5401,6 @@ void Graph::SetInputs(gsl::span<const NodeArg* const> inputs) {
 
   if (is_loaded_from_model_file_) {
     // graph loaded from model file
-    graph_inputs_excluding_initializers_.clear();
     for (const auto* input : inputs) {
       ORT_ENFORCE(input->Exists(), "Input to set must exist.");
       if (name_to_initial_tensor_.find(input->Name()) == name_to_initial_tensor_.end()) {
@@ -5370,6 +5417,7 @@ void Graph::SetInputs(gsl::span<const NodeArg* const> inputs) {
 }
 
 void Graph::SetOutputs(gsl::span<const NodeArg* const> outputs) {
+  graph_outputs_.clear();
   graph_outputs_.reserve(outputs.size());
   graph_outputs_.assign(outputs.begin(), outputs.end());
 
@@ -5688,4 +5736,207 @@ common::Status Graph::LoadFromOrtFormat(const onnxruntime::fbs::Graph& fbs_graph
   return Status::OK();
 }
 
+#if !defined(ORT_MINIMAL_BUILD)
+namespace {
+ValueInfoProto OrtValueInfoToOnnx(const OrtValueInfo& vi) {
+  // the model builder API checks that the OrtValueInfo has a complete and valid OrtTypeInfo instance and that the
+  // name is not null/empty.
+  ORT_ENFORCE(vi.type_info->type == ONNX_TYPE_TENSOR,
+              "Internal error. Model Editor API should only allow OrtValueInfo for tensor to be created.");
+
+  ValueInfoProto value_info_proto;
+  value_info_proto.set_name(vi.name);
+
+  auto* tensor = value_info_proto.mutable_type()->mutable_tensor_type();
+  const OrtTensorTypeAndShapeInfo& tensor_info = *vi.type_info->tensor_type_info.get();
+  tensor->set_elem_type(tensor_info.type);
+
+  auto& shape = *tensor->mutable_shape();
+
+  size_t idx = 0;
+  for (auto dim : tensor_info.shape.GetDims()) {
+    auto& dim_proto = *shape.add_dim();
+    if (dim >= 0) {
+      dim_proto.set_dim_value(dim);
+    } else {
+      const std::string& dim_param = tensor_info.dim_params[idx];
+      // if empty leave the new dim_proto with neither dim_value nor dim_param set. this represents an 'unknown' dim
+      if (!dim_param.empty()) {
+        dim_proto.set_dim_param(dim_param);
+      }
+    }
+  }
+
+  return value_info_proto;
+}
+}  // namespace
+
+Status Graph::LoadFromModelEditorApiModel(const OrtGraph& api_graph, bool updating_existing_graph) {
+  ArgNameToTypeMap name_to_type_map;
+
+  // NOTE: need to create NodeArgs as we go along
+
+  // add inputs first. the shape from an input for a non-const initializer is preferred, so we want to create the
+  // NodeArg for the value using that
+
+  auto add_graph_inputs_outputs = [&, this](
+                                      const InlinedVector<std::unique_ptr<OrtValueInfo>>& graph_inputs_or_outputs,
+                                      bool is_input) {
+    // when updating a model we don't require the inputs or outputs to be set if they're unchanged.
+    if (updating_existing_graph && graph_inputs_or_outputs.empty()) {
+      return;
+    }
+
+    std::vector<const NodeArg*> node_args;
+    node_args.reserve(graph_inputs_or_outputs.size());
+    for (auto& ort_value_info : graph_inputs_or_outputs) {
+      ValueInfoProto value_info = OrtValueInfoToOnnx(*ort_value_info);
+
+      name_to_type_map[value_info.name()] = value_info.type();
+      node_args.push_back(&GetOrCreateNodeArg(value_info.name(), &value_info.type()));
+    }
+
+    if (is_input) {
+      SetInputs(node_args);
+    } else {
+      SetOutputs(node_args);
+    }
+  };
+
+  auto add_initializers = [this](const std::unordered_map<std::string, std::unique_ptr<OrtValue>>& initializers,
+                                 bool is_external) {
+    for (auto& name_and_ortvalue : initializers) {
+      // convert from OrtValue to TensorProto
+      const std::string& name = name_and_ortvalue.first;
+      OrtValue& v = *name_and_ortvalue.second;
+
+      ORT_ENFORCE(v.IsTensor(), "Initializers must be Tensors");
+      const Tensor& t = v.Get<Tensor>();
+      TensorProto& tensor_proto = *graph_proto_->add_initializer();
+
+      tensor_proto.set_name(name);
+      tensor_proto.set_data_type(t.GetElementType());
+      for (auto dim : t.Shape().GetDims()) {
+        tensor_proto.add_dims(dim);
+      }
+
+      if (is_external) {
+        // pre-existing memory that we don't own. avoid a copy by storing the pointer in the ExternalDataInfo
+        const void* data_offset = t.DataRaw();  // address of memory not offset into file
+        auto offset = narrow<ExternalDataInfo::OFFSET_TYPE>(reinterpret_cast<intptr_t>(data_offset));
+
+        ExternalDataInfo::SetExternalLocationToProto(onnxruntime::utils::kTensorProtoMemoryAddressTag,
+                                                     offset, t.SizeInBytes(), tensor_proto);
+
+        // add OrtValue to ortvalue_initializers_ to keep it alive and to store the deleter if provided.
+        ortvalue_initializers_.emplace(name, std::move(v));
+      } else {
+        tensor_proto.set_raw_data(t.DataRaw(), t.SizeInBytes());
+      }
+
+      TypeProto type_proto{TypeProtoFromTensorProto(tensor_proto)};
+      ORT_IGNORE_RETURN_VALUE(GetOrCreateNodeArg(name, &type_proto));
+
+      name_to_initial_tensor_.emplace(name, &tensor_proto);
+    }
+  };
+
+  // process graph inputs first as we want the type/shape from them to be preferred if a graph input
+  // has a matching initializer
+  add_graph_inputs_outputs(api_graph.inputs, /*input*/ true);
+
+  // add initializers
+  ortvalue_initializers_.reserve(api_graph.external_initializers.size());
+  add_initializers(api_graph.external_initializers, /*is_external*/ true);
+  add_initializers(api_graph.initializers, /*is_external*/ false);
+
+  // add graph outputs
+  add_graph_inputs_outputs(api_graph.outputs, /*input*/ false);
+
+  // add nodes
+  for (const auto& ort_node : api_graph.nodes) {
+    const OrtNode& node = *ort_node;
+
+    // convert Constant nodes to initializers
+    if (node.operator_name == "Constant" && node.domain_name == kOnnxDomain) {
+      // graph_proto_ provides storage
+      TensorProto& tensor = *graph_proto_->add_initializer();
+
+      // create NodeProto from OrtNode so we can use the existing conversion functions
+      NodeProto node_proto;
+
+      // 'Constant' node has no inputs or attributes
+      ORT_RETURN_IF_NOT(node.input_names.empty() && node.attributes.size() == 1 && node.output_names.size() == 1,
+                        node.node_name,
+                        " is an invalid 'Constant' node. "
+                        "Must have no inputs, one attribute and one output. ");
+
+      node_proto.add_attribute()->CopyFrom(node.attributes[0]);
+      node_proto.add_output(node.output_names[0]);
+
+      node_proto.set_op_type(node.operator_name);
+      node_proto.set_name(node.node_name);
+      node_proto.set_domain(node.domain_name);
+
+      ORT_RETURN_IF_ERROR(utils::ConstantNodeProtoToTensorProto(node_proto, /*model_path*/ "", tensor));
+      name_to_initial_tensor_.emplace(node.output_names[0], &tensor);
+
+      continue;
+    }
+
+    auto input_defs = CreateNodeArgs(node.input_names, name_to_type_map);
+    auto output_defs = CreateNodeArgs(node.output_names, name_to_type_map);
+
+    const auto num_attributes = node.attributes.size();
+
+    NodeAttributes attributes;
+    attributes.reserve(num_attributes);
+
+    for (const auto& attr : node.attributes) {
+      attributes[attr.name()] = attr;
+    }
+
+    ORT_IGNORE_RETURN_VALUE(AddNode(node.node_name, node.operator_name, /*doc_string*/ "",
+                                    input_defs, output_defs, &attributes, node.domain_name));
+  }
+
+  return Resolve();
+}
+
+// static
+Status Graph::LoadFromModelEditorApiModel(const OrtGraph& api_graph,
+                                          const Model& owning_model,
+                                          const std::unordered_map<std::string, int>& domain_to_version,
+                                          IOnnxRuntimeOpSchemaCollectionPtr schema_registry,
+                                          bool strict_shape_type_inference,
+                                          const logging::Logger& logger,
+                                          std::unique_ptr<Graph>& graph) {
+  graph = std::make_unique<Graph>(owning_model,
+                                  domain_to_version,
+                                  schema_registry,
+                                  /*parent_graph*/ nullptr, /*parent_node*/ nullptr,
+                                  logger,
+                                  strict_shape_type_inference);
+
+  return graph->LoadFromModelEditorApiModel(api_graph);
+}
+
+Status Graph::UpdateUsingModelEditorApiModel(const OrtModel& api_model) {
+  for (auto& entry : api_model.domain_to_version) {
+    if (auto it = domain_to_version_.find(entry.first); it != domain_to_version_.end()) {
+      if (it->second != entry.second) {
+        return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                               "Domain version can not be changed for '", entry.first,
+                               "'. Current version: ", it->second);
+      }
+    } else {
+      domain_to_version_.insert(entry);
+    }
+  }
+
+  // this will replace inputs/outputs and add nodes.
+  return LoadFromModelEditorApiModel(*api_model.graph, /*updating_existing_graph*/ true);
+}
+
+#endif  // !defined(ORT_MINIMAL_BUILD)
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/graph/graph_flatbuffers_utils.cc b/onnxruntime/core/graph/graph_flatbuffers_utils.cc
index 922759b02e75f..199aa79cc1dde 100644
--- a/onnxruntime/core/graph/graph_flatbuffers_utils.cc
+++ b/onnxruntime/core/graph/graph_flatbuffers_utils.cc
@@ -300,8 +300,6 @@ Status LoadInitializerOrtFormat(const fbs::Tensor& fbs_tensor, TensorProto& init
     const auto* fbs_raw_data = fbs_tensor.raw_data();
     if (fbs_raw_data) {
       if (load_options.can_use_flatbuffer_for_initializers && fbs_raw_data->size() > 127) {
-        initializer.set_data_location(ONNX_NAMESPACE::TensorProto_DataLocation_EXTERNAL);
-
         static_assert(sizeof(void*) <= sizeof(ExternalDataInfo::OFFSET_TYPE));
         const void* data_offset = fbs_raw_data->Data();
         // we reinterpret_cast this back to void* in tensorprotoutils.cc:GetExtDataFromTensorProto.
@@ -309,15 +307,9 @@ Status LoadInitializerOrtFormat(const fbs::Tensor& fbs_tensor, TensorProto& init
         // high bit, but that should be unlikely in a scenario where we care about memory usage enough to use this path.
         auto offset = narrow<ExternalDataInfo::OFFSET_TYPE>(reinterpret_cast<intptr_t>(data_offset));
 
-        ONNX_NAMESPACE::StringStringEntryProto* entry = initializer.mutable_external_data()->Add();
-        entry->set_key("location");
-        entry->set_value(ToUTF8String(onnxruntime::utils::kTensorProtoMemoryAddressTag));
-        entry = initializer.mutable_external_data()->Add();
-        entry->set_key("offset");
-        entry->set_value(std::to_string(offset));
-        entry = initializer.mutable_external_data()->Add();
-        entry->set_key("length");
-        entry->set_value(std::to_string(fbs_raw_data->size()));
+        ExternalDataInfo::SetExternalLocationToProto(onnxruntime::utils::kTensorProtoMemoryAddressTag,
+                                                     offset, fbs_raw_data->size(), initializer);
+
       } else {
         // fbs_raw_data is uint8_t vector, so the size is byte size
         initializer.set_raw_data(fbs_raw_data->Data(), fbs_raw_data->size());
diff --git a/onnxruntime/core/graph/model.cc b/onnxruntime/core/graph/model.cc
index be0531e6473fb..7629e40c1b5fe 100644
--- a/onnxruntime/core/graph/model.cc
+++ b/onnxruntime/core/graph/model.cc
@@ -7,6 +7,7 @@
 #include "core/flatbuffers/flatbuffers_utils.h"
 #include "core/framework/tensorprotoutils.h"
 #include "core/graph/model.h"
+#include "core/graph/model_editor_api_types.h"
 #include "core/graph/model_load_utils.h"
 
 #ifdef _MSC_VER
@@ -738,6 +739,36 @@ Status Model::Load(int fd, const PathString& model_path, std::shared_ptr<Model>&
   return Status::OK();
 }
 
+// static
+common::Status Model::LoadFromModelEditorApiModel(const OrtModel& model_editor_api_model,
+                                                  const IOnnxRuntimeOpSchemaRegistryList* local_registries,
+                                                  const ModelOptions& options,
+                                                  const logging::Logger& logger,
+                                                  std::unique_ptr<Model>& model) {
+  model = std::make_unique<Model>();
+  model->model_proto_.set_ir_version(ONNX_NAMESPACE::Version::IR_VERSION);
+  // The optimizer Initializer class requires a path if external data is used, however in the Graph API usage the
+  // external data is pointing to pre-allocated memory and does not require a path. Set a dummy value to make it happy.
+  model->model_path_ = std::filesystem::path("_GRAPH_API_MODEL_");
+
+  auto schema_registry = std::make_shared<SchemaRegistryManager>();
+  if (local_registries != nullptr) {
+    for (const auto& schema_collection : *local_registries) {
+      schema_registry->RegisterRegistry(schema_collection);
+    }
+  }
+
+  ORT_RETURN_IF_ERROR(Graph::LoadFromModelEditorApiModel(*model_editor_api_model.graph,
+                                                         *model,
+                                                         model_editor_api_model.domain_to_version,
+                                                         schema_registry,
+                                                         options.strict_shape_type_inference,
+                                                         logger,
+                                                         model->graph_));
+
+  return Status::OK();
+}
+
 Status Model::Save(Model& model, int p_fd) {
   if (p_fd < 0) {
     return Status(ONNXRUNTIME, INVALID_ARGUMENT, "<p_fd> is less than 0.");
@@ -917,5 +948,4 @@ common::Status Model::LoadFromOrtFormat(const fbs::Model& fbs_model,
 #endif
   return Status::OK();
 }
-
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/graph/model.h b/onnxruntime/core/graph/model.h
index 2d2086aef41fd..6fd94c60d6b99 100644
--- a/onnxruntime/core/graph/model.h
+++ b/onnxruntime/core/graph/model.h
@@ -280,6 +280,12 @@ class Model {
                              const logging::Logger& logger,
                              const ModelOptions& options = {});
 
+  static common::Status LoadFromModelEditorApiModel(const OrtModel& graph_api_model,
+                                                    const IOnnxRuntimeOpSchemaRegistryList* local_registries,
+                                                    const ModelOptions& options,
+                                                    const logging::Logger& logger,
+                                                    std::unique_ptr<Model>& model);
+
   common::Status SaveToOrtFormat(flatbuffers::FlatBufferBuilder& builder,
                                  flatbuffers::Offset<onnxruntime::fbs::Model>& model) const;
 
@@ -333,7 +339,7 @@ class Model {
   ModelMetaData model_metadata_;
 
   // Path to model file. May be empty.
-  const std::filesystem::path model_path_;
+  std::filesystem::path model_path_;
 
   // Main graph of the model.
   std::unique_ptr<Graph> graph_;
diff --git a/onnxruntime/core/graph/model_editor_api_types.h b/onnxruntime/core/graph/model_editor_api_types.h
new file mode 100644
index 0000000000000..d72bd13093b61
--- /dev/null
+++ b/onnxruntime/core/graph/model_editor_api_types.h
@@ -0,0 +1,47 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/common/inlined_containers_fwd.h"
+#include "core/framework/ort_value.h"
+#include "core/framework/onnxruntime_typeinfo.h"
+#include "core/graph/onnx_protobuf.h"
+
+// ORT C interface types for OrtGraphApi can't be in a namespace.
+// We need to define them here so onnxruntime::Model can be created from OrtModel.
+
+struct OrtValueInfo {
+  std::string name;
+  std::unique_ptr<OrtTypeInfo> type_info;
+};
+
+struct OrtOpAttr {
+  ONNX_NAMESPACE::AttributeProto attr_proto;
+};
+
+struct OrtNode {
+  std::string operator_name;
+  std::string domain_name;
+  std::string node_name;
+
+  // OrtOpAttr is 1:1 with ONNX_NAMESPACE::AttributeProto currently.
+  // https://github.com/microsoft/onnxruntime/blob/bd5a759d0cdbed6e7f611c990d4eb5457a9ecf60/onnxruntime/core/session/standalone_op_invoker.cc#L318
+  onnxruntime::InlinedVector<ONNX_NAMESPACE::AttributeProto> attributes;
+  onnxruntime::InlinedVector<std::string> input_names;
+  onnxruntime::InlinedVector<std::string> output_names;
+
+  // FUTURE if we need control flow nodes
+  // std::unordered_map<std::string, OrtGraph> subgraphs;
+};
+
+struct OrtGraph {
+  onnxruntime::InlinedVector<std::unique_ptr<OrtValueInfo>> inputs;
+  onnxruntime::InlinedVector<std::unique_ptr<OrtValueInfo>> outputs;
+  std::unordered_map<std::string, std::unique_ptr<OrtValue>> initializers;
+  std::unordered_map<std::string, std::unique_ptr<OrtValue>> external_initializers;
+  std::vector<std::unique_ptr<OrtNode>> nodes;
+};
+
+struct OrtModel {
+  std::unique_ptr<OrtGraph> graph;
+  std::unordered_map<std::string, int> domain_to_version;
+};
diff --git a/onnxruntime/core/session/abi_session_options.cc b/onnxruntime/core/session/abi_session_options.cc
index 7ef23d6c9e895..2e733f67a888c 100644
--- a/onnxruntime/core/session/abi_session_options.cc
+++ b/onnxruntime/core/session/abi_session_options.cc
@@ -1,17 +1,18 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "core/graph/onnx_protobuf.h"
-#include "core/common/inlined_containers.h"
-#include "core/session/onnxruntime_c_api.h"
-#include "core/session/ort_apis.h"
-#include "core/framework/error_code_helper.h"
-#include <cstring>
 #include <cassert>
+#include <cstring>
 #include <sstream>
+
+#include "core/common/inlined_containers.h"
+#include "core/framework/error_code_helper.h"
+#include "core/graph/onnx_protobuf.h"
+#include "core/session/abi_session_options_impl.h"
 #include "core/session/inference_session.h"
-#include "abi_session_options_impl.h"
-#include "api_utils.h"
+#include "core/session/onnxruntime_c_api.h"
+#include "core/session/ort_apis.h"
+#include "core/session/utils.h"
 
 OrtSessionOptions::~OrtSessionOptions() = default;
 
diff --git a/onnxruntime/core/session/api_utils.cc b/onnxruntime/core/session/api_utils.cc
deleted file mode 100644
index f7cb8520b1e5d..0000000000000
--- a/onnxruntime/core/session/api_utils.cc
+++ /dev/null
@@ -1,25 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#include "api_utils.h"
-
-onnxruntime::common::Status CopyStringToOutputArg(std::string_view str, const char* err_msg, char* out, size_t* size) {
-  const size_t str_len = str.size();
-  const size_t req_size = str_len + 1;
-
-  if (out == nullptr) {  // User is querying the total output buffer size
-    *size = req_size;
-    return onnxruntime::common::Status::OK();
-  }
-
-  if (*size >= req_size) {  // User provided a buffer of sufficient size
-    std::memcpy(out, str.data(), str_len);
-    out[str_len] = '\0';
-    *size = req_size;
-    return onnxruntime::common::Status::OK();
-  }
-
-  // User has provided a buffer that is not large enough
-  *size = req_size;
-  return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, err_msg);
-}
diff --git a/onnxruntime/core/session/api_utils.h b/onnxruntime/core/session/api_utils.h
deleted file mode 100644
index 27c2bbd66f8d5..0000000000000
--- a/onnxruntime/core/session/api_utils.h
+++ /dev/null
@@ -1,9 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#pragma once
-
-#include "core/common/common.h"
-#include <string_view>
-
-onnxruntime::common::Status CopyStringToOutputArg(std::string_view str, const char* err_msg, char* out, size_t* size);
diff --git a/onnxruntime/core/session/custom_ops.cc b/onnxruntime/core/session/custom_ops.cc
index 8492391172133..bb9f278d83cf7 100644
--- a/onnxruntime/core/session/custom_ops.cc
+++ b/onnxruntime/core/session/custom_ops.cc
@@ -20,7 +20,7 @@
 #include "core/framework/tensorprotoutils.h"
 #include "core/graph/onnx_protobuf.h"
 #include "core/session/allocator_adapters.h"
-#include "core/session/api_utils.h"
+#include "core/session/utils.h"
 #include "core/session/custom_ops.h"
 #include "core/session/inference_session.h"
 #include "core/session/ort_apis.h"
diff --git a/onnxruntime/core/session/inference_session.cc b/onnxruntime/core/session/inference_session.cc
index a1903898ea7f0..e941b1ebbaba8 100644
--- a/onnxruntime/core/session/inference_session.cc
+++ b/onnxruntime/core/session/inference_session.cc
@@ -38,6 +38,7 @@
 #include "core/framework/utils.h"
 #include "core/graph/graph_viewer.h"
 #include "core/graph/model.h"
+#include "core/graph/model_editor_api_types.h"
 #include "core/graph/model_saving_options.h"
 #include "core/optimizer/graph_transformer_utils.h"
 #include "core/optimizer/graph_transformer.h"
@@ -67,11 +68,11 @@
 #include "core/optimizer/stft_decomposition.h"
 #endif
 #include "core/session/environment.h"
-#include "core/session/user_logging_sink.h"
 #include "core/session/IOBinding.h"
 #include "core/session/inference_session_utils.h"
 #include "core/session/onnxruntime_session_options_config_keys.h"
 #include "core/session/onnxruntime_run_options_config_keys.h"
+#include "core/session/user_logging_sink.h"
 #include "core/util/protobuf_parsing_utils.h"
 #include "core/util/thread_utils.h"
 
@@ -1215,6 +1216,56 @@ common::Status InferenceSession::Load() {
   return LoadWithLoader(loader, "model_loading_from_saved_proto");
 }
 
+common::Status InferenceSession::Load(const OrtModel& model_editor_api_model) {
+  std::lock_guard<std::mutex> l(session_mutex_);
+
+  if (is_model_loaded_) {  // already loaded
+    Status status(common::ONNXRUNTIME, common::MODEL_LOADED, "This session already contains a loaded model.");
+    LOGS(*session_logger_, ERROR) << status.ErrorMessage();
+    return status;
+  }
+
+  if (is_inited_) {
+    Status status(common::ONNXRUNTIME, common::MODEL_LOADED, "This session has already been initialized.");
+    LOGS(*session_logger_, ERROR) << status.ErrorMessage();
+    return status;
+  }
+
+  const bool strict_shape_type_inference = session_options_.config_options.GetConfigOrDefault(
+                                               kOrtSessionOptionsConfigStrictShapeTypeInference, "0") == "1";
+
+  // need to go from unique_ptr to shared_ptr when moving into model_
+  std::unique_ptr<Model> tmp_model;
+  ORT_RETURN_IF_ERROR(Model::LoadFromModelEditorApiModel(model_editor_api_model,
+                                                         HasLocalSchema() ? &custom_schema_registries_ : nullptr,
+                                                         ModelOptions(true, strict_shape_type_inference),
+                                                         *session_logger_, tmp_model));
+
+  model_ = std::move(tmp_model);
+
+  is_model_loaded_ = true;
+
+  return Status::OK();
+}
+
+common::Status InferenceSession::ApplyUpdates(const OrtModel& model_editor_api_model) {
+  std::lock_guard<std::mutex> l(session_mutex_);
+
+  if (!is_model_loaded_) {
+    Status status(common::ONNXRUNTIME, common::MODEL_LOADED, "This session does not contain a loaded model.");
+    LOGS(*session_logger_, ERROR) << status.ErrorMessage();
+    return status;
+  }
+
+  if (is_inited_) {
+    Status status(common::ONNXRUNTIME, common::MODEL_LOADED, "This session has already been initialized.");
+    LOGS(*session_logger_, ERROR) << status.ErrorMessage();
+    return status;
+  }
+
+  return model_->MainGraph().UpdateUsingModelEditorApiModel(model_editor_api_model);
+}
+
 common::Status InferenceSession::TransformGraph(onnxruntime::Graph& graph, bool saving_model_in_ort_format) {
   // The transformer order:
   // 1. Ensure we inline as many functions as possible. We refer to it as Ahead Of Time (AOT) function inlining.
@@ -3336,6 +3387,10 @@ common::Status InferenceSession::WaitForNotification(Notification* p_executor_do
   return Status::OK();
 }
 
+const Model& InferenceSession::GetModel() const {
+  return *model_;
+}
+
 SessionIOBinding::SessionIOBinding(InferenceSession* session) : sess_(session) {
   ORT_ENFORCE(session->NewIOBinding(&binding_).IsOK());
 }
diff --git a/onnxruntime/core/session/inference_session.h b/onnxruntime/core/session/inference_session.h
index 2c0c09dfd3e51..5b484103c9ecf 100644
--- a/onnxruntime/core/session/inference_session.h
+++ b/onnxruntime/core/session/inference_session.h
@@ -47,6 +47,9 @@ namespace ONNX_NAMESPACE {
 class ModelProto;
 }  // namespace ONNX_NAMESPACE
 
+// OrtModelEditorApi Model. Used to dynamically construct a model via C API at runtime.
+struct OrtModel;
+
 namespace onnxruntime {  // forward declarations
 class CustomRegistry;
 class Environment;
@@ -320,6 +323,27 @@ class InferenceSession {
    * @return OK if success.
    */
   [[nodiscard]] common::Status Load();
+
+  /**
+   * Load an OrtModel that was dynamically constructed via OrtModelEditorApi.
+   *
+   * @param graph_api_model OrtModel from OrtModelEditorApi
+   * @return OK if success.
+   */
+  [[nodiscard]] common::Status Load(const OrtModel& graph_api_model);
+
+  /**
+   * Apply updates from an OrtModel that was created via OrtModelEditorApi.
+   * This can:
+   *   - add nodes at the start and end of the model
+   *   - add initializers
+   *   - update the graph inputs/outputs
+   *
+   * @param graph_api_model OrtModel from OrtModelEditorApi
+   * @return OK if success.
+   */
+  [[nodiscard]] common::Status ApplyUpdates(const OrtModel& graph_api_model);
+
 #endif  // !defined(ORT_MINIMAL_BUILD)
 
   /**
@@ -571,6 +595,8 @@ class InferenceSession {
 
 #endif
 
+  const Model& GetModel() const;
+
  protected:
 #if !defined(ORT_MINIMAL_BUILD)
 
@@ -627,6 +653,12 @@ class InferenceSession {
   /// convenience pointer to logger. should always be the same as session_state_.Logger();
   const logging::Logger* session_logger_;
 
+  // The list of execution providers.
+  // This MUST be prior to model_ in case there are values in the model that were allocated using an allocator
+  // provided by the EP. If that is the case the allocator's `free` implementation may depend on other parts of the
+  // EP instance.
+  ExecutionProviders execution_providers_;
+
   // The model served by this inference session instance.
   // Currently this has to be a shared ptr because the Model::Load method
   // returns a shared_ptr only. Ideally factory functions should always return
@@ -637,9 +669,6 @@ class InferenceSession {
   // The file path of where the model was loaded. e.g. /tmp/test_squeezenet/model.onnx
   PathString model_location_;
 
-  // The list of execution providers.
-  ExecutionProviders execution_providers_;
-
  private:
   ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(InferenceSession);
   void SetLoggingManager(const SessionOptions& session_options,
diff --git a/onnxruntime/core/session/model_editor_api.h b/onnxruntime/core/session/model_editor_api.h
new file mode 100644
index 0000000000000..71004866bc867
--- /dev/null
+++ b/onnxruntime/core/session/model_editor_api.h
@@ -0,0 +1,65 @@
+namespace OrtModelEditorAPI {
+
+// implementation that returns the API struct
+ORT_API(const OrtModelEditorApi*, GetModelEditorApi);
+
+// APIs to create/edit type info
+ORT_API_STATUS_IMPL(CreateTensorTypeInfo, _In_ const OrtTensorTypeAndShapeInfo* tensor_info,
+                    _Out_ OrtTypeInfo** type_info);
+ORT_API_STATUS_IMPL(CreateSparseTensorTypeInfo, _In_ const OrtTensorTypeAndShapeInfo* tensor_info,
+                    _Out_ OrtTypeInfo** type_info);
+ORT_API_STATUS_IMPL(CreateMapTypeInfo, ONNXTensorElementDataType map_key_type, _In_ const OrtTypeInfo* map_value_type,
+                    _Out_ OrtTypeInfo** type_info);
+ORT_API_STATUS_IMPL(CreateSequenceTypeInfo, _In_ const OrtTypeInfo* sequence_type, _Out_ OrtTypeInfo** type_info);
+ORT_API_STATUS_IMPL(CreateOptionalTypeInfo, _In_ const OrtTypeInfo* contained_type, _Out_ OrtTypeInfo** type_info);
+
+ORT_API_STATUS_IMPL(CreateValueInfo, _In_ const char* name, _In_ const OrtTypeInfo* type_info,
+                    _Outptr_ OrtValueInfo** value_info);
+
+ORT_API_STATUS_IMPL(CreateNode, const char* operator_name, const char* domain_name, _In_ const char* node_name,
+                    _In_reads_(input_names_len) const char* const* input_names, size_t input_names_len,
+                    _In_reads_(output_names_len) const char* const* output_names, size_t output_names_len,
+                    _In_reads_(attribs_len) _Inout_opt_ OrtOpAttr** attributes, _In_opt_ size_t attribs_len,
+                    _Outptr_ OrtNode** node);
+
+ORT_API_STATUS_IMPL(CreateGraph, _Outptr_ OrtGraph** graph);
+ORT_API_STATUS_IMPL(SetGraphInputs, _In_ OrtGraph* graph,
+                    _In_reads_(inputs_len) _In_ OrtValueInfo** inputs, _In_ size_t inputs_len);
+ORT_API_STATUS_IMPL(SetGraphOutputs, _In_ OrtGraph* graph,
+                    _In_reads_(outputs_len) _In_ OrtValueInfo** outputs, _In_ size_t outputs_len);
+ORT_API_STATUS_IMPL(AddInitializerToGraph, _In_ OrtGraph* graph, _In_ const char* name, _Inout_ OrtValue* tensor,
+                    bool data_is_external);
+ORT_API_STATUS_IMPL(AddNodeToGraph, _In_ OrtGraph* graph, _Inout_ OrtNode* node);
+
+ORT_API_STATUS_IMPL(CreateModel,
+                    _In_reads_(opset_entries_len) const char* const* domain_names,
+                    _In_reads_(opset_entries_len) const int* opset_versions,
+                    size_t opset_entries_len,
+                    _Outptr_ OrtModel** model);
+ORT_API_STATUS_IMPL(AddGraphToModel, _In_ OrtModel* model, _Inout_ OrtGraph* graph);
+
+ORT_API_STATUS_IMPL(CreateSessionFromModel, _In_ const OrtEnv* env, _In_ const OrtModel* model,
+                    _In_ const OrtSessionOptions* options, _Outptr_ OrtSession** out);
+
+//
+// Model editing APIs for updating existing model by adding node/s at start or end.
+//
+ORT_API_STATUS_IMPL(CreateModelEditorSession, _In_ const OrtEnv* env,
+                    _In_ const ORTCHAR_T* model_path,
+                    _In_ const OrtSessionOptions* options,
+                    _Outptr_ OrtSession** out);
+
+ORT_API_STATUS_IMPL(CreateModelEditorSessionFromArray, _In_ const OrtEnv* env,
+                    _In_ const void* model_data, size_t model_data_length,
+                    _In_ const OrtSessionOptions* options,
+                    _Outptr_ OrtSession** out);
+
+ORT_API_STATUS_IMPL(SessionGetOpsetForDomain, _In_ const OrtSession* session, _In_ const char* domain,
+                    _Out_ int* opset);
+
+ORT_API_STATUS_IMPL(ApplyModelToModelEditorSession, _In_ OrtSession* session, _In_ OrtModel* model);
+
+ORT_API_STATUS_IMPL(FinalizeModelEditorSession, _In_ OrtSession* session, _In_ const OrtSessionOptions* options,
+                    _Inout_ OrtPrepackedWeightsContainer* prepacked_weights_container);
+
+}  // namespace OrtModelEditorAPI
diff --git a/onnxruntime/core/session/model_editor_c_api.cc b/onnxruntime/core/session/model_editor_c_api.cc
new file mode 100644
index 0000000000000..2f09b903ed941
--- /dev/null
+++ b/onnxruntime/core/session/model_editor_c_api.cc
@@ -0,0 +1,358 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#if !defined(ORT_MINIMAL_BUILD)
+
+#include <variant>
+
+#include "core/framework/error_code_helper.h"
+#include "core/framework/ort_value.h"
+#include "core/framework/onnxruntime_typeinfo.h"
+#include "core/framework/tensor_type_and_shape.h"
+#include "core/graph/constants.h"
+#include "core/graph/model.h"
+#include "core/graph/model_editor_api_types.h"
+#include "core/graph/onnx_protobuf.h"
+#include "core/session/abi_session_options_impl.h"
+#include "core/session/inference_session.h"
+#include "core/session/model_editor_api.h"
+#include "core/session/ort_apis.h"
+#include "core/session/ort_env.h"
+#include "core/session/utils.h"
+
+using namespace onnxruntime;
+
+ORT_API_STATUS_IMPL(OrtModelEditorAPI::CreateValueInfo, _In_ const char* name, _In_ const OrtTypeInfo* type_info,
+                    _Outptr_ OrtValueInfo** value_info) {
+  API_IMPL_BEGIN
+  if (name == nullptr || *name == '\0') {
+    return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, "name cannot be null or empty string");
+  }
+
+  if (type_info == nullptr) {
+    return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, "type_info cannot be null");
+  }
+
+  if (type_info->type != ONNX_TYPE_TENSOR) {
+    return OrtApis::CreateStatus(ORT_FAIL, "Only tensor types are supported currently");
+  }
+
+  if (type_info->tensor_type_info == nullptr) {
+    return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, "tensor_type_info cannot be null");
+  }
+
+  auto vi = std::make_unique<OrtValueInfo>();
+  vi->name = name;
+  vi->type_info = type_info->Clone();
+
+  *value_info = vi.release();
+
+  return nullptr;
+  API_IMPL_END
+}
+
+ORT_API_STATUS_IMPL(OrtModelEditorAPI::CreateNode, const char* operator_name, const char* domain_name,
+                    _In_ const char* node_name,
+                    _In_reads_(input_names_len) const char* const* input_names, size_t input_names_len,
+                    _In_reads_(output_names_len) const char* const* output_names, size_t output_names_len,
+                    _In_reads_(attribs_len) _Inout_opt_ OrtOpAttr** attributes, _In_opt_ size_t attribs_len,
+                    _Outptr_ OrtNode** node) {
+  API_IMPL_BEGIN
+  auto n = std::make_unique<OrtNode>();
+  n->operator_name = operator_name;
+  n->domain_name = domain_name == kOnnxDomainAlias ? kOnnxDomain : domain_name;
+  n->node_name = node_name;
+
+  n->input_names.reserve(input_names_len);
+  for (size_t i = 0; i < input_names_len; ++i) {
+    n->input_names.push_back(input_names[i]);
+  }
+
+  n->output_names.reserve(output_names_len);
+  for (size_t i = 0; i < output_names_len; ++i) {
+    n->output_names.push_back(output_names[i]);
+  }
+
+  if (attributes != nullptr) {
+    n->attributes.reserve(attribs_len);
+    for (size_t i = 0; i < attribs_len; ++i) {
+      n->attributes.push_back(*reinterpret_cast<const ONNX_NAMESPACE::AttributeProto*>(attributes[i]));
+      // take ownership. as we took a copy that means releasing the original value
+      OrtApis::ReleaseOpAttr(attributes[i]);
+      attributes[i] = nullptr;
+    }
+  }
+
+  *node = n.release();
+  return nullptr;
+  API_IMPL_END
+}
+
+ORT_API_STATUS_IMPL(OrtModelEditorAPI::CreateGraph, _Outptr_ OrtGraph** graph) {
+  API_IMPL_BEGIN
+  auto g = std::make_unique<OrtGraph>();
+
+  // do some reserves to reduce reallocation. if we had a hint about sizes upfront that would be optimal
+  g->initializers.reserve(32);
+  g->external_initializers.reserve(32);
+  g->nodes.reserve(64);
+
+  *graph = g.release();
+  return nullptr;
+  API_IMPL_END
+}
+
+ORT_API_STATUS_IMPL(OrtModelEditorAPI::SetGraphInputs, _In_ OrtGraph* graph,
+                    _In_reads_(inputs_len) _In_ OrtValueInfo** inputs, _In_ size_t inputs_len) {
+  API_IMPL_BEGIN
+  graph->inputs.clear();
+  for (size_t i = 0; i < inputs_len; ++i) {
+    if (inputs[i] == nullptr) {
+      return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, "inputs cannot contain null entries");
+    }
+
+    graph->inputs.push_back(std::unique_ptr<OrtValueInfo>(inputs[i]));  // take ownership
+    inputs[i] = nullptr;
+  }
+
+  return nullptr;
+  API_IMPL_END
+}
+
+ORT_API_STATUS_IMPL(OrtModelEditorAPI::SetGraphOutputs, _In_ OrtGraph* graph,
+                    _In_reads_(outputs_len) _In_ OrtValueInfo** outputs, _In_ size_t outputs_len) {
+  API_IMPL_BEGIN
+  graph->outputs.clear();
+  for (size_t i = 0; i < outputs_len; ++i) {
+    if (outputs[i] == nullptr) {
+      return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, "outputs cannot contain null entries");
+    }
+
+    graph->outputs.push_back(std::unique_ptr<OrtValueInfo>(outputs[i]));  // take ownership
+    outputs[i] = nullptr;
+  }
+
+  return nullptr;
+  API_IMPL_END
+}
+
+ORT_API_STATUS_IMPL(OrtModelEditorAPI::AddInitializerToGraph, _In_ OrtGraph* graph, _In_ const char* name,
+                    _Inout_ OrtValue* tensor, bool data_is_external) {
+  API_IMPL_BEGIN
+  if (!tensor->IsTensor()) {
+    return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, "Only Tensor is currently supported.");
+  }
+
+  if (!tensor->IsAllocated()) {
+    return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, "Tensor must be allocated.");
+  }
+
+  const auto& t = tensor->Get<onnxruntime::Tensor>();
+  if (t.Location().device.Type() != OrtDevice::CPU) {
+    return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, "Only CPU based tensors are currently supported.");
+  }
+
+  if (data_is_external) {
+    // enforce that an external initializer is not used if the data size is < 128 bytes.
+    // the reason for this is to avoid potential shape inferencing errors if this initializer is providing an
+    // input involved in that. the ONNX shape inferencing does not support external data for those values.
+    // e.g. Reshape's `shape` input, Reduce's `axes', Slice's `starts`, `ends`, `steps`, Clip's `min`, `max`, etc.
+    if (t.SizeInBytes() < 128) {
+      return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT,
+                                   "External initializer should only be used for data >= 128 bytes. "
+                                   "Please use CreateTensorAsOrtValue instead.");
+    }
+
+    graph->external_initializers[name] = std::unique_ptr<OrtValue>(tensor);  // take ownership
+  } else {
+    graph->initializers[name] = std::unique_ptr<OrtValue>(tensor);  // take ownership
+  }
+
+  return nullptr;
+  API_IMPL_END
+}
+
+ORT_API_STATUS_IMPL(OrtModelEditorAPI::AddNodeToGraph, _In_ OrtGraph* graph, _Inout_ OrtNode* node) {
+  API_IMPL_BEGIN
+  graph->nodes.push_back(std::unique_ptr<OrtNode>(node));  // take ownership
+  return nullptr;
+  API_IMPL_END
+}
+
+ORT_API_STATUS_IMPL(OrtModelEditorAPI::CreateModel,
+                    _In_reads_(opset_entries_len) const char* const* domain_names,
+                    _In_reads_(opset_entries_len) const int* opset_versions,
+                    size_t opset_entries_len,
+                    _Outptr_ OrtModel** model) {
+  API_IMPL_BEGIN
+  auto m = std::make_unique<OrtModel>();
+  for (size_t i = 0; i < opset_entries_len; ++i) {
+    m->domain_to_version[domain_names[i]] = opset_versions[i];
+  }
+
+  *model = m.release();
+  return nullptr;
+  API_IMPL_END
+}
+
+ORT_API_STATUS_IMPL(OrtModelEditorAPI::AddGraphToModel, _In_ OrtModel* model, _Inout_ OrtGraph* graph) {
+  API_IMPL_BEGIN
+
+  if (graph == nullptr) {
+    return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, "graph cannot be null");
+  }
+
+  model->graph = std::unique_ptr<OrtGraph>(graph);  // take ownership
+  return nullptr;
+  API_IMPL_END
+}
+
+ORT_API_STATUS_IMPL(OrtModelEditorAPI::CreateSessionFromModel, _In_ const OrtEnv* env, _In_ const OrtModel* model,
+                    _In_ const OrtSessionOptions* options, _Outptr_ OrtSession** out) {
+  API_IMPL_BEGIN
+
+  std::unique_ptr<onnxruntime::InferenceSession> sess;
+  OrtStatus* status = nullptr;
+  *out = nullptr;
+
+  ORT_TRY {
+    sess = std::make_unique<onnxruntime::InferenceSession>(
+        options == nullptr ? onnxruntime::SessionOptions() : options->value,
+        env->GetEnvironment());
+
+    ORT_API_RETURN_IF_STATUS_NOT_OK(sess->Load(*model));
+
+    ORT_API_RETURN_IF_ERROR(InitializeSession(options, *sess));
+
+    *out = reinterpret_cast<OrtSession*>(sess.release());
+  }
+  ORT_CATCH(const std::exception& e) {
+    ORT_HANDLE_EXCEPTION([&]() {
+      status = OrtApis::CreateStatus(ORT_FAIL, e.what());
+    });
+  }
+
+  return status;
+
+  API_IMPL_END
+}
+
+ORT_API_STATUS_IMPL(OrtModelEditorAPI::CreateModelEditorSession,
+                    _In_ const OrtEnv* env, _In_ const ORTCHAR_T* model_path, _In_ const OrtSessionOptions* options,
+                    _Outptr_ OrtSession** out) {
+  API_IMPL_BEGIN
+  std::unique_ptr<onnxruntime::InferenceSession> session;
+  OrtStatus* status = nullptr;
+  *out = nullptr;
+
+  ORT_TRY {
+    ORT_API_RETURN_IF_ERROR(CreateSessionAndLoadModel(options, env, model_path, nullptr, 0, session));
+    *out = reinterpret_cast<OrtSession*>(session.release());
+  }
+  ORT_CATCH(const std::exception& e) {
+    ORT_HANDLE_EXCEPTION([&]() {
+      status = OrtApis::CreateStatus(ORT_FAIL, e.what());
+    });
+  }
+
+  return status;
+  API_IMPL_END
+}
+
+ORT_API_STATUS_IMPL(OrtModelEditorAPI::CreateModelEditorSessionFromArray, _In_ const OrtEnv* env,
+                    _In_ const void* model_data, size_t model_data_length,
+                    _In_ const OrtSessionOptions* options,
+                    _Outptr_ OrtSession** out) {
+  API_IMPL_BEGIN
+  std::unique_ptr<onnxruntime::InferenceSession> session;
+  OrtStatus* status = nullptr;
+  *out = nullptr;
+
+  ORT_TRY {
+    ORT_API_RETURN_IF_ERROR(CreateSessionAndLoadModel(options, env, nullptr, model_data, model_data_length, session));
+    *out = reinterpret_cast<OrtSession*>(session.release());
+  }
+  ORT_CATCH(const std::exception& e) {
+    ORT_HANDLE_EXCEPTION([&]() {
+      status = OrtApis::CreateStatus(ORT_FAIL, e.what());
+    });
+  }
+
+  return status;
+  API_IMPL_END
+}
+
+ORT_API_STATUS_IMPL(OrtModelEditorAPI::SessionGetOpsetForDomain, _In_ const OrtSession* ort_session,
+                    _In_ const char* domain, _Out_ int* opset) {
+  const auto& session = *reinterpret_cast<const ::onnxruntime::InferenceSession*>(ort_session);
+  const auto& domain_opset_map = session.GetModel().MainGraph().DomainToVersionMap();
+
+  auto it = domain_opset_map.find(domain);
+  if (it == domain_opset_map.cend()) {
+    return OrtApis::CreateStatus(ORT_FAIL, "Domain not used by model.");
+  }
+
+  *opset = it->second;
+  return nullptr;
+}
+
+ORT_API_STATUS_IMPL(OrtModelEditorAPI::ApplyModelToModelEditorSession,
+                    _In_ OrtSession* session, _In_ OrtModel* model) {
+  API_IMPL_BEGIN
+  auto sess = reinterpret_cast<onnxruntime::InferenceSession*>(session);
+  ORT_API_RETURN_IF_STATUS_NOT_OK(sess->ApplyUpdates(*model));
+  return nullptr;
+  API_IMPL_END
+}
+
+ORT_API_STATUS_IMPL(OrtModelEditorAPI::FinalizeModelEditorSession, _In_ OrtSession* session,
+                    _In_ const OrtSessionOptions* options,
+                    _Inout_ OrtPrepackedWeightsContainer* prepacked_weights_container) {
+  API_IMPL_BEGIN
+  auto sess = reinterpret_cast<onnxruntime::InferenceSession*>(session);
+  ORT_API_RETURN_IF_ERROR(InitializeSession(options, *sess, prepacked_weights_container));
+  return nullptr;
+  API_IMPL_END
+}
+
+static constexpr OrtModelEditorApi ort_model_editor_api = {
+    // NOTE: The C# bindings depend on the API order within this struct so all additions must be at the end,
+    // and no functions can be removed (the implementation needs to change to return an error).
+
+    &OrtModelEditorAPI::CreateTensorTypeInfo,
+    &OrtModelEditorAPI::CreateSparseTensorTypeInfo,
+    &OrtModelEditorAPI::CreateMapTypeInfo,
+    &OrtModelEditorAPI::CreateSequenceTypeInfo,
+    &OrtModelEditorAPI::CreateOptionalTypeInfo,
+
+    &OrtModelEditorAPI::CreateValueInfo,
+
+    &OrtModelEditorAPI::CreateNode,
+
+    &OrtModelEditorAPI::CreateGraph,
+    &OrtModelEditorAPI::SetGraphInputs,
+    &OrtModelEditorAPI::SetGraphOutputs,
+    &OrtModelEditorAPI::AddInitializerToGraph,
+    &OrtModelEditorAPI::AddNodeToGraph,
+
+    &OrtModelEditorAPI::CreateModel,
+    &OrtModelEditorAPI::AddGraphToModel,
+
+    &OrtModelEditorAPI::CreateSessionFromModel,
+
+    &OrtModelEditorAPI::CreateModelEditorSession,
+    &OrtModelEditorAPI::CreateModelEditorSessionFromArray,
+    &OrtModelEditorAPI::SessionGetOpsetForDomain,
+    &OrtModelEditorAPI::ApplyModelToModelEditorSession,
+    &OrtModelEditorAPI::FinalizeModelEditorSession,
+};
+
+// checks that we don't violate the rule that the functions must remain in the slots they were originally assigned
+static_assert(offsetof(OrtModelEditorApi, FinalizeModelEditorSession) / sizeof(void*) == 19,
+              "Size of version 21 API cannot change");  // initial version in ORT 1.21
+
+ORT_API(const OrtModelEditorApi*, OrtModelEditorAPI::GetModelEditorApi) {
+  return &ort_model_editor_api;
+}
+
+#endif  // !defined(ORT_MINIMAL_BUILD)
diff --git a/onnxruntime/core/session/onnxruntime_c_api.cc b/onnxruntime/core/session/onnxruntime_c_api.cc
index 4eedcd591154f..0e23d7a791bec 100644
--- a/onnxruntime/core/session/onnxruntime_c_api.cc
+++ b/onnxruntime/core/session/onnxruntime_c_api.cc
@@ -1,45 +1,47 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "core/session/onnxruntime_c_api.h"
-#include "core/session/allocator_adapters.h"
-#include "core/session/inference_session_utils.h"
-#include "core/session/IOBinding.h"
-#include "core/framework/allocator.h"
-#include "core/framework/error_code_helper.h"
-#include "core/framework/execution_provider.h"
-#include "core/framework/tensor_type_and_shape.h"
-#include "core/framework/utils.h"
 #include <cassert>
 #include <cstring>
 #include <functional>
+#include <mutex>
 #include <sstream>
 
 #include "core/common/common.h"
 #include "core/common/logging/logging.h"
 #include "core/common/narrow.h"
-#include "core/common/status.h"
 #include "core/common/safeint.h"
-#include "core/graph/constants.h"
-#include "core/graph/graph.h"
+#include "core/common/status.h"
+#include "core/common/string_helper.h"
 #include "core/framework/allocator.h"
-#include "core/framework/tensor.h"
+#include "core/framework/allocator.h"
+#include "core/framework/callback.h"
+#include "core/framework/data_types.h"
+#include "core/framework/error_code_helper.h"
+#include "core/framework/execution_provider.h"
+#include "core/framework/onnxruntime_typeinfo.h"
 #include "core/framework/ort_value.h"
+#include "core/framework/tensor.h"
+#include "core/framework/tensor_type_and_shape.h"
+#include "core/framework/tensorprotoutils.h"
+#include "core/framework/TensorSeq.h"
+#include "core/framework/utils.h"
+#include "core/graph/constants.h"
+#include "core/graph/graph.h"
+#include "core/graph/model_editor_api_types.h"
 #include "core/providers/get_execution_providers.h"
+#include "core/session/abi_session_options_impl.h"
+#include "core/session/allocator_adapters.h"
 #include "core/session/environment.h"
-#include "core/framework/callback.h"
-#include "core/framework/tensorprotoutils.h"
-#include "core/framework/onnxruntime_typeinfo.h"
 #include "core/session/inference_session.h"
+#include "core/session/inference_session_utils.h"
+#include "core/session/IOBinding.h"
+#include "core/session/lora_adapters.h"
+#include "core/session/model_editor_api.h"
+#include "core/session/onnxruntime_c_api.h"
 #include "core/session/ort_apis.h"
 #include "core/session/ort_env.h"
-#include "core/framework/data_types.h"
-#include "abi_session_options_impl.h"
-#include "core/framework/TensorSeq.h"
-#include <mutex>
-#include "core/common/string_helper.h"
-
-#include "core/session/lora_adapters.h"
+#include "core/session/utils.h"
 
 #ifdef USE_CUDA
 #include "core/providers/cuda/cuda_provider_factory.h"
@@ -114,6 +116,72 @@ using namespace onnxruntime;
   auto v = (value);                \
   auto tensor = v->GetMutable<onnxruntime::Tensor>();
 
+namespace {
+// Create tensor. Allocates memory. Tensor owns memory. Allocator is wrapped and stored in a shared_ptr in Tensor.
+ORT_STATUS_PTR CreateTensorImpl(MLDataType ml_type, const int64_t* shape, size_t shape_len,
+                                OrtAllocator* allocator, OrtValue& value) {
+  TensorShape tensor_shape(shape, shape_len);
+  AllocatorPtr alloc_ptr = std::make_shared<onnxruntime::IAllocatorImplWrappingOrtAllocator>(allocator);
+  Tensor::InitOrtValue(ml_type, tensor_shape, std::move(alloc_ptr), value);
+  return nullptr;
+}
+
+// Create Tensor with existing data. Tensor does not own memory.
+ORT_STATUS_PTR CreateTensorImpl(MLDataType ml_type,
+                                const int64_t* shape, size_t shape_len,
+                                const OrtMemoryInfo* info,
+                                void* p_data, size_t p_data_len,
+                                OrtValue& ort_value) {
+  TensorShape tensor_shape(shape, shape_len);
+  if (std::any_of(tensor_shape.GetDims().begin(), tensor_shape.GetDims().end(), [](int64_t v) { return v < 0; })) {
+    return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, "tried creating tensor with negative value in shape");
+  }
+
+  size_t size_to_allocate = 0;
+  Status status = Tensor::CalculateTensorStorageSize(ml_type, tensor_shape, 0 /*alignment*/, size_to_allocate);
+  if (!status.IsOK()) {
+    return ToOrtStatus(status);
+  }
+  if (size_to_allocate > p_data_len) {
+    std::ostringstream oss;
+    oss << "not enough space: expected " << size_to_allocate << ", got " << p_data_len;
+    return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, oss.str().c_str());
+  }
+
+  Tensor::InitOrtValue(ml_type, tensor_shape, p_data, *info, ort_value);
+  return nullptr;
+}
+
+ORT_STATUS_PTR CreateTensorImpl(MLDataType ml_type,
+                                const int64_t* shape, size_t shape_len,
+                                OrtAllocator* deleter,
+                                void* p_data, size_t p_data_len,
+                                OrtValue& ort_value) {
+  TensorShape tensor_shape(shape, shape_len);
+  if (std::any_of(tensor_shape.GetDims().begin(), tensor_shape.GetDims().end(), [](int64_t v) { return v < 0; })) {
+    return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, "tried creating tensor with negative value in shape");
+  }
+
+  size_t size_to_allocate = 0;
+  Status status = Tensor::CalculateTensorStorageSize(ml_type, tensor_shape, 0 /*alignment*/, size_to_allocate);
+
+  if (!status.IsOK()) {
+    return ToOrtStatus(status);
+  }
+
+  if (size_to_allocate > p_data_len) {
+    std::ostringstream oss;
+    oss << "p_data_len was smaller than expected. Expected:" << size_to_allocate << " Got:" << p_data_len;
+    return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, oss.str().c_str());
+  }
+
+  AllocatorPtr alloc_ptr = std::make_shared<onnxruntime::IAllocatorImplWrappingOrtAllocator>(deleter);
+  Tensor::InitOrtValue(ml_type, tensor_shape, p_data, std::move(alloc_ptr), ort_value);
+  return nullptr;
+}
+
+}  // namespace
+
 ORT_API_STATUS_IMPL(OrtApis::CreateEnvWithCustomLogger, OrtLoggingFunction logging_function,
                     _In_opt_ void* logger_param, OrtLoggingLevel logging_level, _In_ const char* logid,
                     _Outptr_ OrtEnv** out) {
@@ -187,50 +255,6 @@ ORT_API_STATUS_IMPL(OrtApis::UpdateEnvWithCustomLogLevel, _In_ OrtEnv* ort_env,
   API_IMPL_END
 }
 
-ORT_STATUS_PTR CreateTensorImpl(MLDataType ml_type, const int64_t* shape, size_t shape_len,
-                                _Inout_ OrtAllocator* allocator, OrtValue& value) {
-  TensorShape tensor_shape(shape, shape_len);
-  AllocatorPtr alloc_ptr = std::make_shared<onnxruntime::IAllocatorImplWrappingOrtAllocator>(allocator);
-  Tensor::InitOrtValue(ml_type, tensor_shape, std::move(alloc_ptr), value);
-  return nullptr;
-}
-
-ORT_STATUS_PTR CreateTensorImplForSeq(MLDataType elem_type, const int64_t* shape, size_t shape_len, Tensor& out) {
-  OrtAllocator* allocator;
-  // TODO(pranav): what allocator should be used to create the tensor here?
-  // for the sake of simplicity of the API using the default one here
-  ORT_API_RETURN_IF_ERROR(OrtApis::GetAllocatorWithDefaultOptions(&allocator));
-  AllocatorPtr alloc_ptr = std::make_shared<onnxruntime::IAllocatorImplWrappingOrtAllocator>(allocator);
-  TensorShape tensor_shape(shape, shape_len);
-  out = Tensor(elem_type, tensor_shape, std::move(alloc_ptr));
-  return nullptr;
-}
-
-/**
- *
- * this function will create a copy of the allocator info
- */
-ORT_STATUS_PTR CreateTensorImpl(MLDataType ml_type, const int64_t* shape, size_t shape_len, const OrtMemoryInfo* info,
-                                void* p_data, size_t p_data_len, OrtValue& ort_value) {
-  TensorShape tensor_shape(shape, shape_len);
-  if (std::any_of(tensor_shape.GetDims().begin(), tensor_shape.GetDims().end(), [](int64_t v) { return v < 0; })) {
-    return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, "tried creating tensor with negative value in shape");
-  }
-
-  size_t size_to_allocate = 0;
-  Status status = Tensor::CalculateTensorStorageSize(ml_type, tensor_shape, 0 /*alignment*/, size_to_allocate);
-  if (!status.IsOK()) {
-    return ToOrtStatus(status);
-  }
-  if (size_to_allocate > p_data_len) {
-    std::ostringstream oss;
-    oss << "not enough space: expected " << size_to_allocate << ", got " << p_data_len;
-    return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, oss.str().c_str());
-  }
-  Tensor::InitOrtValue(ml_type, tensor_shape, p_data, *info, ort_value);
-  return nullptr;
-}
-
 ORT_API_STATUS_IMPL(OrtApis::CreateTensorWithDataAsOrtValue, _In_ const OrtMemoryInfo* info,
                     _Inout_ void* p_data, size_t p_data_len, _In_ const int64_t* shape, size_t shape_len,
                     ONNXTensorElementDataType type, _Outptr_ OrtValue** out) {
@@ -243,6 +267,20 @@ ORT_API_STATUS_IMPL(OrtApis::CreateTensorWithDataAsOrtValue, _In_ const OrtMemor
   API_IMPL_END
 }
 
+ORT_API_STATUS_IMPL(OrtApis::CreateTensorWithDataAndDeleterAsOrtValue, _In_ OrtAllocator* deleter,
+                    _In_ void* p_data, size_t p_data_len,
+                    _In_ const int64_t* shape, size_t shape_len,
+                    ONNXTensorElementDataType type,
+                    _Outptr_ OrtValue** out) {
+  API_IMPL_BEGIN
+  auto ml_type = DataTypeImpl::TensorTypeFromONNXEnum(type)->GetElementType();
+  auto value = std::make_unique<OrtValue>();
+  ORT_API_RETURN_IF_ERROR(CreateTensorImpl(ml_type, shape, shape_len, deleter, p_data, p_data_len, *value));
+  *out = value.release();
+  return nullptr;
+  API_IMPL_END
+}
+
 ORT_API_STATUS_IMPL(OrtApis::CreateTensorAsOrtValue, _Inout_ OrtAllocator* allocator,
                     _In_ const int64_t* shape, size_t shape_len, ONNXTensorElementDataType type,
                     _Outptr_ OrtValue** out) {
@@ -678,97 +716,6 @@ ORT_API_STATUS_IMPL(OrtApis::EnableOrtCustomOps, _Inout_ OrtSessionOptions* opti
   API_IMPL_END
 }
 
-namespace {
-// provider either model_path, or modal_data + model_data_length.
-static ORT_STATUS_PTR CreateSessionAndLoadModel(_In_ const OrtSessionOptions* options,
-                                                _In_ const OrtEnv* env,
-                                                _In_opt_z_ const ORTCHAR_T* model_path,
-                                                _In_opt_ const void* model_data,
-                                                size_t model_data_length,
-                                                std::unique_ptr<onnxruntime::InferenceSession>& sess) {
-  // quick check here to decide load path. InferenceSession will provide error message for invalid values.
-  // TODO: Could move to a helper
-  const Env& os_env = Env::Default();  // OS environment (!= ORT environment)
-  bool load_config_from_model =
-      os_env.GetEnvironmentVar(inference_session_utils::kOrtLoadConfigFromModelEnvVar) == "1";
-
-  if (load_config_from_model) {
-#if !defined(ORT_MINIMAL_BUILD)
-    if (model_path != nullptr) {
-      sess = std::make_unique<onnxruntime::InferenceSession>(
-          options == nullptr ? onnxruntime::SessionOptions() : options->value,
-          env->GetEnvironment(),
-          model_path);
-    } else {
-      sess = std::make_unique<onnxruntime::InferenceSession>(
-          options == nullptr ? onnxruntime::SessionOptions() : options->value,
-          env->GetEnvironment(),
-          model_data, static_cast<int>(model_data_length));
-    }
-#else
-    return OrtApis::CreateStatus(ORT_FAIL, "Loading config from ONNX models is not supported in this build.");
-#endif
-  } else {
-    sess = std::make_unique<onnxruntime::InferenceSession>(
-        options == nullptr ? onnxruntime::SessionOptions() : options->value,
-        env->GetEnvironment());
-  }
-
-#if !defined(ORT_MINIMAL_BUILD) || defined(ORT_MINIMAL_BUILD_CUSTOM_OPS)
-  // Add custom domains
-  if (options && !options->custom_op_domains_.empty()) {
-    ORT_API_RETURN_IF_STATUS_NOT_OK(sess->AddCustomOpDomains(options->custom_op_domains_));
-  }
-#endif
-
-  // Finish load
-  if (load_config_from_model) {
-#if !defined(ORT_MINIMAL_BUILD)
-    ORT_API_RETURN_IF_STATUS_NOT_OK(sess->Load());
-#endif
-  } else {
-    if (model_path != nullptr) {
-      ORT_API_RETURN_IF_STATUS_NOT_OK(sess->Load(model_path));
-    } else {
-      ORT_API_RETURN_IF_STATUS_NOT_OK(sess->Load(model_data, static_cast<int>(model_data_length)));
-    }
-  }
-
-  return nullptr;
-}
-
-static ORT_STATUS_PTR InitializeSession(_In_ const OrtSessionOptions* options,
-                                        _In_ std::unique_ptr<::onnxruntime::InferenceSession>& sess,
-                                        _Inout_opt_ OrtPrepackedWeightsContainer* prepacked_weights_container = nullptr) {
-  // we need to disable mem pattern if DML is one of the providers since DML doesn't have the concept of
-  // byte addressable memory
-  std::vector<std::unique_ptr<IExecutionProvider>> provider_list;
-  if (options) {
-    for (auto& factory : options->provider_factories) {
-      auto provider = factory->CreateProvider();
-      provider_list.push_back(std::move(provider));
-    }
-  }
-
-  // register the providers
-  for (auto& provider : provider_list) {
-    if (provider) {
-      ORT_API_RETURN_IF_STATUS_NOT_OK(sess->RegisterExecutionProvider(std::move(provider)));
-    }
-  }
-
-  if (prepacked_weights_container != nullptr) {
-    ORT_API_RETURN_IF_STATUS_NOT_OK(sess->AddPrePackedWeightsContainer(
-        reinterpret_cast<PrepackedWeightsContainer*>(prepacked_weights_container)));
-  }
-
-  ORT_API_RETURN_IF_STATUS_NOT_OK(sess->Initialize());
-
-  return nullptr;
-}
-
-}  // namespace
-
 ORT_API_STATUS_IMPL(OrtApis::CreateSession, _In_ const OrtEnv* env, _In_ const ORTCHAR_T* model_path,
                     _In_ const OrtSessionOptions* options, _Outptr_ OrtSession** out) {
   API_IMPL_BEGIN
@@ -778,7 +725,7 @@ ORT_API_STATUS_IMPL(OrtApis::CreateSession, _In_ const OrtEnv* env, _In_ const O
 
   ORT_TRY {
     ORT_API_RETURN_IF_ERROR(CreateSessionAndLoadModel(options, env, model_path, nullptr, 0, sess));
-    ORT_API_RETURN_IF_ERROR(InitializeSession(options, sess));
+    ORT_API_RETURN_IF_ERROR(InitializeSession(options, *sess));
 
     *out = reinterpret_cast<OrtSession*>(sess.release());
   }
@@ -801,7 +748,7 @@ ORT_API_STATUS_IMPL(OrtApis::CreateSessionFromArray, _In_ const OrtEnv* env, _In
 
   ORT_TRY {
     ORT_API_RETURN_IF_ERROR(CreateSessionAndLoadModel(options, env, nullptr, model_data, model_data_length, sess));
-    ORT_API_RETURN_IF_ERROR(InitializeSession(options, sess));
+    ORT_API_RETURN_IF_ERROR(InitializeSession(options, *sess));
 
     *out = reinterpret_cast<OrtSession*>(sess.release());
   }
@@ -1208,7 +1155,6 @@ ORT_API_STATUS_IMPL(OrtApis::GetResizedStringTensorElementBuffer, _Inout_ OrtVal
 }
 
 namespace {
-
 OrtStatusPtr GetTensorStringSpan(const ::OrtValue& v, gsl::span<const std::string>& span) {
   if (!v.IsAllocated()) {
     return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, "OrtValue should contain a Tensor or a Sparse Tensor");
@@ -2112,7 +2058,6 @@ ORT_API_STATUS_IMPL(OrtApis::GetOpaqueValue, _In_ const char* domain_name, _In_
 }
 
 namespace {
-
 struct ProviderBuffer {
   char** buffer_;
   char* next_write_;
@@ -2342,7 +2287,7 @@ ORT_API_STATUS_IMPL(OrtApis::CreateSessionWithPrepackedWeightsContainer, _In_ co
 
   ORT_TRY {
     ORT_API_RETURN_IF_ERROR(CreateSessionAndLoadModel(options, env, model_path, nullptr, 0, sess));
-    ORT_API_RETURN_IF_ERROR(InitializeSession(options, sess, prepacked_weights_container));
+    ORT_API_RETURN_IF_ERROR(InitializeSession(options, *sess, prepacked_weights_container));
 
     *out = reinterpret_cast<OrtSession*>(sess.release());
   }
@@ -2368,7 +2313,7 @@ ORT_API_STATUS_IMPL(OrtApis::CreateSessionFromArrayWithPrepackedWeightsContainer
   ORT_TRY {
     ORT_API_RETURN_IF_ERROR(CreateSessionAndLoadModel(options, env, nullptr, model_data,
                                                       model_data_length, sess));
-    ORT_API_RETURN_IF_ERROR(InitializeSession(options, sess, prepacked_weights_container));
+    ORT_API_RETURN_IF_ERROR(InitializeSession(options, *sess, prepacked_weights_container));
 
     *out = reinterpret_cast<OrtSession*>(sess.release());
   }
@@ -2410,6 +2355,39 @@ ORT_API_STATUS_IMPL(OrtApis::SessionOptionsSetCustomJoinThreadFn, _Inout_ OrtSes
   API_IMPL_END
 }
 
+ORT_API(void, OrtApis::ReleaseValueInfo, _Frees_ptr_opt_ OrtValueInfo* value_info) {
+  delete value_info;
+}
+
+ORT_API(void, OrtApis::ReleaseNode, _Frees_ptr_opt_ OrtNode* node) {
+  delete node;
+}
+
+ORT_API(void, OrtApis::ReleaseGraph, _Frees_ptr_opt_ OrtGraph* graph) {
+  delete graph;
+}
+
+ORT_API(void, OrtApis::ReleaseModel, _Frees_ptr_opt_ OrtModel* model) {
+  delete model;
+}
+
+ORT_API_STATUS_IMPL(OrtApis::GetValueInfoName, _In_ const OrtValueInfo* value_info,
+                    _Out_ const char** name) {
+  API_IMPL_BEGIN
+  *name = value_info->name.c_str();
+  return nullptr;
+  API_IMPL_END
+}
+ORT_API_STATUS_IMPL(OrtApis::GetValueInfoTypeInfo, _In_ const OrtValueInfo* value_info,
+                    _Outptr_ const OrtTypeInfo** type_info) {
+  API_IMPL_BEGIN
+
+  *type_info = value_info->type_info.get();
+
+  return nullptr;
+  API_IMPL_END
+}
+
 ORT_API(const OrtTrainingApi*, OrtApis::GetTrainingApi, uint32_t version) {
 #ifdef ENABLE_TRAINING_APIS
   if (version >= 13 && version <= ORT_API_VERSION)
@@ -2419,13 +2397,21 @@ ORT_API(const OrtTrainingApi*, OrtApis::GetTrainingApi, uint32_t version) {
           version, ORT_API_VERSION);
   return nullptr;
 #else
-
   ORT_UNUSED_PARAMETER(version);
 
   return nullptr;
 #endif
 }
 
+ORT_API(const OrtModelEditorApi*, OrtApis::GetModelEditorApi) {
+#if !defined(ORT_MINIMAL_BUILD)
+  return OrtModelEditorAPI::GetModelEditorApi();
+#else
+  fprintf(stderr, "The Model Editor API is not supported in a minimal build.\n");
+  return nullptr;
+#endif
+}
+
 static constexpr OrtApiBase ort_api_base = {
     &OrtApis::GetApi,
     &OrtApis::GetVersionString};
@@ -2812,6 +2798,18 @@ static constexpr OrtApi ort_api_1_to_22 = {
 
     &OrtApis::SetEpDynamicOptions,
     // End of Version 20 - DO NOT MODIFY ABOVE (see above text for more information)
+
+    &OrtApis::ReleaseValueInfo,
+    &OrtApis::ReleaseNode,
+    &OrtApis::ReleaseGraph,
+    &OrtApis::ReleaseModel,
+
+    &OrtApis::GetValueInfoName,
+    &OrtApis::GetValueInfoTypeInfo,
+
+    &OrtApis::GetModelEditorApi,
+
+    &OrtApis::CreateTensorWithDataAndDeleterAsOrtValue,
 };
 
 // OrtApiBase can never change as there is no way to know what version of OrtApiBase is returned by OrtGetApiBase.
diff --git a/onnxruntime/core/session/ort_apis.h b/onnxruntime/core/session/ort_apis.h
index 52d3c98d526dc..9d8aeb18a782f 100644
--- a/onnxruntime/core/session/ort_apis.h
+++ b/onnxruntime/core/session/ort_apis.h
@@ -20,6 +20,10 @@ ORT_API(void, ReleaseCustomOpDomain, _Frees_ptr_opt_ OrtCustomOpDomain*);
 ORT_API(void, ReleaseMapTypeInfo, _Frees_ptr_opt_ OrtMapTypeInfo*);
 ORT_API(void, ReleaseSequenceTypeInfo, _Frees_ptr_opt_ OrtSequenceTypeInfo*);
 ORT_API(void, ReleaseModelMetadata, _Frees_ptr_opt_ OrtModelMetadata*);
+ORT_API(void, ReleaseValueInfo, _Frees_ptr_opt_ OrtValueInfo*);
+ORT_API(void, ReleaseNode, _Frees_ptr_opt_ OrtNode*);
+ORT_API(void, ReleaseGraph, _Frees_ptr_opt_ OrtGraph*);
+ORT_API(void, ReleaseModel, _Frees_ptr_opt_ OrtModel*);
 
 _Check_return_ _Ret_notnull_ [[nodiscard]] OrtStatus* ORT_API_CALL CreateStatus(OrtErrorCode code, _In_z_ const char* msg)
     NO_EXCEPTION;
@@ -533,4 +537,16 @@ ORT_API_STATUS_IMPL(RunOptionsAddActiveLoraAdapter, _Inout_ OrtRunOptions* optio
 
 ORT_API_STATUS_IMPL(SetEpDynamicOptions, _Inout_ OrtSession* sess, _In_reads_(kv_len) const char* const* keys,
                     _In_reads_(kv_len) const char* const* values, _In_ size_t kv_len);
+
+ORT_API_STATUS_IMPL(GetValueInfoName, _In_ const OrtValueInfo* value_info, _Out_ const char** name);
+ORT_API_STATUS_IMPL(GetValueInfoTypeInfo, _In_ const OrtValueInfo* value_info, _Outptr_ const OrtTypeInfo** type_info);
+
+ORT_API(const OrtModelEditorApi*, GetModelEditorApi);
+
+ORT_API_STATUS_IMPL(CreateTensorWithDataAndDeleterAsOrtValue, _In_ OrtAllocator* deleter,
+                    _In_ void* p_data, size_t p_data_len,
+                    _In_ const int64_t* shape, size_t shape_len,
+                    ONNXTensorElementDataType type,
+                    _Outptr_ OrtValue** out);
+
 }  // namespace OrtApis
diff --git a/onnxruntime/core/session/utils.cc b/onnxruntime/core/session/utils.cc
new file mode 100644
index 0000000000000..afb1ed2696c9f
--- /dev/null
+++ b/onnxruntime/core/session/utils.cc
@@ -0,0 +1,125 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/session/utils.h"
+
+#include "core/framework/error_code_helper.h"
+#include "core/framework/execution_provider.h"
+#include "core/session/abi_session_options_impl.h"
+// #include "core/session/environment.h"
+#include "core/session/inference_session.h"
+#include "core/session/inference_session_utils.h"
+#include "core/session/onnxruntime_c_api.h"
+#include "core/session/ort_apis.h"
+#include "core/session/ort_env.h"
+
+using namespace onnxruntime;
+
+common::Status CopyStringToOutputArg(std::string_view str, const char* err_msg, char* out, size_t* size) {
+  const size_t str_len = str.size();
+  const size_t req_size = str_len + 1;
+
+  if (out == nullptr) {  // User is querying the total output buffer size
+    *size = req_size;
+    return onnxruntime::common::Status::OK();
+  }
+
+  if (*size >= req_size) {  // User provided a buffer of sufficient size
+    std::memcpy(out, str.data(), str_len);
+    out[str_len] = '\0';
+    *size = req_size;
+    return onnxruntime::common::Status::OK();
+  }
+
+  // User has provided a buffer that is not large enough
+  *size = req_size;
+  return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, err_msg);
+}
+
+// provider either model_path, or modal_data + model_data_length.
+OrtStatus* CreateSessionAndLoadModel(_In_ const OrtSessionOptions* options,
+                                     _In_ const OrtEnv* env,
+                                     _In_opt_z_ const ORTCHAR_T* model_path,
+                                     _In_opt_ const void* model_data,
+                                     size_t model_data_length,
+                                     std::unique_ptr<onnxruntime::InferenceSession>& sess) {
+  // quick check here to decide load path. InferenceSession will provide error message for invalid values.
+  // TODO: Could move to a helper
+  const Env& os_env = Env::Default();  // OS environment (!= ORT environment)
+  bool load_config_from_model =
+      os_env.GetEnvironmentVar(inference_session_utils::kOrtLoadConfigFromModelEnvVar) == "1";
+
+  if (load_config_from_model) {
+#if !defined(ORT_MINIMAL_BUILD)
+    if (model_path != nullptr) {
+      sess = std::make_unique<onnxruntime::InferenceSession>(
+          options == nullptr ? onnxruntime::SessionOptions() : options->value,
+          env->GetEnvironment(),
+          model_path);
+    } else {
+      sess = std::make_unique<onnxruntime::InferenceSession>(
+          options == nullptr ? onnxruntime::SessionOptions() : options->value,
+          env->GetEnvironment(),
+          model_data, static_cast<int>(model_data_length));
+    }
+#else
+    return OrtApis::CreateStatus(ORT_FAIL, "Loading config from ONNX models is not supported in this build.");
+#endif
+  } else {
+    sess = std::make_unique<onnxruntime::InferenceSession>(
+        options == nullptr ? onnxruntime::SessionOptions() : options->value,
+        env->GetEnvironment());
+  }
+
+#if !defined(ORT_MINIMAL_BUILD) || defined(ORT_MINIMAL_BUILD_CUSTOM_OPS)
+  // Add custom domains
+  if (options && !options->custom_op_domains_.empty()) {
+    ORT_API_RETURN_IF_STATUS_NOT_OK(sess->AddCustomOpDomains(options->custom_op_domains_));
+  }
+#endif
+
+  // Finish load
+  if (load_config_from_model) {
+#if !defined(ORT_MINIMAL_BUILD)
+    ORT_API_RETURN_IF_STATUS_NOT_OK(sess->Load());
+#endif
+  } else {
+    if (model_path != nullptr) {
+      ORT_API_RETURN_IF_STATUS_NOT_OK(sess->Load(model_path));
+    } else {
+      ORT_API_RETURN_IF_STATUS_NOT_OK(sess->Load(model_data, static_cast<int>(model_data_length)));
+    }
+  }
+
+  return nullptr;
+}
+
+OrtStatus* InitializeSession(_In_ const OrtSessionOptions* options,
+                             _In_ onnxruntime::InferenceSession& sess,
+                             _Inout_opt_ OrtPrepackedWeightsContainer* prepacked_weights_container) {
+  // we need to disable mem pattern if DML is one of the providers since DML doesn't have the concept of
+  // byte addressable memory
+  std::vector<std::unique_ptr<IExecutionProvider>> provider_list;
+  if (options) {
+    for (auto& factory : options->provider_factories) {
+      auto provider = factory->CreateProvider();
+      provider_list.push_back(std::move(provider));
+    }
+  }
+
+  // register the providers
+  for (auto& provider : provider_list) {
+    if (provider) {
+      ORT_API_RETURN_IF_STATUS_NOT_OK(sess.RegisterExecutionProvider(std::move(provider)));
+    }
+  }
+
+  if (prepacked_weights_container != nullptr) {
+    ORT_API_RETURN_IF_STATUS_NOT_OK(sess.AddPrePackedWeightsContainer(
+        reinterpret_cast<PrepackedWeightsContainer*>(prepacked_weights_container)));
+  }
+
+  ORT_API_RETURN_IF_STATUS_NOT_OK(sess.Initialize());
+
+  return nullptr;
+}
diff --git a/onnxruntime/core/session/utils.h b/onnxruntime/core/session/utils.h
new file mode 100644
index 0000000000000..ac8ad60758b5b
--- /dev/null
+++ b/onnxruntime/core/session/utils.h
@@ -0,0 +1,28 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include <string_view>
+#include "core/common/common.h"
+#include "core/session/onnxruntime_c_api.h"
+
+onnxruntime::common::Status CopyStringToOutputArg(std::string_view str, const char* err_msg, char* out, size_t* size);
+
+struct OrtSessionOptions;
+struct OrtStatus;
+struct OrtPrepackedWeightsContainer;
+namespace onnxruntime {
+class InferenceSession;
+}
+
+OrtStatus* CreateSessionAndLoadModel(_In_ const OrtSessionOptions* options,
+                                     _In_ const OrtEnv* env,
+                                     _In_opt_z_ const ORTCHAR_T* model_path,
+                                     _In_opt_ const void* model_data,
+                                     size_t model_data_length,
+                                     std::unique_ptr<onnxruntime::InferenceSession>& sess);
+
+OrtStatus* InitializeSession(_In_ const OrtSessionOptions* options,
+                             _In_ onnxruntime::InferenceSession& sess,
+                             _Inout_opt_ OrtPrepackedWeightsContainer* prepacked_weights_container = nullptr);
diff --git a/onnxruntime/test/framework/type_info_test.cc b/onnxruntime/test/framework/type_info_test.cc
index ee787fb071d97..d8ef668bf1c7e 100644
--- a/onnxruntime/test/framework/type_info_test.cc
+++ b/onnxruntime/test/framework/type_info_test.cc
@@ -22,9 +22,9 @@ TEST(TypeInfoTests, TensorProto) {
 
   auto tensor_type_info = OrtTypeInfo::FromTypeProto(tensor_type.value);
   ASSERT_EQ(ONNX_TYPE_TENSOR, tensor_type_info->type);
-  ASSERT_NE(nullptr, tensor_type_info->data);
-  ASSERT_EQ(ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT, tensor_type_info->data->type);
-  ASSERT_TRUE(SpanEq(AsSpan<int64_t>({1, 2, 3, 4}), tensor_type_info->data->shape.GetDims()));
+  ASSERT_NE(nullptr, tensor_type_info->tensor_type_info);
+  ASSERT_EQ(ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT, tensor_type_info->tensor_type_info->type);
+  ASSERT_TRUE(SpanEq(AsSpan<int64_t>({1, 2, 3, 4}), tensor_type_info->tensor_type_info->shape.GetDims()));
 }
 
 TEST(TypeInfoTests, SequenceWithTensorElement) {
@@ -37,9 +37,9 @@ TEST(TypeInfoTests, SequenceWithTensorElement) {
   const auto& tensor_type_info = *seq_type_info->sequence_type_info->sequence_key_type_;
 
   ASSERT_EQ(ONNX_TYPE_TENSOR, tensor_type_info.type);
-  ASSERT_NE(nullptr, tensor_type_info.data);
-  ASSERT_EQ(ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT, tensor_type_info.data->type);
-  ASSERT_TRUE(SpanEq(AsSpan<int64_t>({1, 2, 3, 4}), tensor_type_info.data->shape.GetDims()));
+  ASSERT_NE(nullptr, tensor_type_info.tensor_type_info);
+  ASSERT_EQ(ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT, tensor_type_info.tensor_type_info->type);
+  ASSERT_TRUE(SpanEq(AsSpan<int64_t>({1, 2, 3, 4}), tensor_type_info.tensor_type_info->shape.GetDims()));
 }
 
 TEST(TypeInfoTests, OptionalWithTensorProto) {
@@ -54,9 +54,9 @@ TEST(TypeInfoTests, OptionalWithTensorProto) {
 
   const auto& contained_type = *optional_type_info->optional_type_info->contained_type_;
   ASSERT_EQ(ONNX_TYPE_TENSOR, contained_type.type);
-  ASSERT_NE(nullptr, contained_type.data);
-  ASSERT_EQ(ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT, contained_type.data->type);
-  ASSERT_TRUE(SpanEq(AsSpan<int64_t>({1, 2, 3, 4}), contained_type.data->shape.GetDims()));
+  ASSERT_NE(nullptr, contained_type.tensor_type_info);
+  ASSERT_EQ(ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT, contained_type.tensor_type_info->type);
+  ASSERT_TRUE(SpanEq(AsSpan<int64_t>({1, 2, 3, 4}), contained_type.tensor_type_info->shape.GetDims()));
 }
 
 #if !defined(DISABLE_ML_OPS)
@@ -74,11 +74,11 @@ TEST(TypeInfoTests, MapWithTensorValue) {
   const auto& tensor_type_info = *map_info.map_value_type_;
 
   ASSERT_EQ(ONNX_TYPE_TENSOR, tensor_type_info.type);
-  ASSERT_NE(nullptr, tensor_type_info.data);
-  ASSERT_EQ(ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT, tensor_type_info.data->type);
-  ASSERT_TRUE(SpanEq(AsSpan<int64_t>({1, 2, 3, 4}), tensor_type_info.data->shape.GetDims()));
+  ASSERT_NE(nullptr, tensor_type_info.tensor_type_info);
+  ASSERT_EQ(ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT, tensor_type_info.tensor_type_info->type);
+  ASSERT_TRUE(SpanEq(AsSpan<int64_t>({1, 2, 3, 4}), tensor_type_info.tensor_type_info->shape.GetDims()));
 }
 #endif
 
 }  // namespace test
-}  // namespace onnxruntime
\ No newline at end of file
+}  // namespace onnxruntime
diff --git a/onnxruntime/test/shared_lib/custom_op_utils.h b/onnxruntime/test/shared_lib/custom_op_utils.h
index e11540aaa5691..ea2a5f2771342 100644
--- a/onnxruntime/test/shared_lib/custom_op_utils.h
+++ b/onnxruntime/test/shared_lib/custom_op_utils.h
@@ -8,12 +8,6 @@
 #include <cuda_runtime.h>
 #endif
 
-struct Input {
-  const char* name = nullptr;
-  std::vector<int64_t> dims;
-  std::vector<float> values;
-};
-
 struct MyCustomKernel {
   MyCustomKernel(const OrtApi& ort_api, const OrtKernelInfo* /*info*/)
       : ort_(ort_api) {
diff --git a/onnxruntime/test/shared_lib/test_inference.cc b/onnxruntime/test/shared_lib/test_inference.cc
index ca9ca0f82a25a..4216efdfdfdb8 100644
--- a/onnxruntime/test/shared_lib/test_inference.cc
+++ b/onnxruntime/test/shared_lib/test_inference.cc
@@ -1,17 +1,19 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include <memory>
-#include <vector>
-#include <iostream>
-#include <fstream>
-#include <sstream>
+#include <algorithm>
 #include <atomic>
+#include <fstream>
+#include <iostream>
+#include <memory>
 #include <mutex>
-#include <algorithm>
+#include <sstream>
 #include <thread>
+#include <vector>
 
 #include <absl/base/config.h>
+#include <gsl/gsl>
+
 #include "gtest/gtest.h"
 #include "gmock/gmock.h"
 
@@ -25,13 +27,13 @@
 #include "core/session/onnxruntime_run_options_config_keys.h"
 #include "core/util/thread_utils.h"
 
-#include "onnxruntime_config.h"
-#include "providers.h"
-#include "test_allocator.h"
-#include "test_fixture.h"
-#include "utils.h"
-#include "custom_op_utils.h"
-#include <gsl/gsl>
+#include "test/shared_lib/custom_op_utils.h"
+#include "test/shared_lib/test_fixture.h"
+#include "test/shared_lib/utils.h"
+#include "test/util/include/providers.h"
+#include "test/util/include/test_allocator.h"
+
+#include "onnxruntime_config.h"  // generated file in build output dir
 
 #ifdef _WIN32
 #include <Windows.h>
@@ -63,48 +65,6 @@ constexpr size_t countof(T (&)[N]) { return N; }
 
 extern std::unique_ptr<Ort::Env> ort_env;
 
-template <typename OutT, typename InT = float, typename InputT = Input>
-void RunSession(OrtAllocator* allocator, Ort::Session& session_object,
-                const std::vector<InputT>& inputs,
-                const char* output_name,
-                const std::vector<int64_t>& dims_y,
-                const std::vector<OutT>& values_y,
-                Ort::Value* output_tensor) {
-  std::vector<Ort::Value> ort_inputs;
-  std::vector<const char*> input_names;
-  for (size_t i = 0; i < inputs.size(); i++) {
-    input_names.emplace_back(inputs[i].name);
-    ort_inputs.emplace_back(
-        Ort::Value::CreateTensor<InT>(allocator->Info(allocator), const_cast<InT*>(inputs[i].values.data()),
-                                      inputs[i].values.size(), inputs[i].dims.data(), inputs[i].dims.size()));
-  }
-
-  std::vector<Ort::Value> ort_outputs;
-  if (output_tensor)
-    session_object.Run(Ort::RunOptions{nullptr}, input_names.data(), ort_inputs.data(), ort_inputs.size(),
-                       &output_name, output_tensor, 1);
-  else {
-    ort_outputs = session_object.Run(Ort::RunOptions{}, input_names.data(), ort_inputs.data(), ort_inputs.size(),
-                                     &output_name, 1);
-    ASSERT_EQ(ort_outputs.size(), 1u);
-    output_tensor = &ort_outputs[0];
-  }
-
-  auto type_info = output_tensor->GetTensorTypeAndShapeInfo();
-  ASSERT_EQ(type_info.GetShape(), dims_y);
-  size_t total_len = type_info.GetElementCount();
-  ASSERT_EQ(values_y.size(), total_len);
-
-  OutT* f = output_tensor->GetTensorMutableData<OutT>();
-  for (size_t i = 0; i != total_len; ++i) {
-    if constexpr (std::is_same<OutT, float>::value || std::is_same<OutT, double>::value) {
-      ASSERT_NEAR(values_y[i], f[i], 1e-3);
-    } else {
-      ASSERT_EQ(values_y[i], f[i]);
-    }
-  }
-}
-
 #ifdef USE_DML
 struct DmlObjects {
   ComPtr<ID3D12Device> d3d12_device;
@@ -300,12 +260,12 @@ Ort::Value CreateTensorValueFromExistingD3DResource(
 
 #endif
 
-template <typename OutT, typename InT = float, typename InputT = Input>
+template <typename ModelOutputT, typename ModelInputT = float, typename InputT = Input<float>>
 static void TestInference(Ort::Env& env, const std::basic_string<ORTCHAR_T>& model_uri,
                           const std::vector<InputT>& inputs,
                           const char* output_name,
                           const std::vector<int64_t>& expected_dims_y,
-                          const std::vector<OutT>& expected_values_y,
+                          const std::vector<ModelOutputT>& expected_values_y,
                           int provider_type,
                           OrtCustomOpDomain* custom_op_domain_ptr,
                           const ORTCHAR_T* custom_op_library_filename,
@@ -362,26 +322,26 @@ static void TestInference(Ort::Env& env, const std::basic_string<ORTCHAR_T>& mod
     auto default_allocator = std::make_unique<MockedOrtAllocator>();
 
     // without preallocated output tensor
-    RunSession<OutT, InT, InputT>(default_allocator.get(),
-                                  session,
-                                  inputs,
-                                  output_name,
-                                  expected_dims_y,
-                                  expected_values_y,
-                                  nullptr);
+    RunSession<ModelOutputT, ModelInputT, InputT>(default_allocator.get(),
+                                                  session,
+                                                  inputs,
+                                                  output_name,
+                                                  expected_dims_y,
+                                                  expected_values_y,
+                                                  nullptr);
     // with preallocated output tensor
-    Ort::Value value_y = Ort::Value::CreateTensor<OutT>(default_allocator.get(),
-                                                        expected_dims_y.data(), expected_dims_y.size());
+    Ort::Value value_y = Ort::Value::CreateTensor<ModelOutputT>(default_allocator.get(),
+                                                                expected_dims_y.data(), expected_dims_y.size());
 
     // test it twice
     for (int i = 0; i != 2; ++i)
-      RunSession<OutT, InT, InputT>(default_allocator.get(),
-                                    session,
-                                    inputs,
-                                    output_name,
-                                    expected_dims_y,
-                                    expected_values_y,
-                                    &value_y);
+      RunSession<ModelOutputT, ModelInputT, InputT>(default_allocator.get(),
+                                                    session,
+                                                    inputs,
+                                                    output_name,
+                                                    expected_dims_y,
+                                                    expected_values_y,
+                                                    &value_y);
   }
 }
 
@@ -450,8 +410,8 @@ class CApiTestWithProvider : public testing::Test, public ::testing::WithParamIn
 TEST_P(CApiTestWithProvider, simple) {
   // simple inference test
   // prepare inputs
-  std::vector<Input> inputs(1);
-  Input& input = inputs.back();
+  std::vector<Input<float>> inputs(1);
+  auto& input = inputs.back();
   input.name = "X";
   input.dims = {3, 2};
   input.values = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
@@ -621,8 +581,8 @@ TEST(CApiTest, SparseInputModel) {
 TEST(CApiTest, custom_op_handler) {
   std::cout << "Running custom op inference" << std::endl;
 
-  std::vector<Input> inputs(1);
-  Input& input = inputs[0];
+  std::vector<Input<float>> inputs(1);
+  auto& input = inputs[0];
   input.name = "X";
   input.dims = {3, 2};
   input.values = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
@@ -657,8 +617,8 @@ TEST(CApiTest, custom_op_handler) {
 TEST(CApiTest, custom_op_set_input_memory_type) {
   std::cout << "Running custom op inference" << std::endl;
 
-  std::vector<Input> inputs(1);
-  Input& input = inputs[0];
+  std::vector<Input<float>> inputs(1);
+  auto& input = inputs[0];
   input.name = "X";
   input.dims = {3, 2};
   input.values = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
@@ -687,8 +647,8 @@ TEST(CApiTest, custom_op_set_input_memory_type) {
 
 #if !defined(ORT_MINIMAL_BUILD)
 TEST(CApiTest, StandaloneOpHandler) {
-  std::vector<Input> inputs(1);
-  Input& input = inputs[0];
+  std::vector<Input<float>> inputs(1);
+  auto& input = inputs[0];
   input.name = "X";
   input.dims = {3, 2};
   input.values = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
@@ -811,7 +771,7 @@ TEST(CApiTest, test_enable_ort_customops_stringlower) {
 
 // test custom op which accepts float and double as inputs
 TEST(CApiTest, varied_input_custom_op_handler) {
-  std::vector<Input> inputs(2);
+  std::vector<Input<float>> inputs(2);
   inputs[0].name = "X";
   inputs[0].dims = {3};
   inputs[0].values = {2.0f, 3.0f, 4.0f};
@@ -1422,8 +1382,8 @@ TEST(CApiTest, custom_op_with_attributes_handler) {
 TEST(CApiTest, RegisterCustomOpForCPUAndCUDA) {
   std::cout << "Tests registration of a custom op of the same name for both CPU and CUDA EPs" << std::endl;
 
-  std::vector<Input> inputs(1);
-  Input& input = inputs[0];
+  std::vector<Input<float>> inputs(1);
+  auto& input = inputs[0];
   input.name = "X";
   input.dims = {3, 2};
   input.values = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
@@ -1531,7 +1491,7 @@ TEST(CApiTest, test_custom_op_openvino_wrapper_library) {
   // The custom op extracts the serialized .xml/.bin bytes and creates an in-memory OpenVINO model
   // during kernel creation. The custom op is passed an image of a hand-drawn "1" as an input during computation, which
   // is then inferenced using OpenVINO C++ APIs.
-  std::vector<Input> inputs(1);
+  std::vector<Input<float>> inputs(1);
   inputs[0].name = "Input3";
   inputs[0].dims = {1, 1, 28, 28};
 
@@ -1630,7 +1590,7 @@ TEST(CApiTest, test_custom_op_library) {
 #endif
   std::cout << "Running inference using custom op shared library" << std::endl;
 
-  std::vector<Input> inputs(2);
+  std::vector<Input<float>> inputs(2);
   inputs[0].name = "input_1";
   inputs[0].dims = {3, 5};
   inputs[0].values = {1.1f, 2.2f, 3.3f, 4.4f, 5.5f,
@@ -1682,7 +1642,7 @@ TEST(CApiTest, DISABLED_test_custom_op_shape_infer_attr) {
 #else
 TEST(CApiTest, test_custom_op_shape_infer_attr) {
 #endif
-  std::vector<Input> inputs(1);
+  std::vector<Input<float>> inputs(1);
   inputs[0].name = "input_0";
   inputs[0].dims = {5};
   inputs[0].values = {1.f, 2.f, 3.f, 4.f, 5.f};
@@ -1715,7 +1675,7 @@ TEST(CApiTest, test_custom_op_library_copy_variadic) {
 #endif
   std::cout << "Running inference using custom op shared library" << std::endl;
 
-  std::vector<Input> inputs(2);
+  std::vector<Input<float>> inputs(2);
   inputs[0].name = "input_0";
   inputs[0].dims = {15};
   inputs[0].values = {1.1f, 2.2f, 3.3f, 4.4f, 5.5f,
@@ -1869,8 +1829,8 @@ void PrepareModule() {
 
 TEST(CApiTest, test_pyop) {
   std::call_once(my_module_flag, PrepareModule);
-  std::vector<Input> inputs(1);
-  Input& input = inputs[0];
+  std::vector<Input<float>> inputs(1);
+  auto& input = inputs[0];
   input.name = "X";
   input.dims = {2, 2};
   input.values = {1.0f, 2.0f, 3.0f, 4.0f};
@@ -1882,8 +1842,8 @@ TEST(CApiTest, test_pyop) {
 
 TEST(CApiTest, test_pyop_multi) {
   std::call_once(my_module_flag, PrepareModule);
-  std::vector<Input> inputs(1);
-  Input& input = inputs[0];
+  std::vector<Input<float>> inputs(1);
+  auto& input = inputs[0];
   input.name = "X";
   input.dims = {2, 2};
   input.values = {1.0f, 2.0f, 3.0f, 4.0f};
@@ -1895,8 +1855,8 @@ TEST(CApiTest, test_pyop_multi) {
 
 TEST(CApiTest, test_pyop_kwarg) {
   std::call_once(my_module_flag, PrepareModule);
-  std::vector<Input> inputs(1);
-  Input& input = inputs[0];
+  std::vector<Input<float>> inputs(1);
+  auto& input = inputs[0];
   input.name = "X";
   input.dims = {2, 2};
   input.values = {1.0f, 2.0f, 3.0f, 4.0f};
@@ -1920,7 +1880,7 @@ TEST(ReducedOpsBuildTest, test_excluded_ops) {
   // In reduced ops build, test a model containing ops not included in required_ops.config cannot be loaded.
   // See onnxruntime/test/testdata/reduced_build_test.readme.txt for more details of the setup
   constexpr PATH_TYPE model_uri = TSTR("testdata/reduced_build_test.onnx_model_with_excluded_ops");
-  std::vector<Input> inputs = {{"X", {3}, {-1.0f, 2.0f, -3.0f}}};
+  std::vector<Input<float>> inputs = {{"X", {3}, {-1.0f, 2.0f, -3.0f}}};
   std::vector<int64_t> expected_dims_y = {3};
   std::vector<float> expected_values_y = {0.1f, 0.1f, 0.1f};
   bool failed = false;
@@ -3322,8 +3282,8 @@ TEST(CApiTest, TestSharedAllocators) {
   OrtEnv* env_ptr = (OrtEnv*)(*ort_env);
 
   // prepare inputs
-  std::vector<Input> inputs(1);
-  Input& input = inputs.back();
+  std::vector<Input<float>> inputs(1);
+  auto& input = inputs.back();
   input.name = "X";
   input.dims = {3, 2};
   input.values = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
@@ -3509,8 +3469,8 @@ TEST(CApiTest, TestSharedAllocators) {
 TEST(CApiTest, TestSharingOfInitializerAndItsPrepackedVersion) {
   // simple inference test
   // prepare inputs
-  std::vector<Input> inputs(1);
-  Input& input = inputs.back();
+  std::vector<Input<float>> inputs(1);
+  auto& input = inputs.back();
   input.name = "X";
   input.dims = {3, 2};
   input.values = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
@@ -3905,8 +3865,8 @@ TEST_P(CApiTensorRTTest, TestConfigureTensorRTProviderOptions) {
 
   // simple inference test
   // prepare inputs
-  std::vector<Input> inputs(1);
-  Input& input = inputs.back();
+  std::vector<Input<float>> inputs(1);
+  auto& input = inputs.back();
   input.name = "X";
   input.dims = {3, 2};
   input.values = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
diff --git a/onnxruntime/test/shared_lib/test_model_builder_api.cc b/onnxruntime/test/shared_lib/test_model_builder_api.cc
new file mode 100644
index 0000000000000..9807fcca06ed4
--- /dev/null
+++ b/onnxruntime/test/shared_lib/test_model_builder_api.cc
@@ -0,0 +1,701 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include <gsl/gsl>
+
+#include "gtest/gtest.h"
+#include "gmock/gmock.h"
+
+#include "core/common/narrow.h"
+#include "core/graph/constants.h"
+#include "core/session/onnxruntime_c_api.h"
+#include "core/session/onnxruntime_cxx_api.h"
+#include "core/session/onnxruntime_lite_custom_op.h"
+#include "core/session/onnxruntime_session_options_config_keys.h"
+
+#include "test/shared_lib/test_fixture.h"
+#include "test/shared_lib/utils.h"
+#include "test/util/include/test_allocator.h"
+
+#include "onnxruntime_config.h"  // generated file in build output dir
+
+extern std::unique_ptr<Ort::Env> ort_env;
+
+using namespace Ort;
+
+namespace {
+
+Ort::Session CreateSession(Ort::Env& env,
+                           Model& graph_api_model,
+                           Ort::SessionOptions* session_options_for_test = nullptr) {
+  Ort::SessionOptions default_session_options;
+  Ort::SessionOptions& session_options = session_options_for_test ? *session_options_for_test
+                                                                  : default_session_options;
+
+  // Set this to save the model if you want to debug.
+  // session_options.SetOptimizedModelFilePath(ORT_TSTR("model_builder_output.onnx"));
+
+  Ort::Session session(env, graph_api_model, session_options);
+
+  // Session should not require the model to stay alive so free it now to validate.
+  graph_api_model = Model(nullptr);
+
+  return session;
+}
+
+template <typename ModelOutputT, typename ModelInputT = float>
+void TestInference(Ort::Session& session,
+                   const std::vector<Input<ModelInputT>>& inputs,
+                   const char* output_name,
+                   const std::vector<int64_t>& expected_dims,
+                   const std::vector<ModelOutputT>& expected_values) {
+  auto default_allocator = std::make_unique<MockedOrtAllocator>();
+
+  // without preallocated output tensor
+  RunSession<ModelOutputT, ModelInputT>(default_allocator.get(),
+                                        session,
+                                        inputs,
+                                        output_name,
+                                        expected_dims,
+                                        expected_values,
+                                        nullptr);
+}
+
+// Create OrtNode using the C API
+OrtNode* CreateNode(const OrtModelEditorApi& api,
+                    const char* operator_name, const char* node_name,
+                    const gsl::span<const char*> input_names,
+                    const gsl::span<const char*> output_names,
+                    const gsl::span<OrtOpAttr*> attributes = {},
+                    const char* domain_name = onnxruntime::kOnnxDomain) {
+  OrtNode* node = nullptr;
+  Ort::ThrowOnError(api.CreateNode(operator_name, domain_name, node_name,
+                                   input_names.data(), input_names.size(),
+                                   output_names.data(), output_names.size(),
+                                   attributes.data(), attributes.size(),
+                                   &node));
+  return node;
+}
+
+// convenience func to convert initalizer lists to gsl::span
+OrtNode* CreateNode(const OrtModelEditorApi& api,
+                    const char* operator_name, const char* node_name,
+                    const std::initializer_list<const char*> input_names,
+                    const std::initializer_list<const char*> output_names,
+                    const std::initializer_list<OrtOpAttr*> attributes = {},
+                    const char* domain_name = onnxruntime::kOnnxDomain) {
+  std::vector<const char*> inputs(input_names);
+  std::vector<const char*> outputs(output_names);
+  std::vector<OrtOpAttr*> attrs(attributes);
+  return CreateNode(api, operator_name, node_name, inputs, outputs, attrs, domain_name);
+}
+}  // namespace
+
+struct TestAllocator : public OrtAllocator {
+  TestAllocator() {
+    version = ORT_API_VERSION;
+    Info = [](const struct OrtAllocator* this_ptr) -> const struct OrtMemoryInfo* {
+      auto* test_allocator = static_cast<const TestAllocator*>(this_ptr);
+      return test_allocator->memory_info;
+    };
+
+    Free = [](struct OrtAllocator* allocator, void* p) -> void {
+      auto* test_allocator = static_cast<TestAllocator*>(allocator);
+      // find the matching pointer and remove it
+      auto it = std::find_if(test_allocator->weights.begin(), test_allocator->weights.end(),
+                             [p](const std::unique_ptr<std::vector<float>>& v) { return v->data() == p; });
+      if (it == test_allocator->weights.end()) {
+        throw std::runtime_error("Free called with unknown pointer");
+      }
+
+      test_allocator->weights.erase(it);
+    };
+
+    Alloc = [](struct OrtAllocator* /*this*/, size_t /*size*/) -> void* {
+      throw std::runtime_error("This should not be used");
+    };
+
+    Reserve = [](struct OrtAllocator* /*this*/, size_t /*size*/) -> void* {
+      throw std::runtime_error("This should not be used");
+    };
+  }
+
+  // initializers that are used directly by the model. as there's no copy they must remain valid.
+  // we store them in the test allocator so we can validate that Free is called
+  std::vector<std::unique_ptr<std::vector<float>>> weights;
+  Ort::MemoryInfo memory_info = Ort::MemoryInfo::CreateCpu(OrtAllocatorType::OrtDeviceAllocator,
+                                                           OrtMemType::OrtMemTypeDefault);
+};
+
+// Test the ModelEditorAPI C api
+// Uses the ORT C++ api for the rest for simplicity
+TEST(ModelEditorAPITest, Basic_CApi) {
+  const auto& api = Ort::GetApi();
+  const auto& model_editor_api = Ort::GetModelEditorApi();
+
+  TestAllocator deleter;
+
+  // return void so we can use ASSERT_* in the lambda
+  const auto build_model = [&](bool use_constant_node, OrtModel*& model) -> void {
+    OrtGraph* graph = nullptr;
+    Ort::ThrowOnError(model_editor_api.CreateGraph(&graph));
+
+    //
+    // Create OrtModel with a Gemm. X input is 3x4, Y input is 4x8, Z output is 3x8.
+    // X is model input. Y is initializer.
+    // Set the alpha attribute of the Gemm node to 2.0 to test attribute handling.
+    //
+
+    // model input
+    OrtTensorTypeAndShapeInfo* tensor_type_info = nullptr;
+    std::vector<int64_t> input_dims = {3, 4};
+    // can use api.SetSymbolicDimensions to set symbolic dimensions.
+    // the input array should have the same rank as the call to SetDimensions.
+    // e.g. call SetDimensions with {-1, 3, 2} and SetSymbolicDimensions with {"N", nullptr, nullptr} to create
+    //      a shape of {"N", 3, 2}
+
+    Ort::ThrowOnError(api.CreateTensorTypeAndShapeInfo(&tensor_type_info));
+    Ort::ThrowOnError(api.SetTensorElementType(tensor_type_info, ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT));
+    Ort::ThrowOnError(api.SetDimensions(tensor_type_info, input_dims.data(), input_dims.size()));
+
+    OrtTypeInfo* input_type_info = nullptr;
+    Ort::ThrowOnError(model_editor_api.CreateTensorTypeInfo(tensor_type_info, &input_type_info));
+    api.ReleaseTensorTypeAndShapeInfo(tensor_type_info);  // input_type_info took a copy
+
+    // create ValueInfo and release the type info as CreateValueInfo takes a copy.
+    OrtValueInfo* input_value_info = nullptr;
+    Ort::ThrowOnError(model_editor_api.CreateValueInfo("X", input_type_info, &input_value_info));
+    api.ReleaseTypeInfo(input_type_info);  // input_value_info took a copy
+    tensor_type_info = nullptr;
+
+    // model outputs
+    OrtTypeInfo* output_type_info = nullptr;
+    std::vector<int64_t> output_dims = {3, 8};
+
+    Ort::ThrowOnError(api.CreateTensorTypeAndShapeInfo(&tensor_type_info));
+    Ort::ThrowOnError(api.SetTensorElementType(tensor_type_info, ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT));
+    Ort::ThrowOnError(api.SetDimensions(tensor_type_info, output_dims.data(), output_dims.size()));
+
+    Ort::ThrowOnError(model_editor_api.CreateTensorTypeInfo(tensor_type_info, &output_type_info));
+    api.ReleaseTensorTypeAndShapeInfo(tensor_type_info);  // input_type_info took a copy
+
+    OrtValueInfo* output_value_info = nullptr;
+    Ort::ThrowOnError(model_editor_api.CreateValueInfo("Z", output_type_info, &output_value_info));
+    api.ReleaseTypeInfo(output_type_info);
+
+    std::vector<OrtValueInfo*> graph_inputs = {input_value_info};
+    std::vector<OrtValueInfo*> graph_outputs = {output_value_info};
+    Ort::ThrowOnError(model_editor_api.SetGraphInputs(graph, graph_inputs.data(), graph_inputs.size()));
+    Ort::ThrowOnError(model_editor_api.SetGraphOutputs(graph, graph_outputs.data(), graph_outputs.size()));
+    input_value_info = nullptr;  // graph now owns the input/output values
+    output_value_info = nullptr;
+
+    //
+    // Gemm node
+    //
+
+    OrtOpAttr* alpha_attr = nullptr;
+    float alpha_value = 2.0;
+    Ort::ThrowOnError(api.CreateOpAttr("alpha", &alpha_value, 1, OrtOpAttrType::ORT_OP_ATTR_FLOAT, &alpha_attr));
+
+    std::vector<const char*> node_input_names = {"X", "Y"};
+    const std::string gemm_output_name = use_constant_node ? "Z_temp" : "Z";
+    std::vector<const char*> node_output_names = {gemm_output_name.c_str()};
+    std::vector<OrtOpAttr*> node_attributes{alpha_attr};
+    OrtNode* node = CreateNode(model_editor_api, "Gemm", "Gemm1", node_input_names, node_output_names, node_attributes);
+    alpha_attr = nullptr;  // Node now owns
+
+    Ort::ThrowOnError(model_editor_api.AddNodeToGraph(graph, node));
+    node = nullptr;  // graph now owns node
+
+    // Y input
+    // As it's 128 bytes it could either be allocated using CreateTensorAsOrtValue or use existing memory.
+    // Under 128 bytes must use CreateTensorAsOrtValue.
+    std::vector<int64_t> y_dims = {4, 8};
+
+    deleter.weights.emplace_back(std::make_unique<std::vector<float>>(32));
+    auto& y_values = *deleter.weights.back();
+    std::iota(y_values.begin(), y_values.end(), 1.0f);
+
+    // create an initializer for the Y input. add to `weights` so the memory remains valid.
+    OrtValue* y_tensor = nullptr;
+    Ort::ThrowOnError(
+        api.CreateTensorWithDataAndDeleterAsOrtValue(&deleter,
+                                                     y_values.data(), y_values.size() * sizeof(y_values[0]),
+                                                     y_dims.data(), y_dims.size(),
+                                                     ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT,
+                                                     &y_tensor));
+
+    Ort::ThrowOnError(model_editor_api.AddInitializerToGraph(graph, "Y", y_tensor, /*data is external*/ true));
+    y_tensor = nullptr;  // graph now owns
+
+    if (use_constant_node) {
+      // Test that a Constant node is converted to an initializer
+
+      // create Constant nodes for min/max to limit output range
+      OrtOpAttr* min_attr = nullptr;
+      float min = 400.0f;
+      Ort::ThrowOnError(api.CreateOpAttr("value", &min, sizeof(min), ORT_OP_ATTR_FLOAT, &min_attr));
+      node = CreateNode(model_editor_api, "Constant", "clip_min", {}, {"min"}, {min_attr});
+      Ort::ThrowOnError(model_editor_api.AddNodeToGraph(graph, node));
+      node = nullptr;  // graph now owns node
+
+      OrtOpAttr* max_attr = nullptr;
+      float max = 900.0f;
+      Ort::ThrowOnError(api.CreateOpAttr("value", &max, sizeof(max), ORT_OP_ATTR_FLOAT, &max_attr));
+      node = CreateNode(model_editor_api, "Constant", "clip_max", {}, {"max"}, {max_attr});
+      Ort::ThrowOnError(model_editor_api.AddNodeToGraph(graph, node));
+      node = nullptr;  // graph now owns node
+
+      node = CreateNode(model_editor_api, "Clip", "Clip1", {gemm_output_name.c_str(), "min", "max"}, {"Z"});
+      Ort::ThrowOnError(model_editor_api.AddNodeToGraph(graph, node));
+      node = nullptr;  // graph now owns node
+    }
+
+    std::vector<const char*> domain_names = {onnxruntime::kOnnxDomain};
+    std::vector<int> opset_versions = {18};
+    Ort::ThrowOnError(model_editor_api.CreateModel(domain_names.data(), opset_versions.data(), domain_names.size(),
+                                                   &model));
+    Ort::ThrowOnError(model_editor_api.AddGraphToModel(model, graph));
+    graph = nullptr;  // model now owns
+  };
+
+  auto run_test = [&](bool use_constant_node) -> void {
+    OrtModel* model = nullptr;
+    build_model(use_constant_node, model);
+
+    ASSERT_NE(model, nullptr) << "build_model should have created a model";
+
+    std::vector<Input<float>> inputs(1);
+    auto& input = inputs[0];
+    input.name = "X";
+    input.dims = {3, 4};
+    input.values = {1.0f, 2.0f, 3.0f, 4.0f,
+                    8.0f, 7.0f, 6.0f, 5.0f,
+                    9.0f, 3.0f, 5.0f, 7.0f};
+
+    std::vector<int64_t> expected_dims = {3, 8};
+    Model cxx_model(model);
+    auto session = CreateSession(*ort_env, cxx_model);
+
+    std::vector<float> expected_output;
+    if (use_constant_node) {
+      // clipped with min 400 and max 900
+      expected_output = {400.0f, 400.0f, 400.0f, 400.0f, 420.0f, 440.0f, 460.0f, 480.0f,
+                         596.0f, 648.0f, 700.0f, 752.0f, 804.0f, 856.0f, 900.0f, 900.0f,
+                         592.0f, 640.0f, 688.0f, 736.0f, 784.0f, 832.0f, 880.0f, 900.0f};
+    } else {
+      expected_output = {340.0f, 360.0f, 380.0f, 400.0f, 420.0f, 440.0f, 460.0f, 480.0f,
+                         596.0f, 648.0f, 700.0f, 752.0f, 804.0f, 856.0f, 908.0f, 960.0f,
+                         592.0f, 640.0f, 688.0f, 736.0f, 784.0f, 832.0f, 880.0f, 928.0f};
+    }
+
+    TestInference<float>(session, inputs, "Z", expected_dims, expected_output);
+
+    api.ReleaseSession(session.release());
+
+    ASSERT_EQ(deleter.weights.size(), size_t(0)) << "All weights should have been freed";
+  };
+
+  run_test(false);
+  run_test(true);  // use Constant node for initializer
+}
+
+TEST(ModelEditorAPITest, Basic_CxxApi) {
+  // initializers that are used directly by the model. as there's no copy they must remain valid
+  std::vector<std::unique_ptr<std::vector<float>>> weights;
+
+  Ort::Graph graph;
+
+  //
+  // Create OrtModel with a Gemm. X input is 3x4, Y input is 4x8, Z output is 3x8.
+  // X is model input. Y is initializer.
+  // Set the alpha attribute of the Gemm node to 2.0 to test attribute handling.
+  //
+
+  std::vector<ValueInfo> graph_inputs;
+  std::vector<ValueInfo> graph_outputs;
+
+  // model input. it's {3, 4} but use a symbolic dim to test that works.
+  std::vector<int64_t> input_dims({-1, 4});
+  std::vector<std::string> input_symbolic_dims({"multiple_of_3", ""});
+  TensorTypeAndShapeInfo input_tensor_info(ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT,
+                                           input_dims,
+                                           &input_symbolic_dims);
+  auto input_type_info = TypeInfo::CreateTensorInfo(input_tensor_info.GetConst());
+  graph_inputs.emplace_back("X", input_type_info.GetConst());
+
+  // model outputs
+  std::vector<int64_t> output_dims = {-1, 8};
+  std::vector<std::string> output_symbolic_dims({"multiple_of_3", ""});
+  TensorTypeAndShapeInfo output_tensor_info(ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT,
+                                            output_dims,
+                                            &output_symbolic_dims);
+  auto output_type_info = TypeInfo::CreateTensorInfo(output_tensor_info.GetConst());
+  graph_outputs.emplace_back("Z", output_type_info.GetConst());
+
+  graph.SetInputs(graph_inputs);
+  graph.SetOutputs(graph_outputs);
+
+  //
+  // Gemm node
+  //
+
+  std::vector<OpAttr> attributes;
+  float alpha_value = 2.0;
+  attributes.push_back(OpAttr("alpha", &alpha_value, 1, OrtOpAttrType::ORT_OP_ATTR_FLOAT));
+
+  Node node("Gemm", onnxruntime::kOnnxDomain, "Gemm1", {"X", "Y"}, {"Z"}, attributes);
+
+  graph.AddNode(node);
+
+  // create an initializer for the Y input.
+  // add to `weights` so it remains valid for the lifetime of the session and we can avoid copying the data.
+  // As it's 128 bytes it could either be allocated using CreateTensorAsOrtValue or use existing memory.
+  // Under 128 bytes must use CreateTensorAsOrtValue.
+  std::vector<int64_t> y_dims = {4, 8};
+
+  weights.emplace_back(std::make_unique<std::vector<float>>(32));
+  auto& y_values = *weights.back();
+  std::iota(y_values.begin(), y_values.end(), 1.0f);
+
+  auto info = Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeDefault);
+
+  // if you use this API the initializer data MUST remain valid for the lifetime of the InferenceSession
+  auto y_tensor = Value::CreateTensor(info, y_values.data(), y_values.size(), y_dims.data(), y_dims.size());
+  graph.AddInitializer("Y", y_tensor, /*data is external*/ true);
+
+  std::vector<Model::DomainOpsetPair> opsets{{onnxruntime::kOnnxDomain, 18}};
+  Model model(opsets);
+  model.AddGraph(graph);
+
+  std::vector<Input<float>> inputs(1);
+  auto& input = inputs[0];
+  input.name = "X";
+  input.dims = {3, 4};
+  input.values = {1.0f, 2.0f, 3.0f, 4.0f,
+                  8.0f, 7.0f, 6.0f, 5.0f,
+                  9.0f, 3.0f, 5.0f, 7.0f};
+
+  std::vector<int64_t> expected_dims = {3, 8};
+
+  auto session = CreateSession(*ort_env, model);
+  TestInference<float>(session, inputs, "Z", expected_dims,
+                       {340.0f, 360.0f, 380.0f, 400.0f, 420.0f, 440.0f, 460.0f, 480.0f,
+                        596.0f, 648.0f, 700.0f, 752.0f, 804.0f, 856.0f, 908.0f, 960.0f,
+                        592.0f, 640.0f, 688.0f, 736.0f, 784.0f, 832.0f, 880.0f, 928.0f});
+}
+
+TEST(ModelEditorAPITest, BasicModelEdit_CxxApi) {
+  //
+  // Load existing model
+  // Add Cast to change the model input from float to int64
+  // Update model inputs to match
+  // Run
+  //
+
+  SessionOptions so;
+
+  // Set this to save the model if you want to debug.
+  // so.SetOptimizedModelFilePath(ORT_TSTR("model_builder_edited.onnx"));
+
+  Session session = Session::CreateModelEditorSession(*ort_env, TSTR("testdata/mnist.onnx"), so);
+
+  ASSERT_EQ(session.GetOpset(""), 8);  // ONNX domain is empty string
+
+  // we augment the original model with nodes, initializers and the updated model inputs/outputs from this model.
+  // the original graph is unchanged. nodes can be added before/after it. initializers can be added.
+  // new nodes must conform to the original domain:opset of the model.
+  // additional operator domain:opset pairs can be added.
+  std::vector<Model::DomainOpsetPair> opsets;  // no additional opsets required
+  Model model(opsets);
+
+  std::vector<ValueInfo> graph_inputs = session.GetInputs();
+  ASSERT_EQ(graph_inputs.size(), size_t(1));
+  ASSERT_EQ(graph_inputs[0].TypeInfo().GetTensorTypeAndShapeInfo().GetElementType(),
+            ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT);
+
+  // typically this isn't needed. we replace this input but need to read info from it later on in the test
+  // validation so we save the info locally to keep it accessible.
+  auto orig_input_name = graph_inputs[0].Name();
+  auto input_shape = graph_inputs[0].TypeInfo().GetTensorTypeAndShapeInfo().GetShape();
+  const std::string new_input_name = "Int64Input";
+
+  // Add Cast node to convert input from float to int64
+  std::vector<OpAttr> attributes;
+  int64_t to = ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
+  attributes.push_back(OpAttr("to", &to, 1, OrtOpAttrType::ORT_OP_ATTR_INT));
+
+  Ort::Node node("Cast", onnxruntime::kOnnxDomain, new_input_name, {"Int64Input"},
+                 // the existing node will now consume the output from the Cast instead of a graph input
+                 {orig_input_name},
+                 attributes);
+
+  // we're replacing the only input. the shape is the same but the name and data type change.
+  TensorTypeAndShapeInfo input_tensor_info(ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64,
+                                           input_shape);
+  auto input_type_info = TypeInfo::CreateTensorInfo(input_tensor_info.GetConst());
+  graph_inputs[0] = ValueInfo(new_input_name, input_type_info.GetConst());
+
+  Graph graph;  // new info to augment the model with
+
+  graph.AddNode(node);
+  graph.SetInputs(graph_inputs);
+
+  // the node we added does not require any new opsets.
+  model.AddGraph(graph);
+  session.FinalizeModelEditorSession(model, so);
+
+  std::vector<Input<int64_t>> inputs(1);
+  auto& input = inputs[0];
+  input.name = new_input_name.c_str();
+  input.dims = input_shape;
+
+  auto num_values = std::accumulate(input.dims.begin(), input.dims.end(), int64_t(1), std::multiplies<int64_t>());
+  input.values.resize(size_t(num_values));
+  std::iota(input.values.begin(), input.values.end(), 1);
+
+  std::vector<int64_t> expected_dims = {1, 10};
+  std::vector<float> expected_output = {-48.5088f, -1040.2948f, -347.0959f, 101.7392f, 421.3352f,
+                                        750.92145f, 231.5060f, -1694.4152f, 681.5623f, 378.1689f};
+
+  TestInference<float>(session, inputs, session.GetOutputNames()[0].c_str(), expected_dims, expected_output);
+
+  // double check with original model
+  {
+    SessionOptions expected_so;
+    Session expected_session = Session(*ort_env, TSTR("testdata/mnist.onnx"), expected_so);
+    std::vector<Input<float>> expected_inputs(1);
+    auto& expected_input = expected_inputs[0];
+    expected_input.name = orig_input_name.c_str();
+    expected_input.dims = input_shape;
+    expected_input.values.reserve(size_t(num_values));
+    std::transform(input.values.begin(), input.values.end(), std::back_inserter(expected_input.values),
+                   [&](int64_t value) { return float(value); });
+
+    TestInference<float>(expected_session, expected_inputs, session.GetOutputNames()[0].c_str(),
+                         expected_dims, expected_output);
+  }
+}
+
+TEST(ModelEditorAPITest, InvalidDimension) {
+  try {
+    std::vector<int64_t> input_dims = {-2, 2};
+    TensorTypeAndShapeInfo tensor_type_info(ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT,
+                                            input_dims);
+    // invalid dim of -2 should cause exception
+    TypeInfo::CreateTensorInfo(tensor_type_info.GetConst());
+    FAIL() << "Expected exception for invalid dimension";
+  } catch (const Ort::Exception& e) {
+    ASSERT_STREQ(e.what(), "dim_values must be -1 (symbolic dimension) or larger.");
+  }
+}
+
+TEST(ModelEditorAPITest, CreateInvalidModel_NoOpsets) {
+  Ort::Graph graph;
+  std::vector<ValueInfo> graph_inputs;
+  std::vector<ValueInfo> graph_outputs;
+
+  std::vector<int64_t> dims({4});
+  TensorTypeAndShapeInfo tensor_info(ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT, dims);
+  auto type_info = TypeInfo::CreateTensorInfo(tensor_info.GetConst());
+  graph_inputs.emplace_back("X", type_info.GetConst());
+  graph_outputs.emplace_back("Z", type_info.GetConst());
+
+  graph.SetInputs(graph_inputs);
+  graph.SetOutputs(graph_outputs);
+
+  Ort::Node node("Add", onnxruntime::kOnnxDomain, "Add1", {"X", "X"}, {"Z"});
+
+  graph.AddNode(node);
+
+  std::vector<Model::DomainOpsetPair> opsets;
+  Model model(opsets);
+  model.AddGraph(graph);
+
+  try {
+    auto session = CreateSession(*ort_env, model);
+    FAIL();
+  } catch (const Ort::Exception& e) {
+    ASSERT_THAT(e.what(), ::testing::HasSubstr("Error No opset import for domain"));
+  }
+}
+
+TEST(ModelEditorAPITest, CreateInvalidModel_MissingValue) {
+  Ort::Graph graph;
+
+  std::vector<ValueInfo> graph_inputs;
+  std::vector<ValueInfo> graph_outputs;
+
+  std::vector<int64_t> dims({4});
+  TensorTypeAndShapeInfo tensor_info(ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT, dims);
+  auto type_info = TypeInfo::CreateTensorInfo(tensor_info.GetConst());
+  graph_inputs.emplace_back("X", type_info.GetConst());
+  graph_outputs.emplace_back("Z", type_info.GetConst());
+
+  graph.SetInputs(graph_inputs);
+  graph.SetOutputs(graph_outputs);
+
+  Ort::Node node("Add", onnxruntime::kOnnxDomain, "Add1", {"X", "missing"}, {"Z"});
+  graph.AddNode(node);
+
+  std::vector<Model::DomainOpsetPair> opsets{{onnxruntime::kOnnxDomain, 18}};
+  Model model(opsets);
+  model.AddGraph(graph);
+
+  try {
+    auto session = CreateSession(*ort_env, model);
+    FAIL();
+  } catch (const Ort::Exception& e) {
+    ASSERT_THAT(e.what(), ::testing::HasSubstr("Node input 'missing' is not a graph input, "
+                                               "initializer, or output of a previous node."));
+  }
+}
+
+TEST(ModelEditorAPITest, InvalidModelEdit) {
+  // Add a node but make the edit invalid in various ways
+  //   - add node but don't update graph inputs
+  //   - add node with invalid domain
+  const auto edit_model = [](bool invalid_domain) {
+    SessionOptions so;
+
+    // Set this to save the model if you want to debug.
+    // so.SetOptimizedModelFilePath(ORT_TSTR("model_builder_edited.onnx"));
+
+    Session session = Session::CreateModelEditorSession(*ort_env, TSTR("testdata/mnist.onnx"), so);
+
+    ASSERT_EQ(session.GetOpset(""), 8);  // ONNX domain is empty string
+
+    std::vector<Model::DomainOpsetPair> opsets;  // no additional opsets required
+    Model model(opsets);
+    Graph graph;  // new info to augment the model with
+
+    const char* domain = invalid_domain ? "invalid_domain" : onnxruntime::kOnnxDomain;
+
+    std::vector<ValueInfo> graph_inputs = session.GetInputs();
+    ASSERT_EQ(graph_inputs.size(), size_t(1));
+    ASSERT_EQ(graph_inputs[0].TypeInfo().GetTensorTypeAndShapeInfo().GetElementType(),
+              ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT);
+
+    const std::string new_input_name = "Int64Input";
+
+    // Add Cast node to convert input from float to int64
+    std::vector<OpAttr> attributes;
+    int64_t to = ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
+    attributes.push_back(OpAttr("to", &to, 1, OrtOpAttrType::ORT_OP_ATTR_INT));
+
+    Node node("Cast", domain, "NewInputNode", {new_input_name},
+              // the existing node will now consume the output from the Cast instead of a graph input
+              {graph_inputs[0].Name()},
+              attributes);
+    graph.AddNode(node);
+
+    if (invalid_domain) {
+      // we're replacing the only input. the shape is the same but the name and data type change.
+      TensorTypeAndShapeInfo input_tensor_info(ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64,
+                                               graph_inputs[0].TypeInfo().GetTensorTypeAndShapeInfo().GetShape());
+      auto input_type_info = TypeInfo::CreateTensorInfo(input_tensor_info.GetConst());
+      graph_inputs[0] = ValueInfo(new_input_name, input_type_info.GetConst());
+      graph.SetInputs(graph_inputs);
+    } else {
+      // model should be invalid as we didn't connect the new node up to the graph inputs
+    }
+
+    // the node we added does not require any new opsets.
+    model.AddGraph(graph);
+
+    try {
+      session.FinalizeModelEditorSession(model, so);
+      FAIL() << "Should have failed to resolve graph due to invalid edits.";
+    } catch (const Ort::Exception& e) {
+      if (invalid_domain) {
+        ASSERT_THAT(e.what(), ::testing::HasSubstr("Error No opset import for domain 'invalid_domain'"));
+      } else {
+        ASSERT_THAT(e.what(), ::testing::HasSubstr("This is an invalid model"));
+      }
+    }
+  };
+
+  edit_model(false);
+  edit_model(true);  // add node with invalid domain
+}
+
+TEST(ModelEditorAPITest, CreateTypeInfo) {
+  const auto& api = Ort::GetApi();
+  const auto& model_editor_api = Ort::GetModelEditorApi();
+
+  TensorTypeAndShapeInfo base_tensor_info(ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT,
+                                          {2, 4});
+
+  OrtTypeInfo* base_tensor_type_info = nullptr;
+  Ort::ThrowOnError(model_editor_api.CreateTensorTypeInfo(base_tensor_info, &base_tensor_type_info));
+
+  ONNXType onnx_type = ONNX_TYPE_UNKNOWN;
+  const OrtTensorTypeAndShapeInfo* tensor_info = nullptr;
+  ONNXTensorElementDataType onnx_element_type = ONNX_TENSOR_ELEMENT_DATA_TYPE_UNDEFINED;
+
+  // sparse tensor
+  OrtTypeInfo* sparse_tensor_type_info = nullptr;
+  Ort::ThrowOnError(model_editor_api.CreateSparseTensorTypeInfo(base_tensor_info, &sparse_tensor_type_info));
+  Ort::ThrowOnError(api.GetOnnxTypeFromTypeInfo(sparse_tensor_type_info, &onnx_type));
+  ASSERT_EQ(onnx_type, ONNXType::ONNX_TYPE_SPARSETENSOR);
+  Ort::ThrowOnError(api.CastTypeInfoToTensorInfo(sparse_tensor_type_info, &tensor_info));
+  Ort::ThrowOnError(api.GetTensorElementType(tensor_info, &onnx_element_type));
+  ASSERT_EQ(onnx_element_type, ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT);
+  api.ReleaseTypeInfo(sparse_tensor_type_info);
+
+  // sequence
+  OrtTypeInfo* sequence_type_info = nullptr;
+  const OrtSequenceTypeInfo* sequence_info = nullptr;
+  OrtTypeInfo* sequence_element_type_info = nullptr;
+
+  Ort::ThrowOnError(model_editor_api.CreateSequenceTypeInfo(base_tensor_type_info, &sequence_type_info));
+  Ort::ThrowOnError(api.GetOnnxTypeFromTypeInfo(sequence_type_info, &onnx_type));
+  ASSERT_EQ(onnx_type, ONNXType::ONNX_TYPE_SEQUENCE);
+  Ort::ThrowOnError(api.CastTypeInfoToSequenceTypeInfo(sequence_type_info, &sequence_info));
+  Ort::ThrowOnError(api.GetSequenceElementType(sequence_info, &sequence_element_type_info));
+  Ort::ThrowOnError(api.GetOnnxTypeFromTypeInfo(sequence_element_type_info, &onnx_type));
+  ASSERT_EQ(onnx_type, ONNXType::ONNX_TYPE_TENSOR);
+  Ort::ThrowOnError(api.CastTypeInfoToTensorInfo(sequence_element_type_info, &tensor_info));
+  Ort::ThrowOnError(api.GetTensorElementType(tensor_info, &onnx_element_type));
+  ASSERT_EQ(onnx_element_type, ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT);
+  api.ReleaseTypeInfo(sequence_element_type_info);
+  api.ReleaseTypeInfo(sequence_type_info);
+
+  // map
+  OrtTypeInfo* map_type_info = nullptr;
+  const OrtMapTypeInfo* map_info = nullptr;
+  ONNXTensorElementDataType map_key_type = ONNX_TENSOR_ELEMENT_DATA_TYPE_UNDEFINED;
+  OrtTypeInfo* map_value_type_info = nullptr;
+  Ort::ThrowOnError(model_editor_api.CreateMapTypeInfo(ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64, base_tensor_type_info,
+                                                       &map_type_info));  // clones map_type_info
+  Ort::ThrowOnError(api.GetOnnxTypeFromTypeInfo(map_type_info, &onnx_type));
+  ASSERT_EQ(onnx_type, ONNXType::ONNX_TYPE_MAP);
+  Ort::ThrowOnError(api.CastTypeInfoToMapTypeInfo(map_type_info, &map_info));
+  Ort::ThrowOnError(api.GetMapKeyType(map_info, &map_key_type));
+  ASSERT_EQ(map_key_type, ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64);
+  Ort::ThrowOnError(api.GetMapValueType(map_info, &map_value_type_info));
+  Ort::ThrowOnError(api.GetOnnxTypeFromTypeInfo(map_value_type_info, &onnx_type));
+  ASSERT_EQ(onnx_type, ONNXType::ONNX_TYPE_TENSOR);
+  Ort::ThrowOnError(api.CastTypeInfoToTensorInfo(map_value_type_info, &tensor_info));
+  Ort::ThrowOnError(api.GetTensorElementType(tensor_info, &onnx_element_type));
+  ASSERT_EQ(onnx_element_type, ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT);
+  api.ReleaseTypeInfo(map_value_type_info);
+  api.ReleaseTypeInfo(map_type_info);
+
+  // optional
+  OrtTypeInfo* optional_type_info = nullptr;
+  const OrtOptionalTypeInfo* optional_info = nullptr;
+  OrtTypeInfo* optional_contained_type_info = nullptr;
+  Ort::ThrowOnError(model_editor_api.CreateOptionalTypeInfo(base_tensor_type_info, &optional_type_info));
+  Ort::ThrowOnError(api.GetOnnxTypeFromTypeInfo(optional_type_info, &onnx_type));
+  ASSERT_EQ(onnx_type, ONNXType::ONNX_TYPE_OPTIONAL);
+  Ort::ThrowOnError(api.CastTypeInfoToOptionalTypeInfo(optional_type_info, &optional_info));
+  Ort::ThrowOnError(api.GetOptionalContainedTypeInfo(optional_info, &optional_contained_type_info));
+  Ort::ThrowOnError(api.GetOnnxTypeFromTypeInfo(optional_contained_type_info, &onnx_type));
+  ASSERT_EQ(onnx_type, ONNXType::ONNX_TYPE_TENSOR);
+  api.ReleaseTypeInfo(optional_contained_type_info);
+  api.ReleaseTypeInfo(optional_type_info);
+
+  api.ReleaseTypeInfo(base_tensor_type_info);
+}
diff --git a/onnxruntime/test/shared_lib/test_ort_format_models.cc b/onnxruntime/test/shared_lib/test_ort_format_models.cc
index 99a9ebc3362ae..b3491e3476f23 100644
--- a/onnxruntime/test/shared_lib/test_ort_format_models.cc
+++ b/onnxruntime/test/shared_lib/test_ort_format_models.cc
@@ -17,7 +17,7 @@
 extern std::unique_ptr<Ort::Env> ort_env;
 
 [[maybe_unused]] static void TestInference(Ort::Env& env, const std::basic_string<ORTCHAR_T>& model_uri,
-                                           const std::vector<Input>& inputs, const char* output_name,
+                                           const std::vector<Input<float>>& inputs, const char* output_name,
                                            const std::vector<int64_t>& expected_dims_y, const std::vector<float>& expected_values_y,
                                            Ort::CustomOpDomain& custom_op_domain, void* cuda_compute_stream = nullptr) {
   Ort::SessionOptions session_options;
@@ -100,8 +100,8 @@ TEST(OrtFormatCustomOpTests, ConvertOnnxModelToOrt) {
   }
 
   // now load the ORT format model and execute it
-  std::vector<Input> inputs(1);
-  Input& input = inputs[0];
+  std::vector<Input<float>> inputs(1);
+  auto& input = inputs[0];
   input.name = "X";
   input.dims = {3, 2};
   input.values = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
@@ -130,8 +130,8 @@ TEST(OrtFormatCustomOpTests, LoadOrtModel) {
   custom_op_domain.Add(&custom_op);
 
   //  load the ORT format model and execute it
-  std::vector<Input> inputs(1);
-  Input& input = inputs[0];
+  std::vector<Input<float>> inputs(1);
+  auto& input = inputs[0];
   input.name = "X";
   input.dims = {3, 2};
   input.values = {6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f};
@@ -151,8 +151,8 @@ TEST(OrtFormatCustomOpTests, LoadOrtModelStandaloneCustomOpImplementation) {
   custom_op_domain.Add(&standalone_op);
 
   // load the ORT format model and execute it
-  std::vector<Input> inputs(1);
-  Input& input = inputs[0];
+  std::vector<Input<float>> inputs(1);
+  auto& input = inputs[0];
   input.name = "X";
   input.dims = {3, 2};
   input.values = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
diff --git a/onnxruntime/test/shared_lib/utils.h b/onnxruntime/test/shared_lib/utils.h
index 483753f2ae6b2..5d15582b86cb9 100644
--- a/onnxruntime/test/shared_lib/utils.h
+++ b/onnxruntime/test/shared_lib/utils.h
@@ -5,4 +5,56 @@
 
 #include "core/session/onnxruntime_cxx_api.h"
 
+#include "gtest/gtest.h"
+
 OrtCUDAProviderOptions CreateDefaultOrtCudaProviderOptionsWithCustomStream(void* cuda_compute_stream = nullptr);
+
+template <typename T = float>
+struct Input {
+  const char* name = nullptr;
+  std::vector<int64_t> dims;
+  std::vector<T> values;
+};
+
+template <typename ModelOutputT, typename ModelInputT = float, typename InputT = Input<float>>
+void RunSession(OrtAllocator* allocator,
+                Ort::Session& session_object,
+                const std::vector<InputT>& inputs,
+                const char* output_name,
+                const std::vector<int64_t>& output_dims,
+                const std::vector<ModelOutputT>& expected_output,
+                Ort::Value* output_tensor) {
+  std::vector<Ort::Value> ort_inputs;
+  std::vector<const char*> input_names;
+  for (size_t i = 0; i < inputs.size(); i++) {
+    input_names.emplace_back(inputs[i].name);
+    ort_inputs.emplace_back(
+        Ort::Value::CreateTensor(allocator->Info(allocator), const_cast<ModelInputT*>(inputs[i].values.data()),
+                                 inputs[i].values.size(), inputs[i].dims.data(), inputs[i].dims.size()));
+  }
+
+  std::vector<Ort::Value> ort_outputs;
+  if (output_tensor)
+    session_object.Run(Ort::RunOptions{nullptr}, input_names.data(), ort_inputs.data(), ort_inputs.size(),
+                       &output_name, output_tensor, 1);
+  else {
+    ort_outputs = session_object.Run(Ort::RunOptions{}, input_names.data(), ort_inputs.data(), ort_inputs.size(),
+                                     &output_name, 1);
+    ASSERT_EQ(ort_outputs.size(), 1u);
+    output_tensor = &ort_outputs[0];
+  }
+
+  auto type_info = output_tensor->GetTensorTypeAndShapeInfo();
+  ASSERT_EQ(type_info.GetShape(), output_dims);
+  size_t total_len = type_info.GetElementCount();
+  ASSERT_EQ(expected_output.size(), total_len);
+
+  auto* actual = output_tensor->GetTensorMutableData<ModelOutputT>();
+  for (size_t i = 0; i != total_len; ++i) {
+    if constexpr (std::is_same<ModelOutputT, float>::value || std::is_same<ModelOutputT, double>::value) {
+      EXPECT_NEAR(expected_output[i], actual[i], 1e-3) << "i=" << i;
+    } else {
+      EXPECT_EQ(expected_output[i], actual[i]) << "i=" << i;
+    }
+  }
+}
diff --git a/winml/adapter/winml_adapter_model.cpp b/winml/adapter/winml_adapter_model.cpp
index 195bf6e5f0ffd..cf02c6fa2328b 100644
--- a/winml/adapter/winml_adapter_model.cpp
+++ b/winml/adapter/winml_adapter_model.cpp
@@ -593,13 +593,13 @@ ORT_API_STATUS_IMPL(
   input.set_name(input_name);
 
   if (info->type == ONNXType::ONNX_TYPE_TENSOR) {
-    auto num_dims = info->data->shape.NumDimensions();
+    auto num_dims = info->tensor_type_info->shape.NumDimensions();
     CreateTypeProto_Tensor(
       input.mutable_type()->mutable_tensor_type(),
       input_name,
-      (num_dims == 0) ? nullptr : &info->data->shape[0],
+      (num_dims == 0) ? nullptr : &info->tensor_type_info->shape[0],
       num_dims,
-      ONNXTensorElementDataTypeToTensorProto_DataType(info->data->type)
+      ONNXTensorElementDataTypeToTensorProto_DataType(info->tensor_type_info->type)
     );
   }
   return nullptr;
@@ -619,12 +619,12 @@ ORT_API_STATUS_IMPL(
   ONNX_NAMESPACE::TensorProto& input = *graph.add_initializer();
   input.set_name(input_name);
 
-  auto num_dims = info->data->shape.NumDimensions();
+  auto num_dims = info->tensor_type_info->shape.NumDimensions();
   for (size_t i = 0; i < num_dims; i++) {
-    input.add_dims(info->data->shape[i]);
+    input.add_dims(info->tensor_type_info->shape[i]);
   }
 
-  input.set_data_type(ONNXTensorElementDataTypeToTensorProto_DataType(info->data->type));
+  input.set_data_type(ONNXTensorElementDataTypeToTensorProto_DataType(info->tensor_type_info->type));
   auto tensor = value->GetMutable<onnxruntime::Tensor>();
   input.set_raw_data(tensor->DataRaw(), tensor->SizeInBytes());
 
@@ -645,9 +645,9 @@ ORT_API_STATUS_IMPL(
     CreateTypeProto_Tensor(
       output.mutable_type()->mutable_tensor_type(),
       output_name,
-      &info->data->shape[0],
-      info->data->shape.NumDimensions(),
-      ONNXTensorElementDataTypeToTensorProto_DataType(info->data->type)
+      &info->tensor_type_info->shape[0],
+      info->tensor_type_info->shape.NumDimensions(),
+      ONNXTensorElementDataTypeToTensorProto_DataType(info->tensor_type_info->type)
     );
   }
   return nullptr;

From 1ffe793a834cf347b13845c15aa9cd008ec23b23 Mon Sep 17 00:00:00 2001
From: Seungtaek Kim <seungtaek.kim.94@gmail.com>
Date: Sat, 1 Mar 2025 15:25:04 +0900
Subject: [PATCH 03/46] Fix typo: change `Upample` to `Upsample`. (#23838)

### Description
<!-- Describe your changes. -->
Fixed a typo in function names related to the Upsample CUDA kernel.
Changed incorrect spelling Upample to Upsample across relevant
functions.


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
This change is necessary to maintain consistency and prevent potential
confusion caused by incorrect function names.
---
 .../core/providers/cuda/tensor/upsample.cc    | 20 ++--
 .../providers/cuda/tensor/upsample_impl.cu    | 94 +++++++++----------
 .../providers/cuda/tensor/upsample_impl.h     | 20 ++--
 3 files changed, 67 insertions(+), 67 deletions(-)

diff --git a/onnxruntime/core/providers/cuda/tensor/upsample.cc b/onnxruntime/core/providers/cuda/tensor/upsample.cc
index cbf745d3c7b4f..a38fe1efad540 100644
--- a/onnxruntime/core/providers/cuda/tensor/upsample.cc
+++ b/onnxruntime/core/providers/cuda/tensor/upsample.cc
@@ -290,16 +290,16 @@ Status Upsample<T>::BaseCompute(OpKernelContext* context,
       scales_div[i] = fast_divmod(gsl::narrow_cast<int>(ceil(scales[i])));
     }
 
-    UpampleImpl(Stream(context),
-                mode_,
-                rank,
-                (UpsampleMode::LINEAR == mode_) ? (rank == 2 ? X_dims[0] : X_dims[2]) : 0,
-                input_strides,
-                output_div_pitches,
-                scales_div,
-                reinterpret_cast<const CudaT*>(X->Data<T>()),
-                reinterpret_cast<CudaT*>(Y->MutableData<T>()),
-                output_count);
+    UpsampleImpl(Stream(context),
+                 mode_,
+                 rank,
+                 (UpsampleMode::LINEAR == mode_) ? (rank == 2 ? X_dims[0] : X_dims[2]) : 0,
+                 input_strides,
+                 output_div_pitches,
+                 scales_div,
+                 reinterpret_cast<const CudaT*>(X->Data<T>()),
+                 reinterpret_cast<CudaT*>(Y->MutableData<T>()),
+                 output_count);
   }
 
   return Status::OK();
diff --git a/onnxruntime/core/providers/cuda/tensor/upsample_impl.cu b/onnxruntime/core/providers/cuda/tensor/upsample_impl.cu
index d1c2ae6332994..24aeada559979 100644
--- a/onnxruntime/core/providers/cuda/tensor/upsample_impl.cu
+++ b/onnxruntime/core/providers/cuda/tensor/upsample_impl.cu
@@ -8,12 +8,12 @@ namespace onnxruntime {
 namespace cuda {
 
 template <typename T, int RANK>
-__global__ void _UpampleNearestKernel(const TArray<int64_t> input_pitches,
-                                      const TArray<fast_divmod> output_div_pitches,
-                                      const TArray<fast_divmod> scales_div,
-                                      const T* __restrict__ input_data,
-                                      T* __restrict__ output_data,
-                                      const size_t N) {
+__global__ void _UpsampleNearestKernel(const TArray<int64_t> input_pitches,
+                                       const TArray<fast_divmod> output_div_pitches,
+                                       const TArray<fast_divmod> scales_div,
+                                       const T* __restrict__ input_data,
+                                       T* __restrict__ output_data,
+                                       const size_t N) {
   CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N);
   CUDA_LONG input_index = 0;
   CUDA_LONG output_index = id;
@@ -36,13 +36,13 @@ __global__ void _UpampleNearestKernel(const TArray<int64_t> input_pitches,
 // This is the common use-case where the 4-D input (batched multi-channel images)
 // is usually of shape [N, C, H, W] and the scales are [1.0, 1.0, height_scale, width_scale]
 template <typename T>
-__global__ void _UpampleBilinear4DInputKernel(const int64_t input_dim2,
-                                              const TArray<int64_t> input_pitches,
-                                              const TArray<fast_divmod> output_div_pitches,
-                                              const TArray<fast_divmod> scales_div,
-                                              const T* __restrict__ input_data,
-                                              T* __restrict__ output_data,
-                                              const size_t N) {
+__global__ void _UpsampleBilinear4DInputKernel(const int64_t input_dim2,
+                                               const TArray<int64_t> input_pitches,
+                                               const TArray<fast_divmod> output_div_pitches,
+                                               const TArray<fast_divmod> scales_div,
+                                               const T* __restrict__ input_data,
+                                               T* __restrict__ output_data,
+                                               const size_t N) {
   CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N);
   CUDA_LONG input_index = 0;
 
@@ -95,13 +95,13 @@ __global__ void _UpampleBilinear4DInputKernel(const int64_t input_dim2,
 
 // The following method supports a 2-D input in 'Linear mode'
 template <typename T>
-__global__ void _UpampleBilinear2DInputKernel(const int64_t input_dim0,
-                                              const TArray<int64_t> input_pitches,
-                                              const TArray<fast_divmod> output_div_pitches,
-                                              const TArray<fast_divmod> scales_div,
-                                              const T* __restrict__ input_data,
-                                              T* __restrict__ output_data,
-                                              const size_t N) {
+__global__ void _UpsampleBilinear2DInputKernel(const int64_t input_dim0,
+                                               const TArray<int64_t> input_pitches,
+                                               const TArray<fast_divmod> output_div_pitches,
+                                               const TArray<fast_divmod> scales_div,
+                                               const T* __restrict__ input_data,
+                                               T* __restrict__ output_data,
+                                               const size_t N) {
   CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N);
   CUDA_LONG input_index = 0;
 
@@ -147,32 +147,32 @@ __global__ void _UpampleBilinear2DInputKernel(const int64_t input_dim0,
 }
 
 template <typename T>
-void UpampleImpl(cudaStream_t stream,
-                 const onnxruntime::UpsampleMode upsample_mode,
-                 const size_t rank,
-                 const int64_t input_dim2,
-                 const TArray<int64_t>& input_pitches,
-                 const TArray<fast_divmod>& output_div_pitches,
-                 const TArray<fast_divmod>& scales_div,
-                 const T* input_data,
-                 T* output_data,
-                 const size_t N) {
+void UpsampleImpl(cudaStream_t stream,
+                  const onnxruntime::UpsampleMode upsample_mode,
+                  const size_t rank,
+                  const int64_t input_dim2,
+                  const TArray<int64_t>& input_pitches,
+                  const TArray<fast_divmod>& output_div_pitches,
+                  const TArray<fast_divmod>& scales_div,
+                  const T* input_data,
+                  T* output_data,
+                  const size_t N) {
   int blocksPerGrid = (int)(ceil(static_cast<float>(N) / GridDim::maxThreadsPerBlock));
   if (onnxruntime::UpsampleMode::NN == upsample_mode) {
     if (rank == 4) {
-      _UpampleNearestKernel<T, 4><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(
+      _UpsampleNearestKernel<T, 4><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(
           input_pitches, output_div_pitches, scales_div,
           input_data, output_data, N);
     } else if (rank == 3) {
-      _UpampleNearestKernel<T, 3><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(
+      _UpsampleNearestKernel<T, 3><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(
           input_pitches, output_div_pitches, scales_div,
           input_data, output_data, N);
     } else if (rank == 2) {
-      _UpampleNearestKernel<T, 2><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(
+      _UpsampleNearestKernel<T, 2><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(
           input_pitches, output_div_pitches, scales_div,
           input_data, output_data, N);
     } else if (rank == 1) {
-      _UpampleNearestKernel<T, 1><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(
+      _UpsampleNearestKernel<T, 1><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(
           input_pitches, output_div_pitches, scales_div,
           input_data, output_data, N);
     } else {
@@ -180,11 +180,11 @@ void UpampleImpl(cudaStream_t stream,
     }
   } else if (onnxruntime::UpsampleMode::LINEAR == upsample_mode) {
     if (rank == 4) {
-      _UpampleBilinear4DInputKernel<T><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(
+      _UpsampleBilinear4DInputKernel<T><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(
           input_dim2, input_pitches, output_div_pitches, scales_div,
           input_data, output_data, N);
     } else if (rank == 2) {
-      _UpampleBilinear2DInputKernel<T><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(
+      _UpsampleBilinear2DInputKernel<T><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(
           input_dim2, input_pitches, output_div_pitches, scales_div,
           input_data, output_data, N);
     } else {
@@ -197,17 +197,17 @@ void UpampleImpl(cudaStream_t stream,
   }
 }
 
-#define SPECIALIZED_IMPL(T)                                                   \
-  template void UpampleImpl<T>(cudaStream_t stream,                           \
-                               const onnxruntime::UpsampleMode upsample_mode, \
-                               const size_t rank,                             \
-                               const int64_t input_dim2,                      \
-                               const TArray<int64_t>& input_pitches,          \
-                               const TArray<fast_divmod>& output_div_pitches, \
-                               const TArray<fast_divmod>& scales_div,         \
-                               const T* input_data,                           \
-                               T* output_data,                                \
-                               const size_t N);
+#define SPECIALIZED_IMPL(T)                                                    \
+  template void UpsampleImpl<T>(cudaStream_t stream,                           \
+                                const onnxruntime::UpsampleMode upsample_mode, \
+                                const size_t rank,                             \
+                                const int64_t input_dim2,                      \
+                                const TArray<int64_t>& input_pitches,          \
+                                const TArray<fast_divmod>& output_div_pitches, \
+                                const TArray<fast_divmod>& scales_div,         \
+                                const T* input_data,                           \
+                                T* output_data,                                \
+                                const size_t N);
 
 SPECIALIZED_IMPL(float)
 SPECIALIZED_IMPL(double)
diff --git a/onnxruntime/core/providers/cuda/tensor/upsample_impl.h b/onnxruntime/core/providers/cuda/tensor/upsample_impl.h
index 250ec6b272e34..fb47ad8301615 100644
--- a/onnxruntime/core/providers/cuda/tensor/upsample_impl.h
+++ b/onnxruntime/core/providers/cuda/tensor/upsample_impl.h
@@ -11,16 +11,16 @@ namespace onnxruntime {
 namespace cuda {
 
 template <typename T>
-void UpampleImpl(cudaStream_t stream,
-                 const onnxruntime::UpsampleMode upsample_mode,
-                 const size_t rank,
-                 const int64_t input_dim2,
-                 const TArray<int64_t>& input_pitches,
-                 const TArray<fast_divmod>& output_div_pitches,
-                 const TArray<fast_divmod>& scales_div,
-                 const T* input_data,
-                 T* output_data,
-                 const size_t N);
+void UpsampleImpl(cudaStream_t stream,
+                  const onnxruntime::UpsampleMode upsample_mode,
+                  const size_t rank,
+                  const int64_t input_dim2,
+                  const TArray<int64_t>& input_pitches,
+                  const TArray<fast_divmod>& output_div_pitches,
+                  const TArray<fast_divmod>& scales_div,
+                  const T* input_data,
+                  T* output_data,
+                  const size_t N);
 
 }  // namespace cuda
 }  // namespace onnxruntime

From 0a6b05fb2dda82e2e18a1755f83bfaa3a0a7f5eb Mon Sep 17 00:00:00 2001
From: co63oc <co63oc@users.noreply.github.com>
Date: Sat, 1 Mar 2025 16:54:58 +0800
Subject: [PATCH 04/46] [doc] Fix typos in csharp/src/Microsoft.ML.OnnxRuntime/
 (#23848)

### Description
<!-- Describe your changes. -->
Fix typos in csharp/src/Microsoft.ML.OnnxRuntime/


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
---
 .../src/Microsoft.ML.OnnxRuntime/ManagedProjections.shared.cs | 3 +--
 csharp/src/Microsoft.ML.OnnxRuntime/NativeMethods.shared.cs   | 4 ++--
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/csharp/src/Microsoft.ML.OnnxRuntime/ManagedProjections.shared.cs b/csharp/src/Microsoft.ML.OnnxRuntime/ManagedProjections.shared.cs
index 13117f23e8ef9..8916f11919cfe 100644
--- a/csharp/src/Microsoft.ML.OnnxRuntime/ManagedProjections.shared.cs
+++ b/csharp/src/Microsoft.ML.OnnxRuntime/ManagedProjections.shared.cs
@@ -25,7 +25,7 @@ internal class ManagedTypeProjection
         /// </summary>
         /// <param name="namedOnnxValue"></param>
         /// <param name="metadata"></param>
-        /// <returns>OrtValye created accoding to the metadata</returns>
+        /// <returns>OrtValue created according to the metadata</returns>
         internal static OrtValue CreateProjection(NamedOnnxValue namedOnnxValue, NodeMetadata metadata)
         {
             OrtValue result;
@@ -191,4 +191,3 @@ private static OrtValue CreateTensorProjection(NamedOnnxValue node, NodeMetadata
         }
     }
 }
-
diff --git a/csharp/src/Microsoft.ML.OnnxRuntime/NativeMethods.shared.cs b/csharp/src/Microsoft.ML.OnnxRuntime/NativeMethods.shared.cs
index d628b065ceaa7..b64a5c3e5a4a2 100644
--- a/csharp/src/Microsoft.ML.OnnxRuntime/NativeMethods.shared.cs
+++ b/csharp/src/Microsoft.ML.OnnxRuntime/NativeMethods.shared.cs
@@ -847,7 +847,7 @@ internal class NativeLib
         /// Creates an instance of OrtSession with provided parameters
         /// </summary>
         /// <param name="environment">Native OrtEnv instance</param>
-        /// <param name="modelData">Byte array correspoonding to the model</param>
+        /// <param name="modelData">Byte array corresponding to the model</param>
         /// <param name="modelSize">Size of the model in bytes</param>
         /// <param name="sessionOptions">Native SessionOptions instance</param>
         /// <param name="prepackedWeightsContainer">Native OrtPrepackedWeightsContainer instance</param>
@@ -1258,7 +1258,7 @@ IntPtr[] outputValues /* An array of output value pointers. Array must be alloca
         /// </summary>
         /// <param name="options">Native SessionOptions instance</param>
         /// <param name="name">Name of the initializer</param>
-        /// <param name="ortValue">Native OrtValue instnce</param>
+        /// <param name="ortValue">Native OrtValue instance</param>
         [UnmanagedFunctionPointer(CallingConvention.Winapi)]
         public delegate IntPtr /*(OrtStatus*)*/ DOrtAddInitializer(IntPtr /*(OrtSessionOptions*)*/ options,
                                                                    byte[] /*(const char*)*/ name,

From daf9565d1b550fd0d8149e8314bfce61d8df1d54 Mon Sep 17 00:00:00 2001
From: Jambay Kinley <jambaykinley@microsoft.com>
Date: Sat, 1 Mar 2025 20:16:10 -0800
Subject: [PATCH 05/46] Quant tool: Consistent `get_qdq_config` and
 `get_qnn_qdq_config` behavior (#23856)

---
 .../execution_providers/qnn/quant_config.py   |  6 +-
 .../python/tools/quantization/quantize.py     | 32 +++++++----
 .../quantization/test_get_qdq_config.py       | 56 +++++++++++++++++++
 3 files changed, 81 insertions(+), 13 deletions(-)

diff --git a/onnxruntime/python/tools/quantization/execution_providers/qnn/quant_config.py b/onnxruntime/python/tools/quantization/execution_providers/qnn/quant_config.py
index ea995d4707ba3..50da0025752aa 100644
--- a/onnxruntime/python/tools/quantization/execution_providers/qnn/quant_config.py
+++ b/onnxruntime/python/tools/quantization/execution_providers/qnn/quant_config.py
@@ -204,9 +204,9 @@ def get_qnn_qdq_config(
         calibrate_method=calibrate_method,
         activation_type=activation_type,
         weight_type=weight_type,
-        op_types_to_quantize=op_types_to_quantize
-        if op_types_to_quantize
-        else list(op_types.difference(OP_TYPES_TO_EXCLUDE)),
+        op_types_to_quantize=(
+            op_types_to_quantize if op_types_to_quantize else list(op_types.difference(OP_TYPES_TO_EXCLUDE))
+        ),
         nodes_to_exclude=nodes_to_exclude,
         per_channel=per_channel,
         use_external_data_format=(model_has_external_data or model.ByteSize() >= MODEL_SIZE_THRESHOLD),
diff --git a/onnxruntime/python/tools/quantization/quantize.py b/onnxruntime/python/tools/quantization/quantize.py
index fa468a9676a65..d19bebad8a12c 100644
--- a/onnxruntime/python/tools/quantization/quantize.py
+++ b/onnxruntime/python/tools/quantization/quantize.py
@@ -240,6 +240,8 @@ def get_qdq_config(
     keep_removable_activations: bool = False,
     min_real_range: float | None = None,
     tensor_quant_overrides: dict[str, list[dict[str, Any]]] | None = None,
+    calibration_providers: list[str] | None = None,
+    op_types_to_quantize: list[str] | None = None,
     nodes_to_exclude: list[str] | Callable[[onnx.ModelProto, onnx.NodeProto], bool] | None = None,
     extra_options: dict | None = None,
 ) -> StaticQuantConfig:
@@ -294,6 +296,10 @@ def get_qdq_config(
                 'convert["recv_nodes"] = Set : Set of node names that consume the converted activation,
                                                other nodes get the original type. If not specified,
                                                assume all consumer nodes get the converted type.
+        calibration_providers: Execution providers to run the session during calibration. Default is None which uses
+            [ "CPUExecutionProvider" ].
+        op_types_to_quantize: List of operator types to quantize. If None, all operators other than Cast, DequantizeLinear,
+            and QuantizeLinear are quantized.
         nodes_to_exclude: List of nodes names to exclude from quantization. Alternatively, can provide a function that
             accepts an onnx.ModelProto and onnx.NodeProto as arguments and returns true if the give onnx.NodeProto
             should be excluded from quantization.
@@ -324,17 +330,20 @@ def get_qdq_config(
         if onnx.external_data_helper.uses_external_data(initializer):
             model_has_external_data = True
 
-    final_nodes_to_exclude = []
-    if nodes_to_exclude is not None and isinstance(nodes_to_exclude, list):
-        final_nodes_to_exclude.extend(nodes_to_exclude)
+    op_types_to_quantize_set = set(op_types_to_quantize) if op_types_to_quantize else None
+    nodes_to_exclude_set = set(nodes_to_exclude) if isinstance(nodes_to_exclude, list) else set()
 
     # Iterate through nodes to get all operator types in the model and
     # call user's function to filter out nodes from quantization.
     for node in model.graph.node:
-        op_types.add(node.op_type)
-        if nodes_to_exclude is not None and callable(nodes_to_exclude):
-            if nodes_to_exclude(model, node):
-                final_nodes_to_exclude.append(node.name)
+        if op_types_to_quantize_set and node.op_type not in op_types_to_quantize_set:
+            continue
+        if node.name in nodes_to_exclude_set:
+            continue
+        if callable(nodes_to_exclude) and nodes_to_exclude(model, node):
+            nodes_to_exclude_set.add(node.name)
+        else:
+            op_types.add(node.op_type)
 
     final_extra_options = {
         "MinimumRealRange": min_real_range,
@@ -378,11 +387,14 @@ def get_qdq_config(
         quant_format=QuantFormat.QDQ,
         activation_type=activation_type,
         weight_type=weight_type,
-        op_types_to_quantize=list(op_types.difference(op_types_to_exclude)),
-        nodes_to_exclude=final_nodes_to_exclude,
+        op_types_to_quantize=(
+            op_types_to_quantize if op_types_to_quantize else list(op_types.difference(op_types_to_exclude))
+        ),
+        nodes_to_exclude=list(nodes_to_exclude_set),
         per_channel=per_channel,
         reduce_range=reduce_range,
         use_external_data_format=(model_has_external_data or model.ByteSize() >= MODEL_SIZE_THRESHOLD),
+        calibration_providers=calibration_providers,
         extra_options=final_extra_options,
     )
 
@@ -442,7 +454,7 @@ def check_static_quant_arguments(quant_format: QuantFormat, activation_type: Qua
     if activation_type != QuantType.QFLOAT8E4M3FN and weight_type == QuantType.QFLOAT8E4M3FN:
         raise ValueError(
             f"ONNXRuntime quantization doesn't support data format: activation_type={activation_type} "
-            f"!=QuantType.QFLOAT8E4M3FN, weight_type=QuantType.QFLOAT8E4M3FN."
+            "!=QuantType.QFLOAT8E4M3FN, weight_type=QuantType.QFLOAT8E4M3FN."
         )
 
     if activation_type == QuantType.QFLOAT8E4M3FN and weight_type != QuantType.QFLOAT8E4M3FN:
diff --git a/onnxruntime/test/python/quantization/test_get_qdq_config.py b/onnxruntime/test/python/quantization/test_get_qdq_config.py
index 25f058d8f6eac..4a71b3694822c 100644
--- a/onnxruntime/test/python/quantization/test_get_qdq_config.py
+++ b/onnxruntime/test/python/quantization/test_get_qdq_config.py
@@ -156,6 +156,62 @@ def should_exclude_node_(model: onnx.ModelProto, node: onnx.NodeProto) -> bool:
         self.assertTrue(bool(expected_excluded_nodes))
         self.assertEqual(set(qdq_config.nodes_to_exclude), expected_excluded_nodes)
 
+    def test_op_types_to_quantize(self):
+        """
+        Test that get_qdq_config() returns a config that sets the op_types_to_quantize arg.
+        """
+        shape = [1, 8, 8]
+        tensor_type = onnx.TensorProto.FLOAT
+        np_dtype = onnx.helper.tensor_dtype_to_np_dtype(tensor_type)
+        weight = onnx.numpy_helper.from_array(np.ones(shape, dtype=np_dtype), "weight")
+        float_model = self.build_add_model(shape, tensor_type, weight)
+
+        input_data_list = [
+            {"input_0": np.ones(shape, dtype=np_dtype) * np.array(-2, dtype=np_dtype)},
+            {"input_0": np.ones(shape, dtype=np_dtype) * np.array(2, dtype=np_dtype)},
+        ]
+        data_reader = TestDataFeeds(input_data_list)
+
+        # No op_types_to_quantize arg means all ops are quantized.
+        qdq_config = get_qdq_config(float_model, data_reader, op_types_to_quantize=None)
+        self.assertEqual(set(qdq_config.op_types_to_quantize), {"Add"})
+
+        # specify custom op_types_to_quantize arg.
+        qdq_config = get_qdq_config(float_model, data_reader, op_types_to_quantize=["Mul"])
+        self.assertEqual(set(qdq_config.op_types_to_quantize), {"Mul"})
+
+        # exclude op_type indirectly by specifying nodes_to_exclude arg.
+        qdq_config = get_qdq_config(
+            float_model,
+            data_reader,
+            nodes_to_exclude=[node.name for node in float_model.graph.node if node.op_type == "Add"],
+        )
+        self.assertEqual(set(qdq_config.op_types_to_quantize), set())
+
+    def test_calibration_providers(self):
+        """
+        Test that get_qdq_config() returns a config that sets the calibration providers arg.
+        """
+
+        shape = [1, 8, 8]
+        tensor_type = onnx.TensorProto.FLOAT
+        np_dtype = onnx.helper.tensor_dtype_to_np_dtype(tensor_type)
+        weight = onnx.numpy_helper.from_array(np.ones(shape, dtype=np_dtype), "weight")
+        float_model = self.build_add_model(shape, tensor_type, weight)
+
+        input_data_list = [
+            {"input_0": np.ones(shape, dtype=np_dtype) * np.array(-2, dtype=np_dtype)},
+            {"input_0": np.ones(shape, dtype=np_dtype) * np.array(2, dtype=np_dtype)},
+        ]
+        data_reader = TestDataFeeds(input_data_list)
+
+        qdq_config = get_qdq_config(
+            float_model,
+            data_reader,
+            calibration_providers=["CPUExecutionProvider"],
+        )
+        self.assertEqual(qdq_config.calibration_providers, ["CPUExecutionProvider"])
+
     def test_external_data(self):
         """
         Test that get_qdq_config() returns a config that enables external data

From 99c51a326e0ff54a56e7b194204d459932084408 Mon Sep 17 00:00:00 2001
From: Hector Li <hecli@microsoft.com>
Date: Sun, 2 Mar 2025 20:57:01 -0800
Subject: [PATCH 06/46] Change the logic to generate the default ep context
 file name (#23788)

Change the logic to generate the default ep context file name

### Description
Applies to all EPs: replace the .onnx to _ctx.onnx, instead of directly append extra string _ctx.onnx to existing model path. In QNN EP, also make the context binary .bin file shorter by removing QNNExecutionProvider_ from the file name.
---
 .../core/framework/graph_partitioner.cc       | 28 +++++-----
 .../qnn/builder/onnx_ctx_model_helper.cc      | 38 +++++--------
 .../qnn/builder/onnx_ctx_model_helper.h       |  7 +--
 .../providers/qnn/qnn_execution_provider.cc   | 30 +++++++----
 .../test/providers/qnn/qnn_ep_context_test.cc | 53 +++++++++++++------
 5 files changed, 84 insertions(+), 72 deletions(-)

diff --git a/onnxruntime/core/framework/graph_partitioner.cc b/onnxruntime/core/framework/graph_partitioner.cc
index 111f8e0a5fc34..b79d0327c3ef5 100644
--- a/onnxruntime/core/framework/graph_partitioner.cc
+++ b/onnxruntime/core/framework/graph_partitioner.cc
@@ -667,23 +667,28 @@ static Status InlineFunctionsAOTImpl(const ExecutionProviders& execution_provide
 }
 
 // Validate the ep_context_path to make sure it is file path and check whether the file exist already
-static Status EpContextFilePathCheck(const std::string& ep_context_path,
-                                     const std::filesystem::path& model_path) {
-  std::filesystem::path context_cache_path;
+static Status GetValidatedEpContextPath(const std::filesystem::path& ep_context_path,
+                                        const std::filesystem::path& model_path,
+                                        std::filesystem::path& context_cache_path) {
   if (!ep_context_path.empty()) {
     context_cache_path = ep_context_path;
     if (!context_cache_path.has_filename()) {
       return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "context_file_path should not point to a folder.");
     }
   } else if (!model_path.empty()) {
-    context_cache_path = model_path.native() + ORT_TSTR("_ctx.onnx");
+    auto pos = model_path.native().find_last_of(ORT_TSTR("."));
+    if (pos != std::string::npos) {
+      context_cache_path = model_path.native().substr(0, pos) + ORT_TSTR("_ctx.onnx");
+    } else {
+      context_cache_path = model_path.native() + ORT_TSTR("_ctx.onnx");
+    }
   } else {
     return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Both ep_context_path and model_path are empty.");
   }
 
   if (std::filesystem::exists(context_cache_path)) {
     return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Failed to generate EP context model since the file '",
-                           context_cache_path, "' exist already.");
+                           context_cache_path, "' exist already. Please remove the EP context model if you want to re-generate it.");
   }
 
   return Status::OK();
@@ -714,15 +719,7 @@ static Status CreateEpContextModel(const ExecutionProviders& execution_providers
   };
 
   std::filesystem::path context_cache_path;
-  const std::filesystem::path& model_path = graph.ModelPath();
-
-  if (!ep_context_path.empty()) {
-    context_cache_path = ep_context_path;
-  } else if (!model_path.empty()) {
-    context_cache_path = model_path.native() + ORT_TSTR("_ctx.onnx");
-  } else {
-    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Both ep_context_path and model_path are empty");
-  }
+  ORT_RETURN_IF_ERROR(GetValidatedEpContextPath(ep_context_path, graph.ModelPath(), context_cache_path));
 
   Model ep_context_model(graph.Name(), false, graph.GetModel().MetaData(),
                          graph.GetModel().ModelPath(),  // use source model path so that external initializers can find the data file path
@@ -1068,7 +1065,8 @@ Status GraphPartitioner::Partition(Graph& graph, FuncManager& func_mgr,
     if (ep_context_enabled) {
       std::string ep_context_path = config_options.GetConfigOrDefault(kOrtSessionOptionEpContextFilePath, "");
       // Check before EP compile graphs
-      ORT_RETURN_IF_ERROR(EpContextFilePathCheck(ep_context_path, graph.ModelPath()));
+      std::filesystem::path context_cache_path;
+      ORT_RETURN_IF_ERROR(GetValidatedEpContextPath(ep_context_path, graph.ModelPath(), context_cache_path));
     }
 
     // We use this only if Resource Aware Partitioning is enabled for any of the EPs
diff --git a/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.cc b/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.cc
index 3df231e53e7c0..d85277627a3de 100644
--- a/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.cc
+++ b/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.cc
@@ -198,35 +198,13 @@ Status LoadQnnCtxFromOnnxGraph(const onnxruntime::GraphViewer& graph_viewer,
   return Status::OK();
 }
 
-// Figure out the real context cache file path
-// return true if context cache file exists
-bool ValidateContextCacheFilePath(bool is_qnn_ctx_model,
-                                  const std::string& customer_context_cache_path,
-                                  const onnxruntime::PathString& model_pathstring,
-                                  onnxruntime::PathString& context_cache_path) {
-  // always try the path set by user first, it's the only way to set it if load model from memory
-  if (!customer_context_cache_path.empty()) {
-    context_cache_path = ToPathString(customer_context_cache_path);
-  } else if (!model_pathstring.empty()) {  // model loaded from file
-    if (is_qnn_ctx_model) {
-      // it's a context cache model, just use the model path
-      context_cache_path = model_pathstring;
-    } else if (!model_pathstring.empty()) {
-      // this is not a normal Onnx model, no customer path, create a default path for generation: model_path + _ctx.onnx
-      context_cache_path = model_pathstring + ToPathString("_ctx.onnx");
-    }
-  }
-
-  return std::filesystem::is_regular_file(context_cache_path) && std::filesystem::exists(context_cache_path);
-}
-
 Status CreateEPContextNodes(Model* model,
                             unsigned char* buffer,
                             uint64_t buffer_size,
                             const std::string& sdk_build_version,
                             const std::vector<IExecutionProvider::FusedNodeAndGraph>& fused_nodes_and_graphs,
                             const QnnModelLookupTable& qnn_models,
-                            const onnxruntime::PathString& context_cache_path,
+                            const onnxruntime::PathString& context_model_path,
                             bool qnn_context_embed_mode,
                             uint64_t max_spill_fill_buffer_size,
                             const logging::Logger& logger) {
@@ -262,7 +240,19 @@ Status CreateEPContextNodes(Model* model,
         std::string cache_payload(buffer, buffer + buffer_size);
         ep_node.AddAttribute(EP_CACHE_CONTEXT, cache_payload);
       } else {
-        onnxruntime::PathString context_bin_path = context_cache_path + ToPathString("_" + graph_name + ".bin");
+        onnxruntime::PathString context_bin_path;
+        auto pos = context_model_path.find_last_of(ORT_TSTR("."));
+        if (pos != std::string::npos) {
+          context_bin_path = context_model_path.substr(0, pos);
+        } else {
+          context_bin_path = context_model_path;
+        }
+        std::string graph_name_in_file(graph_name);
+        auto name_pos = graph_name_in_file.find_first_of(kQnnExecutionProvider);
+        if (name_pos != std::string::npos) {
+          graph_name_in_file.replace(name_pos, strlen(kQnnExecutionProvider), "");
+        }
+        context_bin_path = context_bin_path + ToPathString(graph_name_in_file + ".bin");
         std::string context_cache_name(std::filesystem::path(context_bin_path).filename().string());
         std::ofstream of_stream(context_bin_path.c_str(), std::ofstream::binary);
         if (!of_stream) {
diff --git a/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.h b/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.h
index 3dfa0ae21001b..c54cd3ca6e90c 100644
--- a/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.h
+++ b/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.h
@@ -38,11 +38,6 @@ Status CreateNodeArgs(const std::vector<std::string>& names,
                       std::vector<NodeArg*>& node_args,
                       onnxruntime::Graph& graph);
 
-bool ValidateContextCacheFilePath(bool is_qnn_ctx_model,
-                                  const std::string& customer_context_cache_path,
-                                  const onnxruntime::PathString& model_pathstring,
-                                  onnxruntime::PathString& context_cache_path);
-
 Status GetEpContextFromMainNode(const onnxruntime::Node& main_context_node,
                                 const onnxruntime::PathString& ctx_onnx_model_path,
                                 QnnBackendManager* qnn_backend_manager,
@@ -67,7 +62,7 @@ Status CreateEPContextNodes(Model* model,
                             const std::string& sdk_build_version,
                             const std::vector<IExecutionProvider::FusedNodeAndGraph>& fused_nodes_and_graphs,
                             const std::unordered_map<std::string, std::unique_ptr<QnnModel>>& qnn_models,
-                            const onnxruntime::PathString& context_cache_path,
+                            const onnxruntime::PathString& context_model_path,
                             bool qnn_context_embed_mode,
                             uint64_t max_spill_fill_buffer_size,
                             const logging::Logger& logger);
diff --git a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
index 3fc537066ae0b..99a6f51f6f712 100644
--- a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
+++ b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
@@ -904,25 +904,33 @@ Status QNNExecutionProvider::CompileFromOrtGraph(const std::vector<FusedNodeAndG
   return Status::OK();
 }
 
+// Figure out the context cache Onnx file path to decide the folder location
+static void GetContextOnnxModelFilePath(const std::string& customer_context_cache_path,
+                                        const onnxruntime::PathString& model_path_string,
+                                        onnxruntime::PathString& context_cache_binary_path) {
+  // always try the path set by user first, it's the only way to set it if load model from memory
+  if (!customer_context_cache_path.empty()) {
+    context_cache_binary_path = ToPathString(customer_context_cache_path);
+  } else if (!model_path_string.empty()) {  // model loaded from file
+    context_cache_binary_path = model_path_string;
+  }
+}
+
 Status QNNExecutionProvider::Compile(const std::vector<FusedNodeAndGraph>& fused_nodes_and_graphs,
                                      std::vector<NodeComputeInfo>& node_compute_funcs) {
   const auto& logger = *GetLogger();
   bool is_qnn_ctx_model = qnn::IsFusedGraphHasCtxNode(fused_nodes_and_graphs);
 
-  onnxruntime::PathString context_cache_path;
+  onnxruntime::PathString context_model_path;
   bool is_ctx_file_exist = false;
   if (is_qnn_ctx_model || context_cache_enabled_) {
     const onnxruntime::GraphViewer& graph_viewer_0(fused_nodes_and_graphs[0].filtered_graph);
-    is_ctx_file_exist = qnn::ValidateContextCacheFilePath(is_qnn_ctx_model,
-                                                          context_cache_path_cfg_,
-                                                          graph_viewer_0.ModelPath().native(),
-                                                          context_cache_path);
+    // Figure out the EP context model path from model path or session option
+    GetContextOnnxModelFilePath(context_cache_path_cfg_,
+                                graph_viewer_0.ModelPath().native(),
+                                context_model_path);
   }
 
-  ORT_RETURN_IF(is_ctx_file_exist && !is_qnn_ctx_model && context_cache_enabled_,
-                "The inference session is created from normal ONNX model. And an EP context model file is provided and existed. ",
-                "Please remove the EP context model manually if you want to re-generate it.");
-
   if (is_qnn_ctx_model) {
     // Get QnnModel from EP shared contexts
     if (share_ep_contexts_ && SharedContext::GetInstance().HasSharedQnnModels()) {
@@ -965,7 +973,7 @@ Status QNNExecutionProvider::Compile(const std::vector<FusedNodeAndGraph>& fused
       const onnxruntime::GraphViewer& main_ctx_graph_viewer(fused_nodes_and_graphs[main_context_pos].filtered_graph);
       // Create QNN context from the cached binary, deserialize the QNN graph from the binary
       ORT_RETURN_IF_ERROR(qnn::LoadQnnCtxFromOnnxGraph(main_ctx_graph_viewer,
-                                                       context_cache_path,
+                                                       context_model_path,
                                                        qnn_backend_manager_.get(),
                                                        qnn_models,
                                                        logger,
@@ -1025,7 +1033,7 @@ Status QNNExecutionProvider::Compile(const std::vector<FusedNodeAndGraph>& fused
                                                   qnn_backend_manager_->GetSdkVersion(),
                                                   fused_nodes_and_graphs,
                                                   qnn_models_,
-                                                  context_cache_path,
+                                                  context_model_path,
                                                   qnn_context_embed_mode_,
                                                   max_spill_fill_buffer_size,
                                                   logger));
diff --git a/onnxruntime/test/providers/qnn/qnn_ep_context_test.cc b/onnxruntime/test/providers/qnn/qnn_ep_context_test.cc
index 07843c30a61df..e50dd7c214240 100644
--- a/onnxruntime/test/providers/qnn/qnn_ep_context_test.cc
+++ b/onnxruntime/test/providers/qnn/qnn_ep_context_test.cc
@@ -333,7 +333,7 @@ TEST_F(QnnHTPBackendTests, QnnContextBinaryGenerationNoOverWrite) {
   const auto model_data_span = AsByteSpan(model_data.data(), model_data.size());
 
   const std::string ep_context_onnx_file = "./ep_context_no_over_write.onnx";
-  const std::string ep_context_binary_file = "./ep_context_no_over_write.onnx_QNNExecutionProvider_QNN_10880527342279992768_1_0.bin";
+  const std::string ep_context_binary_file = "./ep_context_no_over_write_QNN_10880527342279992768_1_0.bin";
 
   std::remove(ep_context_onnx_file.c_str());
   Ort::SessionOptions so;
@@ -580,6 +580,8 @@ TEST_F(QnnHTPBackendTests, QnnContextBinaryCacheEmbedModeTest) {
   EXPECT_TRUE(std::filesystem::exists(context_binary_file.c_str()));
 
   // 2nd run directly loads and run from Qnn context cache model
+  std::unordered_map<std::string, std::string> session_option_pairs2;
+  session_option_pairs2.emplace(kOrtSessionOptionEpContextFilePath, context_binary_file);
   TestQDQModelAccuracy(BuildOpTestCase<float>(op_type, {input_def}, {}, {}),
                        BuildQDQOpTestCase<uint8_t>(op_type, {input_def}, {}, {}),
                        provider_options,
@@ -587,7 +589,8 @@ TEST_F(QnnHTPBackendTests, QnnContextBinaryCacheEmbedModeTest) {
                        ExpectedEPNodeAssignment::All,
                        QDQTolerance(),
                        logging::Severity::kERROR,
-                       context_binary_file);
+                       context_binary_file,
+                       session_option_pairs2);
   // Clean up
   ASSERT_EQ(std::remove(context_binary_file.c_str()), 0);
 }
@@ -604,7 +607,7 @@ TEST_F(QnnHTPBackendTests, QnnContextBinaryCacheNonEmbedModeTest) {
 #endif
   provider_options["offload_graph_io_quantization"] = "0";
   const std::string context_binary_file = "./testdata/qnn_context_cache_non_embed.onnx";
-  std::string qnn_ctx_bin = "./testdata/qnn_context_cache_non_embed.onnx_QNNExecutionProvider_QNN_8283143575221199085_1_0.bin";
+  std::string qnn_ctx_bin = "./testdata/qnn_context_cache_non_embed_QNN_8283143575221199085_1_0.bin";
 
   std::unordered_map<std::string, std::string> session_option_pairs;
   session_option_pairs.emplace(kOrtSessionOptionEpContextEnable, "1");
@@ -686,7 +689,7 @@ TEST_F(QnnHTPBackendTests, QnnContextBinaryCache_InvalidGraph) {
 #endif
   provider_options["offload_graph_io_quantization"] = "0";
   const std::string context_binary_file = "./qnn_context_cache_non_embed.onnx";
-  std::filesystem::path context_bin = "qnn_context_cache_non_embed.onnx_QNNExecutionProvider_QNN_8283143575221199085_1_0.bin";
+  std::filesystem::path context_bin = "qnn_context_cache_non_embed_QNN_8283143575221199085_1_0.bin";
   std::remove(context_binary_file.c_str());
   std::remove(context_bin.string().c_str());
 
@@ -828,6 +831,7 @@ TEST_F(QnnHTPBackendTests, QnnContextBinaryFileNotExistTest) {
 
   SessionOptions so;
   so.session_logid = "qnn_ctx_model_logger";
+  ASSERT_STATUS_OK(so.config_options.AddConfigEntry(kOrtSessionOptionEpContextFilePath, "./qnn_context_not_exist.onnx"));
   RunOptions run_options;
   run_options.run_tag = so.session_logid;
 
@@ -841,7 +845,7 @@ TEST_F(QnnHTPBackendTests, QnnContextBinaryFileNotExistTest) {
 #endif
   provider_options["offload_graph_io_quantization"] = "0";
 
-  ASSERT_STATUS_OK(session_object.RegisterExecutionProvider(QnnExecutionProviderWithOptions(provider_options)));
+  ASSERT_STATUS_OK(session_object.RegisterExecutionProvider(QnnExecutionProviderWithOptions(provider_options, &so)));
   ASSERT_STATUS_OK(session_object.Load(model_data.data(), static_cast<int>(model_data.size())));
   // Verify the return status with code INVALID_GRAPH
   ASSERT_TRUE(session_object.Initialize().Code() == common::StatusCode::INVALID_GRAPH);
@@ -854,6 +858,7 @@ TEST_F(QnnHTPBackendTests, QnnContextBinaryFileEmptyStringTest) {
 
   SessionOptions so;
   so.session_logid = "qnn_ctx_model_logger";
+  ASSERT_STATUS_OK(so.config_options.AddConfigEntry(kOrtSessionOptionEpContextFilePath, "./test_ctx.onnx"));
   RunOptions run_options;
   run_options.run_tag = so.session_logid;
 
@@ -867,7 +872,7 @@ TEST_F(QnnHTPBackendTests, QnnContextBinaryFileEmptyStringTest) {
 #endif
   provider_options["offload_graph_io_quantization"] = "0";
 
-  ASSERT_STATUS_OK(session_object.RegisterExecutionProvider(QnnExecutionProviderWithOptions(provider_options)));
+  ASSERT_STATUS_OK(session_object.RegisterExecutionProvider(QnnExecutionProviderWithOptions(provider_options, &so)));
   ASSERT_STATUS_OK(session_object.Load(model_data.data(), static_cast<int>(model_data.size())));
   // Verify the return status with code INVALID_GRAPH
   ASSERT_TRUE(session_object.Initialize().Code() == common::StatusCode::INVALID_GRAPH);
@@ -911,6 +916,8 @@ TEST_F(QnnHTPBackendTests, QnnContextBinary2InputsTest) {
   EXPECT_TRUE(std::filesystem::exists(context_binary_file.c_str()));
 
   // 2nd run directly loads and run from Qnn context cache model
+  std::unordered_map<std::string, std::string> session_option_pairs2;
+  session_option_pairs2.emplace(kOrtSessionOptionEpContextFilePath, context_binary_file);
   TestQDQModelAccuracy(BuildOpTestCase<float>(op_type, {input_def1, input_def2}, {}, {}),
                        BuildQDQOpTestCase<uint8_t>(op_type, {input_def1, input_def2}, {}, {}),
                        provider_options,
@@ -918,7 +925,8 @@ TEST_F(QnnHTPBackendTests, QnnContextBinary2InputsTest) {
                        ExpectedEPNodeAssignment::All,
                        QDQTolerance(),
                        logging::Severity::kERROR,
-                       context_binary_file);
+                       context_binary_file,
+                       session_option_pairs2);
   // Clean up
   ASSERT_EQ(std::remove(context_binary_file.c_str()), 0);
 }
@@ -936,14 +944,14 @@ TEST_F(QnnHTPBackendTests, QnnContextBinaryCache_SingleNodeNameNotMatchGraphName
   provider_options["backend_path"] = "libQnnHtp.so";
 #endif
   provider_options["offload_graph_io_quantization"] = "0";
-  const std::string context_binary_file = "./qnn_context_cache_non_embed.onnx";
-  std::filesystem::path context_bin = "qnn_context_cache_non_embed.onnx_QNNExecutionProvider_QNN_8283143575221199085_1_0.bin";
-  std::remove(context_binary_file.c_str());
+  const std::string context_model_file = "./qnn_context_cache_non_embed.onnx";
+  std::filesystem::path context_bin = "qnn_context_cache_non_embed_QNN_8283143575221199085_1_0.bin";
+  std::remove(context_model_file.c_str());
   std::remove(context_bin.string().c_str());
 
   std::unordered_map<std::string, std::string> session_option_pairs;
   session_option_pairs.emplace(kOrtSessionOptionEpContextEnable, "1");
-  session_option_pairs.emplace(kOrtSessionOptionEpContextFilePath, context_binary_file);
+  session_option_pairs.emplace(kOrtSessionOptionEpContextFilePath, context_model_file);
   session_option_pairs.emplace(kOrtSessionOptionEpContextEmbedMode, "0");
 
   const TestInputDef<float> input_def({1, 2, 3}, false, -10.0f, 10.0f);
@@ -962,7 +970,7 @@ TEST_F(QnnHTPBackendTests, QnnContextBinaryCache_SingleNodeNameNotMatchGraphName
                        session_option_pairs);
 
   // Check the Onnx skeleton file is generated
-  EXPECT_TRUE(std::filesystem::exists(context_binary_file.c_str()));
+  EXPECT_TRUE(std::filesystem::exists(context_model_file.c_str()));
   // Check the Qnn context cache binary file is generated
   EXPECT_TRUE(std::filesystem::exists(context_bin));
 
@@ -990,18 +998,19 @@ TEST_F(QnnHTPBackendTests, QnnContextBinaryCache_SingleNodeNameNotMatchGraphName
 
   SessionOptions so;
   so.session_logid = "qnn_ctx_model_logger";
+  ASSERT_STATUS_OK(so.config_options.AddConfigEntry(kOrtSessionOptionEpContextFilePath, context_model_file.c_str()));
   RunOptions run_options;
   run_options.run_tag = so.session_logid;
 
   InferenceSessionWrapper session_object{so, GetEnvironment()};
 
-  ASSERT_STATUS_OK(session_object.RegisterExecutionProvider(QnnExecutionProviderWithOptions(provider_options)));
+  ASSERT_STATUS_OK(session_object.RegisterExecutionProvider(QnnExecutionProviderWithOptions(provider_options, &so)));
   ASSERT_STATUS_OK(session_object.Load(model_data.data(), static_cast<int>(model_data.size())));
   // Verify the return status with code INVALID_GRAPH
   ASSERT_TRUE(session_object.Initialize().Code() == common::StatusCode::OK);
 
   // Clean up
-  ASSERT_EQ(std::remove(context_binary_file.c_str()), 0);
+  ASSERT_EQ(std::remove(context_model_file.c_str()), 0);
   ASSERT_EQ(std::remove(context_bin.string().c_str()), 0);
 }
 
@@ -1167,7 +1176,13 @@ TEST_F(QnnHTPBackendTests, QnnContextShareAcrossSessions1) {
   for (auto model_path : onnx_model_paths) {
     CreateQdqModel(model_path, DefaultLoggingManager().DefaultLogger());
     EXPECT_TRUE(std::filesystem::exists(model_path.c_str()));
-    ctx_model_paths.push_back(model_path + "_ctx.onnx");
+    auto pos = model_path.find_last_of(".");
+    if (pos != std::string::npos) {
+      model_path = model_path.substr(0, pos) + "_ctx.onnx";
+    } else {
+      model_path = model_path + "_ctx.onnx";
+    }
+    ctx_model_paths.push_back(model_path);
   }
 
   DumpModelWithSharedCtx(provider_options, onnx_model_paths[0], onnx_model_paths[1]);
@@ -1265,7 +1280,13 @@ TEST_F(QnnHTPBackendTests, QnnContextShareAcrossSessions2) {
   for (auto model_path : onnx_model_paths) {
     CreateQdqModel(model_path, DefaultLoggingManager().DefaultLogger());
     EXPECT_TRUE(std::filesystem::exists(model_path.c_str()));
-    ctx_model_paths.push_back(model_path + "_ctx.onnx");
+    auto pos = model_path.find_last_of(".");
+    if (pos != std::string::npos) {
+      model_path = model_path.substr(0, pos) + "_ctx.onnx";
+    } else {
+      model_path = model_path + "_ctx.onnx";
+    }
+    ctx_model_paths.push_back(model_path);
   }
 
   DumpModelWithSharedCtx(provider_options, onnx_model_paths[0], onnx_model_paths[1]);

From 7f0c2c644c83a5175d92c9b2fdf20399a6faad1d Mon Sep 17 00:00:00 2001
From: Jian Chen <cjian@microsoft.com>
Date: Mon, 3 Mar 2025 13:14:04 -0500
Subject: [PATCH 07/46] Make Nuget QNN package pipeline 1ES compliant (#23805)

### Description
Make
[QNN_Nuget_Windows](https://aiinfra.visualstudio.com/Lotus/_build?definitionId=1234)1ES
compliant


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
---
 .../qnn-ep-nuget-packaging-pipeline.yml       | 148 ++++------
 .../stages/nuget-qnn-packaging-stage.yml      |  76 +++++
 .../azure-pipelines/templates/qnn-ep-win.yml  | 259 +++++++++---------
 3 files changed, 255 insertions(+), 228 deletions(-)
 create mode 100644 tools/ci_build/github/azure-pipelines/stages/nuget-qnn-packaging-stage.yml

diff --git a/tools/ci_build/github/azure-pipelines/qnn-ep-nuget-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/qnn-ep-nuget-packaging-pipeline.yml
index 055ef58e4524a..cfca998e0f06c 100644
--- a/tools/ci_build/github/azure-pipelines/qnn-ep-nuget-packaging-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/qnn-ep-nuget-packaging-pipeline.yml
@@ -29,108 +29,58 @@ parameters:
   displayName: Pipeline BuildId, you could find it in the URL
   type: string
   default: '0'
-
-stages:
-
-- template: templates/qnn-ep-win.yml
-  parameters:
-    qnn_ep_build_pool_name: 'Onnxruntime-QNNEP-Windows-2022-CPU'
-    QnnSdk: ${{ parameters.QnnSdk }}
-    IsReleaseBuild: ${{ parameters.IsReleaseBuild }}
-    DoEsrp: ${{ parameters.DoEsrp }}
-    ArtifactName: 'drop-nuget-qnn-x64'
-    StageName: 'OnnxRuntime_QNN_Nuget_Win_x64'
-    build_config: ${{ parameters.build_config }}
-
-- template: templates/qnn-ep-win.yml
+resources:
+  repositories:
+  - repository: 1esPipelines
+    type: git
+    name: 1ESPipelineTemplates/1ESPipelineTemplates
+    ref: refs/tags/release
+extends:
+  # The pipeline extends the 1ES PT which will inject different SDL and compliance tasks.
+  # For non-production pipelines, use "Unofficial" as defined below.
+  # For productions pipelines, use "Official".
+  template: v1/1ES.Official.PipelineTemplate.yml@1esPipelines
   parameters:
-    qnn_ep_build_pool_name: 'Onnxruntime-QNNEP-Windows-2022-CPU'
-    QnnSdk: ${{ parameters.QnnSdk }}
-    IsReleaseBuild: ${{ parameters.IsReleaseBuild }}
-    DoEsrp: ${{ parameters.DoEsrp }}
-    ArtifactName: 'drop-nuget-qnn-arm64'
-    buildParameter: '--arm64'
-    buildPlatform: 'ARM64'
-    buildArch: 'ARM64'
-    StageName: 'OnnxRuntime_QNN_Nuget_Win_Arm64'
-    build_config: ${{ parameters.build_config }}
-
-- stage: NuGet_Packaging_QNN
-  pool: 'Onnxruntime-QNNEP-Windows-2022-CPU'
-  dependsOn:
-  - OnnxRuntime_QNN_Nuget_Win_x64
-  - OnnxRuntime_QNN_Nuget_Win_Arm64
-  condition: succeeded()
-  jobs:
-  - job: NuGet_Packaging_QNN
-    workspace:
-      clean: all
-    steps:
-    - task: DownloadPipelineArtifact@0
-      displayName: 'Download Pipeline Artifact - QNN NuGet x64'
-      inputs:
-        artifactName: 'drop-nuget-qnn-x64'
-        targetPath: '$(Build.BinariesDirectory)/nuget-artifact-x64'
-
-    - task: DownloadPipelineArtifact@0
-      displayName: 'Download Pipeline Artifact - QNN NuGet arm64'
-      inputs:
-        artifactName: 'drop-nuget-qnn-arm64'
-        targetPath: '$(Build.BinariesDirectory)/nuget-artifact-arm64'
-
-    - task: PowerShell@2
-      displayName: 'Bundle NuGet'
-      inputs:
-        targetType: 'inline'
-        script: |
-
-          $x64_nupkgs = (Get-ChildItem $(Build.BinariesDirectory)/nuget-artifact-x64 -Filter Microsoft.ML.OnnxRuntime.QNN*.nupkg -Recurse)
-          $nuget_package_name = $x64_nupkgs[0].Name
-          $x64_nuget_package = $x64_nupkgs[0].FullName
-
-          $nupkg_unzipped_directory = [System.IO.Path]::Combine($Env:BUILD_ARTIFACTSTAGINGDIRECTORY, 'nuget_unzip_merged', [System.IO.Path]::GetFileNameWithoutExtension($nuget_package_name))
-
-          $x64_unzip_cmd = "7z.exe x $x64_nuget_package -y -o$nupkg_unzipped_directory"
-          Invoke-Expression -Command $x64_unzip_cmd
-
-          $arm64_nupkgs = (Get-ChildItem $(Build.BinariesDirectory)/nuget-artifact-arm64 -Filter Microsoft.ML.OnnxRuntime.QNN*.nupkg -Recurse)
-          $arm64_nuget_package = $arm64_nupkgs[0].FullName
+    sdl:
+      sourceAnalysisPool:
+        name: onnxruntime-Win-CPU-2022
+        os: windows
+    stages:
 
-          $arm64_unzip_cmd = "7z.exe x $arm64_nuget_package -y -o$nupkg_unzipped_directory"
-          Invoke-Expression -Command $arm64_unzip_cmd
-
-          $merged_nuget_path = [System.IO.Path]::Combine($Env:BUILD_ARTIFACTSTAGINGDIRECTORY, 'nuget-artifact-merged')
-          if (!(Test-Path $merged_nuget_path)) {
-              New-Item -Path $merged_nuget_path -ItemType Directory
-          }
-
-          $merged_zip = [System.IO.Path]::Combine($merged_nuget_path, 'qnn_nuget.zip')
-          $zip_cmd = "7z.exe a -r $merged_zip $nupkg_unzipped_directory/*"
-          Invoke-Expression -Command $zip_cmd
-
-          $merged_nuget = [System.IO.Path]::Combine($merged_nuget_path, $nuget_package_name)
-          move $merged_zip $merged_nuget
-        workingDirectory: $(Build.BinariesDirectory)
-
-    - template: templates/esrp_nuget.yml
+    - template: templates/qnn-ep-win.yml
       parameters:
-        DisplayName: 'ESRP - sign NuGet package'
-        FolderPath: '$(Build.ArtifactStagingDirectory)/nuget-artifact-merged'
+        qnn_ep_build_pool_name: 'Onnxruntime-QNNEP-Windows-2022-CPU'
+        QnnSdk: ${{ parameters.QnnSdk }}
+        IsReleaseBuild: ${{ parameters.IsReleaseBuild }}
         DoEsrp: ${{ parameters.DoEsrp }}
+        ArtifactName: 'drop-nuget-qnn-x64'
+        StageName: 'OnnxRuntime_QNN_Nuget_Win_x64'
+        build_config: ${{ parameters.build_config }}
 
-    - task: PublishPipelineArtifact@0
-      displayName: 'Publish Pipeline NuGet Artifact'
-      inputs:
-        artifactName: 'drop-signed-nuget-qnn'
-        targetPath: '$(Build.ArtifactStagingDirectory)/nuget-artifact-merged'
+    - template: templates/qnn-ep-win.yml
+      parameters:
+        qnn_ep_build_pool_name: 'Onnxruntime-QNNEP-Windows-2022-CPU'
+        QnnSdk: ${{ parameters.QnnSdk }}
+        IsReleaseBuild: ${{ parameters.IsReleaseBuild }}
+        DoEsrp: ${{ parameters.DoEsrp }}
+        ArtifactName: 'drop-nuget-qnn-arm64'
+        buildParameter: '--arm64'
+        buildPlatform: 'ARM64'
+        buildArch: 'ARM64'
+        StageName: 'OnnxRuntime_QNN_Nuget_Win_Arm64'
+        build_config: ${{ parameters.build_config }}
+
+    - template: stages/nuget-qnn-packaging-stage.yml
+      parameters:
+        DoEsrp: ${{ parameters.DoEsrp }}
 
-- template: templates/publish-nuget-steps.yml
-  parameters:
-    download_artifacts_steps:
-      - template: templates/flex-downloadPipelineArtifact.yml
-        parameters:
-          StepName: 'Download Pipeline Artifact - Signed NuGet Qnn Package'
-          ArtifactName: 'drop-signed-nuget-qnn'
-          targetPath: '$(Build.BinariesDirectory)/nuget-artifact/final-package'
-          SpecificArtifact: ${{ parameters.specificArtifact }}
-          BuildId: ${{ parameters.BuildId }}
+    - template: templates/publish-nuget-steps.yml
+      parameters:
+        download_artifacts_steps:
+        - template: templates/flex-downloadPipelineArtifact.yml
+          parameters:
+            StepName: 'Download Pipeline Artifact - Signed NuGet Qnn Package'
+            ArtifactName: 'drop-signed-nuget-qnn'
+            targetPath: '$(Build.BinariesDirectory)/nuget-artifact/final-package'
+            SpecificArtifact: ${{ parameters.specificArtifact }}
+            BuildId: ${{ parameters.BuildId }}
diff --git a/tools/ci_build/github/azure-pipelines/stages/nuget-qnn-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/stages/nuget-qnn-packaging-stage.yml
new file mode 100644
index 0000000000000..03802746cec3d
--- /dev/null
+++ b/tools/ci_build/github/azure-pipelines/stages/nuget-qnn-packaging-stage.yml
@@ -0,0 +1,76 @@
+parameters:
+- name: DoEsrp
+  displayName: Run code sign tasks? Must be true if you are doing an Onnx Runtime release.
+  type: boolean
+  default: true
+
+stages:
+- stage: NuGet_Packaging_QNN
+  pool:
+    name: 'Onnxruntime-QNNEP-Windows-2022-CPU'
+  dependsOn:
+  - OnnxRuntime_QNN_Nuget_Win_x64
+  - OnnxRuntime_QNN_Nuget_Win_Arm64
+  condition: succeeded()
+  jobs:
+  - job: NuGet_Packaging_QNN
+    workspace:
+      clean: all
+    steps:
+    - task: DownloadPipelineArtifact@0
+      displayName: 'Download Pipeline Artifact - QNN NuGet x64'
+      inputs:
+        artifactName: 'drop-nuget-qnn-x64'
+        targetPath: '$(Build.BinariesDirectory)/nuget-artifact-x64'
+
+    - task: DownloadPipelineArtifact@0
+      displayName: 'Download Pipeline Artifact - QNN NuGet arm64'
+      inputs:
+        artifactName: 'drop-nuget-qnn-arm64'
+        targetPath: '$(Build.BinariesDirectory)/nuget-artifact-arm64'
+
+    - task: PowerShell@2
+      displayName: 'Bundle NuGet'
+      inputs:
+        targetType: 'inline'
+        script: |
+          
+          $x64_nupkgs = (Get-ChildItem $(Build.BinariesDirectory)/nuget-artifact-x64 -Filter Microsoft.ML.OnnxRuntime.QNN*.nupkg -Recurse)
+          $nuget_package_name = $x64_nupkgs[0].Name
+          $x64_nuget_package = $x64_nupkgs[0].FullName
+          
+          $nupkg_unzipped_directory = [System.IO.Path]::Combine($Env:BUILD_ARTIFACTSTAGINGDIRECTORY, 'nuget_unzip_merged', [System.IO.Path]::GetFileNameWithoutExtension($nuget_package_name))
+          
+          $x64_unzip_cmd = "7z.exe x $x64_nuget_package -y -o$nupkg_unzipped_directory"
+          Invoke-Expression -Command $x64_unzip_cmd
+          
+          $arm64_nupkgs = (Get-ChildItem $(Build.BinariesDirectory)/nuget-artifact-arm64 -Filter Microsoft.ML.OnnxRuntime.QNN*.nupkg -Recurse)
+          $arm64_nuget_package = $arm64_nupkgs[0].FullName
+          
+          $arm64_unzip_cmd = "7z.exe x $arm64_nuget_package -y -o$nupkg_unzipped_directory"
+          Invoke-Expression -Command $arm64_unzip_cmd
+          
+          $merged_nuget_path = [System.IO.Path]::Combine($Env:BUILD_ARTIFACTSTAGINGDIRECTORY, 'nuget-artifact-merged')
+          if (!(Test-Path $merged_nuget_path)) {
+              New-Item -Path $merged_nuget_path -ItemType Directory
+          }
+          
+          $merged_zip = [System.IO.Path]::Combine($merged_nuget_path, 'qnn_nuget.zip')
+          $zip_cmd = "7z.exe a -r $merged_zip $nupkg_unzipped_directory/*"
+          Invoke-Expression -Command $zip_cmd
+          
+          $merged_nuget = [System.IO.Path]::Combine($merged_nuget_path, $nuget_package_name)
+          move $merged_zip $merged_nuget
+        workingDirectory: $(Build.BinariesDirectory)
+
+    - template: ../templates/esrp_nuget.yml
+      parameters:
+        DisplayName: 'ESRP - sign NuGet package'
+        FolderPath: '$(Build.ArtifactStagingDirectory)/nuget-artifact-merged'
+        DoEsrp: ${{ parameters.DoEsrp }}
+
+    - task: 1ES.PublishPipelineArtifact@1
+      displayName: 'Publish Pipeline NuGet Artifact'
+      inputs:
+        artifactName: 'drop-signed-nuget-qnn'
+        targetPath: '$(Build.ArtifactStagingDirectory)/nuget-artifact-merged'
diff --git a/tools/ci_build/github/azure-pipelines/templates/qnn-ep-win.yml b/tools/ci_build/github/azure-pipelines/templates/qnn-ep-win.yml
index a93d6b5ff8419..b591a3e3e121b 100644
--- a/tools/ci_build/github/azure-pipelines/templates/qnn-ep-win.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/qnn-ep-win.yml
@@ -18,7 +18,8 @@ stages:
   - job: ${{ parameters.StageName }}
     timeoutInMinutes: 120
 
-    pool: ${{ parameters.qnn_ep_build_pool_name }}
+    pool:
+      name: ${{ parameters.qnn_ep_build_pool_name }}
     variables:
       ${{ if eq(parameters.buildArch, 'ARM64') }}:
         targetArchitecture: 'arm64'
@@ -28,134 +29,134 @@ stages:
       commonBuildArgs: '--update --compile_no_warning_as_error --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --cmake_generator "Visual Studio 17 2022" --config ${{ parameters.build_config }} --parallel --use_binskim_compliant_compile_flags ${{ parameters.buildParameter }} '
 
     steps:
-      - template: set-version-number-variables-step.yml
-
-      - task: UsePythonVersion@0
-        inputs:
-          versionSpec: '3.12'
-          addToPath: true
-
-      - template: jobs/download_win_qnn_sdk.yml
-        parameters:
-          QnnSDKVersion: ${{ parameters.QnnSdk }}
-
-      - task: PythonScript@0
-        displayName: 'Generate project'
-        inputs:
-          scriptPath: '$(Build.SourcesDirectory)\tools\ci_build\build.py'
-          arguments: '--use_qnn --qnn_home $(QnnSDKRootDir) $(commonBuildArgs)'
-
-      - task: VSBuild@1
-        displayName: 'Build onnxruntime'
-        inputs:
-          solution: '$(Build.BinariesDirectory)\${{ parameters.build_config }}\onnxruntime.vcxproj'
-          platform: ${{ parameters.buildPlatform }}
-          configuration: ${{ parameters.build_config }}
-          msbuildArchitecture: ${{ parameters.buildArch }}
-          maximumCpuCount: true
-          logProjectEvents: true
-          workingFolder: '$(Build.BinariesDirectory)\${{ parameters.build_config }}'
-          createLogFile: true
-
-      - task: VSBuild@1
-        displayName: 'Build onnx_test_runner'
-        inputs:
-          solution: '$(Build.BinariesDirectory)\${{ parameters.build_config }}\onnx_test_runner.vcxproj'
-          platform: ${{ parameters.buildPlatform }}
-          configuration: ${{ parameters.build_config }}
-          msbuildArchitecture: ${{ parameters.buildArch }}
-          maximumCpuCount: true
-          logProjectEvents: true
-          workingFolder: '$(Build.BinariesDirectory)\${{ parameters.build_config }}'
-          createLogFile: true
-
-      - task: VSBuild@1
-        displayName: 'Build onnxruntime_perf_test'
-        inputs:
-          solution: '$(Build.BinariesDirectory)\${{ parameters.build_config }}\onnxruntime_perf_test.vcxproj'
-          platform: ${{ parameters.buildPlatform }}
-          configuration: ${{ parameters.build_config }}
-          msbuildArchitecture: ${{ parameters.buildArch }}
-          maximumCpuCount: true
-          logProjectEvents: true
-          workingFolder: '$(Build.BinariesDirectory)\${{ parameters.build_config }}'
-          createLogFile: true
-
-      - task: VSBuild@1
-        displayName: 'Build onnxruntime_test_all (to copy Qnn libs)'
-        inputs:
-          solution: '$(Build.BinariesDirectory)\${{ parameters.build_config }}\onnxruntime_test_all.vcxproj'
-          platform: ${{ parameters.buildPlatform }}
-          configuration: ${{ parameters.build_config }}
-          msbuildArchitecture: ${{ parameters.buildArch }}
-          maximumCpuCount: true
-          logProjectEvents: true
-          workingFolder: '$(Build.BinariesDirectory)\${{ parameters.build_config }}'
-          createLogFile: true
-
-      - task: CmdLine@2
-        displayName: 'Print contents of binaries directory'
-        inputs:
-          script: |
-            dir $(Build.BinariesDirectory)\${{ parameters.build_config }}\${{ parameters.build_config }}
-
+    - template: set-version-number-variables-step.yml
+
+    - task: UsePythonVersion@0
+      inputs:
+        versionSpec: '3.12'
+        addToPath: true
+
+    - template: jobs/download_win_qnn_sdk.yml
+      parameters:
+        QnnSDKVersion: ${{ parameters.QnnSdk }}
+
+    - task: PythonScript@0
+      displayName: 'Generate project'
+      inputs:
+        scriptPath: '$(Build.SourcesDirectory)\tools\ci_build\build.py'
+        arguments: '--use_qnn --qnn_home $(QnnSDKRootDir) $(commonBuildArgs)'
+
+    - task: VSBuild@1
+      displayName: 'Build onnxruntime'
+      inputs:
+        solution: '$(Build.BinariesDirectory)\${{ parameters.build_config }}\onnxruntime.vcxproj'
+        platform: ${{ parameters.buildPlatform }}
+        configuration: ${{ parameters.build_config }}
+        msbuildArchitecture: ${{ parameters.buildArch }}
+        maximumCpuCount: true
+        logProjectEvents: true
+        workingFolder: '$(Build.BinariesDirectory)\${{ parameters.build_config }}'
+        createLogFile: true
+
+    - task: VSBuild@1
+      displayName: 'Build onnx_test_runner'
+      inputs:
+        solution: '$(Build.BinariesDirectory)\${{ parameters.build_config }}\onnx_test_runner.vcxproj'
+        platform: ${{ parameters.buildPlatform }}
+        configuration: ${{ parameters.build_config }}
+        msbuildArchitecture: ${{ parameters.buildArch }}
+        maximumCpuCount: true
+        logProjectEvents: true
+        workingFolder: '$(Build.BinariesDirectory)\${{ parameters.build_config }}'
+        createLogFile: true
+
+    - task: VSBuild@1
+      displayName: 'Build onnxruntime_perf_test'
+      inputs:
+        solution: '$(Build.BinariesDirectory)\${{ parameters.build_config }}\onnxruntime_perf_test.vcxproj'
+        platform: ${{ parameters.buildPlatform }}
+        configuration: ${{ parameters.build_config }}
+        msbuildArchitecture: ${{ parameters.buildArch }}
+        maximumCpuCount: true
+        logProjectEvents: true
+        workingFolder: '$(Build.BinariesDirectory)\${{ parameters.build_config }}'
+        createLogFile: true
+
+    - task: VSBuild@1
+      displayName: 'Build onnxruntime_test_all (to copy Qnn libs)'
+      inputs:
+        solution: '$(Build.BinariesDirectory)\${{ parameters.build_config }}\onnxruntime_test_all.vcxproj'
+        platform: ${{ parameters.buildPlatform }}
+        configuration: ${{ parameters.build_config }}
+        msbuildArchitecture: ${{ parameters.buildArch }}
+        maximumCpuCount: true
+        logProjectEvents: true
+        workingFolder: '$(Build.BinariesDirectory)\${{ parameters.build_config }}'
+        createLogFile: true
+
+    - task: CmdLine@2
+      displayName: 'Print contents of binaries directory'
+      inputs:
+        script: |
+          dir $(Build.BinariesDirectory)\${{ parameters.build_config }}\${{ parameters.build_config }}
+
+    - template: win-esrp-dll.yml
+      parameters:
+        FolderPath: '$(Build.BinariesDirectory)\${{ parameters.build_config }}\${{ parameters.build_config }}'
+        DisplayName: 'ESRP - Sign dlls'
+        DoEsrp: ${{ parameters.DoEsrp }}
+        Pattern: 'onnxruntime*.dll'
+
+    - task: MSBuild@1
+      displayName: 'Restore NuGet Packages and create project.assets.json'
+      inputs:
+        solution: '$(Build.SourcesDirectory)\csharp\OnnxRuntime.DesktopOnly.CSharp.sln'
+        platform: 'Any CPU'
+        configuration: ${{ parameters.build_config }}
+        msbuildArguments: '-t:restore -p:OrtPackageId=$(OrtPackageId)'
+        workingDirectory: '$(Build.SourcesDirectory)\csharp'
+
+    - task: MSBuild@1
+      displayName: 'Build C# bindings'
+      inputs:
+        solution: '$(Build.SourcesDirectory)\csharp\OnnxRuntime.DesktopOnly.CSharp.sln'
+        platform: 'Any CPU'
+        configuration: ${{ parameters.build_config }}
+        msbuildArguments: '-p:OnnxRuntimeBuildDirectory="$(Build.BinariesDirectory)" -p:OrtPackageId=$(OrtPackageId) -p:IsReleaseBuild=${{ parameters.IsReleaseBuild }}'
+        workingDirectory: '$(Build.SourcesDirectory)\csharp'
+
+    - ${{ if eq(parameters.DoEsrp, true) }}:
       - template: win-esrp-dll.yml
         parameters:
-          FolderPath: '$(Build.BinariesDirectory)\${{ parameters.build_config }}\${{ parameters.build_config }}'
-          DisplayName: 'ESRP - Sign dlls'
+          FolderPath: '$(Build.SourcesDirectory)\csharp\src\Microsoft.ML.OnnxRuntime\bin\${{ parameters.build_config }}'
+          DisplayName: 'ESRP - Sign C# dlls'
           DoEsrp: ${{ parameters.DoEsrp }}
-          Pattern: 'onnxruntime*.dll'
-
-      - task: MSBuild@1
-        displayName: 'Restore NuGet Packages and create project.assets.json'
-        inputs:
-          solution: '$(Build.SourcesDirectory)\csharp\OnnxRuntime.DesktopOnly.CSharp.sln'
-          platform: 'Any CPU'
-          configuration: ${{ parameters.build_config }}
-          msbuildArguments: '-t:restore -p:OrtPackageId=$(OrtPackageId)'
-          workingDirectory: '$(Build.SourcesDirectory)\csharp'
-
-      - task: MSBuild@1
-        displayName: 'Build C# bindings'
-        inputs:
-          solution: '$(Build.SourcesDirectory)\csharp\OnnxRuntime.DesktopOnly.CSharp.sln'
-          platform: 'Any CPU'
-          configuration: ${{ parameters.build_config }}
-          msbuildArguments: '-p:OnnxRuntimeBuildDirectory="$(Build.BinariesDirectory)" -p:OrtPackageId=$(OrtPackageId) -p:IsReleaseBuild=${{ parameters.IsReleaseBuild }}'
-          workingDirectory: '$(Build.SourcesDirectory)\csharp'
-
-      - ${{ if eq(parameters.DoEsrp, true) }}:
-        - template: win-esrp-dll.yml
-          parameters:
-            FolderPath: '$(Build.SourcesDirectory)\csharp\src\Microsoft.ML.OnnxRuntime\bin\${{ parameters.build_config }}'
-            DisplayName: 'ESRP - Sign C# dlls'
-            DoEsrp: ${{ parameters.DoEsrp }}
-
-      - task: MSBuild@1
-        displayName: 'Build Nuget Packages'
-        inputs:
-          solution: '$(Build.SourcesDirectory)\csharp\OnnxRuntime.CSharp.proj'
-          platform: 'Any CPU'
-          configuration: ${{ parameters.build_config }}
-          msbuildArguments: '-t:CreatePackage -p:OnnxRuntimeBuildDirectory="$(Build.BinariesDirectory)" -p:OrtPackageId=$(OrtPackageId) -p:IsReleaseBuild=${{ parameters.IsReleaseBuild }} -p:TargetArchitecture=$(targetArchitecture)'
-          workingDirectory: '$(Build.SourcesDirectory)\csharp'
-
-      - task: CopyFiles@2
-        displayName: 'Copy native nuget package to: $(Build.ArtifactStagingDirectory)'
-        inputs:
-          SourceFolder: '$(Build.BinariesDirectory)\${{ parameters.build_config }}\${{ parameters.build_config }}'
-          Contents: '*.nupkg'
-          TargetFolder: '$(Build.ArtifactStagingDirectory)'
-
-      - task: CopyFiles@2
-        displayName: 'Copy native nuget symbols package to: $(Build.ArtifactStagingDirectory)'
-        inputs:
-          SourceFolder: '$(Build.BinariesDirectory)\${{ parameters.build_config }}\${{ parameters.build_config }}'
-          Contents: '*.snupkg'
-          TargetFolder: '$(Build.ArtifactStagingDirectory)'
-
-      - task: PublishPipelineArtifact@0
-        displayName: 'Publish Pipeline x64 NuGet Artifact'
-        inputs:
-          artifactName: ${{ parameters.ArtifactName }}
-          targetPath: '$(Build.ArtifactStagingDirectory)'
+
+    - task: MSBuild@1
+      displayName: 'Build Nuget Packages'
+      inputs:
+        solution: '$(Build.SourcesDirectory)\csharp\OnnxRuntime.CSharp.proj'
+        platform: 'Any CPU'
+        configuration: ${{ parameters.build_config }}
+        msbuildArguments: '-t:CreatePackage -p:OnnxRuntimeBuildDirectory="$(Build.BinariesDirectory)" -p:OrtPackageId=$(OrtPackageId) -p:IsReleaseBuild=${{ parameters.IsReleaseBuild }} -p:TargetArchitecture=$(targetArchitecture)'
+        workingDirectory: '$(Build.SourcesDirectory)\csharp'
+
+    - task: CopyFiles@2
+      displayName: 'Copy native nuget package to: $(Build.ArtifactStagingDirectory)'
+      inputs:
+        SourceFolder: '$(Build.BinariesDirectory)\${{ parameters.build_config }}\${{ parameters.build_config }}'
+        Contents: '*.nupkg'
+        TargetFolder: '$(Build.ArtifactStagingDirectory)'
+
+    - task: CopyFiles@2
+      displayName: 'Copy native nuget symbols package to: $(Build.ArtifactStagingDirectory)'
+      inputs:
+        SourceFolder: '$(Build.BinariesDirectory)\${{ parameters.build_config }}\${{ parameters.build_config }}'
+        Contents: '*.snupkg'
+        TargetFolder: '$(Build.ArtifactStagingDirectory)'
+
+    - task: 1ES.PublishPipelineArtifact@1
+      displayName: 'Publish Pipeline x64 NuGet Artifact'
+      inputs:
+        artifactName: ${{ parameters.ArtifactName }}
+        targetPath: '$(Build.ArtifactStagingDirectory)'

From 18725277e3a54cb23f096eba50ff3d8da3d528ab Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Mon, 3 Mar 2025 13:49:17 -0800
Subject: [PATCH 08/46] [js/common] allows using Uint16Array as data for
 float16 tensor (#23827)

### Description

Resolve #23817


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
---
 js/common/lib/tensor-impl-type-mapping.ts     |  9 +--
 js/common/lib/tensor-impl.ts                  |  7 +++
 js/common/package.json                        |  3 +-
 js/common/test/unit-tests/common.ts           |  5 +-
 .../test/unit-tests/tensor/constructor-f16.ts | 62 +++++++++++++++++++
 .../unit-tests/tensor/constructor-type.ts     |  8 ---
 .../templates/linux-web-init-and-check.yml    |  8 +++
 .../azure-pipelines/templates/web-ci.yml      |  3 -
 8 files changed, 85 insertions(+), 20 deletions(-)
 create mode 100644 js/common/test/unit-tests/tensor/constructor-f16.ts

diff --git a/js/common/lib/tensor-impl-type-mapping.ts b/js/common/lib/tensor-impl-type-mapping.ts
index 14dbdca707220..58f4cc6281b09 100644
--- a/js/common/lib/tensor-impl-type-mapping.ts
+++ b/js/common/lib/tensor-impl-type-mapping.ts
@@ -44,12 +44,6 @@ export const NUMERIC_TENSOR_TYPEDARRAY_TO_TYPE_MAP = new Map<SupportedTypedArray
   [Uint32Array, 'uint32'],
 ]);
 
-// a dummy type declaration for Float16Array in case any polyfill is available.
-declare global {
-  // eslint-disable-next-line @typescript-eslint/naming-convention, @typescript-eslint/no-explicit-any
-  const Float16Array: any;
-}
-
 // the following code allows delaying execution of BigInt/Float16Array checking. This allows lazy initialization for
 // NUMERIC_TENSOR_TYPE_TO_TYPEDARRAY_MAP and NUMERIC_TENSOR_TYPEDARRAY_TO_TYPE_MAP, which allows BigInt/Float16Array
 // polyfill if available.
@@ -59,6 +53,9 @@ export const checkTypedArray = () => {
     isTypedArrayChecked = true;
     const isBigInt64ArrayAvailable = typeof BigInt64Array !== 'undefined' && BigInt64Array.from;
     const isBigUint64ArrayAvailable = typeof BigUint64Array !== 'undefined' && BigUint64Array.from;
+
+    // eslint-disable-next-line @typescript-eslint/naming-convention, @typescript-eslint/no-explicit-any
+    const Float16Array = (globalThis as any).Float16Array;
     const isFloat16ArrayAvailable = typeof Float16Array !== 'undefined' && Float16Array.from;
 
     if (isBigInt64ArrayAvailable) {
diff --git a/js/common/lib/tensor-impl.ts b/js/common/lib/tensor-impl.ts
index 8feb8d7205fa1..2c54bdbfb6874 100644
--- a/js/common/lib/tensor-impl.ts
+++ b/js/common/lib/tensor-impl.ts
@@ -261,6 +261,13 @@ export class Tensor implements TensorInterface {
             } else {
               throw new TypeError(`A Uint8ClampedArray tensor's data must be type of uint8`);
             }
+          } else if (arg0 === 'float16' && arg1 instanceof Uint16Array && typedArrayConstructor !== Uint16Array) {
+            // when Float16Array is available and data is of type Uint16Array.
+            // We allow Uint16Array to be passed in as data for 'float16' tensor until Float16Array is generally
+            // supported in JavaScript environment.
+
+            // eslint-disable-next-line @typescript-eslint/no-explicit-any
+            data = new (globalThis as any).Float16Array(arg1.buffer, arg1.byteOffset, arg1.length);
           } else {
             throw new TypeError(`A ${type} tensor's data must be type of ${typedArrayConstructor}`);
           }
diff --git a/js/common/package.json b/js/common/package.json
index 3d8d3f6533cfe..2d331bb42e4c7 100644
--- a/js/common/package.json
+++ b/js/common/package.json
@@ -15,7 +15,8 @@
     "build": "node ./build.js",
     "prepare": "npm run build",
     "pretest": "tsc --build ./test",
-    "test": "mocha ./test/**/*.js --timeout 30000"
+    "test": "mocha \"./test/**/*.js\" --timeout 30000",
+    "test:f16": "mocha -n js-float16array \"./test/**/*.js\" --timeout 30000"
   },
   "devDependencies": {
     "typedoc": "^0.25.7"
diff --git a/js/common/test/unit-tests/common.ts b/js/common/test/unit-tests/common.ts
index 0a6e4e5dd6ebd..bbbceed605bd4 100644
--- a/js/common/test/unit-tests/common.ts
+++ b/js/common/test/unit-tests/common.ts
@@ -29,9 +29,10 @@ export const NUMBER_COMPATIBLE_NUMERICAL_TYPES = [
 export const BIGINT_TYPES = [['int64', BigInt64Array, true] as const, ['uint64', BigUint64Array, true] as const];
 
 /**
- * float16 type, data represented by Uint16Array
+ * float16 type, data represented by Uint16Array/Float16Array
  */
-export const FLOAT16_TYPE = ['float16', Uint16Array, false] as const;
+// eslint-disable-next-line @typescript-eslint/no-explicit-any
+export const FLOAT16_TYPE = ['float16', (globalThis as any).Float16Array ?? Uint16Array, false] as const;
 
 /**
  * A list of all numerical types.
diff --git a/js/common/test/unit-tests/tensor/constructor-f16.ts b/js/common/test/unit-tests/tensor/constructor-f16.ts
new file mode 100644
index 0000000000000..38c6ac037c5f9
--- /dev/null
+++ b/js/common/test/unit-tests/tensor/constructor-f16.ts
@@ -0,0 +1,62 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+import assert from 'assert/strict';
+import { Tensor } from 'onnxruntime-common';
+
+// eslint-disable-next-line @typescript-eslint/no-explicit-any
+const globalF16 = (globalThis as any).Float16Array;
+
+(globalF16 ? describe : describe.skip)('Tensor Constructor Tests - check type float16 (Float16Array available)', () => {
+  it("[float16] new Tensor('float16', numbers, dims): allow number array when Float16Array is available", () => {
+    const tensor = new Tensor('float16', [1, 2, 3, 4], [2, 2]);
+    assert.equal(tensor.type, 'float16', "tensor.type should be 'float16'");
+    assert(tensor.data instanceof globalF16, "tensor.data should be an instance of 'Float16Array'");
+    assert.equal(tensor.data[0], 1, 'tensor.data[0] should be 1');
+    assert.equal(tensor.data[1], 2, 'tensor.data[1] should be 2');
+    assert.equal(tensor.data[2], 3, 'tensor.data[2] should be 3');
+    assert.equal(tensor.data[3], 4, 'tensor.data[3] should be 4');
+    assert.equal(tensor.data.length, 4, 'tensor.data.length should be 4');
+  });
+
+  it("[float16] new Tensor('float16', float16array, dims): allow Float16Array when Float16Array is available", () => {
+    const tensor = new Tensor('float16', new globalF16([1, 2, 3, 4]), [2, 2]);
+    assert.equal(tensor.type, 'float16', "tensor.type should be 'float16'");
+    assert(tensor.data instanceof globalF16, "tensor.data should be an instance of 'Float16Array'");
+    assert.equal(tensor.data[0], 1, 'tensor.data[0] should be 1');
+    assert.equal(tensor.data[1], 2, 'tensor.data[1] should be 2');
+    assert.equal(tensor.data[2], 3, 'tensor.data[2] should be 3');
+    assert.equal(tensor.data[3], 4, 'tensor.data[3] should be 4');
+    assert.equal(tensor.data.length, 4, 'tensor.data.length should be 4');
+  });
+
+  it("[float16] new Tensor('float16', uint16array, dims): allow Uint16Array when Float16Array is available", () => {
+    const tensor = new Tensor('float16', new Uint16Array([15360, 16384, 16896, 17408]), [2, 2]);
+    assert.equal(tensor.type, 'float16', "tensor.type should be 'float16'");
+    assert(tensor.data instanceof globalF16, "tensor.data should be an instance of 'Float16Array'");
+    assert.equal(tensor.data[0], 1, 'tensor.data[0] should be 1');
+    assert.equal(tensor.data[1], 2, 'tensor.data[1] should be 2');
+    assert.equal(tensor.data[2], 3, 'tensor.data[2] should be 3');
+    assert.equal(tensor.data[3], 4, 'tensor.data[3] should be 4');
+    assert.equal(tensor.data.length, 4, 'tensor.data.length should be 4');
+  });
+});
+
+(globalF16 ? describe.skip : describe)(
+  'Tensor Constructor Tests - check type float16 (Float16Array not available)',
+  () => {
+    it(
+      "[float16] new Tensor('float16', numbers, dims): " +
+        "expect to throw because it's not allowed to construct 'float16' tensor from number array",
+      () => {
+        assert.throws(() => new Tensor('float16', [1, 2, 3, 4], [2, 2]), TypeError);
+      },
+    );
+
+    it("[float16] new Tensor('float16', uint16array, dims): allow Uint16Array", () => {
+      const tensor = new Tensor('float16', new Uint16Array([15360, 16384, 16896, 17408]), [2, 2]);
+      assert.equal(tensor.type, 'float16', "tensor.type should be 'float16'");
+      assert(tensor.data instanceof Uint16Array, "tensor.data should be an instance of 'Uint16Array'");
+    });
+  },
+);
diff --git a/js/common/test/unit-tests/tensor/constructor-type.ts b/js/common/test/unit-tests/tensor/constructor-type.ts
index 02390800e8611..d86e18ba744b8 100644
--- a/js/common/test/unit-tests/tensor/constructor-type.ts
+++ b/js/common/test/unit-tests/tensor/constructor-type.ts
@@ -105,14 +105,6 @@ describe('Tensor Constructor Tests - check types', () => {
     assert(tensor.data instanceof Uint8Array, "tensor.data should be an instance of 'Uint8Array'");
   });
 
-  it(
-    "[float16] new Tensor('float16', numbers, dims): " +
-      "expect to throw because it's not allowed to construct 'float16' tensor from number array",
-    () => {
-      assert.throws(() => new Tensor('float16', [1, 2, 3, 4], [2, 2]), TypeError);
-    },
-  );
-
   it("[badtype] new Tensor('a', numbers, dims): expect to throw because 'a' is an invalid type", () => {
     assert.throws(() => new TensorAny('a', [1, 2, 3, 4], [2, 2]), TypeError);
   });
diff --git a/tools/ci_build/github/azure-pipelines/templates/linux-web-init-and-check.yml b/tools/ci_build/github/azure-pipelines/templates/linux-web-init-and-check.yml
index a4d5a73118ea2..2b73f82615bba 100644
--- a/tools/ci_build/github/azure-pipelines/templates/linux-web-init-and-check.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/linux-web-init-and-check.yml
@@ -1,4 +1,8 @@
 steps:
+- task: NodeTool@0
+  inputs:
+    # requires Node.js v22 for float16 testing (the V8 flag "--js-float16array")
+    versionSpec: '22.x'
 - script: |
     npm ci
   workingDirectory: '$(Build.SourcesDirectory)/js'
@@ -11,6 +15,10 @@ steps:
     npm test
   workingDirectory: '$(Build.SourcesDirectory)/js/common'
   displayName: 'run onnxruntime-common tests'
+- script: |
+    npm run test:f16
+  workingDirectory: '$(Build.SourcesDirectory)/js/common'
+  displayName: 'run onnxruntime-common tests (enable Float16Array)'
 - script: |
     npm ci
   workingDirectory: '$(Build.SourcesDirectory)/js/web'
diff --git a/tools/ci_build/github/azure-pipelines/templates/web-ci.yml b/tools/ci_build/github/azure-pipelines/templates/web-ci.yml
index 87836880cbdb8..2e3589ee87c29 100644
--- a/tools/ci_build/github/azure-pipelines/templates/web-ci.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/web-ci.yml
@@ -83,9 +83,6 @@ stages:
         git submodule update --init -- cmake/external/onnx
       workingDirectory: '$(Build.SourcesDirectory)'
       displayName: 'Checkout submodule onnx'
-    - task: NodeTool@0
-      inputs:
-        versionSpec: '20.x'
     - template: linux-web-init-and-check.yml
     - task: Bash@3
       displayName: 'Extract commit SHA and save to __commit.txt'

From 325ee30916fef0ce56c5fcaa45e43b27673cb950 Mon Sep 17 00:00:00 2001
From: Jiajia Qin <jiajiaqin@microsoft.com>
Date: Tue, 4 Mar 2025 08:50:14 +0800
Subject: [PATCH 09/46] [js/webgpu] Reland the optimization of ConvTranspose
 (#23858)

This PR fixes the errors in the ConvTranspose optimization and adds
tests to ensure the correctness of the implementation.
---
 .../ops/3rd-party/conv_backprop_webgpu.ts     |  96 +++++++++++---
 js/web/test/data/ops/conv-transpose.jsonc     | 122 ++++++++++++++++++
 2 files changed, 199 insertions(+), 19 deletions(-)

diff --git a/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv_backprop_webgpu.ts b/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv_backprop_webgpu.ts
index ad1de42106d6d..50620cea33863 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv_backprop_webgpu.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv_backprop_webgpu.ts
@@ -46,6 +46,11 @@ export const createConvTranspose2DProgramInfo = (
   const inputChannelsPerGroup = wShape[2] / group;
   const outputChannelsPerGroup = wShape[3];
   const aComponents = isChannelsLast ? getMaxComponents(inputChannelsPerGroup) : 1;
+  const packInputAs4 = isChannelsLast && outputChannelsPerGroup === 1 && inputChannelsPerGroup >= 4;
+  const inputChannelsPerGroupInt = packInputAs4
+    ? Math.floor(inputChannelsPerGroup / 4) * 4
+    : Math.floor(inputChannelsPerGroup / aComponents) * aComponents;
+  const inputChannelsRemainder = inputChannelsPerGroup - inputChannelsPerGroupInt;
   const components = isChannelsLast ? getMaxComponents(outputChannelsPerGroup) : 1;
   const bComponents = isChannelsLast ? (outputChannelsPerGroup === 1 ? aComponents : components) : 1;
   const outputSize = ShapeUtil.size(outputShape) / components;
@@ -78,6 +83,7 @@ export const createConvTranspose2DProgramInfo = (
     { type: DataType.uint32, data: dilations },
     { type: DataType.uint32, data: effectiveFilterDims },
     { type: DataType.int32, data: pads },
+    { type: DataType.uint32, data: inputChannelsPerGroupInt },
     { type: DataType.uint32, data: inputChannelsPerGroup },
     { type: DataType.uint32, data: outputChannelsPerGroup },
     ...createTensorShapeVariables(inputs[0].dims, inputs[1].dims),
@@ -96,6 +102,7 @@ export const createConvTranspose2DProgramInfo = (
       { name: 'dilations', type: 'u32', length: filterDims.length },
       { name: 'effective_filter_dims', type: 'u32', length: effectiveFilterDims.length },
       { name: 'pads', type: 'i32', length: pads.length },
+      { name: 'input_channels_per_group_int', type: 'u32' },
       { name: 'input_channels_per_group', type: 'u32' },
       { name: 'output_channels_per_group', type: 'u32' },
     ];
@@ -114,16 +121,40 @@ export const createConvTranspose2DProgramInfo = (
 
     const calculateResult = (): string => {
       let calcStr = '';
-      if (aComponents === 1) {
-        calcStr += `
-        let w_offset = ${w.indicesToOffset(`${w.type.indices}(u32(wRPerm), u32(wCPerm), inputChannel, wOutChannel)`)};
-        let wValue = ${w.getByOffset(`w_offset / ${bComponents}`)};
-        dotProd = dotProd + xValue * wValue;`;
+      if (packInputAs4) {
+        if (aComponents === 4) {
+          calcStr += `
+        let xValue = ${dy.getByOffset('x_offset')};
+        let wValue = ${w.getByOffset('w_offset')};
+        dotProd = dotProd + dot(xValue, wValue);
+        x_offset += 1u;
+        w_offset += 1u;`;
+        } else if (aComponents === 2) {
+          calcStr += `
+          dotProd = dotProd + dot(vec4<${dataType}>(${dy.getByOffset('x_offset')}, ${dy.getByOffset('x_offset + 1u')}), vec4<${dataType}>(${w.getByOffset('w_offset')}, ${w.getByOffset('w_offset + 1u')}));
+          x_offset += 2u;
+          w_offset += 2u;`;
+        } else if (aComponents === 1) {
+          calcStr += `
+          dotProd = dotProd + dot(vec4<${dataType}>(${dy.getByOffset('x_offset')}, ${dy.getByOffset('x_offset + 1u')}, ${dy.getByOffset('x_offset + 2u')}, ${dy.getByOffset('x_offset + 3u')}), vec4<${dataType}>(${w.getByOffset('w_offset')}, ${w.getByOffset('w_offset + 1u')}, ${w.getByOffset('w_offset + 2u')}, ${w.getByOffset('w_offset + 3u')}));
+          x_offset += 4u;
+          w_offset += 4u;`;
+        }
       } else {
-        if (outputChannelsPerGroup === 1) {
+        calcStr += `
+                  let xValue = ${
+                    isChannelsLast
+                      ? dy.getByOffset(
+                          `${dy.indicesToOffset(`${dy.type.indices}(batch, idyR, idyC, inputChannel)`)} / ${aComponents}`,
+                        )
+                      : dy.get('batch', 'inputChannel', 'idyR', 'idyC')
+                  };
+        `;
+        if (aComponents === 1) {
           calcStr += `
-          let wValue = ${w.getByOffset(`${w.indicesToOffset(`${w.type.indices}(u32(wRPerm), u32(wCPerm), inputChannel, wOutChannel)`)} / ${bComponents}`)};
-          dotProd = dotProd + dot(xValue, wValue);`;
+          let w_offset = ${w.indicesToOffset(`${w.type.indices}(u32(wRPerm), u32(wCPerm), inputChannel, wOutChannel)`)};
+          let wValue = ${w.getByOffset(`w_offset / ${bComponents}`)};
+          dotProd = dotProd + xValue * wValue;`;
         } else {
           for (let c = 0; c < aComponents; c++) {
             calcStr += `
@@ -134,6 +165,32 @@ export const createConvTranspose2DProgramInfo = (
       }
       return calcStr;
     };
+    const calculateRemainder = (): string => {
+      if (inputChannelsRemainder === 0) {
+        return '';
+      }
+      if (!packInputAs4) {
+        throw new Error(`packInputAs4 ${packInputAs4} is not true.`);
+      }
+      let calcStr = '';
+      if (aComponents === 1) {
+        calcStr += 'dotProd = dotProd';
+        for (let i = 0; i < inputChannelsRemainder; i++) {
+          calcStr += `
+            + ${dy.getByOffset(`x_offset + ${i}`)} * ${w.getByOffset(`w_offset + ${i}`)}`;
+        }
+        calcStr += ';';
+      } else if (aComponents === 2) {
+        if (inputChannelsRemainder !== 2) {
+          throw new Error(`Invalid inputChannelsRemainder ${inputChannelsRemainder}.`);
+        }
+        calcStr += `
+          let xValue = ${dy.getByOffset('x_offset')};
+          let wValue = ${w.getByOffset('w_offset')};
+          dotProd = dotProd + dot(xValue, wValue);`;
+      }
+      return calcStr;
+    };
     const codeSnippet = `
             let outputIndices = ${output.offsetToIndices(`global_idx * ${components}`)};
             let batch = ${output.indicesGet('outputIndices', 0)};
@@ -169,7 +226,6 @@ export const createConvTranspose2DProgramInfo = (
                 // Minimum wC >= 0 that satisfies (dyCCorner + wC) % (uniforms.strides.y) == 0
                 wC = u32(((dyCCorner + i32(uniforms.strides.y) - 1) / i32(uniforms.strides.y)) * i32(uniforms.strides.y) - dyCCorner);
               }
-
               for (; wC < uniforms.effective_filter_dims.y; wC = wC + 1) {
                 if (wC % uniforms.dilations.y != 0) {
                   continue;
@@ -182,17 +238,19 @@ export const createConvTranspose2DProgramInfo = (
                 }
                 let idyC: u32 = u32(dyC);
                 var inputChannel = groupId * uniforms.input_channels_per_group;
-                for (var d2: u32 = 0; d2 < uniforms.input_channels_per_group; d2 = d2 + ${aComponents}) {
-                  let xValue = ${
-                    isChannelsLast
-                      ? dy.getByOffset(
-                          `${dy.indicesToOffset(`${dy.type.indices}(batch, idyR, idyC, inputChannel)`)} / ${aComponents}`,
-                        )
-                      : dy.get('batch', 'inputChannel', 'idyR', 'idyC')
-                  };
+                ${
+                  packInputAs4
+                    ? `
+                var x_offset = ${dy.indicesToOffset(`${dy.type.indices}(batch, idyR, idyC, inputChannel)`)} / ${aComponents};
+                var w_offset = ${w.indicesToOffset(`${w.type.indices}(wRPerm, wCPerm, inputChannel, wOutChannel)`)} / ${bComponents};
+                  `
+                    : ''
+                }
+                for (var d2: u32 = 0; d2 < uniforms.input_channels_per_group_int; d2 = d2 + ${packInputAs4 ? 4 : aComponents}) {
                   ${calculateResult()}
-                  inputChannel = inputChannel + ${aComponents};
+                  inputChannel = inputChannel + ${packInputAs4 ? 4 : aComponents};
                 }
+                ${calculateRemainder()}
                 wC = wC + uniforms.strides.y - 1;
               }
               wR = wR + uniforms.strides[0] - 1;
@@ -211,7 +269,7 @@ export const createConvTranspose2DProgramInfo = (
   return {
     name: 'ConvTranspose2D',
     shaderCache: {
-      hint: `${attributes.cacheKey};${aComponents}${bComponents}${components}${outputChannelsPerGroup === 1}`,
+      hint: `${attributes.cacheKey};${aComponents}${bComponents}${components}${packInputAs4}${inputChannelsRemainder}`,
       inputDependencies,
     },
     getRunData: () => ({
diff --git a/js/web/test/data/ops/conv-transpose.jsonc b/js/web/test/data/ops/conv-transpose.jsonc
index 6429845d23df9..008d58530ee36 100644
--- a/js/web/test/data/ops/conv-transpose.jsonc
+++ b/js/web/test/data/ops/conv-transpose.jsonc
@@ -348,6 +348,128 @@
       }
     ]
   },
+  {
+    "name": "ConvTranspose NHWC- group - A",
+    "operator": "ConvTranspose",
+    "inputShapeDefinitions": "rankOnly",
+    "opset": { "domain": "", "version": 17 },
+    "attributes": [
+      { "name": "kernel_shape", "data": [1, 1], "type": "ints" },
+      { "name": "group", "data": 2, "type": "int" }
+    ],
+    "cases": [
+      {
+        "name": "T[0]",
+        "inputs": [
+          {
+            "data": [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 18.0, 20.0, 22.0, 24.0, 26.0, 28.0, 30.0, 32.0, 34.0],
+            "dims": [1, 2, 3, 3],
+            "type": "float32"
+          },
+          {
+            "data": [1.0, 2.0],
+            "dims": [2, 1, 1, 1],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [0, 1, 2, 3, 4, 5, 6, 7, 8, 36, 40, 44, 48, 52, 56, 60, 64, 68],
+            "dims": [1, 2, 3, 3],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "ConvTranspose NHWC- group - B",
+    "operator": "ConvTranspose",
+    "inputShapeDefinitions": "rankOnly",
+    "opset": { "domain": "", "version": 17 },
+    "attributes": [
+      { "name": "kernel_shape", "data": [2, 2], "type": "ints" },
+      { "name": "group", "data": 3, "type": "int" }
+    ],
+    "cases": [
+      {
+        "name": "T[0]",
+        "inputs": [
+          {
+            "data": [
+              0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0,
+              19.0, 20.0, 21.0, 22.0, 23.0, 0, 0, 0
+            ],
+            "dims": [1, 3, 3, 3],
+            "type": "float32"
+          },
+          {
+            "data": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0],
+            "dims": [3, 1, 2, 2],
+            "type": "float32"
+          },
+          {
+            "data": [0.125, 0.25, 0.375],
+            "dims": [3],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [
+              0.125, 1.125, 4.125, 4.125, 3.125, 13.125, 23.125, 18.125, 15.125, 43.125, 53.125, 36.125, 18.125, 45.125,
+              52.125, 32.125, 45.25, 104.25, 115.25, 66.25, 123.25, 279.25, 305.25, 172.25, 159.25, 357.25, 383.25,
+              214.25, 105.25, 232.25, 247.25, 136.25, 162.375, 351.375, 370.375, 200.375, 387.375, 833.375, 875.375,
+              470.375, 231.375, 494.375, 517.375, 276.375, 0.375, 0.375, 0.375, 0.375
+            ],
+            "dims": [1, 3, 4, 4],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "ConvTranspose NHWC- group - C",
+    "operator": "ConvTranspose",
+    "inputShapeDefinitions": "rankOnly",
+    "opset": { "domain": "", "version": 17 },
+    "attributes": [
+      { "name": "kernel_shape", "data": [2, 2], "type": "ints" },
+      { "name": "group", "data": 3, "type": "int" }
+    ],
+    "cases": [
+      {
+        "name": "T[0]",
+        "inputs": [
+          {
+            "data": [
+              0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0,
+              19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0
+            ],
+            "dims": [1, 3, 3, 4],
+            "type": "float32"
+          },
+          {
+            "data": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0],
+            "dims": [3, 1, 2, 2],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [
+              0, 1, 4, 7, 6, 4, 16, 26, 36, 26, 20, 56, 66, 76, 50, 24, 59, 66, 73, 44, 60, 137, 148, 159, 90, 164, 368,
+              394, 420, 234, 212, 472, 498, 524, 290, 140, 307, 322, 337, 184, 216, 465, 484, 503, 270, 516, 1104, 1146,
+              1188, 634, 596, 1272, 1314, 1356, 722, 352, 747, 770, 793, 420
+            ],
+            "dims": [1, 3, 4, 5],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
+  },
   {
     "name": "ConvTranspose with bias addition C",
     "operator": "ConvTranspose",

From 30c682547bdae3e09523614f8e38526e49ca8fbc Mon Sep 17 00:00:00 2001
From: Alessio Soldano <services@soldano.it>
Date: Tue, 4 Mar 2025 04:12:22 +0100
Subject: [PATCH 10/46] [OpenVINO] Fix a build warning (#23877)

### Description
Fix a warning with std::move usage


### Motivation and Context
Possibly allow building without --compile_no_warning_as_error flag
---
 onnxruntime/core/providers/openvino/backends/basic_backend.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/onnxruntime/core/providers/openvino/backends/basic_backend.cc b/onnxruntime/core/providers/openvino/backends/basic_backend.cc
index 3ac4d22f5453c..44b811e6af2c0 100644
--- a/onnxruntime/core/providers/openvino/backends/basic_backend.cc
+++ b/onnxruntime/core/providers/openvino/backends/basic_backend.cc
@@ -653,7 +653,7 @@ void BasicBackend::CompleteAsyncInference(Ort::KernelContext& context, OVInferRe
         const auto& out_name = item.first;
         auto node = item.second;
         Ort::UnownedValue output_tensor = GetOutputTensor(context,
-                                                          std::move(out_name),
+                                                          out_name,
                                                           subgraph_context_.output_names,
                                                           node);
         auto mem_info = output_tensor.GetTensorMemoryInfo();

From bde4fbec9707f542f0b67540b051f7646ef5ab71 Mon Sep 17 00:00:00 2001
From: Changming Sun <chasun@microsoft.com>
Date: Tue, 4 Mar 2025 09:37:00 -0800
Subject: [PATCH 11/46] Change gsl::byte to std::byte (#23872)

To be compatible with the latest GSL library. Without this fix we will
get:

```
onnxruntime\core\providers\cpu\controlflow\loop.cc(247): error C4996: 'gsl::byte': Use std::byte instead.
```
---
 onnxruntime/core/providers/cpu/controlflow/loop.cc  | 4 ++--
 onnxruntime/core/providers/cuda/controlflow/loop.cc | 4 ++--
 onnxruntime/test/providers/base_tester.cc           | 6 +++---
 onnxruntime/test/providers/base_tester.h            | 6 +++---
 4 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/onnxruntime/core/providers/cpu/controlflow/loop.cc b/onnxruntime/core/providers/cpu/controlflow/loop.cc
index c65dd2a04bf55..b33b1f189594b 100644
--- a/onnxruntime/core/providers/cpu/controlflow/loop.cc
+++ b/onnxruntime/core/providers/cpu/controlflow/loop.cc
@@ -244,7 +244,7 @@ static Status ConcatenateCpuOutput(void* /*stream*/,
 
   // we can't easily use a C++ template for the tensor element type,
   // so use a span for some protection but work in bytes
-  gsl::span<gsl::byte> output_span = gsl::make_span<gsl::byte>(static_cast<gsl::byte*>(output),
+  gsl::span<std::byte> output_span = gsl::make_span<std::byte>(static_cast<std::byte*>(output),
                                                                output_size_in_bytes);
 
   for (size_t i = 0, num_iterations = per_iteration_output.size(); i < num_iterations; ++i) {
@@ -257,7 +257,7 @@ static Status ConcatenateCpuOutput(void* /*stream*/,
                              " Expected:", per_iteration_shape, " Got:", iteration_data.Shape());
     }
 
-    auto src = gsl::make_span<const gsl::byte>(static_cast<const gsl::byte*>(iteration_data.DataRaw()),
+    auto src = gsl::make_span<const std::byte>(static_cast<const std::byte*>(iteration_data.DataRaw()),
                                                bytes_per_iteration);
     auto dst = output_span.subspan(i * bytes_per_iteration, bytes_per_iteration);
     gsl::copy(src, dst);
diff --git a/onnxruntime/core/providers/cuda/controlflow/loop.cc b/onnxruntime/core/providers/cuda/controlflow/loop.cc
index 3295b73a800c9..d66de7c74e647 100644
--- a/onnxruntime/core/providers/cuda/controlflow/loop.cc
+++ b/onnxruntime/core/providers/cuda/controlflow/loop.cc
@@ -84,10 +84,10 @@ static Status ConcatenateGpuOutput(void* stream, std::vector<OrtValue>& per_iter
     CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(cur_output, iteration_data.DataRaw(), bytes_per_iteration,
                                          cudaMemcpyDeviceToDevice, static_cast<cudaStream_t>(stream)));
 
-    cur_output = static_cast<void*>((static_cast<gsl::byte*>(cur_output) + bytes_per_iteration));
+    cur_output = static_cast<void*>((static_cast<std::byte*>(cur_output) + bytes_per_iteration));
   }
 
-  ORT_ENFORCE(static_cast<gsl::byte*>(cur_output) - static_cast<gsl::byte*>(output) == output_size_in_bytes,
+  ORT_ENFORCE(static_cast<std::byte*>(cur_output) - static_cast<std::byte*>(output) == output_size_in_bytes,
               "Concatenation did not fill output buffer as expected.");
 
   return Status::OK();
diff --git a/onnxruntime/test/providers/base_tester.cc b/onnxruntime/test/providers/base_tester.cc
index 6bfe7bc3856ba..eecff3fa4d8ff 100644
--- a/onnxruntime/test/providers/base_tester.cc
+++ b/onnxruntime/test/providers/base_tester.cc
@@ -174,7 +174,7 @@ static std::unique_ptr<SparseTensor> MakeSparseTensor(MLDataType data_type, cons
   return p_tensor;
 }
 
-void BaseTester::CopyDataToTensor(gsl::span<const gsl::byte> data, Tensor& dst) {
+void BaseTester::CopyDataToTensor(gsl::span<const std::byte> data, Tensor& dst) {
   ORT_ENFORCE(dst.SizeInBytes() >= data.size_bytes(), "Not enough space in the destination tensor");
   memcpy(dst.MutableDataRaw(), data.data(), data.size_bytes());
 }
@@ -203,7 +203,7 @@ void BaseTester::AddSparseCooTensorData(std::vector<Data>& data,
                                         MLDataType data_type,
                                         const char* name,
                                         gsl::span<const int64_t> dims,
-                                        gsl::span<const gsl::byte> values,
+                                        gsl::span<const std::byte> values,
                                         gsl::span<const int64_t> indices,
                                         const ValidateOutputParams& check_params,
                                         const std::vector<std::string>* dim_params) {
@@ -247,7 +247,7 @@ void BaseTester::AddSparseCsrTensorData(std::vector<Data>& data,
                                         MLDataType data_type,
                                         const char* name,
                                         gsl::span<const int64_t> dims,
-                                        gsl::span<const gsl::byte> values,
+                                        gsl::span<const std::byte> values,
                                         gsl::span<const int64_t> inner_indices,
                                         gsl::span<const int64_t> outer_indices,
                                         const ValidateOutputParams& check_params,
diff --git a/onnxruntime/test/providers/base_tester.h b/onnxruntime/test/providers/base_tester.h
index 512b3402c5986..d39cc3c750dec 100644
--- a/onnxruntime/test/providers/base_tester.h
+++ b/onnxruntime/test/providers/base_tester.h
@@ -868,7 +868,7 @@ class BaseTester {
   void AddShapeToTensorData(NodeArg& node_arg, gsl::span<const int64_t> dims,
                             const std::vector<std::string>* dim_params);
 
-  void CopyDataToTensor(gsl::span<const gsl::byte> data, Tensor& dst);
+  void CopyDataToTensor(gsl::span<const std::byte> data, Tensor& dst);
 
 #if !defined(DISABLE_SPARSE_TENSORS)
   NodeArg MakeSparseNodeArg(int32_t dtype, const char* name,
@@ -879,7 +879,7 @@ class BaseTester {
                               MLDataType data_type,
                               const char* name,
                               gsl::span<const int64_t> dims,
-                              gsl::span<const gsl::byte> values,
+                              gsl::span<const std::byte> values,
                               gsl::span<const int64_t> indices,
                               const ValidateOutputParams& check_params,
                               const std::vector<std::string>* dim_params = nullptr);
@@ -895,7 +895,7 @@ class BaseTester {
                               MLDataType data_type,
                               const char* name,
                               gsl::span<const int64_t> dims,
-                              gsl::span<const gsl::byte> values,
+                              gsl::span<const std::byte> values,
                               gsl::span<const int64_t> inner_indices,
                               gsl::span<const int64_t> outer_indices,
                               const ValidateOutputParams& check_params,

From 17dcea7a662bf21725c754719952ae54e51d6d23 Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Tue, 4 Mar 2025 10:06:36 -0800
Subject: [PATCH 12/46] Allow using extended minimal build for several EPs
 (#23834)

### Description

#### Background

From code search, the following EPs use
`onnxruntime::GetCpuPreferredNodes()` in their `GetCapabilities()`
methods:
- CANN
- CUDA
- DML
- JS
- ROCM
- WebGPU

However, the source file that implements
`onnxruntime::GetCpuPreferredNodes()` is excluded when minimal build is
ON:
https://github.com/microsoft/onnxruntime/blob/6df0973e58ba5399fcaa98686f70ed9a9e59aaef/cmake/onnxruntime_framework.cmake#L38-L42

This means that all EPs mentioned above is not able to compile with
minimal build.

#### Solution

The excluded file `core/framework/fallback_cpu_capability.cc` cannot
build in minimal build because some of its dependencies are not included
in the minimal build. However, in extended minimal build mode, all
dependencies are available.

This PR looses the restrict and allows to compile this file when it is
extended minimal build. After this change, those EPs are able to compile
in extended minimal build.
---
 cmake/onnxruntime_framework.cmake                     | 5 +----
 cmake/onnxruntime_providers_js.cmake                  | 6 +++++-
 onnxruntime/core/framework/fallback_cpu_capability.cc | 4 ++++
 onnxruntime/core/framework/fallback_cpu_capability.h  | 4 ++++
 4 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/cmake/onnxruntime_framework.cmake b/cmake/onnxruntime_framework.cmake
index b1e98a9e0411c..9c9a25f8ee77e 100644
--- a/cmake/onnxruntime_framework.cmake
+++ b/cmake/onnxruntime_framework.cmake
@@ -36,10 +36,7 @@ elseif(onnxruntime_ENABLE_TRITON)
 endif()
 
 if (onnxruntime_MINIMAL_BUILD)
-  set(onnxruntime_framework_src_exclude
-    "${ONNXRUNTIME_ROOT}/core/framework/fallback_cpu_capability.h"
-    "${ONNXRUNTIME_ROOT}/core/framework/fallback_cpu_capability.cc"
-  )
+  set(onnxruntime_framework_src_exclude)
 
   # custom ops support must be explicitly enabled in a minimal build. exclude if not.
   if (NOT onnxruntime_MINIMAL_BUILD_CUSTOM_OPS)
diff --git a/cmake/onnxruntime_providers_js.cmake b/cmake/onnxruntime_providers_js.cmake
index 9811eae611463..fefbab5082da4 100644
--- a/cmake/onnxruntime_providers_js.cmake
+++ b/cmake/onnxruntime_providers_js.cmake
@@ -1,6 +1,10 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License.
 
+  if (onnxruntime_MINIMAL_BUILD AND NOT onnxruntime_EXTENDED_MINIMAL_BUILD)
+    message(FATAL_ERROR "JSEP can not be used in a basic minimal build. Please build with '--minimal_build extended'")
+  endif()
+
   add_compile_definitions(USE_JSEP=1)
 
   file(GLOB_RECURSE onnxruntime_providers_js_cc_srcs
@@ -18,4 +22,4 @@
     onnxruntime_common onnxruntime_framework onnx onnx_proto ${PROTOBUF_LIB} flatbuffers Boost::mp11 Eigen3::Eigen
   )
 
-  add_dependencies(onnxruntime_providers_js ${onnxruntime_EXTERNAL_DEPENDENCIES})
\ No newline at end of file
+  add_dependencies(onnxruntime_providers_js ${onnxruntime_EXTERNAL_DEPENDENCIES})
diff --git a/onnxruntime/core/framework/fallback_cpu_capability.cc b/onnxruntime/core/framework/fallback_cpu_capability.cc
index 1eb7420b44d2c..d3e435c0341b0 100644
--- a/onnxruntime/core/framework/fallback_cpu_capability.cc
+++ b/onnxruntime/core/framework/fallback_cpu_capability.cc
@@ -1,6 +1,8 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
+#if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
+
 #include "core/framework/fallback_cpu_capability.h"
 #include "core/common/inlined_containers.h"
 
@@ -176,3 +178,5 @@ std::unordered_set<NodeIndex> GetCpuPreferredNodes(const onnxruntime::GraphViewe
 }
 
 }  // namespace onnxruntime
+
+#endif  // !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
diff --git a/onnxruntime/core/framework/fallback_cpu_capability.h b/onnxruntime/core/framework/fallback_cpu_capability.h
index bca75adbfd5a7..ddcc1de96d2af 100644
--- a/onnxruntime/core/framework/fallback_cpu_capability.h
+++ b/onnxruntime/core/framework/fallback_cpu_capability.h
@@ -3,6 +3,8 @@
 
 #pragma once
 
+#if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
+
 #include <gsl/gsl>
 #include "core/common/inlined_containers_fwd.h"
 #include "core/framework/execution_provider.h"  // for IExecutionProvider::IKernelLookup
@@ -26,3 +28,5 @@ std::unordered_set<NodeIndex> GetCpuPreferredNodes(const GraphViewer& graph,
                                                    const logging::Logger& logger);
 
 }  // namespace onnxruntime
+
+#endif  // !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)

From 813bdaab8d13ec029a32545e28d05aebdceb610e Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Tue, 4 Mar 2025 10:08:18 -0800
Subject: [PATCH 13/46] Add dawn to ThirdPartyNotices (#23876)

### Description

Add `dawn` to ThirdPartyNotices.
---
 ThirdPartyNotices.txt | 35 +++++++++++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)

diff --git a/ThirdPartyNotices.txt b/ThirdPartyNotices.txt
index 26084ab42ec1c..a449e42f6bf19 100644
--- a/ThirdPartyNotices.txt
+++ b/ThirdPartyNotices.txt
@@ -6045,3 +6045,38 @@ https://github.com/intel/neural-speed
    terms, and open source software license terms. These separate license terms
    govern your use of the third party programs as set forth in the
    "THIRD-PARTY-PROGRAMS" file.
+
+_____
+
+dawn
+
+https://dawn.googlesource.com/dawn
+
+   BSD 3-Clause License
+
+   Copyright 2017-2023 The Dawn & Tint Authors
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are met:
+
+   1. Redistributions of source code must retain the above copyright notice, this
+      list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright notice,
+      this list of conditions and the following disclaimer in the documentation
+      and/or other materials provided with the distribution.
+
+   3. Neither the name of the copyright holder nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+   AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+   IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+   DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+   FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+   DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+   SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+   CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+   OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

From 9d0dc9f062fa496422c3679204ecb62c65430758 Mon Sep 17 00:00:00 2001
From: Hector Li <hecli@microsoft.com>
Date: Tue, 4 Mar 2025 16:20:06 -0800
Subject: [PATCH 14/46] Enable QNN EP weight sharing generation using public
 API (#23702)

### Description
Enable QNN EP weight sharing generation using public API instead of internal interfaces, so that user can integrate into their own toolchain. The change is to share the QnnBackendManager across ORT sessions if ep.share_ep_contexts is enabled. And there is extra option to end the share so that we know when to remove the shared QnnBackendManager from the singleton.

Change the tool name from onnxruntime_qnn_ctx_gen to ep_weight_sharing_ctx_gen, so that it can be shared for other EPs.
---
 cmake/onnxruntime_python.cmake                |   2 +-
 cmake/onnxruntime_unittests.cmake             |  33 +--
 .../onnxruntime_session_options_config_keys.h |   5 +-
 .../qnn/builder/qnn_backend_manager.cc        |   2 +
 .../providers/qnn/qnn_execution_provider.cc   |  42 ++-
 .../providers/qnn/qnn_execution_provider.h    |   1 +
 .../core/providers/qnn/shared_context.h       |  26 ++
 .../README.md                                 |  10 +-
 .../command_args_parser.cc                    |  47 ++--
 .../command_args_parser.h                     |   0
 .../test/ep_weight_sharing_ctx_gen/main.cc    | 247 +++++++++++++++++
 .../test_configuration.h                      |   7 +-
 .../test/providers/qnn/qnn_ep_context_test.cc | 222 ++++++++++------
 onnxruntime/test/qnn_ctx_gen/main.cc          | 250 ------------------
 setup.py                                      |   2 +-
 15 files changed, 519 insertions(+), 377 deletions(-)
 rename onnxruntime/test/{qnn_ctx_gen => ep_weight_sharing_ctx_gen}/README.md (82%)
 rename onnxruntime/test/{qnn_ctx_gen => ep_weight_sharing_ctx_gen}/command_args_parser.cc (68%)
 rename onnxruntime/test/{qnn_ctx_gen => ep_weight_sharing_ctx_gen}/command_args_parser.h (100%)
 create mode 100644 onnxruntime/test/ep_weight_sharing_ctx_gen/main.cc
 rename onnxruntime/test/{qnn_ctx_gen => ep_weight_sharing_ctx_gen}/test_configuration.h (75%)
 delete mode 100644 onnxruntime/test/qnn_ctx_gen/main.cc

diff --git a/cmake/onnxruntime_python.cmake b/cmake/onnxruntime_python.cmake
index aee6d2ff7655c..64b53c2912be0 100644
--- a/cmake/onnxruntime_python.cmake
+++ b/cmake/onnxruntime_python.cmake
@@ -1029,7 +1029,7 @@ if (onnxruntime_USE_QNN)
   add_custom_command(
     TARGET onnxruntime_pybind11_state POST_BUILD
     COMMAND ${CMAKE_COMMAND} -E copy
-        $<TARGET_FILE:onnxruntime_qnn_ctx_gen>
+        $<TARGET_FILE:ep_weight_sharing_ctx_gen>
         $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/capi/
   )
   if (EXISTS "${onnxruntime_QNN_HOME}/Qualcomm AI Hub Proprietary License.pdf")
diff --git a/cmake/onnxruntime_unittests.cmake b/cmake/onnxruntime_unittests.cmake
index cb5a28f82de66..2ed7923941643 100644
--- a/cmake/onnxruntime_unittests.cmake
+++ b/cmake/onnxruntime_unittests.cmake
@@ -1289,31 +1289,34 @@ if (NOT onnxruntime_ENABLE_TRAINING_TORCH_INTEROP)
 
   if(onnxruntime_USE_QNN)
     #qnn ctx generator
-    set(onnxruntime_qnn_ctx_gen_src_dir ${TEST_SRC_DIR}/qnn_ctx_gen)
-    set(onnxruntime_qnn_ctx_gen_src_patterns
-    "${onnxruntime_qnn_ctx_gen_src_dir}/*.cc"
-    "${onnxruntime_qnn_ctx_gen_src_dir}/*.h")
+    set(ep_weight_sharing_ctx_gen_src_dir ${TEST_SRC_DIR}/ep_weight_sharing_ctx_gen)
+    set(ep_weight_sharing_ctx_gen_src_patterns
+    "${ep_weight_sharing_ctx_gen_src_dir}/*.cc"
+    "${ep_weight_sharing_ctx_gen_src_dir}/*.h")
 
-    file(GLOB onnxruntime_qnn_ctx_gen_src CONFIGURE_DEPENDS
-      ${onnxruntime_qnn_ctx_gen_src_patterns}
+    file(GLOB ep_weight_sharing_ctx_gen_src CONFIGURE_DEPENDS
+      ${ep_weight_sharing_ctx_gen_src_patterns}
       )
-    onnxruntime_add_executable(onnxruntime_qnn_ctx_gen ${onnxruntime_qnn_ctx_gen_src})
-    target_include_directories(onnxruntime_qnn_ctx_gen PRIVATE   ${onnx_test_runner_src_dir} ${ONNXRUNTIME_ROOT}
-          ${onnxruntime_graph_header} ${onnxruntime_exec_src_dir}
-          ${CMAKE_CURRENT_BINARY_DIR})
+    onnxruntime_add_executable(ep_weight_sharing_ctx_gen ${ep_weight_sharing_ctx_gen_src})
+    target_include_directories(ep_weight_sharing_ctx_gen PRIVATE ${ONNXRUNTIME_ROOT} ${CMAKE_CURRENT_BINARY_DIR})
     if (WIN32)
-      target_compile_options(onnxruntime_qnn_ctx_gen PRIVATE ${disabled_warnings})
+      target_compile_options(ep_weight_sharing_ctx_gen PRIVATE ${disabled_warnings})
       if (NOT DEFINED SYS_PATH_LIB)
         set(SYS_PATH_LIB shlwapi)
       endif()
     endif()
 
-    if(WIN32)
-      target_link_libraries(onnxruntime_qnn_ctx_gen PRIVATE debug dbghelp advapi32)
+    if (onnxruntime_BUILD_SHARED_LIB)
+      set(ep_weight_sharing_ctx_gen_libs onnxruntime_common onnxruntime ${onnxruntime_EXTERNAL_LIBRARIES} ${GETOPT_LIB_WIDE})
+      target_link_libraries(ep_weight_sharing_ctx_gen PRIVATE ${ep_weight_sharing_ctx_gen_libs})
+      if (WIN32)
+        target_link_libraries(ep_weight_sharing_ctx_gen PRIVATE debug dbghelp advapi32)
+      endif()
+    else()
+      target_link_libraries(ep_weight_sharing_ctx_gen PRIVATE onnxruntime_session ${onnxruntime_test_providers_libs} ${onnxruntime_EXTERNAL_LIBRARIES} ${GETOPT_LIB_WIDE})
     endif()
-    target_link_libraries(onnxruntime_qnn_ctx_gen PRIVATE onnx_test_runner_common onnxruntime_test_utils onnxruntime_common onnxruntime_graph onnxruntime_session onnxruntime_providers onnxruntime_framework onnxruntime_util onnxruntime_mlas onnxruntime_optimizer onnxruntime_flatbuffers onnx_test_data_proto ${onnxruntime_test_providers_libs} ${onnxruntime_EXTERNAL_LIBRARIES} ${GETOPT_LIB_WIDE} ${SYS_PATH_LIB} ${CMAKE_DL_LIBS})
 
-    set_target_properties(onnxruntime_qnn_ctx_gen PROPERTIES FOLDER "ONNXRuntimeTest")
+    set_target_properties(ep_weight_sharing_ctx_gen PROPERTIES FOLDER "ONNXRuntimeTest")
   endif()
 
   # shared lib
diff --git a/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h b/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h
index 117a2cdabca2f..af1f9c04b2831 100644
--- a/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h
+++ b/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h
@@ -315,9 +315,12 @@ static const char* const kOrtSessionOptionEpContextEmbedMode = "ep.context_embed
 // in case user need to merge/connect multiple EPContext nodes in one model
 static const char* const kOrtSessionOptionEpContextNodeNamePrefix = "ep.context_node_name_prefix";
 
-// Share EP related resources across EPs
+// Share EP related resources across sessions
 static const char* const kOrtSessionOptionShareEpContexts = "ep.share_ep_contexts";
 
+// Stop to share EP related resources across sessions from then on
+static const char* const kOrtSessionOptionStopShareEpContexts = "ep.stop_share_ep_contexts";
+
 // Use this config when dumping EP context model with an external initializers file
 // All initializers will be inside the external data file if specified, otherwise all in Onnx file
 static const char* const kOrtSessionOptionsEpContextModelExternalInitializersFileName =
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc
index bcde69beceef7..26d792c008edc 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc
+++ b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc
@@ -470,8 +470,10 @@ Status QnnBackendManager::InitializeProfiling() {
   QnnProfile_Level_t qnn_profile_level = QNN_PROFILE_LEVEL_BASIC;
   if (ProfilingLevel::BASIC == profiling_level_merge_) {
     qnn_profile_level = QNN_PROFILE_LEVEL_BASIC;
+    LOGS_DEFAULT(VERBOSE) << "Profiling level set to basic.";
   } else if (ProfilingLevel::DETAILED == profiling_level_merge_) {
     qnn_profile_level = QNN_PROFILE_LEVEL_DETAILED;
+    LOGS_DEFAULT(VERBOSE) << "Profiling level set to detailed.";
   }
   Qnn_ErrorHandle_t result = qnn_interface_.profileCreate(backend_handle_, qnn_profile_level, &profile_backend_handle_);
   ORT_RETURN_IF(QNN_PROFILE_NO_ERROR != result, "Failed to create QNN profile! Error: ", QnnErrorHandleToString(result));
diff --git a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
index 99a6f51f6f712..1ad17d96e9322 100644
--- a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
+++ b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
@@ -195,6 +195,10 @@ QNNExecutionProvider::QNNExecutionProvider(const ProviderOptions& provider_optio
     share_ep_contexts_ =
         config_options->GetConfigOrDefault(kOrtSessionOptionShareEpContexts, "0") == "1";
     LOGS_DEFAULT(VERBOSE) << "User specified option - share EP contexts across sessions: " << share_ep_contexts_;
+
+    stop_share_ep_contexts_ =
+        config_options->GetConfigOrDefault(kOrtSessionOptionStopShareEpContexts, "0") == "1";
+    LOGS_DEFAULT(VERBOSE) << "User specified option - stop share EP contexts across sessions: " << stop_share_ep_contexts_;
   }
 
   static const std::string BACKEND_PATH = "backend_path";
@@ -384,17 +388,27 @@ QNNExecutionProvider::QNNExecutionProvider(const ProviderOptions& provider_optio
     }
   }
 
-  qnn_backend_manager_ = qnn::QnnBackendManager::Create(
-      qnn::QnnBackendManagerConfig{backend_path,
-                                   profiling_level_etw,
-                                   profiling_level,
-                                   profiling_file_path,
-                                   context_priority,
-                                   qnn_saver_path,
-                                   device_id_,
-                                   htp_arch,
-                                   soc_model,
-                                   enable_htp_weight_sharing});
+  // For context binary generation with weight sharing enabled, use the QnnBackendManager from the shared context if it exits
+  // So that all graphs from later sessions will be compiled into the same QNN context
+  if (context_cache_enabled_ && share_ep_contexts_ && SharedContext::GetInstance().GetSharedQnnBackendManager()) {
+    qnn_backend_manager_ = SharedContext::GetInstance().GetSharedQnnBackendManager();
+    // Clear the QnnBackendManager from singleton to stop the resource share
+    if (stop_share_ep_contexts_) {
+      SharedContext::GetInstance().ResetSharedQnnBackendManager();
+    }
+  } else {
+    qnn_backend_manager_ = qnn::QnnBackendManager::Create(
+        qnn::QnnBackendManagerConfig{backend_path,
+                                     profiling_level_etw,
+                                     profiling_level,
+                                     profiling_file_path,
+                                     context_priority,
+                                     qnn_saver_path,
+                                     device_id_,
+                                     htp_arch,
+                                     soc_model,
+                                     enable_htp_weight_sharing});
+  }
 
 #if defined(_WIN32)
   if (onnxruntime::logging::EtwRegistrationManager::SupportsETW()) {
@@ -1037,6 +1051,12 @@ Status QNNExecutionProvider::Compile(const std::vector<FusedNodeAndGraph>& fused
                                                   qnn_context_embed_mode_,
                                                   max_spill_fill_buffer_size,
                                                   logger));
+
+    if (share_ep_contexts_ && !stop_share_ep_contexts_ &&
+        nullptr == SharedContext::GetInstance().GetSharedQnnBackendManager()) {
+      ORT_RETURN_IF_NOT(SharedContext::GetInstance().SetSharedQnnBackendManager(qnn_backend_manager_),
+                        "Failed to set shared QnnBackendManager.");
+    }
   }
   return Status::OK();
 }
diff --git a/onnxruntime/core/providers/qnn/qnn_execution_provider.h b/onnxruntime/core/providers/qnn/qnn_execution_provider.h
index 31c34855ca4c0..0f40e40c2fa36 100644
--- a/onnxruntime/core/providers/qnn/qnn_execution_provider.h
+++ b/onnxruntime/core/providers/qnn/qnn_execution_provider.h
@@ -90,6 +90,7 @@ class QNNExecutionProvider : public IExecutionProvider {
   uint32_t default_rpc_control_latency_ = 0;
   bool enable_HTP_FP16_precision_ = true;
   bool share_ep_contexts_ = false;
+  bool stop_share_ep_contexts_ = false;
   bool enable_spill_fill_buffer_ = false;
 #if defined(_WIN32)
   onnxruntime::logging::EtwRegistrationManager::EtwInternalCallback callback_ETWSink_provider_ = nullptr;
diff --git a/onnxruntime/core/providers/qnn/shared_context.h b/onnxruntime/core/providers/qnn/shared_context.h
index 81de357dbe677..277a484ad8528 100644
--- a/onnxruntime/core/providers/qnn/shared_context.h
+++ b/onnxruntime/core/providers/qnn/shared_context.h
@@ -61,13 +61,39 @@ class SharedContext {
     return graph_exist;
   }
 
+  bool SetSharedQnnBackendManager(std::shared_ptr<qnn::QnnBackendManager>& qnn_backend_manager) {
+    const std::lock_guard<std::mutex> lock(mtx_);
+
+    if (qnn_backend_manager_ != nullptr) {
+      if (qnn_backend_manager_ == qnn_backend_manager) {
+        return true;
+      }
+      return false;
+    }
+    qnn_backend_manager_ = qnn_backend_manager;
+    return true;
+  }
+
+  std::shared_ptr<qnn::QnnBackendManager> GetSharedQnnBackendManager() {
+    const std::lock_guard<std::mutex> lock(mtx_);
+    return qnn_backend_manager_;
+  }
+
+  void ResetSharedQnnBackendManager() {
+    const std::lock_guard<std::mutex> lock(mtx_);
+    qnn_backend_manager_.reset();
+  }
+
  private:
   SharedContext() = default;
   ~SharedContext() = default;
 
   ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(SharedContext);
 
+  // Used for passing through QNN models (deserialized from context binary) across sessions
   std::vector<std::unique_ptr<qnn::QnnModel>> shared_qnn_models_;
+  // Used for compiling multiple models into same QNN context binary
+  std::shared_ptr<qnn::QnnBackendManager> qnn_backend_manager_;
   // Producer sessions can be in parallel
   // Consumer sessions have to be after producer sessions initialized
   std::mutex mtx_;
diff --git a/onnxruntime/test/qnn_ctx_gen/README.md b/onnxruntime/test/ep_weight_sharing_ctx_gen/README.md
similarity index 82%
rename from onnxruntime/test/qnn_ctx_gen/README.md
rename to onnxruntime/test/ep_weight_sharing_ctx_gen/README.md
index 97ab89d79cbd2..be1a1fe039366 100644
--- a/onnxruntime/test/qnn_ctx_gen/README.md
+++ b/onnxruntime/test/ep_weight_sharing_ctx_gen/README.md
@@ -2,17 +2,19 @@
 
 This tool provides the way to generate Onnx models that wraps QNN context binary warpt with weight sharing enabled. The options to use with the tool are listed below:
 
-`onnxruntime_qnn_ctx_gen [options...] model_path,model_path`
+`ep_weight_sharing_ctx_gen [options...] model_1_path,model_2_path`
 
-./onnxruntime_qnn_ctx_gen -v -i "soc_model|60 htp_graph_finalization_optimization_mode|3" -C "ep.context_enable|1 ep.context_embed_mode|0" /mnt/c/model1.onnx,/mnt/c/model2.onnx
+./ep_weight_sharing_ctx_gen -e qnn -v -i "soc_model|60 htp_graph_finalization_optimization_mode|3" /mnt/c/model1.onnx,/mnt/c/model2.onnx
 
 Options:
-       
+
+    -e [qnn|tensorrt|openvino|vitisai]: Specifies the compile based provider qnn, tensorrt, openvino, vitisai. Default is qnn.
+
     -v: Show verbose information.
 
     -C: [session_config_entries]: Specify session configuration entries as key-value pairs: -C "<key1>|<val1> <key2>|<val2>"
                                   Refer to onnxruntime_session_options_config_keys.h for valid keys and values.
-                                  [Example] -C "ep.context_enable|1 ep.context_embed_mode|0"
+                                  [Example] -C "ep.context_enable|1 ep.context_embed_mode|0". These are set as default so can be ignored.
 
     -i: [provider_options]: Specify QNN EP specific runtime options as key value pairs. Different runtime options available are:
             [Usage]: -i '<key1>|<value1> <key2>|<value2>'
diff --git a/onnxruntime/test/qnn_ctx_gen/command_args_parser.cc b/onnxruntime/test/ep_weight_sharing_ctx_gen/command_args_parser.cc
similarity index 68%
rename from onnxruntime/test/qnn_ctx_gen/command_args_parser.cc
rename to onnxruntime/test/ep_weight_sharing_ctx_gen/command_args_parser.cc
index 24c343c7b9541..bf21d54ccde41 100644
--- a/onnxruntime/test/qnn_ctx_gen/command_args_parser.cc
+++ b/onnxruntime/test/ep_weight_sharing_ctx_gen/command_args_parser.cc
@@ -1,5 +1,4 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
-// Copyright (c) 2023 NVIDIA Corporation.
 // Licensed under the MIT License.
 
 #include "command_args_parser.h"
@@ -29,28 +28,30 @@ namespace qnnctxgen {
 
 /*static*/ void CommandLineParser::ShowUsage() {
   printf(
-      "onnxruntime_qnn_ctx_gen [options...] model1_path,model2_path\n"
-      "Example: ./onnxruntime_qnn_ctx_gen -i \"soc_model|60 htp_graph_finalization_optimization_mode|3\" -C \"ep.context_node_name_prefix|_part1\" ./model1.onnx,./model2.onnx\n"
+      "ep_weight_sharing_ctx_gen [options...] model1_path,model2_path\n"
+      "Example: ./ep_weight_sharing_ctx_gen -i \"soc_model|60 htp_graph_finalization_optimization_mode|3\" -C \"ep.context_node_name_prefix|_part1\" ./model1.onnx,./model2.onnx\n"
       "Options:\n"
+      "\t-e [qnn|tensorrt|openvino|vitisai]: Specifies the compile based provider 'qnn','tensorrt','openvino', 'vitisai'. "
+      "Default:'qnn'.\n"
       "\t-v: Show verbose information.\n"
       "\t-C: Specify session configuration entries as key-value pairs: -C \"<key1>|<value1> <key2>|<value2>\" \n"
       "\t    Refer to onnxruntime_session_options_config_keys.h for valid keys and values. \n"
       "\t    Force ep.context_enable to 1 and ep.context_embed_mode to 0. Change ep.context_file_path is not allowed."
       "\t    [Example] -C \"ep.context_node_name_prefix|_part1\" \n"
-      "\t-i: Specify QNN EP specific runtime options as key value pairs. Different runtime options available are: \n"
+      "\t-i: Specify EP specific runtime options as key value pairs. Different runtime options available are: \n"
       "\t    [Usage]: -i '<key1>|<value1> <key2>|<value2>'\n"
       "\n"
-      "\t    [backend_path]: QNN backend path. e.g '/folderpath/libQnnHtp.so', '/winfolderpath/QnnHtp.dll'. default to HTP backend\n"
-      "\t    [vtcm_mb]: QNN VTCM size in MB. default to 0(not set).\n"
-      "\t    [htp_graph_finalization_optimization_mode]: QNN graph finalization optimization mode, options: '0', '1', '2', '3', default is '0'.\n"
-      "\t    [soc_model]: The SoC Model number. Refer to QNN SDK documentation for specific values. Defaults to '0' (unknown). \n"
-      "\t    [htp_arch]: The minimum HTP architecture. The driver will use ops compatible with this architecture. eg: '0', '68', '69', '73', '75'. Defaults to '0' (none). \n"
-      "\t    [enable_htp_fp16_precision]: Enable the HTP_FP16 precision so that the float32 model will be inferenced with fp16 precision. \n"
+      "\t    [QNN only] [backend_path]: QNN backend path. e.g '/folderpath/libQnnHtp.so', '/winfolderpath/QnnHtp.dll'. default to HTP backend\n"
+      "\t    [QNN only] [vtcm_mb]: QNN VTCM size in MB. default to 0(not set).\n"
+      "\t    [QNN only] [htp_graph_finalization_optimization_mode]: QNN graph finalization optimization mode, options: '0', '1', '2', '3', default is '0'.\n"
+      "\t    [QNN only] [soc_model]: The SoC Model number. Refer to QNN SDK documentation for specific values. Defaults to '0' (unknown). \n"
+      "\t    [QNN only] [htp_arch]: The minimum HTP architecture. The driver will use ops compatible with this architecture. eg: '0', '68', '69', '73', '75'. Defaults to '0' (none). \n"
+      "\t    [QNN only] [enable_htp_fp16_precision]: Enable the HTP_FP16 precision so that the float32 model will be inferenced with fp16 precision. \n"
       "\t    Otherwise, it will be fp32 precision. Works for float32 model for HTP backend. Defaults to '1' (with FP16 precision.). \n"
-      "\t    [enable_htp_weight_sharing]: Allows common weights across graphs to be shared and stored in a single context binary. Defaults to '1' (enabled).\n"
-      "\t    [offload_graph_io_quantization]: Offload graph input quantization and graph output dequantization to another EP (typically CPU EP). \n"
-      "\t    Defaults to '0' (QNN EP handles the graph I/O quantization and dequantization). \n"
-      "\t    [enable_htp_spill_fill_buffer]: Enable HTP spill file buffer, used while generating QNN context binary."
+      "\t    [QNN only] [enable_htp_weight_sharing]: Allows common weights across graphs to be shared and stored in a single context binary. Defaults to '1' (enabled).\n"
+      "\t    [QNN only] [offload_graph_io_quantization]: Offload graph input quantization and graph output dequantization to another EP (typically CPU EP). \n"
+      "\t    Defaults to '1' (QNN EP handles the graph I/O quantization and dequantization). \n"
+      "\t    [QNN only] [enable_htp_spill_fill_buffer]: Enable HTP spill file buffer, used while generating QNN context binary."
       "\t    [Example] -i \"vtcm_mb|8 htp_arch|73\" \n"
       "\n"
       "\t-h: help\n");
@@ -109,8 +110,22 @@ static bool ParseSessionConfigs(const std::string& configs_string,
 
 /*static*/ bool CommandLineParser::ParseArguments(TestConfig& test_config, int argc, ORTCHAR_T* argv[]) {
   int ch;
-  while ((ch = getopt(argc, argv, ORT_TSTR("o:u:i:C:vh"))) != -1) {
+  while ((ch = getopt(argc, argv, ORT_TSTR("e:o:u:i:C:vh"))) != -1) {
     switch (ch) {
+      case 'e':
+        if (!CompareCString(optarg, ORT_TSTR("qnn"))) {
+          test_config.machine_config.provider_type_name = onnxruntime::kQnnExecutionProvider;
+        } else if (!CompareCString(optarg, ORT_TSTR("openvino"))) {
+          test_config.machine_config.provider_type_name = onnxruntime::kOpenVINOExecutionProvider;
+        } else if (!CompareCString(optarg, ORT_TSTR("tensorrt"))) {
+          test_config.machine_config.provider_type_name = onnxruntime::kTensorrtExecutionProvider;
+        } else if (!CompareCString(optarg, ORT_TSTR("vitisai"))) {
+          test_config.machine_config.provider_type_name = onnxruntime::kVitisAIExecutionProvider;
+        } else {
+          fprintf(stderr, "The execution provider is not included in this tool.\n");
+          return false;
+        }
+        break;
       case 'v':
         test_config.run_config.f_verbose = true;
         break;
@@ -162,7 +177,7 @@ static bool ParseSessionConfigs(const std::string& configs_string,
  'offload_graph_io_quantization', 'enable_htp_spill_fill_buffer'])");
           }
 
-          test_config.run_config.qnn_options[key] = value;
+          test_config.run_config.provider_options[key] = value;
         }
         break;
       }
diff --git a/onnxruntime/test/qnn_ctx_gen/command_args_parser.h b/onnxruntime/test/ep_weight_sharing_ctx_gen/command_args_parser.h
similarity index 100%
rename from onnxruntime/test/qnn_ctx_gen/command_args_parser.h
rename to onnxruntime/test/ep_weight_sharing_ctx_gen/command_args_parser.h
diff --git a/onnxruntime/test/ep_weight_sharing_ctx_gen/main.cc b/onnxruntime/test/ep_weight_sharing_ctx_gen/main.cc
new file mode 100644
index 0000000000000..104cdbdfd5abc
--- /dev/null
+++ b/onnxruntime/test/ep_weight_sharing_ctx_gen/main.cc
@@ -0,0 +1,247 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "test_configuration.h"
+#include "command_args_parser.h"
+
+// onnxruntime dependencies
+#include "core/session/onnxruntime_cxx_api.h"
+#include "core/session/onnxruntime_session_options_config_keys.h"
+
+// onnx dependencies
+#include "onnx/onnx_pb.h"
+#include <fstream>
+
+using namespace onnxruntime;
+using ProviderOptions = std::unordered_map<std::string, std::string>;
+
+// from the last context cache Onnx model, find the EPContext node with main_context=1,
+// and get the QNN context binary file name, this context binary contains all graphs from all Onnx models
+// get the max spill fill buffer size
+static void GetLastContextBinaryFileName(const std::basic_string<ORTCHAR_T> last_onnx_ctx_file,
+                                         std::string& last_ctx_bin_file,
+                                         int64_t& max_size) {
+  max_size = 0;
+
+  onnx::ModelProto model;
+  std::ifstream onnx_file_stream(last_onnx_ctx_file, std::ios_base::binary);
+  model.ParseFromIstream(&onnx_file_stream);
+
+  for (auto& node : model.graph().node()) {
+    if (node.op_type() == "EPContext") {
+      int64_t is_main_context = 0;
+      for (auto& attr : node.attribute()) {
+        if (attr.name() == "main_context") {
+          is_main_context = attr.i();
+        }
+        if (attr.name() == "max_size") {
+          max_size = attr.i();
+        }
+        if (attr.name() == "ep_cache_context") {
+          last_ctx_bin_file = attr.s();
+        }
+      }
+      if (is_main_context) {
+        return;
+      }
+    }
+  }
+
+  onnx_file_stream.close();
+}
+
+// Update generated context cache Onnx model to make the main EPContext node point to
+// the last QNN context binary file
+// Remove not used QNN context binary file, only keep the last one which contains all graphs
+static void UpdateEpContextModel(const std::vector<std::basic_string<ORTCHAR_T>>& ep_ctx_files,
+                                 const std::string& last_qnn_ctx_binary_file_name,
+                                 int64_t max_size) {
+  for (auto ep_ctx_file : ep_ctx_files) {
+    onnx::ModelProto model;
+    std::ifstream onnx_file_stream(ep_ctx_file, std::ios_base::binary);
+    model.ParseFromIstream(&onnx_file_stream);
+    onnx_file_stream.close();
+
+    for (auto& node : *(model.mutable_graph()->mutable_node())) {
+      if (node.op_type() == "EPContext") {
+        int64_t is_main_context = 0;
+        std::string old_qnn_ctx_binary_file_name;
+        int max_size_index = 0;
+        int ep_context_index = 0;
+        for (auto i = 0; i < node.attribute_size(); ++i) {
+          auto& attr = node.attribute()[i];
+          if (attr.name() == "main_context") {
+            is_main_context = attr.i();
+          }
+          if (attr.name() == "max_size") {
+            max_size = attr.i();
+            max_size_index = i;
+          }
+          if (attr.name() == "ep_cache_context") {
+            old_qnn_ctx_binary_file_name = attr.s();
+            ep_context_index = 0;
+          }
+        }
+        if (is_main_context) {
+          auto path_str = ToPathString(ep_ctx_file);
+          auto path = std::filesystem::path(path_str);
+          auto file_path = path.replace_filename(old_qnn_ctx_binary_file_name);
+          std::remove(file_path.string().c_str());
+
+          node.mutable_attribute(max_size_index)->set_i(max_size);
+          node.mutable_attribute(ep_context_index)->set_s(last_qnn_ctx_binary_file_name);
+        }
+      }
+    }
+
+    // re-write the onnx ctx file
+    std::ofstream onnx_file_ostream(ep_ctx_file, std::ios_base::binary);
+    model.SerializeToOstream(&onnx_file_ostream);
+    onnx_file_ostream.close();
+  }
+}
+
+#ifdef _WIN32
+int real_main(int argc, wchar_t* argv[]) {
+#else
+int real_main(int argc, char* argv[]) {
+#endif
+  qnnctxgen::TestConfig test_config;
+  if (!qnnctxgen::CommandLineParser::ParseArguments(test_config, argc, argv)) {
+    qnnctxgen::CommandLineParser::ShowUsage();
+    return -1;
+  }
+
+  OrtLoggingLevel logging_level = test_config.run_config.f_verbose
+                                      ? ORT_LOGGING_LEVEL_VERBOSE
+                                      : ORT_LOGGING_LEVEL_ERROR;
+  Ort::Env env(logging_level, "ep_weight_sharing");
+
+  ORT_TRY {
+    Ort::SessionOptions so;
+    so.SetLogId("ep_weight_sharing_ctx_gen_session_logger");
+    // Set default session option to dump EPContext model with non-embed mode
+    so.AddConfigEntry(kOrtSessionOptionEpContextEnable, "1");
+    so.AddConfigEntry(kOrtSessionOptionEpContextEmbedMode, "0");
+    // enable ep.share_ep_contexts
+    so.AddConfigEntry(kOrtSessionOptionShareEpContexts, "1");
+
+    ProviderOptions provider_options;
+
+    for (auto it : test_config.run_config.provider_options) {
+      provider_options[it.first] = it.second;
+    }
+
+    for (auto it : test_config.run_config.session_config_entries) {
+      if (it.first == kOrtSessionOptionEpContextEnable && it.second != "1") {
+        std::cerr << "Need to enable ep context cache." << std::endl;
+        continue;
+      }
+      if (it.first == kOrtSessionOptionEpContextEmbedMode && it.second != "0") {
+        std::cerr << "Only support non-embed model for weight sharing." << std::endl;
+        continue;
+      }
+      if (it.first == kOrtSessionOptionEpContextFilePath) {
+        std::cout << "Not support to specify the generated Onnx context cache file name." << std::endl;
+        continue;
+      }
+      so.AddConfigEntry(it.first.c_str(), it.second.c_str());
+    }
+
+    for (auto model_path : test_config.model_file_paths) {
+      std::cout << "Model file path: " << ToUTF8String(model_path) << std::endl;
+    }
+
+    // Generate context cache model files with QNN context binary files
+    // The context binary file generated later includes all graphs from previous models
+    {
+      std::string provider_name_ = test_config.machine_config.provider_type_name;
+      if (provider_name_ == onnxruntime::kQnnExecutionProvider) {
+#ifdef USE_QNN
+#if defined(_WIN32)
+        provider_options["backend_path"] = "QnnHtp.dll";
+#else
+        provider_options["backend_path"] = "libQnnHtp.so";
+#endif
+
+        // set default QNN EP option to enable weight sharing if not set by user
+        const std::string enable_htp_weight_sharing = "enable_htp_weight_sharing";
+        if (provider_options.find(enable_htp_weight_sharing) == provider_options.end()) {
+          provider_options[enable_htp_weight_sharing] = "1";
+        }
+        so.AppendExecutionProvider("QNN", provider_options);
+#else
+        ORT_THROW("QNN is not supported in this build\n");
+#endif
+      } else if (!provider_name_.empty()) {
+        ORT_THROW("This execution provider is not included in this tool.\n");
+      }
+
+      size_t total_file_count = test_config.model_file_paths.size();
+      for (size_t i = 0; i < total_file_count; ++i) {
+        auto model_path = test_config.model_file_paths[i];
+        std::cout << "Generating context cache model for: " << ToUTF8String(model_path) << std::endl;
+        if (i == total_file_count - 1) {
+          so.AddConfigEntry(kOrtSessionOptionStopShareEpContexts, "1");
+        }
+        Ort::Session session(env, model_path.c_str(), so);
+      }
+    }
+
+    std::cout << "Start to update the generated Onnx model." << std::endl;
+    std::vector<std::basic_string<ORTCHAR_T>> ep_ctx_files;
+    ep_ctx_files.reserve(test_config.model_file_paths.size());
+    for (auto model_path : test_config.model_file_paths) {
+      auto pos = model_path.find_last_of(ORT_TSTR("."));
+      if (pos != std::string::npos) {
+        model_path = model_path.substr(0, pos) + ORT_TSTR("_ctx.onnx");
+      } else {
+        model_path = model_path + ORT_TSTR("_ctx.onnx");
+      }
+      ep_ctx_files.push_back(model_path);
+    }
+
+    // Get the last context binary file name
+    std::string last_qnn_ctx_binary_file_name;
+    int64_t max_size = 0;
+    GetLastContextBinaryFileName(ep_ctx_files.back(), last_qnn_ctx_binary_file_name, max_size);
+    std::cout << "The last context binary file: " << last_qnn_ctx_binary_file_name << std::endl;
+    if (last_qnn_ctx_binary_file_name.empty()) {
+      throw Ort::Exception("Can't find QNN context binary file from the Onnx model.", OrtErrorCode::ORT_FAIL);
+    }
+    ep_ctx_files.pop_back();
+
+    // Update generated context cache Onnx model to make the main EPContext node point to
+    // the last QNN context binary file
+    // Remove not used QNN context binary file, only keep the last one only which contains all graphs
+    UpdateEpContextModel(ep_ctx_files, last_qnn_ctx_binary_file_name, max_size);
+  }
+  ORT_CATCH(const Ort::Exception& e) {
+    std::cerr << "Failed to generate context cache file: " << e.what();
+    return -1;
+  }
+
+  std::cout << "Generation done!";
+  return 0;
+}
+
+#ifdef _WIN32
+int wmain(int argc, wchar_t* argv[]) {
+#else
+int main(int argc, char* argv[]) {
+#endif
+  int retval = -1;
+  ORT_TRY {
+    retval = real_main(argc, argv);
+  }
+  ORT_CATCH(const std::exception& ex) {
+    ORT_HANDLE_EXCEPTION([&]() {
+      fprintf(stderr, "%s\n", ex.what());
+      retval = -1;
+    });
+  }
+
+  ::google::protobuf::ShutdownProtobufLibrary();
+
+  return retval;
+}
diff --git a/onnxruntime/test/qnn_ctx_gen/test_configuration.h b/onnxruntime/test/ep_weight_sharing_ctx_gen/test_configuration.h
similarity index 75%
rename from onnxruntime/test/qnn_ctx_gen/test_configuration.h
rename to onnxruntime/test/ep_weight_sharing_ctx_gen/test_configuration.h
index bf4c7061a3484..198d03211f561 100644
--- a/onnxruntime/test/qnn_ctx_gen/test_configuration.h
+++ b/onnxruntime/test/ep_weight_sharing_ctx_gen/test_configuration.h
@@ -14,15 +14,20 @@
 namespace onnxruntime {
 namespace qnnctxgen {
 
+struct MachineConfig {
+  std::string provider_type_name{onnxruntime::kQnnExecutionProvider};
+};
+
 struct RunConfig {
   bool f_verbose{false};
   std::unordered_map<std::string, std::string> session_config_entries;
-  std::unordered_map<std::string, std::string> qnn_options;
+  std::unordered_map<std::string, std::string> provider_options;
 };
 
 struct TestConfig {
   std::vector<std::basic_string<ORTCHAR_T>> model_file_paths;
   RunConfig run_config;
+  MachineConfig machine_config;
 };
 
 }  // namespace qnnctxgen
diff --git a/onnxruntime/test/providers/qnn/qnn_ep_context_test.cc b/onnxruntime/test/providers/qnn/qnn_ep_context_test.cc
index e50dd7c214240..3dec74599abdf 100644
--- a/onnxruntime/test/providers/qnn/qnn_ep_context_test.cc
+++ b/onnxruntime/test/providers/qnn/qnn_ep_context_test.cc
@@ -43,6 +43,35 @@ static const std::string& GetNodeAttr(const Node& node, const std::string& attr_
   return default_val;
 }
 
+// from the context cache Onnx model, find the EPContext node with main_context=1,
+// and get the QNN context binary file name
+static void GetContextBinaryFileName(const std::string onnx_ctx_file,
+                                     std::string& last_ctx_bin_file,
+                                     const Logger& logger) {
+  std::shared_ptr<Model> ctx_model;
+  ASSERT_STATUS_OK(Model::Load(ToPathString(onnx_ctx_file), ctx_model, nullptr, logger));
+  auto& ctx_graph = ctx_model->MainGraph();
+  for (auto& node : ctx_graph.Nodes()) {
+    if (node.OpType() == "EPContext") {
+      int64_t is_main_context = GetNodeAttr(node, "main_context", static_cast<int64_t>(0));
+      if (1 == is_main_context) {
+        last_ctx_bin_file = GetNodeAttr(node, "ep_cache_context", "");
+        return;
+      }
+    }
+  }
+}
+
+// Get context binary file name from Context model file and remove it with the context model file
+void CleanUpCtxFile(std::string context_file_path) {
+  std::string qnn_ctx_binary_file_name;
+  GetContextBinaryFileName(context_file_path, qnn_ctx_binary_file_name,
+                           DefaultLoggingManager().DefaultLogger());
+
+  ASSERT_EQ(std::remove(qnn_ctx_binary_file_name.c_str()), 0);
+  ASSERT_EQ(std::remove(context_file_path.c_str()), 0);
+}
+
 // Create a model with FusedMatMul + Add (quantized)
 // input1 -> Add -> Q -> DQ ----
 //                              |
@@ -123,22 +152,22 @@ void QnnContextBinaryMultiPartitionTestBody(bool single_ep_node = true) {
 
   const auto model_data_span = AsByteSpan(model_data.data(), model_data.size());
 
-  const std::string context_binary_file = "./qnn_context_binary_multi_partition_test.onnx";
-  std::remove(context_binary_file.c_str());
+  const std::string context_model_file = "./qnn_context_binary_multi_partition_test.onnx";
+  std::remove(context_model_file.c_str());
   Ort::SessionOptions so;
   so.AddConfigEntry(kOrtSessionOptionEpContextEnable, "1");
-  so.AddConfigEntry(kOrtSessionOptionEpContextFilePath, context_binary_file.c_str());
+  so.AddConfigEntry(kOrtSessionOptionEpContextFilePath, context_model_file.c_str());
   so.AppendExecutionProvider("QNN", provider_options);
 
   Ort::Session session(*ort_env, model_data_span.data(), model_data_span.size(), so);
 
   // Make sure the Qnn context cache binary file is generated
-  EXPECT_TRUE(std::filesystem::exists(context_binary_file.c_str()));
+  EXPECT_TRUE(std::filesystem::exists(context_model_file.c_str()));
 
   int ep_context_node_count = 0;
   int non_ep_context_node_count = 0;
   std::shared_ptr<Model> ctx_model;
-  ASSERT_STATUS_OK(Model::Load(ToPathString(context_binary_file), ctx_model, nullptr, DefaultLoggingManager().DefaultLogger()));
+  ASSERT_STATUS_OK(Model::Load(ToPathString(context_model_file), ctx_model, nullptr, DefaultLoggingManager().DefaultLogger()));
   auto& ctx_graph = ctx_model->MainGraph();
   for (auto& node : ctx_graph.Nodes()) {
     if (node.OpType() == "EPContext") {
@@ -156,7 +185,7 @@ void QnnContextBinaryMultiPartitionTestBody(bool single_ep_node = true) {
 
   Ort::SessionOptions so2;
   // context file path is required if it's non-embed mode and the model is loaded from memory
-  so2.AddConfigEntry(kOrtSessionOptionEpContextFilePath, context_binary_file.c_str());
+  so2.AddConfigEntry(kOrtSessionOptionEpContextFilePath, context_model_file.c_str());
   so2.AppendExecutionProvider("QNN", provider_options);
 
   std::string ctx_model_data;
@@ -164,7 +193,7 @@ void QnnContextBinaryMultiPartitionTestBody(bool single_ep_node = true) {
   Ort::Session session2(*ort_env, ctx_model_data.data(), ctx_model_data.size(), so2);
 
   // clean up
-  ASSERT_EQ(std::remove(context_binary_file.c_str()), 0);
+  CleanUpCtxFile(context_model_file);
 }
 
 // Test that models with 1 non-quantized FusedMatMul node and 1 quantized Add node can still generate the context binary
@@ -237,7 +266,7 @@ void EpCtxCpuNodeWithExternalIniFileTestBody(bool expect_external_ini_file) {
   // clean up
   ASSERT_EQ(std::remove(model_with_ext.c_str()), 0);
   ASSERT_EQ(std::remove(model_ext_file_full_path.c_str()), 0);
-  ASSERT_EQ(std::remove(ep_context_model_file.c_str()), 0);
+  CleanUpCtxFile(ep_context_model_file);
 }
 
 // Set the external initializer size threshold to 1024 so FusedMatMul (which fallback on CPU)
@@ -444,21 +473,21 @@ TEST_F(QnnHTPBackendTests, QnnContextBinaryGeneration2InputTypes) {
 
   const auto model_data_span = AsByteSpan(model_data.data(), model_data.size());
 
-  const std::string context_binary_file = "./qnn_context_binary_int32_fp32_inputs_test.onnx";
-  std::remove(context_binary_file.c_str());
+  const std::string context_model_file = "./qnn_context_binary_int32_fp32_inputs_test.onnx";
+  std::remove(context_model_file.c_str());
   Ort::SessionOptions so;
   so.AddConfigEntry(kOrtSessionOptionEpContextEnable, "1");
-  so.AddConfigEntry(kOrtSessionOptionEpContextFilePath, context_binary_file.c_str());
+  so.AddConfigEntry(kOrtSessionOptionEpContextFilePath, context_model_file.c_str());
 
   so.AppendExecutionProvider("QNN", provider_options);
 
   Ort::Session session(*ort_env, model_data_span.data(), model_data_span.size(), so);
 
   // Make sure the Qnn context cache binary file is generated
-  EXPECT_TRUE(std::filesystem::exists(context_binary_file.c_str()));
+  EXPECT_TRUE(std::filesystem::exists(context_model_file.c_str()));
 
   // clean up
-  ASSERT_EQ(std::remove(context_binary_file.c_str()), 0);
+  CleanUpCtxFile(context_model_file);
 }
 
 // Generate context cache model from the ONNX models with 2 inputs.
@@ -481,26 +510,26 @@ TEST_F(QnnHTPBackendTests, QnnContextGeneration2InputsOrderIssue) {
   auto& logging_manager = DefaultLoggingManager();
   logging_manager.SetDefaultLoggerSeverity(logging::Severity::kERROR);
 
-  const std::string context_binary_file = "./qnn_ctx_2_inputs_order_test_gen.onnx";
+  const std::string context_model_file = "./qnn_ctx_2_inputs_order_test_gen.onnx";
   Ort::SessionOptions so;
   so.AddConfigEntry(kOrtSessionOptionEpContextEnable, "1");
-  so.AddConfigEntry(kOrtSessionOptionEpContextFilePath, context_binary_file.c_str());
+  so.AddConfigEntry(kOrtSessionOptionEpContextFilePath, context_model_file.c_str());
   so.AppendExecutionProvider("QNN", provider_options);
 
   Ort::Session session(*ort_env, ORT_TSTR("testdata/qnn_ctx_2_inputs_order_test.onnx"), so);
 
   // Make sure the Qnn context cache binary file is generated
-  EXPECT_TRUE(std::filesystem::exists(context_binary_file.c_str()));
+  EXPECT_TRUE(std::filesystem::exists(context_model_file.c_str()));
 
   std::shared_ptr<Model> model;
-  ASSERT_STATUS_OK(Model::Load(ToPathString(context_binary_file), model, nullptr, DefaultLoggingManager().DefaultLogger()));
+  ASSERT_STATUS_OK(Model::Load(ToPathString(context_model_file), model, nullptr, DefaultLoggingManager().DefaultLogger()));
   auto inputs = model->MainGraph().GetInputs();
   EXPECT_TRUE(inputs.size() == 2);
   EXPECT_TRUE(inputs[0]->Name() == "attention_mask");
   EXPECT_TRUE(inputs[1]->Name() == "Add_input_0");
 
   // clean up
-  ASSERT_EQ(std::remove(context_binary_file.c_str()), 0);
+  CleanUpCtxFile(context_model_file);
 }
 
 TEST_F(QnnHTPBackendTests, QnnContextGenerationNodeNamePrefix) {
@@ -519,20 +548,20 @@ TEST_F(QnnHTPBackendTests, QnnContextGenerationNodeNamePrefix) {
   auto& logging_manager = DefaultLoggingManager();
   logging_manager.SetDefaultLoggerSeverity(logging::Severity::kERROR);
 
-  const std::string context_binary_file = "./qnn_ctx_2_inputs_order_test_gen.onnx";
+  const std::string context_model_file = "./qnn_ctx_2_inputs_order_test_gen.onnx";
   Ort::SessionOptions so;
   so.AddConfigEntry(kOrtSessionOptionEpContextEnable, "1");
-  so.AddConfigEntry(kOrtSessionOptionEpContextFilePath, context_binary_file.c_str());
+  so.AddConfigEntry(kOrtSessionOptionEpContextFilePath, context_model_file.c_str());
   so.AddConfigEntry(kOrtSessionOptionEpContextNodeNamePrefix, node_name_prefix.c_str());
   so.AppendExecutionProvider("QNN", provider_options);
 
   Ort::Session session(*ort_env, ORT_TSTR("testdata/qnn_ctx_2_inputs_order_test.onnx"), so);
 
   // Make sure the Qnn context cache binary file is generated
-  EXPECT_TRUE(std::filesystem::exists(context_binary_file.c_str()));
+  EXPECT_TRUE(std::filesystem::exists(context_model_file.c_str()));
 
   std::shared_ptr<Model> model;
-  ASSERT_STATUS_OK(Model::Load(ToPathString(context_binary_file), model, nullptr, DefaultLoggingManager().DefaultLogger()));
+  ASSERT_STATUS_OK(Model::Load(ToPathString(context_model_file), model, nullptr, DefaultLoggingManager().DefaultLogger()));
   for (auto& node : model->MainGraph().Nodes()) {
     if (node.OpType() == "EPContext") {
       EXPECT_TRUE(node.Name().find(node_name_prefix) != std::string::npos);
@@ -540,7 +569,7 @@ TEST_F(QnnHTPBackendTests, QnnContextGenerationNodeNamePrefix) {
   }
 
   // clean up
-  ASSERT_EQ(std::remove(context_binary_file.c_str()), 0);
+  CleanUpCtxFile(context_model_file);
 }
 
 // Run QDQ model on HTP 3 times
@@ -554,12 +583,12 @@ TEST_F(QnnHTPBackendTests, QnnContextBinaryCacheEmbedModeTest) {
   provider_options["backend_path"] = "libQnnHtp.so";
 #endif
   provider_options["offload_graph_io_quantization"] = "0";
-  const std::string context_binary_file = "./qnn_context_binary_test.onnx";
-  std::remove(context_binary_file.c_str());
+  const std::string context_model_file = "./qnn_context_binary_test.onnx";
+  std::remove(context_model_file.c_str());
 
   std::unordered_map<std::string, std::string> session_option_pairs;
   session_option_pairs.emplace(kOrtSessionOptionEpContextEnable, "1");
-  session_option_pairs.emplace(kOrtSessionOptionEpContextFilePath, context_binary_file);
+  session_option_pairs.emplace(kOrtSessionOptionEpContextFilePath, context_model_file);
 
   const TestInputDef<float> input_def({1, 2, 3}, false, -10.0f, 10.0f);
   const std::string op_type = "Atan";
@@ -577,11 +606,11 @@ TEST_F(QnnHTPBackendTests, QnnContextBinaryCacheEmbedModeTest) {
                        session_option_pairs);
 
   // Make sure the Qnn context cache binary file is generated
-  EXPECT_TRUE(std::filesystem::exists(context_binary_file.c_str()));
+  EXPECT_TRUE(std::filesystem::exists(context_model_file.c_str()));
 
   // 2nd run directly loads and run from Qnn context cache model
   std::unordered_map<std::string, std::string> session_option_pairs2;
-  session_option_pairs2.emplace(kOrtSessionOptionEpContextFilePath, context_binary_file);
+  session_option_pairs2.emplace(kOrtSessionOptionEpContextFilePath, context_model_file);
   TestQDQModelAccuracy(BuildOpTestCase<float>(op_type, {input_def}, {}, {}),
                        BuildQDQOpTestCase<uint8_t>(op_type, {input_def}, {}, {}),
                        provider_options,
@@ -589,10 +618,10 @@ TEST_F(QnnHTPBackendTests, QnnContextBinaryCacheEmbedModeTest) {
                        ExpectedEPNodeAssignment::All,
                        QDQTolerance(),
                        logging::Severity::kERROR,
-                       context_binary_file,
+                       context_model_file,
                        session_option_pairs2);
   // Clean up
-  ASSERT_EQ(std::remove(context_binary_file.c_str()), 0);
+  CleanUpCtxFile(context_model_file);
 }
 
 // Run QDQ model on HTP 3 times
@@ -889,12 +918,12 @@ TEST_F(QnnHTPBackendTests, QnnContextBinary2InputsTest) {
   provider_options["backend_path"] = "libQnnHtp.so";
 #endif
   provider_options["offload_graph_io_quantization"] = "0";
-  const std::string context_binary_file = "./qnn_context_binary_2inputs_test.onnx";
-  std::remove(context_binary_file.c_str());
+  const std::string context_model_file = "./qnn_context_binary_2inputs_test.onnx";
+  std::remove(context_model_file.c_str());
 
   std::unordered_map<std::string, std::string> session_option_pairs;
   session_option_pairs.emplace(kOrtSessionOptionEpContextEnable, "1");
-  session_option_pairs.emplace(kOrtSessionOptionEpContextFilePath, context_binary_file);
+  session_option_pairs.emplace(kOrtSessionOptionEpContextFilePath, context_model_file);
 
   const TestInputDef<float> input_def1({1, 2, 3}, false, -10.0f, 10.0f);
   const TestInputDef<float> input_def2({1, 2, 3}, false, -10.0f, 10.0f);
@@ -913,11 +942,11 @@ TEST_F(QnnHTPBackendTests, QnnContextBinary2InputsTest) {
                        session_option_pairs);
 
   // Make sure the Qnn context cache binary file is generated
-  EXPECT_TRUE(std::filesystem::exists(context_binary_file.c_str()));
+  EXPECT_TRUE(std::filesystem::exists(context_model_file.c_str()));
 
   // 2nd run directly loads and run from Qnn context cache model
   std::unordered_map<std::string, std::string> session_option_pairs2;
-  session_option_pairs2.emplace(kOrtSessionOptionEpContextFilePath, context_binary_file);
+  session_option_pairs2.emplace(kOrtSessionOptionEpContextFilePath, context_model_file);
   TestQDQModelAccuracy(BuildOpTestCase<float>(op_type, {input_def1, input_def2}, {}, {}),
                        BuildQDQOpTestCase<uint8_t>(op_type, {input_def1, input_def2}, {}, {}),
                        provider_options,
@@ -925,10 +954,10 @@ TEST_F(QnnHTPBackendTests, QnnContextBinary2InputsTest) {
                        ExpectedEPNodeAssignment::All,
                        QDQTolerance(),
                        logging::Severity::kERROR,
-                       context_binary_file,
+                       context_model_file,
                        session_option_pairs2);
   // Clean up
-  ASSERT_EQ(std::remove(context_binary_file.c_str()), 0);
+  CleanUpCtxFile(context_model_file);
 }
 
 // Context binary only contains a single QNN graph, generated context cache model (detached mode) only has 1 EPContext node
@@ -1062,44 +1091,20 @@ static void CreateQdqModel(const std::string& model_file_name, const Logger& log
 static void DumpModelWithSharedCtx(const ProviderOptions& provider_options,
                                    const std::string& onnx_model_path1,
                                    const std::string& onnx_model_path2) {
-  SessionOptions so;
-  so.session_logid = "qnn_ctx_model_logger";
-  ASSERT_STATUS_OK(so.config_options.AddConfigEntry(kOrtSessionOptionEpContextEnable, "1"));
-  ASSERT_STATUS_OK(so.config_options.AddConfigEntry(kOrtSessionOptionEpContextEmbedMode, "0"));
-  RunOptions run_options;
-  run_options.run_tag = so.session_logid;
-
-  auto qnn_ep = QnnExecutionProviderWithOptions(provider_options, &so);
-  std::shared_ptr<IExecutionProvider> qnn_ep_shared(std::move(qnn_ep));
+  Ort::SessionOptions so;
+  so.AddConfigEntry(kOrtSessionOptionEpContextEnable, "1");
+  so.AddConfigEntry(kOrtSessionOptionEpContextEmbedMode, "0");
+  // enable ep.share_ep_contexts so that QNNEP share the QnnBackendManager across sessions
+  so.AddConfigEntry(kOrtSessionOptionShareEpContexts, "1");
 
-  InferenceSessionWrapper session_object1{so, GetEnvironment()};
-  ASSERT_STATUS_OK(session_object1.RegisterExecutionProvider(qnn_ep_shared));
-  ASSERT_STATUS_OK(session_object1.Load(ToPathString(onnx_model_path1)));
-  ASSERT_STATUS_OK(session_object1.Initialize());
+  so.AppendExecutionProvider("QNN", provider_options);
 
-  InferenceSessionWrapper session_object2{so, GetEnvironment()};
-  ASSERT_STATUS_OK(session_object2.RegisterExecutionProvider(qnn_ep_shared));
-  ASSERT_STATUS_OK(session_object2.Load(ToPathString(onnx_model_path2)));
-  ASSERT_STATUS_OK(session_object2.Initialize());
-}
+  // Create 2 sessions to generate context binary models, the 1st session will share the QnnBackendManager
+  // to the 2nd session, so graphs from these 2 models are all included in the 2nd context binary
+  Ort::Session session1(*ort_env, ToPathString(onnx_model_path1).c_str(), so);
 
-// from the last context ache Onnx model, find the EPContext node with main_context=1,
-// and get the QNN context binary file name, thie context binary contains all graphs from all Onnx models
-static void GetLastContextBinaryFileName(const std::string last_onnx_ctx_file,
-                                         std::string& last_ctx_bin_file,
-                                         const Logger& logger) {
-  std::shared_ptr<Model> ctx_model;
-  ASSERT_STATUS_OK(Model::Load(ToPathString(last_onnx_ctx_file), ctx_model, nullptr, logger));
-  auto& ctx_graph = ctx_model->MainGraph();
-  for (auto& node : ctx_graph.Nodes()) {
-    if (node.OpType() == "EPContext") {
-      int64_t is_main_context = GetNodeAttr(node, "main_context", static_cast<int64_t>(0));
-      if (1 == is_main_context) {
-        last_ctx_bin_file = GetNodeAttr(node, "ep_cache_context", "");
-        return;
-      }
-    }
-  }
+  so.AddConfigEntry(kOrtSessionOptionStopShareEpContexts, "1");
+  Ort::Session session2(*ort_env, ToPathString(onnx_model_path2).c_str(), so);
 }
 
 // Update generated context cache Onnx model to make the main EPContext node point to
@@ -1187,10 +1192,10 @@ TEST_F(QnnHTPBackendTests, QnnContextShareAcrossSessions1) {
 
   DumpModelWithSharedCtx(provider_options, onnx_model_paths[0], onnx_model_paths[1]);
 
-  // Get the last context binary file name
+  // Get the last context binary file name, the latest context binary file holds all graphs generated from all models
   std::string last_qnn_ctx_binary_file_name;
-  GetLastContextBinaryFileName(ctx_model_paths.back(), last_qnn_ctx_binary_file_name,
-                               DefaultLoggingManager().DefaultLogger());
+  GetContextBinaryFileName(ctx_model_paths.back(), last_qnn_ctx_binary_file_name,
+                           DefaultLoggingManager().DefaultLogger());
   EXPECT_TRUE(!last_qnn_ctx_binary_file_name.empty());
 
   // Update generated context cache Onnx model to make the main EPContext node point to
@@ -1293,8 +1298,8 @@ TEST_F(QnnHTPBackendTests, QnnContextShareAcrossSessions2) {
 
   // Get the last context binary file name
   std::string last_qnn_ctx_binary_file_name;
-  GetLastContextBinaryFileName(ctx_model_paths.back(), last_qnn_ctx_binary_file_name,
-                               DefaultLoggingManager().DefaultLogger());
+  GetContextBinaryFileName(ctx_model_paths.back(), last_qnn_ctx_binary_file_name,
+                           DefaultLoggingManager().DefaultLogger());
   EXPECT_TRUE(!last_qnn_ctx_binary_file_name.empty());
 
   // Update generated context cache Onnx model to make the main EPContext node point to
@@ -1357,6 +1362,69 @@ TEST_F(QnnHTPBackendTests, QnnContextShareAcrossSessions2) {
   }
   std::remove(last_qnn_ctx_binary_file_name.c_str());
 }
+
+// For Ort sessions to generate the context binary, with session option ep.share_ep_contexts enabled
+// Ort sessions will share the QnnBackendManager, so that all graphs from all models compile into the same Qnn context
+TEST_F(QnnHTPBackendTests, QnnContextGenWeightSharingSessionAPI) {
+  ProviderOptions provider_options;
+#if defined(_WIN32)
+  provider_options["backend_path"] = "QnnHtp.dll";
+#else
+  provider_options["backend_path"] = "libQnnHtp.so";
+#endif
+  provider_options["offload_graph_io_quantization"] = "0";
+
+  // Create QDQ models
+  std::vector<std::string> onnx_model_paths{"./weight_share1.onnx", "./weight_share2.onnx"};
+  std::vector<std::string> ctx_model_paths;
+  for (auto model_path : onnx_model_paths) {
+    CreateQdqModel(model_path, DefaultLoggingManager().DefaultLogger());
+    EXPECT_TRUE(std::filesystem::exists(model_path.c_str()));
+    auto pos = model_path.find_last_of(".");
+    if (pos != std::string::npos) {
+      model_path = model_path.substr(0, pos) + "_ctx.onnx";
+    } else {
+      model_path = model_path + "_ctx.onnx";
+    }
+    ctx_model_paths.push_back(model_path);
+  }
+
+  Ort::SessionOptions so;
+  so.AddConfigEntry(kOrtSessionOptionEpContextEnable, "1");
+  so.AddConfigEntry(kOrtSessionOptionEpContextEmbedMode, "0");
+  // enable ep.share_ep_contexts so that QNNEP share the QnnBackendManager across sessions
+  so.AddConfigEntry(kOrtSessionOptionShareEpContexts, "1");
+
+  so.AppendExecutionProvider("QNN", provider_options);
+
+  Ort::Session session1(*ort_env, ToPathString(onnx_model_paths[0]).c_str(), so);
+  std::string qnn_ctx_binary_file_name1;
+  GetContextBinaryFileName(ctx_model_paths[0], qnn_ctx_binary_file_name1,
+                           DefaultLoggingManager().DefaultLogger());
+  EXPECT_TRUE(!qnn_ctx_binary_file_name1.empty());
+
+  // Tell the EP stop share the QnnBackendManager from this session then on
+  so.AddConfigEntry(kOrtSessionOptionStopShareEpContexts, "1");
+  Ort::Session session2(*ort_env, ToPathString(onnx_model_paths[1]).c_str(), so);
+  std::string qnn_ctx_binary_file_name2;
+  GetContextBinaryFileName(ctx_model_paths[1], qnn_ctx_binary_file_name2,
+                           DefaultLoggingManager().DefaultLogger());
+  EXPECT_TRUE(!qnn_ctx_binary_file_name2.empty());
+
+  auto file_size_1 = std::filesystem::file_size(qnn_ctx_binary_file_name1);
+  auto file_size_2 = std::filesystem::file_size(qnn_ctx_binary_file_name2);
+  EXPECT_TRUE(file_size_2 > file_size_1);
+
+  // clean up
+  for (auto model_path : onnx_model_paths) {
+    ASSERT_EQ(std::remove(model_path.c_str()), 0);
+  }
+  for (auto ctx_model_path : ctx_model_paths) {
+    ASSERT_EQ(std::remove(ctx_model_path.c_str()), 0);
+  }
+  ASSERT_EQ(std::remove(qnn_ctx_binary_file_name1.c_str()), 0);
+  ASSERT_EQ(std::remove(qnn_ctx_binary_file_name2.c_str()), 0);
+}
 #endif  // defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__)
 
 }  // namespace test
diff --git a/onnxruntime/test/qnn_ctx_gen/main.cc b/onnxruntime/test/qnn_ctx_gen/main.cc
deleted file mode 100644
index bb5007b40b072..0000000000000
--- a/onnxruntime/test/qnn_ctx_gen/main.cc
+++ /dev/null
@@ -1,250 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-// onnxruntime dependencies
-#include "test_configuration.h"
-#include <core/session/onnxruntime_c_api.h>
-#include <core/session/onnxruntime_cxx_api.h>
-#include <random>
-#include "command_args_parser.h"
-#include <google/protobuf/stubs/common.h>
-
-#include "core/session/onnxruntime_session_options_config_keys.h"
-#include "core/session/inference_session.h"
-#include "core/session/ort_env.h"
-#include "core/providers/provider_factory_creators.h"
-#include "core/common/logging/sinks/clog_sink.h"
-
-#include "core/graph/model.h"
-#include "core/session/environment.h"
-#include "core/common/logging/logging.h"
-
-using namespace onnxruntime;
-const OrtApi* g_ort = NULL;
-std::unique_ptr<Ort::Env> ort_env;
-
-static void CheckStatus(const Status& status) {
-  if (status.Code() != common::StatusCode::OK) {
-    std::string msg = status.ErrorMessage();
-    throw Ort::Exception(std::move(msg), OrtErrorCode::ORT_FAIL);
-  }
-}
-
-static int64_t GetNodeAttr(const Node& node, const std::string& attr_name, int64_t default_val) {
-  const auto& attributes = node.GetAttributes();
-  if (auto entry = attributes.find(attr_name); entry != attributes.end()) {
-    return entry->second.i();
-  }
-
-  return default_val;
-}
-
-static const std::string& GetNodeAttr(const Node& node, const std::string& attr_name, const std::string& default_val) {
-  const auto& attributes = node.GetAttributes();
-  if (auto entry = attributes.find(attr_name); entry != attributes.end()) {
-    return entry->second.s();
-  }
-
-  return default_val;
-}
-
-// from the last context cache Onnx model, find the EPContext node with main_context=1,
-// and get the QNN context binary file name, this context binary contains all graphs from all Onnx models
-// get the max spill fill buffer size
-static void GetLastContextBinaryFileName(const std::basic_string<ORTCHAR_T> last_onnx_ctx_file,
-                                         std::string& last_ctx_bin_file,
-                                         int64_t& max_size) {
-  max_size = 0;
-  std::shared_ptr<Model> ctx_model;
-  CheckStatus(Model::Load(ToPathString(last_onnx_ctx_file), ctx_model, nullptr,
-                          (*((OrtEnv*)*ort_env.get())->GetEnvironment().GetLoggingManager()).DefaultLogger()));
-  auto& ctx_graph = ctx_model->MainGraph();
-  for (auto& node : ctx_graph.Nodes()) {
-    if (node.OpType() == "EPContext") {
-      int64_t is_main_context = GetNodeAttr(node, "main_context", static_cast<int64_t>(0));
-      max_size = GetNodeAttr(node, "max_size", static_cast<int64_t>(0));
-      if (1 == is_main_context) {
-        last_ctx_bin_file = GetNodeAttr(node, "ep_cache_context", "");
-        return;
-      }
-    }
-  }
-}
-
-// Update generated context cache Onnx model to make the main EPContext node point to
-// the last QNN context binary file
-// Remove not used QNN context binary file, only keep the last one which contains all graphs
-static void UpdateEpContextModel(const std::vector<std::basic_string<ORTCHAR_T>>& ep_ctx_files,
-                                 const std::string& last_qnn_ctx_binary_file_name,
-                                 int64_t max_size) {
-  for (auto ep_ctx_file : ep_ctx_files) {
-    std::shared_ptr<Model> ctx_model;
-    auto path_str = ToPathString(ep_ctx_file);
-    CheckStatus(Model::Load(path_str, ctx_model, nullptr,
-                            (*((OrtEnv*)*ort_env.get())->GetEnvironment().GetLoggingManager()).DefaultLogger()));
-    auto& ctx_graph = ctx_model->MainGraph();
-    GraphViewer graph_viewer(ctx_graph);
-    auto path = std::filesystem::path(path_str);
-
-    for (auto& node : ctx_graph.Nodes()) {
-      if (node.OpType() == "EPContext") {
-        int64_t is_main_context = GetNodeAttr(node, "main_context", static_cast<int64_t>(0));
-        if (1 == is_main_context) {
-          std::string old_qnn_ctx_binary_file_name = GetNodeAttr(node, "ep_cache_context", "");
-          auto file_path = path.replace_filename(old_qnn_ctx_binary_file_name);
-          std::remove(file_path.string().c_str());
-          node.ClearAttribute("ep_cache_context");
-          node.AddAttribute("ep_cache_context", last_qnn_ctx_binary_file_name);
-          node.ClearAttribute("max_size");
-          node.AddAttribute("max_size", max_size);
-        }
-      }
-    }
-    std::remove(ToUTF8String(ep_ctx_file).c_str());
-    CheckStatus(Model::Save(*ctx_model.get(), ToPathString(ep_ctx_file)));
-  }
-}
-
-#ifdef _WIN32
-int real_main(int argc, wchar_t* argv[]) {
-#else
-int real_main(int argc, char* argv[]) {
-#endif
-  g_ort = OrtGetApiBase()->GetApi(ORT_API_VERSION);
-  qnnctxgen::TestConfig test_config;
-  if (!qnnctxgen::CommandLineParser::ParseArguments(test_config, argc, argv)) {
-    qnnctxgen::CommandLineParser::ShowUsage();
-    return -1;
-  }
-
-  {
-    bool failed = false;
-    ORT_TRY {
-      OrtLoggingLevel logging_level = test_config.run_config.f_verbose
-                                          ? ORT_LOGGING_LEVEL_VERBOSE
-                                          : ORT_LOGGING_LEVEL_WARNING;
-
-      ort_env = std::make_unique<Ort::Env>(logging_level, "Default");
-    }
-    ORT_CATCH(const Ort::Exception& e) {
-      ORT_HANDLE_EXCEPTION([&]() {
-        fprintf(stderr, "Error creating environment. Error-> %s \n", e.what());
-        failed = true;
-      });
-    }
-
-    if (failed)
-      return -1;
-  }
-
-  ORT_TRY {
-    SessionOptions so;
-    so.session_logid = "qnn_ctx_gen_session_logger";
-    // Set default session option to dump QNN context model with non-embed mode
-    CheckStatus(so.config_options.AddConfigEntry(kOrtSessionOptionEpContextEnable, "1"));
-    CheckStatus(so.config_options.AddConfigEntry(kOrtSessionOptionEpContextEmbedMode, "0"));
-    RunOptions run_options;
-    run_options.run_tag = so.session_logid;
-
-    ProviderOptions provider_options;
-#if defined(_WIN32)
-    provider_options["backend_path"] = "QnnHtp.dll";
-#else
-    provider_options["backend_path"] = "libQnnHtp.so";
-#endif
-    // set default QNN EP option to enable weight sharing
-    provider_options["enable_htp_weight_sharing"] = "1";
-
-    for (auto it : test_config.run_config.qnn_options) {
-      provider_options[it.first] = it.second;
-    }
-
-    for (auto it : test_config.run_config.session_config_entries) {
-      if (it.first == kOrtSessionOptionEpContextEnable && it.second != "1") {
-        std::cerr << "Need to enable ep context cache." << std::endl;
-        continue;
-      }
-      if (it.first == kOrtSessionOptionEpContextEmbedMode && it.second != "0") {
-        std::cerr << "Only support non-embed model for weight sharing." << std::endl;
-        continue;
-      }
-      if (it.first == kOrtSessionOptionEpContextFilePath) {
-        std::cout << "Not support to specify the generated Onnx context cache file name." << std::endl;
-        continue;
-      }
-      CheckStatus(so.config_options.AddConfigEntry(it.first.c_str(), it.second.c_str()));
-    }
-
-    for (auto model_path : test_config.model_file_paths) {
-      std::cout << "Model file path: " << ToUTF8String(model_path) << std::endl;
-    }
-
-    // Generate context cache model files with QNN context binary files
-    // The context binary file generated later includes all graphs from previous models
-    {
-      auto ep = QNNProviderFactoryCreator::Create(provider_options, &so)->CreateProvider();
-      std::shared_ptr<IExecutionProvider> qnn_ep(std::move(ep));
-
-      for (auto model_path : test_config.model_file_paths) {
-        std::cout << "Generate context cache model for: " << ToUTF8String(model_path) << std::endl;
-        InferenceSession session_object{so, ((OrtEnv*)*ort_env.get())->GetEnvironment()};
-        CheckStatus(session_object.RegisterExecutionProvider(qnn_ep));
-        CheckStatus(session_object.Load(ToPathString(model_path)));
-        CheckStatus(session_object.Initialize());
-      }
-    }
-
-    std::cout << "Start to update the generated Onnx model." << std::endl;
-    std::vector<std::basic_string<ORTCHAR_T>> ep_ctx_files;
-    ep_ctx_files.reserve(test_config.model_file_paths.size());
-    for (auto model_path : test_config.model_file_paths) {
-      ep_ctx_files.push_back(model_path + ORT_TSTR("_ctx.onnx"));
-    }
-
-    // Get the last context binary file name
-    std::string last_qnn_ctx_binary_file_name;
-    int64_t max_size = 0;
-    GetLastContextBinaryFileName(ep_ctx_files.back(), last_qnn_ctx_binary_file_name, max_size);
-    std::cout << "The last context binary file: " << last_qnn_ctx_binary_file_name << std::endl;
-    if (last_qnn_ctx_binary_file_name.empty()) {
-      throw Ort::Exception("Can't find QNN context binary file from the Onnx model.", OrtErrorCode::ORT_FAIL);
-    }
-    ep_ctx_files.pop_back();
-
-    // Update generated context cache Onnx model to make the main EPContext node point to
-    // the last QNN context binary file
-    // Remove not used QNN context binary file, only keep the last one which contains all graphs
-    UpdateEpContextModel(ep_ctx_files, last_qnn_ctx_binary_file_name, max_size);
-  }
-  ORT_CATCH(const Ort::Exception& e) {
-    fprintf(stderr, "Failed to generate context cache file: %s \n", e.what());
-
-    ort_env.reset();
-    return -1;
-  }
-
-  ort_env.reset();
-
-  return 0;
-}
-
-#ifdef _WIN32
-int wmain(int argc, wchar_t* argv[]) {
-#else
-int main(int argc, char* argv[]) {
-#endif
-  int retval = -1;
-  ORT_TRY {
-    retval = real_main(argc, argv);
-  }
-  ORT_CATCH(const std::exception& ex) {
-    ORT_HANDLE_EXCEPTION([&]() {
-      fprintf(stderr, "%s\n", ex.what());
-      retval = -1;
-    });
-  }
-
-  ::google::protobuf::ShutdownProtobufLibrary();
-
-  return retval;
-}
diff --git a/setup.py b/setup.py
index ced2f28e38778..53e533050b245 100644
--- a/setup.py
+++ b/setup.py
@@ -356,7 +356,7 @@ def finalize_options(self):
         "libQnnSaver.so",
         "libQnnSystem.so",
         "libHtpPrepare.so",
-        "onnxruntime_qnn_ctx_gen",
+        "ep_weight_sharing_ctx_gen",
     ]
     dl_libs.extend(qnn_deps)
     if nightly_build:

From 788ca51b044bf1c7379a065213ec1b56c978c55f Mon Sep 17 00:00:00 2001
From: Ashish Garg <quic_ashigarg@quicinc.com>
Date: Tue, 4 Mar 2025 23:02:58 -0800
Subject: [PATCH 15/46] [QNN-EP]: Fix inference failures while running with
 htp_shared_memory (#23892)

### Description
When using the enable_htp_shared_memory feature, we see that the address
of the buffer passed to rpcmem_free is incorrect. So the rpc buffers are
not freed leading to memory exhaustion.

### Motivation and Context
When using the enable_htp_shared_memory_allocator feature for QNN in
GenAI extensions, it leads to inference failures during the second
prompt. As GenAI memory asks are higher, it surfaces sooner in gen AI
use cases.

Co-authored-by: Ashish Garg <ashigarg@qti.qualcomm.com>
---
 onnxruntime/core/providers/qnn/qnn_allocator.cc | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/onnxruntime/core/providers/qnn/qnn_allocator.cc b/onnxruntime/core/providers/qnn/qnn_allocator.cc
index 1fb8742f724cd..cb92e927ff65a 100644
--- a/onnxruntime/core/providers/qnn/qnn_allocator.cc
+++ b/onnxruntime/core/providers/qnn/qnn_allocator.cc
@@ -181,7 +181,9 @@ void HtpSharedMemoryAllocator::Free(void* allocation_address) {
   // Avoid throwing exceptions as this may be running from a destructor.
   try {
     // take ownership of shared memory and free at end of scope
-    auto shared_memory = WrapSharedMemoryWithUniquePtr(allocation_address, rpcmem_lib_->Api());
+    const size_t allocation_offset = AllocationOffsetFromStartOfHeader();
+    void* raw_allocation_address = (void*)((std::byte*)allocation_address - allocation_offset);
+    auto shared_memory = WrapSharedMemoryWithUniquePtr(raw_allocation_address, rpcmem_lib_->Api());
 
     // destroy header
     allocation_header.~AllocationHeader();

From 8aed9208d28131b2b0dc022092250c39ae58768e Mon Sep 17 00:00:00 2001
From: Jie Chen <jie.a.chen@intel.com>
Date: Thu, 6 Mar 2025 01:59:50 +0800
Subject: [PATCH 16/46] Fix enable_pix_capture build for WebGPU (#23857)

The build option --enable_pix_capture is broken. This fixes the problem.

---------

Co-authored-by: wp <webgraphics@intel.com>
---
 cmake/external/onnxruntime_external_deps.cmake           | 9 +--------
 onnxruntime/core/providers/webgpu/webgpu_context.cc      | 1 -
 .../core/providers/webgpu/webgpu_pix_frame_generator.cc  | 4 ++--
 .../core/providers/webgpu/webgpu_pix_frame_generator.h   | 2 +-
 4 files changed, 4 insertions(+), 12 deletions(-)

diff --git a/cmake/external/onnxruntime_external_deps.cmake b/cmake/external/onnxruntime_external_deps.cmake
index ebf20ab21bbd2..a579badee666c 100644
--- a/cmake/external/onnxruntime_external_deps.cmake
+++ b/cmake/external/onnxruntime_external_deps.cmake
@@ -304,7 +304,7 @@ if(NOT TARGET Boost::mp11)
      EXCLUDE_FROM_ALL
      FIND_PACKAGE_ARGS NAMES Boost
     )
-    onnxruntime_fetchcontent_makeavailable(mp11)    
+    onnxruntime_fetchcontent_makeavailable(mp11)
     if(NOT TARGET Boost::mp11)
       add_library(Boost::mp11 ALIAS Boost::headers)
     endif()
@@ -672,17 +672,10 @@ if (onnxruntime_USE_WEBGPU)
 
     # disable things we don't use
     set(DAWN_DXC_ENABLE_ASSERTS_IN_NDEBUG OFF)
-    set(DAWN_ENABLE_DESKTOP_GL OFF CACHE BOOL "" FORCE)
-    set(DAWN_ENABLE_OPENGLES OFF CACHE BOOL "" FORCE)
-    set(DAWN_SUPPORTS_GLFW_FOR_WINDOWING OFF CACHE BOOL "" FORCE)
-    set(DAWN_USE_GLFW OFF CACHE BOOL "" FORCE)
-    set(DAWN_USE_WINDOWS_UI OFF CACHE BOOL "" FORCE)
     set(DAWN_USE_X11 OFF CACHE BOOL "" FORCE)
 
     set(TINT_BUILD_TESTS OFF CACHE BOOL "" FORCE)
     set(TINT_BUILD_CMD_TOOLS OFF CACHE BOOL "" FORCE)
-    set(TINT_BUILD_GLSL_WRITER OFF CACHE BOOL "" FORCE)
-    set(TINT_BUILD_GLSL_VALIDATOR OFF CACHE BOOL "" FORCE)
     set(TINT_BUILD_IR_BINARY OFF CACHE BOOL "" FORCE)
     set(TINT_BUILD_SPV_READER OFF CACHE BOOL "" FORCE)  # don't need. disabling is a large binary size saving
     set(TINT_BUILD_WGSL_WRITER ON CACHE BOOL "" FORCE)  # needed to create cache key. runtime error if not enabled.
diff --git a/onnxruntime/core/providers/webgpu/webgpu_context.cc b/onnxruntime/core/providers/webgpu/webgpu_context.cc
index 163dd691b7f16..2bd547f406226 100644
--- a/onnxruntime/core/providers/webgpu/webgpu_context.cc
+++ b/onnxruntime/core/providers/webgpu/webgpu_context.cc
@@ -165,7 +165,6 @@ void WebGpuContext::Initialize(const WebGpuBufferCacheConfig& buffer_cache_confi
 #if defined(ENABLE_PIX_FOR_WEBGPU_EP)
       // set pix frame generator
       pix_frame_generator_ = std::make_unique<WebGpuPIXFrameGenerator>(instance_,
-                                                                       Adapter(),
                                                                        Device());
 #else
     ORT_THROW("Support PIX capture requires extra build flags (--enable_pix_capture)");
diff --git a/onnxruntime/core/providers/webgpu/webgpu_pix_frame_generator.cc b/onnxruntime/core/providers/webgpu/webgpu_pix_frame_generator.cc
index 90b99b7b38bb1..9b287b7b7df99 100644
--- a/onnxruntime/core/providers/webgpu/webgpu_pix_frame_generator.cc
+++ b/onnxruntime/core/providers/webgpu/webgpu_pix_frame_generator.cc
@@ -11,7 +11,7 @@
 namespace onnxruntime {
 namespace webgpu {
 
-WebGpuPIXFrameGenerator::WebGpuPIXFrameGenerator(wgpu::Instance instance, wgpu::Adapter adapter, wgpu::Device device) {
+WebGpuPIXFrameGenerator::WebGpuPIXFrameGenerator(wgpu::Instance instance, wgpu::Device device) {
   // Trivial window size for surface texture creation and provide frame concept for PIX.
   static constexpr uint32_t kWidth = 512u;
   static constexpr uint32_t kHeight = 512u;
@@ -32,7 +32,7 @@ WebGpuPIXFrameGenerator::WebGpuPIXFrameGenerator(wgpu::Instance instance, wgpu::
 
   wgpu::TextureFormat format;
   wgpu::SurfaceCapabilities capabilities;
-  surface_.GetCapabilities(adapter, &capabilities);
+  surface_.GetCapabilities(device.GetAdapter(), &capabilities);
   format = capabilities.formats[0];
 
   wgpu::SurfaceConfiguration config;
diff --git a/onnxruntime/core/providers/webgpu/webgpu_pix_frame_generator.h b/onnxruntime/core/providers/webgpu/webgpu_pix_frame_generator.h
index 52a7459a81eba..0d9393321284d 100644
--- a/onnxruntime/core/providers/webgpu/webgpu_pix_frame_generator.h
+++ b/onnxruntime/core/providers/webgpu/webgpu_pix_frame_generator.h
@@ -41,7 +41,7 @@ namespace webgpu {
 // WebGpuContext destruction.
 class WebGpuPIXFrameGenerator {
  public:
-  WebGpuPIXFrameGenerator(wgpu::Instance instance, wgpu::Adapter adapter, wgpu::Device device);
+  WebGpuPIXFrameGenerator(wgpu::Instance instance, wgpu::Device device);
   ~WebGpuPIXFrameGenerator();
   void GeneratePIXFrame();
 

From 834adde81a4f4cadc8c9e29bcb96c6479c19b42c Mon Sep 17 00:00:00 2001
From: Satya Kumar Jandhyala <satya.k.jandhyala@gmail.com>
Date: Wed, 5 Mar 2025 13:40:40 -0800
Subject: [PATCH 17/46] [WebGPU-EP Native] Add ReduceMean (#23860)

### Description
<!-- Describe your changes. -->


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
---
 .../webgpu/reduction/reduction_ops.cc         | 168 ++++++++++++++++++
 .../webgpu/reduction/reduction_ops.h          |  62 +++++++
 .../webgpu/webgpu_execution_provider.cc       |   8 +-
 3 files changed, 234 insertions(+), 4 deletions(-)
 create mode 100644 onnxruntime/core/providers/webgpu/reduction/reduction_ops.cc
 create mode 100644 onnxruntime/core/providers/webgpu/reduction/reduction_ops.h

diff --git a/onnxruntime/core/providers/webgpu/reduction/reduction_ops.cc b/onnxruntime/core/providers/webgpu/reduction/reduction_ops.cc
new file mode 100644
index 0000000000000..eb7903e7903b6
--- /dev/null
+++ b/onnxruntime/core/providers/webgpu/reduction/reduction_ops.cc
@@ -0,0 +1,168 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/providers/webgpu/reduction/reduction_ops.h"
+#include <sstream>
+#include "core/framework/data_transfer_manager.h"
+#include "core/providers/webgpu/data_transfer.h"
+#include "core/providers/webgpu/shader_helper.h"
+#include "core/providers/webgpu/webgpu_supported_types.h"
+
+namespace onnxruntime {
+namespace webgpu {
+
+#define REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceOp, begin, end)              \
+  ONNX_OPERATOR_VERSIONED_KERNEL_EX(                                                   \
+      ReduceOp,                                                                        \
+      kOnnxDomain,                                                                     \
+      begin, end,                                                                      \
+      kWebGpuExecutionProvider,                                                        \
+      (*KernelDefBuilder::Create()).TypeConstraint("T", WebGpuSupportedNumberTypes()), \
+      ReduceOp);
+
+#define REGISTER_UNARY_ELEMENTWISE_KERNEL(ReduceOp, version)                                                                  \
+  ONNX_OPERATOR_KERNEL_EX(                                                                                                    \
+      ReduceOp,                                                                                                               \
+      kOnnxDomain,                                                                                                            \
+      version,                                                                                                                \
+      kWebGpuExecutionProvider,                                                                                               \
+      (*KernelDefBuilder::Create()).TypeConstraint("T", WebGpuSupportedNumberTypes()).InputMemoryType(OrtMemTypeCPUInput, 1), \
+      ReduceOp);
+
+REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceMean, 1, 10);
+REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceMean, 11, 12);
+REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceMean, 13, 17);
+REGISTER_UNARY_ELEMENTWISE_KERNEL(ReduceMean, 18);
+
+Status ReduceKernelProgram::GenerateShaderCode(ShaderHelper& shader) const {
+  const auto& input = shader.AddInput("input", ShaderUsage::UseUniform | ShaderUsage::UseIndicesTypeAlias | ShaderUsage::UseValueTypeAlias);
+  const auto& output = shader.AddOutput("output", ShaderUsage::UseUniform | ShaderUsage::UseIndicesTypeAlias | ShaderUsage::UseValueTypeAlias);
+  bool reduce_on_all_axes = no_op_with_empty_axes_ == false && axes_.empty();
+  std::string loop_header = code_[0];
+  std::string loop_body = "let current_element: input_value_t = " + input.GetByIndices("input_indices") + ";\n" + code_[1];
+  std::string loop_footer = code_[2];
+  const auto input_rank = input.Rank();
+  for (int i = 0, l = 0; i < input_rank; ++i) {
+    if (reduce_on_all_axes || std::find(axes_.begin(), axes_.end(), i) != axes_.end()) {
+      if (keepdims_) {
+        l++;
+      }
+      std::stringstream ss;
+      std::string index = "i" + std::to_string(i);
+      ss << "for (var " << index << " : u32 = 0; " << index << " < " << input.IndicesGet("uniforms.input_shape", i) << "; " << index << "++) {\n";
+      ss << input.IndicesSet("input_indices", i, index) << ";\n";
+      ss << loop_body << "\n";
+      ss << "}\n";
+      loop_body = ss.str();
+    } else {
+      std::stringstream ss;
+      ss << loop_header << "\n";
+      std::string index = "i" + std::to_string(i);
+      ss << "let " << index << " = " << output.IndicesGet("output_indices", l) << ";\n";
+      ss << input.IndicesSet("input_indices", i, index) << ";\n";
+      loop_header = ss.str();
+      l++;
+    }
+  }
+  std::stringstream input_indices_init_value;
+  for (int i = 0; i < input_rank - 1; ++i) {
+    input_indices_init_value << "0, ";
+  }
+  input_indices_init_value << "0";
+  shader.MainFunctionBody() << shader.GuardAgainstOutOfBoundsWorkgroupSizes("uniforms.output_size")
+                            << "let output_indices: output_indices_t = " << output.OffsetToIndices("global_idx") << ";\n"
+                            << "var input_indices: input_indices_t = input_indices_t(" << input_indices_init_value.str() << ");\n"
+                            << loop_header << loop_body << loop_footer;
+  shader.MainFunctionBody() << output.SetByOffset("global_idx", "output_value");
+  return Status::OK();
+}
+
+template <bool allow_multi_axes>
+Status ReduceKernel<allow_multi_axes>::ComputeInternal(ComputeContext& context) const {
+  const auto* input_tensor = context.Input(0);
+  InlinedVector<uint32_t> input_axes;
+  auto rank = input_tensor->Shape().NumDimensions();
+  auto transform_axis = [rank](int64_t axis) {
+    if (axis < 0) {
+      axis += rank;
+    }
+    if (axis < 0 || static_cast<size_t>(axis) >= rank) {
+      ORT_THROW("Axes values must be in the range [-rank, rank-1]. Got: ", axis);
+    }
+    return static_cast<uint32_t>(axis);
+  };
+  // Check if axes input is provided and copy the axes values to input_axes
+  if (context.InputCount() > 1) {
+    ORT_ENFORCE(axes_.empty(), "Axes attribute may not be specified when axes input is also provided.");
+    const Tensor* axes_tensor = context.Input<Tensor>(1);
+    auto size = static_cast<size_t>(axes_tensor->Shape()[0]);
+    const auto* data = axes_tensor->Data<int64_t>();
+    input_axes.reserve(size);
+    std::transform(data, data + size, std::back_inserter(input_axes), transform_axis);
+  } else {
+    input_axes.reserve(axes_.size());
+    std::transform(axes_.begin(), axes_.end(), std::back_inserter(input_axes), transform_axis);
+  }
+  if (input_axes.empty()) {
+    if (noop_with_empty_axes_ || rank == 0) {
+      // If axes is empty and noop_with_empty_axes_ is true, it is a no-op according to the spec
+      // If input tensor is a scalar, return the input tensor as is.
+      // This is not correct for ReduceLogSum and ReduceSumSquare
+      // TODO handle these cases separately.
+      auto output = context.Output(0, input_tensor->Shape());
+      if (output->DataRaw() != input_tensor->DataRaw()) {
+        ORT_RETURN_IF_ERROR(Info().GetDataTransferManager().CopyTensor(*input_tensor, *output));
+      }
+      return Status::OK();
+    } else {
+      // If axes is empty and noop_with_empty_axes_ is false, it is a reduction over all axes
+      input_axes.resize(rank);
+      std::iota(input_axes.begin(), input_axes.end(), 0);
+    }
+  }
+  const auto code = GetOpSpecificCode(input_tensor, input_axes.size());
+  // Compute output shape
+  std::vector<int64_t> output_shape;
+  for (size_t i = 0; i < input_tensor->Shape().NumDimensions(); ++i) {
+    if (std::find(input_axes.begin(), input_axes.end(), i) != input_axes.end()) {
+      if (keepdims_) {
+        output_shape.push_back(1);
+      }
+    } else {
+      output_shape.push_back(input_tensor->Shape()[i]);
+    }
+  }
+  TensorShape output_tensor_shape(output_shape);
+  int64_t output_size = output_tensor_shape.Size();
+  ReduceKernelProgram program("ReduceMean", keepdims_, noop_with_empty_axes_, input_axes, code);
+  program.AddInput({input_tensor, ProgramTensorMetadataDependency::TypeAndRank})
+      .AddOutput({context.Output(0, output_shape), ProgramTensorMetadataDependency::TypeAndRank})
+      .SetDispatchGroupSize((output_size + WORKGROUP_SIZE - 1) / WORKGROUP_SIZE)
+      .AddUniformVariables({{static_cast<uint32_t>(output_size)},
+                            {static_cast<uint32_t>(noop_with_empty_axes_ ? 1 : 0)},
+                            {input_axes},
+                            {static_cast<uint32_t>(input_axes.size())}});
+
+  return context.RunProgram(program);
+}
+
+ReduceOpSpecificCode ReduceMean::GetOpSpecificCode(const Tensor* input_tensor, size_t axes_size) const {
+  const TensorShape& input_shape = input_tensor->Shape();
+  size_t input_rank = input_shape.NumDimensions();
+  std::stringstream ss;
+  ss << "var size: u32 = 1;\n"
+     << "for (var i: u32 = 0; i < uniforms.axes_size; i += 1) { \n"
+     << "  let index = " << GetElementAt("uniforms.axes", "i", axes_size) << ";\n"
+     << "  size = size * " << GetElementAt("uniforms.input_shape", "index", input_rank) << ";\n"
+     << "}\n"
+     << "let output_value = output_value_t(sum / f32(size));";
+  ReduceOpSpecificCode code({"var sum = f32(0);", "sum += f32(current_element);", ss.str()});
+  return code;
+}
+
+Status ReduceMean::ComputeInternal(ComputeContext& ctx) const {
+  return ReduceKernel<true>::ComputeInternal(ctx);
+}
+
+}  // namespace webgpu
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/webgpu/reduction/reduction_ops.h b/onnxruntime/core/providers/webgpu/reduction/reduction_ops.h
new file mode 100644
index 0000000000000..e93eb06f20886
--- /dev/null
+++ b/onnxruntime/core/providers/webgpu/reduction/reduction_ops.h
@@ -0,0 +1,62 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+#include "core/common/optional.h"
+#include "core/providers/webgpu/webgpu_supported_types.h"
+#include "core/providers/webgpu/webgpu_kernel.h"
+#include "core/providers/cpu/reduction/reduction_kernel_base.h"
+#include "core/providers/webgpu/program.h"
+#include "core/providers/webgpu/shader_helper.h"
+namespace onnxruntime {
+namespace webgpu {
+// reduceOpSpecificCode is a 3-element array of strings that represent the op specific code for the reduce operation.
+// The first element is the loop header, the second element is the loop body, and the third element is the loop footer.
+// The loop header is the code that is executed before the loop starts. The loop body is the code that is executed for each element in the loop.
+// The loop footer is the code that is executed after the loop ends.
+typedef std::array<std::string, 3> ReduceOpSpecificCode;
+class ReduceKernelProgram final : public Program<ReduceKernelProgram> {
+ public:
+  ReduceKernelProgram(std::string name, bool keepdims, bool no_op_with_empty_axes, const InlinedVector<uint32_t>& axes, ReduceOpSpecificCode code) : Program{name}, keepdims_(keepdims), no_op_with_empty_axes_(no_op_with_empty_axes), axes_(axes.begin(), axes.end()), code_(code) {}
+  Status GenerateShaderCode(ShaderHelper& wgpuShaderModuleAddRef) const override;
+  WEBGPU_PROGRAM_DEFINE_UNIFORM_VARIABLES({"output_size", ProgramUniformVariableDataType::Uint32},
+                                          {"no_op_with_empty_axes", ProgramUniformVariableDataType::Uint32},
+                                          {"axes", ProgramUniformVariableDataType::Uint32},
+                                          {"axes_size", ProgramUniformVariableDataType::Uint32});
+
+ private:
+  const bool keepdims_;
+  const bool no_op_with_empty_axes_;
+  InlinedVector<uint32_t> axes_;
+  ReduceOpSpecificCode code_;
+};
+
+template <bool allow_multi_axes = true>
+class ReduceKernel : public WebGpuKernel, public ReduceKernelBase<allow_multi_axes> {
+ protected:
+  using ReduceKernelBase<allow_multi_axes>::axes_;
+  using ReduceKernelBase<allow_multi_axes>::noop_with_empty_axes_;
+  using ReduceKernelBase<allow_multi_axes>::keepdims_;
+  using ReduceKernelBase<allow_multi_axes>::select_last_index_;
+
+  ReduceKernel(const OpKernelInfo& info, std::string name, optional<int64_t> keepdims_override = {})
+      : WebGpuKernel(info),
+        ReduceKernelBase<allow_multi_axes>(info, keepdims_override),
+        name_(name) {
+  }
+  Status ComputeInternal(ComputeContext& ctx) const;
+  virtual ReduceOpSpecificCode GetOpSpecificCode(const Tensor* input_tensor, size_t axes_size) const = 0;
+
+ private:
+  std::string name_;
+};
+
+class ReduceMean final : public ReduceKernel<true> {
+ public:
+  ReduceMean(const OpKernelInfo& info) : ReduceKernel<true>(info, "ReduceMean") {}
+  ReduceOpSpecificCode GetOpSpecificCode(const Tensor* input_tensor, size_t axes_size) const override;
+  Status ComputeInternal(ComputeContext& ctx) const override;
+};
+
+}  // namespace webgpu
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc b/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc
index d44cf4674d8a3..4950d94dea4c4 100644
--- a/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc
+++ b/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc
@@ -516,10 +516,10 @@ std::unique_ptr<KernelRegistry> RegisterKernels() {
       // BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 13, 17, ReduceMax)>,
       // BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 18, ReduceMax)>,
 
-      // BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 1, 10, ReduceMean)>,
-      // BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 11, 12, ReduceMean)>,
-      // BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 13, 17, ReduceMean)>,
-      // BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 18, ReduceMean)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 1, 10, ReduceMean)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 11, 12, ReduceMean)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 13, 17, ReduceMean)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 18, ReduceMean)>,
 
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 1, 10, Unsqueeze)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 11, 12, Unsqueeze)>,

From cfb0a72feb93a59722c55e69b1415d47c80df7bc Mon Sep 17 00:00:00 2001
From: Prathik Rao <prathik.rao@gmail.com>
Date: Wed, 5 Mar 2025 14:44:32 -0800
Subject: [PATCH 18/46] [WebGPU EP] introduce BiasAdd contrib op (#23861)

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
---
 .../contrib_ops/webgpu/bert/bias_add.cc       | 80 +++++++++++++++++++
 .../contrib_ops/webgpu/bert/bias_add.h        | 32 ++++++++
 .../webgpu/webgpu_contrib_kernels.cc          |  4 +-
 3 files changed, 114 insertions(+), 2 deletions(-)
 create mode 100644 onnxruntime/contrib_ops/webgpu/bert/bias_add.cc
 create mode 100644 onnxruntime/contrib_ops/webgpu/bert/bias_add.h

diff --git a/onnxruntime/contrib_ops/webgpu/bert/bias_add.cc b/onnxruntime/contrib_ops/webgpu/bert/bias_add.cc
new file mode 100644
index 0000000000000..65c14e8cb0bdd
--- /dev/null
+++ b/onnxruntime/contrib_ops/webgpu/bert/bias_add.cc
@@ -0,0 +1,80 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/providers/webgpu/shader_helper.h"
+#include "core/providers/webgpu/webgpu_supported_types.h"
+#include "contrib_ops/webgpu/bert/bias_add.h"
+#include "contrib_ops/webgpu/webgpu_contrib_kernels.h"
+
+namespace onnxruntime {
+namespace contrib {
+namespace webgpu {
+
+ONNX_OPERATOR_KERNEL_EX(
+    BiasAdd,
+    kMSDomain,
+    1,
+    kWebGpuExecutionProvider,
+    (*KernelDefBuilder::Create())
+        .TypeConstraint("T", WebGpuSupportedFloatTypes()),
+    BiasAdd);
+
+Status BiasAddProgram::GenerateShaderCode(ShaderHelper& shader) const {
+  const ShaderVariableHelper& input = shader.AddInput("input");
+  const ShaderVariableHelper& bias = shader.AddInput("bias");
+  const ShaderVariableHelper& residual = shader.AddInput("residual");
+  const ShaderVariableHelper& output = shader.AddOutput("output");
+
+  shader.MainFunctionBody() << shader.GuardAgainstOutOfBoundsWorkgroupSizes("uniforms.output_size")
+                            << "let value = " << input.GetByOffset("global_idx")
+                            << "  + " << bias.GetByOffset("global_idx % uniforms.channels")
+                            << "  + " << residual.GetByOffset("global_idx") << ";\n"
+                            << output.SetByOffset("global_idx", "value");
+
+  return Status::OK();
+}
+
+static int64_t GetMaxComponents(int64_t size) {
+  if (size % 4 == 0) {
+    return 4;
+  } else if (size % 2 == 0) {
+    return 2;
+  }
+  return 1;
+}
+
+Status BiasAdd::ComputeInternal(onnxruntime::webgpu::ComputeContext& context) const {
+  const auto* input = context.Input(0);
+  const auto* bias = context.Input(1);
+  const auto* residual = context.Input(2);
+
+  TensorShape input_shape = input->Shape();
+
+  if (input_shape.NumDimensions() != 3) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "BiasAdd input should have 3 dimensions.");
+  }
+
+  int64_t channels = input_shape[2];
+  int64_t components = GetMaxComponents(channels);
+  channels /= components;
+
+  TensorShape bias_shape = bias->Shape();
+  if (bias_shape.NumDimensions() != 1 || bias_shape[0] != channels) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "BiasAdd bias should have 1 dimension with size equal to the number of channels.");
+  }
+
+  auto* output = context.Output(0, input_shape);
+  int64_t output_size = output->Shape().Size() / components;
+
+  BiasAddProgram program{};
+  program.AddInputs({{input}, {bias}, {residual}})
+      .AddOutput({output})
+      .SetDispatchGroupSize((output_size + WORKGROUP_SIZE - 1) / WORKGROUP_SIZE)
+      .AddUniformVariables({{static_cast<uint32_t>(output_size)},
+                            {static_cast<uint32_t>(channels)}});
+  return context.RunProgram(program);
+}
+
+}  // namespace webgpu
+}  // namespace contrib
+}  // namespace onnxruntime
diff --git a/onnxruntime/contrib_ops/webgpu/bert/bias_add.h b/onnxruntime/contrib_ops/webgpu/bert/bias_add.h
new file mode 100644
index 0000000000000..58cc5f09f8003
--- /dev/null
+++ b/onnxruntime/contrib_ops/webgpu/bert/bias_add.h
@@ -0,0 +1,32 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/providers/webgpu/program.h"
+#include "core/providers/webgpu/webgpu_kernel.h"
+
+namespace onnxruntime {
+namespace contrib {
+namespace webgpu {
+
+using namespace onnxruntime::webgpu;
+using onnxruntime::webgpu::ComputeContext;
+
+class BiasAddProgram final : public Program<BiasAddProgram> {
+ public:
+  BiasAddProgram() : Program{"BiasAdd"} {}
+  Status GenerateShaderCode(ShaderHelper& sh) const override;
+  WEBGPU_PROGRAM_DEFINE_UNIFORM_VARIABLES({"output_size", ProgramUniformVariableDataType::Uint32},
+                                          {"channels", ProgramUniformVariableDataType::Uint32});
+};
+
+class BiasAdd final : public WebGpuKernel {
+ public:
+  BiasAdd(const OpKernelInfo& info) : WebGpuKernel(info) {}
+  Status ComputeInternal(ComputeContext& context) const override;
+};
+
+}  // namespace webgpu
+}  // namespace contrib
+}  // namespace onnxruntime
diff --git a/onnxruntime/contrib_ops/webgpu/webgpu_contrib_kernels.cc b/onnxruntime/contrib_ops/webgpu/webgpu_contrib_kernels.cc
index 2e7ed5a16a2f0..068a94c7390e2 100644
--- a/onnxruntime/contrib_ops/webgpu/webgpu_contrib_kernels.cc
+++ b/onnxruntime/contrib_ops/webgpu/webgpu_contrib_kernels.cc
@@ -37,8 +37,8 @@ Status RegisterWebGpuContribKernels(KernelRegistry& kernel_registry) {
   static const BuildKernelCreateInfoFn function_table[] = {
       BuildKernelCreateInfo<void>,  // default entry to avoid the list become empty after ops-reducing
                                     // BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kMSDomain, 1, Attention)>,
-                                    // BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kMSDomain, 1, BiasAdd)>,
-                                    // BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kMSDomain, 1, BiasSplitGelu)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kMSDomain, 1, BiasAdd)>,
+      // BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kMSDomain, 1, BiasSplitGelu)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kMSDomain, 1, FastGelu)>,
       // BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kMSDomain, 1, FusedConv)>,
       // BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kMSDomain, 1, Gelu)>,

From 5e636a676bdbf36907681171ab1d87430cab3b4e Mon Sep 17 00:00:00 2001
From: Tianlei Wu <tlwu@microsoft.com>
Date: Wed, 5 Mar 2025 15:13:23 -0800
Subject: [PATCH 19/46] Dynamo export and improve benchmark script for SAM2
 encoder (#23887)

### Description
* Add dynamo export for Sam2 image encoder
* Verify fp32 onnx model with CPU EP (to avoid error message from TRT
EP).
* Update benchmark script:
  - output ORT profiling
- output torch compiled code and unique kernel name for compiled kernel
  - add an option for nightly package installation
  - uninstall existing ort packages before installing

The node metadata of dynamo exported model can help mapping node in onnx
model back to pytorch modeling script. Currently, the graph optimization
is not done on dynamo exported model, so it is experimental right now.

### Motivation and Context

To support profiling of torch compiled CUDA kernel.
---
 .../tools/transformers/models/sam2/README.md  |  31 +-
 .../models/sam2/benchmark_sam2.py             |  15 +-
 .../models/sam2/benchmark_sam2.sh             | 310 ++++++++++++------
 .../models/sam2/convert_to_onnx.py            |  14 +-
 .../transformers/models/sam2/image_decoder.py |   2 +-
 .../transformers/models/sam2/image_encoder.py |  74 ++++-
 .../transformers/models/sam2/mask_decoder.py  |   2 +-
 .../models/sam2/prompt_encoder.py             |   2 +-
 8 files changed, 315 insertions(+), 135 deletions(-)

diff --git a/onnxruntime/python/tools/transformers/models/sam2/README.md b/onnxruntime/python/tools/transformers/models/sam2/README.md
index e7cafeffc6231..463d154525f8f 100644
--- a/onnxruntime/python/tools/transformers/models/sam2/README.md
+++ b/onnxruntime/python/tools/transformers/models/sam2/README.md
@@ -96,8 +96,7 @@ We can create a conda environment then run GPU benchmark like the following:
 conda create -n sam2_gpu python=3.11 -y
 conda activate sam2_gpu
 install_dir=$HOME
-profiling=true
-bash benchmark_sam2.sh $install_dir gpu $profiling
+bash benchmark_sam2.sh $install_dir gpu
 ```
 
 or create a new conda environment for CPU benchmark:
@@ -107,16 +106,28 @@ conda activate sam2_cpu
 bash benchmark_sam2.sh $HOME cpu
 ```
 
-The first parameter is a directory to clone git repositories or install CUDA/cuDNN for benchmark.
-The second parameter can be either "gpu" or "cpu", which indicates the device to run benchmark.
-The third parameter is optional. Value "true" will enable profiling after running benchmarking on GPU.
+The usage of the script like the following:
+```
+bash benchmark_sam2.sh <install_dir> <cpu_or_gpu> [profiling] [benchmarking] [nightly] [dynamo]
+```
+
+| Parameter| Default  | Description |
+|----------|----------| ------------|
+| install_dir | $HOME | a directory to clone git repositories or install CUDA/cuDNN for benchmark |
+| cpu_or_gpu | gpu | the device to run benchmark. The value can be either "gpu" or "cpu" |
+| profiling | false | run gpu profiling |
+| benchmarking | true | run benchmark |
+| nightly | false | install onnxruntime nightly or official release package |
+| dynamo | false | export image encoder using dynamo or not. |
 
-The script will automatically install required packages in current conda environment, download checkpoints, export onnx,
-and run demo, benchmark and optionally run profiling.
+The dynamo export is experimental since graph optimization still need extra works for this model.
 
-* The performance test result is in sam2_gpu.csv or sam2_cpu.csv, which can be loaded into Excel.
-* The demo output is sam2_demo_fp16_gpu.png or sam2_demo_fp32_cpu.png.
-* The profiling results are in *.nsys-rep or *.json files in current directory. Use Nvidia NSight System to view the *.nsys-rep file.
+Output files:
+* sam2_cpu_[timestamp].csv or sam2_gpu_[timestamp].csv has benchmark results. Use Excel to load the file to view it.
+* onnxruntime_image_[encoder|decoder].json has ONNX Runtime profiling results. Use `chrome://tracing` in Chrome browser to view it.
+* torch_image_[encoder|decoder].json has PyTorch profiling results. Use `chrome://tracing` in Chrome browser to view it.
+* sam2_fp16_profile_image_[encoder|decoder]_[ort|torch]_gpu.[nsys-rep|sqlite] has NVTX profiling. Use Nvidia NSight System to view it.
+* torch_image_encoder_compiled_code.txt has the compiled kernel code from Pytorch.
 
 ## Limitations
 - The exported image_decoder model does not support batch mode for now.
diff --git a/onnxruntime/python/tools/transformers/models/sam2/benchmark_sam2.py b/onnxruntime/python/tools/transformers/models/sam2/benchmark_sam2.py
index 16d71d5057b02..3fc24d157b0cf 100644
--- a/onnxruntime/python/tools/transformers/models/sam2/benchmark_sam2.py
+++ b/onnxruntime/python/tools/transformers/models/sam2/benchmark_sam2.py
@@ -46,6 +46,7 @@ def __init__(
         prefer_nhwc: bool = False,
         warm_up: int = 5,
         enable_nvtx_profile: bool = False,
+        enable_ort_profile: bool = False,
         enable_torch_profile: bool = False,
         repeats: int = 1000,
         verbose: bool = False,
@@ -74,6 +75,7 @@ def __init__(
         self.prefer_nhwc = prefer_nhwc
         self.warm_up = warm_up
         self.enable_nvtx_profile = enable_nvtx_profile
+        self.enable_ort_profile = enable_ort_profile
         self.enable_torch_profile = enable_torch_profile
         self.repeats = repeats
         self.verbose = verbose
@@ -317,6 +319,7 @@ def run_test(
         repeats=args.repeats,
         warm_up=args.warm_up,
         enable_nvtx_profile=args.enable_nvtx_profile,
+        enable_ort_profile=args.enable_ort_profile,
         enable_torch_profile=args.enable_torch_profile,
         torch_compile_mode=args.torch_compile_mode,
         verbose=False,
@@ -325,7 +328,7 @@ def run_test(
     if args.engine == "ort":
         sess_options = SessionOptions()
         sess_options.intra_op_num_threads = args.intra_op_num_threads
-        if config.enable_nvtx_profile:
+        if config.enable_ort_profile:
             sess_options.enable_profiling = True
             sess_options.log_severity_level = 4
             sess_options.log_verbosity_level = 0
@@ -349,6 +352,8 @@ def run_test(
             with nvtx.annotate("one_run"):
                 _ = session.infer(input_dict)
             cudart.cudaProfilerStop()
+
+        if config.enable_ort_profile:
             session.ort_session.end_profiling()
 
         if repeats == 0:
@@ -554,6 +559,14 @@ def _parse_arguments():
         help="Enable nvtx profiling. It will add an extra run for profiling before performance test.",
     )
 
+    parser.add_argument(
+        "--enable_ort_profile",
+        required=False,
+        default=False,
+        action="store_true",
+        help="Enable ORT profiling.",
+    )
+
     parser.add_argument(
         "--enable_torch_profile",
         required=False,
diff --git a/onnxruntime/python/tools/transformers/models/sam2/benchmark_sam2.sh b/onnxruntime/python/tools/transformers/models/sam2/benchmark_sam2.sh
index 9e97867657ab9..c82b1ed31796e 100644
--- a/onnxruntime/python/tools/transformers/models/sam2/benchmark_sam2.sh
+++ b/onnxruntime/python/tools/transformers/models/sam2/benchmark_sam2.sh
@@ -5,7 +5,17 @@
 # -------------------------------------------------------------------------
 
 # Please refer to README.md for the prerequisites and usage of this script.
-#   bash benchmark_sam2.sh <install_dir> <cpu|gpu> [profiling]
+#   bash benchmark_sam2.sh <install_dir> <cpu|gpu> [profiling] [benchmarking] [nightly] [dynamo]
+# Note that dynamo need onnxruntime 1.21 or later, or nightly build.
+# Example:
+#   bash benchmark_sam2.sh $HOME gpu true true true false
+
+install_dir="${1:-$HOME}"
+cpu_or_gpu="${2:-gpu}"
+profiling="${3:-false}"
+benchmarking="${4:-true}"
+nightly="${5:-false}"
+dynamo="${6:-false}"
 
 python="$CONDA_PREFIX/bin/python3"
 
@@ -13,9 +23,6 @@ python="$CONDA_PREFIX/bin/python3"
 dir="$(cd "$(dirname "$0")" && pwd)"
 onnx_dir="$dir/sam2_onnx_models"
 
-# Installation directory (default: $HOME)
-install_dir="${1:-$HOME}"
-
 if [ ! -d "$install_dir" ]; then
     echo "Error: install_dir '$install_dir' does not exist."
     exit 1
@@ -26,7 +33,6 @@ sam2_dir="$install_dir/segment-anything-2"
 model="sam2_hiera_large"
 
 # Default to GPU, switch to CPU if specified
-cpu_or_gpu="${2:-gpu}"
 if [ "$cpu_or_gpu" != "gpu" ] && [ "$cpu_or_gpu" != "cpu" ]; then
     echo "Invalid option: $2. Please specify 'cpu' or 'gpu'."
     exit 1
@@ -35,52 +41,97 @@ fi
 echo "install_dir: $install_dir"
 echo "cpu_or_gpu: $cpu_or_gpu"
 
-install_cuda_12()
-{
-    pushd $install_dir
-    wget https://developer.download.nvidia.com/compute/cuda/12.6.2/local_installers/cuda_12.6.2_560.35.03_linux.run
-    sh cuda_12.6.2_560.35.03_linux.run --toolkit --toolkitpath=$install_dir/cuda12.6 --silent --override --no-man-page
+# Function to check if a command exists
+command_exists() {
+    command -v "$1" >/dev/null 2>&1
+}
+
+# Ensure necessary tools are installed
+if ! command_exists wget; then
+    echo "wget is not installed. Please install it and try again."
+    exit 1
+fi
+
+if ! command_exists git; then
+    echo "git is not installed. Please install it and try again."
+    exit 1
+fi
+
+if ! command_exists pip; then
+    echo "pip is not installed. Please install it and try again."
+    exit 1
+fi
+
+cuda_version=12.6
+cudnn_version=9.5
 
-    export PATH="$install_dir/cuda12.6/bin:$PATH"
-    export LD_LIBRARY_PATH="$install_dir/cuda12.6/lib64:$LD_LIBRARY_PATH"
-    popd
+# Install CUDA 12.6
+install_cuda_12() {
+    if ! [ -d "$install_dir/cuda${cuda_version}" ]; then
+        pushd "$install_dir" || exit
+        wget https://developer.download.nvidia.com/compute/cuda/12.6.2/local_installers/cuda_12.6.2_560.35.03_linux.run
+        sh cuda_12.6.2_560.35.03_linux.run --toolkit --toolkitpath="$install_dir/cuda${cuda_version}" --silent --override --no-man-page
+        popd || exit
+    fi
+    export PATH="$install_dir/cuda${cuda_version}/bin:$PATH"
+    export LD_LIBRARY_PATH="$install_dir/cuda${cuda_version}/lib64:$LD_LIBRARY_PATH"
 }
 
-# Function to install cuDNN 9.4
+# Install cuDNN 9.5
 install_cudnn_9() {
-    pushd "$install_dir"
-    wget -q https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-x86_64/cudnn-linux-x86_64-9.5.0.50_cuda12-archive.tar.xz
-    mkdir -p "$install_dir/cudnn9.5"
-    tar -Jxvf cudnn-linux-x86_64-9.5.0.50_cuda12-archive.tar.xz -C "$install_dir/cudnn9.5" --strip=1
-    export LD_LIBRARY_PATH="$install_dir/cudnn9.5/lib:$LD_LIBRARY_PATH"
-    popd
+    if ! [ -d "$install_dir/cudnn${cudnn_version}" ]; then
+        pushd "$install_dir" || exit
+        wget -q https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-x86_64/cudnn-linux-x86_64-9.5.0.50_cuda12-archive.tar.xz
+        mkdir -p "$install_dir/cudnn${cudnn_version}"
+        tar -Jxvf cudnn-linux-x86_64-9.5.0.50_cuda12-archive.tar.xz -C "$install_dir/cudnn${cudnn_version}" --strip=1
+        popd || exit
+    fi
+    export LD_LIBRARY_PATH="$install_dir/cudnn${cudnn_version}/lib:$LD_LIBRARY_PATH"
+}
+
+install_ort() {
+    local ort="$1"
+    pip uninstall onnxruntime onnxruntime-gpu -y
+
+    if [ "$nightly" = "true" ]; then
+        pip install flatbuffers numpy packaging protobuf sympy
+        pip install --pre --index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/ORT-Nightly/pypi/simple/ "$ort"
+    else
+        pip install "$ort"
+    fi
+
+    pip install onnx onnxscript opencv-python matplotlib
 }
 
 # Install GPU dependencies
 install_gpu() {
-    [ ! -d "$install_dir/cuda12.6" ] && install_cuda_12
-    [ ! -d "$install_dir/cudnn9.5" ] && install_cudnn_9
+    install_cuda_12
+    install_cudnn_9
+    echo "PATH: $PATH"
+    echo "LD_LIBRARY_PATH: $LD_LIBRARY_PATH"
+
+    # The dynamo export need torch 2.6.0 or later. Use the latest one.
+    pip install torch torchvision --index-url https://download.pytorch.org/whl/cu124 --upgrade
 
-    pip install torch torchvision --index-url https://download.pytorch.org/whl/cu124
-    pip install onnxruntime-gpu onnx opencv-python matplotlib
+    install_ort "onnxruntime-gpu"
 }
 
 # Install CPU dependencies
 install_cpu() {
     pip install torch torchvision --index-url https://download.pytorch.org/whl/cpu
-    pip install onnxruntime onnx opencv-python matplotlib
+    install_ort "onnxruntime"
 }
 
 # Clone and install SAM2 if not already installed
 install_sam2() {
-    pushd "$install_dir"
+    pushd "$install_dir" || exit
     if [ ! -d "$sam2_dir" ]; then
         git clone https://github.com/facebookresearch/segment-anything-2.git
     fi
-    cd "$sam2_dir"
+    cd "$sam2_dir" || exit
     pip show SAM-2 > /dev/null 2>&1 || pip install -e .
     [ ! -f checkpoints/sam2_hiera_large.pt ] && (cd checkpoints && sh ./download_ckpts.sh)
-    popd
+    popd || exit
 }
 
 # Download test image if not available
@@ -90,7 +141,12 @@ download_test_image() {
 
 run_cpu_benchmark() {
     local repeats="$1"
-    $python convert_to_onnx.py --sam2_dir "$sam2_dir" --optimize --demo
+
+    if [ "$dynamo" = "true" ]; then
+        $python convert_to_onnx.py --sam2_dir "$sam2_dir" --optimize --demo --dynamo
+    else
+        $python convert_to_onnx.py --sam2_dir "$sam2_dir" --optimize --demo
+    fi
 
     for component in image_encoder image_decoder; do
         $python benchmark_sam2.py --model_type "$model" --engine torch --sam2_dir "$sam2_dir" --repeats "$repeats" --dtype fp32 --component "$component"
@@ -103,65 +159,75 @@ run_cpu_benchmark() {
     done
 }
 
-run_gpu_benchmark() {
+run_ort_gpu_benchmark() {
     local repeats="$1"
-    $python convert_to_onnx.py --sam2_dir "$sam2_dir" --optimize --use_gpu --dtype fp32
-    $python convert_to_onnx.py --sam2_dir "$sam2_dir" --optimize --use_gpu --dtype fp16 --demo
 
-    for component in image_encoder image_decoder; do
-        for dtype in bf16 fp32 fp16; do
-            $python benchmark_sam2.py --model_type "$model" --engine torch --sam2_dir "$sam2_dir" --repeats "$repeats" --use_gpu --dtype $dtype --component "$component"
-        done
-    done
+    if [ "$dynamo" = "true" ]; then
+        $python convert_to_onnx.py --sam2_dir "$sam2_dir" --optimize --use_gpu --dtype fp32 --dynamo
+        $python convert_to_onnx.py --sam2_dir "$sam2_dir" --optimize --use_gpu --dtype fp16 --demo --dynamo
+    else
+        $python convert_to_onnx.py --sam2_dir "$sam2_dir" --optimize --use_gpu --dtype fp32
+        $python convert_to_onnx.py --sam2_dir "$sam2_dir" --optimize --use_gpu --dtype fp16 --demo
+    fi
 
     component="image_encoder"
     for dtype in fp32 fp16; do
-        #TODO: --prefer_nhwc does not help with performance
-        $python benchmark_sam2.py --model_type "$model" --engine ort --sam2_dir "$sam2_dir" --repeats "$repeats" --use_gpu --dtype $dtype --component "$component" --onnx_path "${onnx_dir}/${model}_${component}_${dtype}_gpu.onnx" --use_cuda_graph
+        $python benchmark_sam2.py --model_type "$model" --engine ort --sam2_dir "$sam2_dir" --repeats "$repeats" --use_gpu --dtype "$dtype" --component "$component" --onnx_path "${onnx_dir}/${model}_${component}_${dtype}_gpu.onnx" --use_cuda_graph
     done
+    # Test prefer_nhwc.
+    $python benchmark_sam2.py --model_type "$model" --engine ort --sam2_dir "$sam2_dir" --repeats "$repeats" --use_gpu --dtype fp16 --component "$component" --onnx_path "${onnx_dir}/${model}_${component}_${dtype}_gpu.onnx" --use_cuda_graph --prefer_nhwc
 
     component="image_decoder"
     for dtype in fp32 fp16; do
         # TODO: decoder does not work with cuda graph
-        $python benchmark_sam2.py --model_type "$model" --engine ort --sam2_dir "$sam2_dir" --repeats "$repeats" --use_gpu --dtype $dtype --component "$component" --onnx_path "${onnx_dir}/${model}_${component}_${dtype}_gpu.onnx"
+        $python benchmark_sam2.py --model_type "$model" --engine ort --sam2_dir "$sam2_dir" --repeats "$repeats" --use_gpu --dtype "$dtype" --component "$component" --onnx_path "${onnx_dir}/${model}_${component}_${dtype}_gpu.onnx"
     done
+    # Test prefer_nhwc.
+    $python benchmark_sam2.py --model_type "$model" --engine ort --sam2_dir "$sam2_dir" --repeats "$repeats" --use_gpu --dtype fp16 --component "$component" --onnx_path "${onnx_dir}/${model}_${component}_${dtype}_gpu.onnx" --prefer_nhwc
 }
 
-run_torch_compile_gpu_benchmark() {
+run_torch_gpu_benchmark() {
     local repeats="$1"
 
+    # Test PyTorch eager mode.
+    for component in image_encoder image_decoder; do
+        for dtype in bf16 fp32 fp16; do
+            $python benchmark_sam2.py --model_type "$model" --engine torch --sam2_dir "$sam2_dir" --repeats "$repeats" --use_gpu --dtype "$dtype" --component "$component"
+        done
+    done
+
     # Test different torch compile modes on image encoder
     for torch_compile_mode in none max-autotune reduce-overhead max-autotune-no-cudagraphs
     do
-        $python benchmark_sam2.py --model_type $model --engine torch --sam2_dir "$sam2_dir" --repeats "$repeats" --use_gpu --dtype fp16 --component image_encoder --torch_compile_mode $torch_compile_mode
+        $python benchmark_sam2.py --model_type "$model" --engine torch --sam2_dir "$sam2_dir" --repeats "$repeats" --use_gpu --dtype fp16 --component image_encoder --torch_compile_mode $torch_compile_mode
     done
 }
 
-
-# Main script
-run_benchmarks() {
-    if [ ! -v CONDA_PREFIX ]; then
-        echo "Please activate conda environment before running this script."
-        exit 1
+install_all() {
+    if [ "$cpu_or_gpu" = "gpu" ]; then
+        install_gpu
+    else
+        install_cpu
     fi
-
-    # Install dependencies
-    [ "$cpu_or_gpu" = "gpu" ] && install_gpu || install_cpu
     install_sam2
     download_test_image
+}
 
-    # Run benchmarks
-    output_csv="sam2_${cpu_or_gpu}.csv"
+run_benchmarks() {
+    suffix=$(date +"%Y_%m_%d_%H_%M_%S")
+    [ "$dynamo" = "true" ] && suffix="${suffix}_dynamo"
+    output_csv="sam2_${cpu_or_gpu}_${suffix}.csv"
     if [ ! -f "$output_csv" ]; then
         echo "Running $cpu_or_gpu benchmark..."
         if [ "$cpu_or_gpu" = "gpu" ]; then
-            run_gpu_benchmark 1000
-            run_torch_compile_gpu_benchmark 1000
+            run_ort_gpu_benchmark 1000
+            run_torch_gpu_benchmark 1000
         else
             run_cpu_benchmark 100
         fi
         cat benchmark*.csv > combined_csv
         awk '!x[$0]++' combined_csv > "$output_csv"
+        rm benchmark*.csv
         rm combined_csv
         echo "Benchmark results saved in $output_csv"
     else
@@ -169,7 +235,16 @@ run_benchmarks() {
     fi
 }
 
-run_benchmarks
+if [ ! -v CONDA_PREFIX ]; then
+    echo "Please activate conda environment before running this script."
+    exit 1
+fi
+
+install_all
+
+if [ "$benchmarking" = "true" ]; then
+    run_benchmarks
+fi
 
 #--------------------------------------------------------------------------
 # Below are for profiling
@@ -177,79 +252,100 @@ run_benchmarks
 
 # Build onnxruntime-gpu from source for profiling
 build_onnxruntime_gpu_for_profiling() {
-    pushd "$install_dir"
+    pushd "$install_dir" || exit
     if ! [ -d onnxruntime ]; then
         git clone https://github.com/microsoft/onnxruntime
     fi
-    cd onnxruntime
-    CUDA_ARCH=$(python3 -c "import torch; cc = torch.cuda.get_device_capability(); print(f'{cc[0]}{cc[1]}')")
-    if [ -n "$CUDA_ARCH" ]; then
-        pip install --upgrade pip cmake psutil setuptools wheel packaging ninja numpy==1.26.4
-        sh build.sh --config Release --build_dir build/cuda12 --build_shared_lib --parallel \
-                --use_cuda --cuda_version 12.6 --cuda_home $install_dir/cuda12.6 \
-                --cudnn_home $install_dir/cudnn9.5 \
-                --build_wheel --skip_tests \
-                --cmake_generator Ninja \
-                --compile_no_warning_as_error \
-                --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=$CUDA_ARCH \
-                --cmake_extra_defines onnxruntime_ENABLE_NVTX_PROFILE=ON \
-                --enable_cuda_line_info
-
-        pip install build/cuda12/Release/dist/onnxruntime_gpu-*-linux_x86_64.whl numpy==1.26.4
-    else
-        echo "No CUDA device found."
-        exit 1
-    fi
-    popd
+    cd onnxruntime || exit
+    pip install --upgrade pip cmake psutil setuptools wheel packaging ninja numpy
+    build_dir=build/cuda${cuda_version}
+    rm -rf ${build_dir}/Release/dist
+    sh build.sh --config Release --build_dir "${build_dir}" --build_shared_lib --parallel \
+            --use_cuda --cuda_version ${cuda_version} --cuda_home "$install_dir/cuda${cuda_version}" \
+            --cudnn_home "$install_dir/cudnn${cudnn_version}" \
+            --build_wheel --skip_tests \
+            --cmake_generator Ninja \
+            --compile_no_warning_as_error \
+            --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=native \
+            --cmake_extra_defines onnxruntime_ENABLE_NVTX_PROFILE=ON \
+            --enable_cuda_line_info
+    pip uninstall onnxruntime-gpu -y
+    pip install "${build_dir}/Release/dist/onnxruntime_gpu-*-linux_x86_64.whl"
+    popd || exit
 }
 
 # Run profiling with NVTX.
-run_nvtx_profile()
-{
-    pip install nvtx cuda-python==12.6.0
-
+run_nvtx_profile() {
+    local engine="$1"
     # Only trace one device to avoid huge output file size.
     device_id=0
-    envs="CUDA_VISIBLE_DEVICES=$device_id,ORT_ENABLE_CUDNN_FLASH_ATTENTION=1,LD_LIBRARY_PATH=$LD_LIBRARY_PATH"
+    envs="CUDA_VISIBLE_DEVICES=$device_id,ORT_ENABLE_CUDNN_FLASH_ATTENTION=1,LD_LIBRARY_PATH=$LD_LIBRARY_PATH,TORCHINDUCTOR_UNIQUE_KERNEL_NAMES=1"
     cuda_graph_trace=node
-    for engine in ort torch; do
-        for component in image_encoder image_decoder; do
-            sudo $install_dir/cuda12.6/bin/nsys profile --capture-range=nvtx --nvtx-capture='one_run' \
-                --gpu-metrics-device $device_id --force-overwrite true \
-                --sample process-tree --backtrace fp --stats true \
-                -t cuda,cudnn,cublas,osrt,nvtx --cuda-memory-usage true --cudabacktrace all \
-                --cuda-graph-trace $cuda_graph_trace \
-                -e $envs,NSYS_NVTX_PROFILER_REGISTER_ONLY=0 \
-                -o sam2_fp16_profile_${component}_${engine}_${cpu_or_gpu} \
-                $python benchmark_sam2.py --model_type $model --engine $engine \
-                                          --sam2_dir  $sam2_dir --warm_up 1 --repeats 0 \
-                                          --onnx_path ${onnx_dir}/${model}_${component}_fp16_gpu.onnx \
-                                          --component $component \
-                                          --use_gpu --dtype fp16 --enable_nvtx_profile
-        done
+    for component in image_encoder image_decoder; do
+        sudo "$install_dir/cuda${cuda_version}/bin/nsys" profile --capture-range=nvtx --nvtx-capture='one_run' \
+            --gpu-metrics-devices $device_id --force-overwrite true \
+            --sample process-tree --backtrace fp --stats true \
+            -t cuda,cudnn,cublas,osrt,nvtx --cuda-memory-usage true --cudabacktrace all \
+            --cuda-graph-trace "$cuda_graph_trace" \
+            -e "$envs,NSYS_NVTX_PROFILER_REGISTER_ONLY=0" \
+            -o "sam2_fp16_profile_${component}_${engine}_${cpu_or_gpu}" \
+            $python benchmark_sam2.py --model_type "$model" --engine "$engine" \
+                                      --sam2_dir "$sam2_dir" --warm_up 1 --repeats 0 \
+                                      --onnx_path "${onnx_dir}/${model}_${component}_fp16_gpu.onnx" \
+                                      --component "$component" \
+                                      --use_gpu --dtype fp16 --enable_nvtx_profile
     done
 }
 
-# Run profiling with PyTorch
-run_torch_profile() {
+run_ort_profile() {
+    export ORT_ENABLE_CUDNN_FLASH_ATTENTION=1
+    rm -f onnxruntime_*.json
     for component in image_encoder image_decoder; do
-        $python benchmark_sam2.py --model_type $model --engine torch \
-                                  --sam2_dir  $sam2_dir --warm_up 1 --repeats 0 \
-                                  --component $component \
-                                  --use_gpu --dtype fp16 --enable_torch_profile
+        $python benchmark_sam2.py --model_type "$model" --engine ort \
+                                  --sam2_dir  "$sam2_dir" --warm_up 1 --repeats 0 \
+                                  --onnx_path "${onnx_dir}/${model}_${component}_fp16_gpu.onnx" \
+                                  --component "$component" \
+                                  --use_gpu --dtype fp16 --enable_ort_profile
+        mv onnxruntime_profile*.json onnxruntime_$component.json
     done
 }
 
-run_profilings() {
-    build_onnxruntime_gpu_for_profiling
+# Run profiling with PyTorch
+run_torch_profile() {
+    # Enable logging might could help get the code of compiled kernels. You can turn it off to reduce overhead.
+    export TORCH_LOGS="+inductor,+output_code"
+    export TORCHINDUCTOR_UNIQUE_KERNEL_NAMES=1
+    component=image_encoder
+    $python benchmark_sam2.py --model_type "$model" --engine torch \
+                              --sam2_dir "$sam2_dir" --warm_up 1 --repeats 0 \
+                              --component "$component" \
+                              --torch_compile_mode max-autotune \
+                              --use_gpu --dtype fp16 --enable_torch_profile > "torch_${component}_compiled_code.txt"
+
+    component=image_decoder
+    $python benchmark_sam2.py --model_type "$model" --engine torch \
+                              --sam2_dir "$sam2_dir" --warm_up 1 --repeats 0 \
+                              --component "$component" \
+                              --torch_compile_mode none \
+                              --use_gpu --dtype fp16 --enable_torch_profile
+}
 
+run_nvtx_profilings() {
+    build_onnxruntime_gpu_for_profiling
     rm -f *.nsys-rep *.sqlite
-    run_nvtx_profile
+    run_nvtx_profile ort
+    run_nvtx_profile torch
+}
 
+run_profilings() {
+    pip install nvtx cuda-python==${cuda_version}.0
+    run_ort_profile
     run_torch_profile
+
+    # NVTX profiling need to build onnxruntime-gpu from source so it is put as the last step.
+    run_nvtx_profilings
 }
 
-profiling="${3:-false}"
 if [ "$profiling" = "true" ] &&  [ "$cpu_or_gpu" = "gpu" ]; then
     run_profilings
 fi
diff --git a/onnxruntime/python/tools/transformers/models/sam2/convert_to_onnx.py b/onnxruntime/python/tools/transformers/models/sam2/convert_to_onnx.py
index cacad717faf9c..3533a274b9972 100644
--- a/onnxruntime/python/tools/transformers/models/sam2/convert_to_onnx.py
+++ b/onnxruntime/python/tools/transformers/models/sam2/convert_to_onnx.py
@@ -113,6 +113,14 @@ def parse_arguments():
         help="Optimize onnx models for GPU",
     )
 
+    parser.add_argument(
+        "--dynamo",
+        required=False,
+        default=False,
+        action="store_true",
+        help="Use dynamo for exporting onnx model. Only image_encoder supports dynamo right now.",
+    )
+
     parser.add_argument(
         "--verbose",
         required=False,
@@ -151,8 +159,10 @@ def main():
         onnx_model_path = sam2_onnx_path(args.output_dir, args.model_type, component, args.multimask_output)
         if component == "image_encoder":
             if args.overwrite or not os.path.exists(onnx_model_path):
-                export_image_encoder_onnx(sam2_model, onnx_model_path, args.dynamic_batch_axes, args.verbose)
-                test_image_encoder_onnx(sam2_model, onnx_model_path, dynamic_batch_axes=False)
+                export_image_encoder_onnx(
+                    sam2_model, onnx_model_path, args.dynamic_batch_axes, args.verbose, args.dynamo
+                )
+                test_image_encoder_onnx(sam2_model, onnx_model_path, dynamic_batch_axes=args.dynamic_batch_axes)
 
         elif component == "mask_decoder":
             if args.overwrite or not os.path.exists(onnx_model_path):
diff --git a/onnxruntime/python/tools/transformers/models/sam2/image_decoder.py b/onnxruntime/python/tools/transformers/models/sam2/image_decoder.py
index 07ed150631f50..376e6ba7d802c 100644
--- a/onnxruntime/python/tools/transformers/models/sam2/image_decoder.py
+++ b/onnxruntime/python/tools/transformers/models/sam2/image_decoder.py
@@ -246,7 +246,7 @@ def test_decoder_onnx(
 
     import onnxruntime
 
-    ort_session = onnxruntime.InferenceSession(onnx_model_path, providers=onnxruntime.get_available_providers())
+    ort_session = onnxruntime.InferenceSession(onnx_model_path, providers=["CPUExecutionProvider"])
 
     model_inputs = ort_session.get_inputs()
     input_names = [model_inputs[i].name for i in range(len(model_inputs))]
diff --git a/onnxruntime/python/tools/transformers/models/sam2/image_encoder.py b/onnxruntime/python/tools/transformers/models/sam2/image_encoder.py
index c5ce339732063..79e9297788c36 100644
--- a/onnxruntime/python/tools/transformers/models/sam2/image_encoder.py
+++ b/onnxruntime/python/tools/transformers/models/sam2/image_encoder.py
@@ -90,6 +90,8 @@ def export_image_encoder_onnx(
     onnx_model_path: str,
     dynamic_batch_axes: bool = False,
     verbose: bool = False,
+    dynamo: bool = False,
+    clear_dynamo_metadata: bool = False,
 ):
     image = random_sam2_input_image()
 
@@ -113,17 +115,65 @@ def export_image_encoder_onnx(
         if not verbose:
             warnings.filterwarnings("ignore", category=torch.jit.TracerWarning)
             warnings.filterwarnings("ignore", category=UserWarning)
-        torch.onnx.export(
-            sam2_encoder,
-            image,
-            onnx_model_path,
-            export_params=True,
-            opset_version=17,
-            do_constant_folding=True,
-            input_names=["image"],
-            output_names=["image_features_0", "image_features_1", "image_embeddings"],
-            dynamic_axes=dynamic_axes,
-        )
+
+        if not dynamo:
+            torch.onnx.export(
+                sam2_encoder,
+                image,
+                onnx_model_path,
+                export_params=True,
+                opset_version=17,
+                do_constant_folding=True,
+                input_names=["image"],
+                output_names=["image_features_0", "image_features_1", "image_embeddings"],
+                dynamic_axes=dynamic_axes,
+            )
+        else:
+            torch._dynamo.config.capture_scalar_outputs = True
+            ep = torch.export.export(
+                sam2_encoder,
+                args=(image,),
+                strict=False,
+                dynamic_shapes=[
+                    {0: torch.export.Dim.AUTO},
+                ],
+            )
+
+            onnx_program = torch.onnx.export(
+                ep,
+                (),
+                opset_version=17,
+                input_names=["image"],
+                output_names=["image_features_0", "image_features_1", "image_embeddings"],
+                dynamo=True,
+            )
+            onnx_program.optimize()
+            onnx_program.save(onnx_model_path + ".dynamo.onnx", external_data=False)
+            import onnx
+
+            from onnxruntime.transformers.dynamo_onnx_helper import DynamoOnnxHelper
+
+            onnx_model = onnx.load_model(onnx_model_path + ".dynamo.onnx", load_external_data=True)
+            if dynamic_batch_axes:
+                # Fix labels of dynamic axes since they can't be specified during Dynamo export currently
+                onnx_model.graph.input[0].type.tensor_type.shape.dim[0].dim_param = "batch_size"
+                for i in range(3):
+                    onnx_model.graph.output[i].type.tensor_type.shape.dim[0].dim_param = "batch_size"
+
+            onnx_model_helper = DynamoOnnxHelper(onnx_model)
+            onnx_model_helper.convert_constants_to_initializers()
+            if clear_dynamo_metadata:
+                onnx_model_helper.clear_metadata()
+
+            import os
+
+            if os.path.exists(onnx_model_path):
+                os.remove(onnx_model_path)
+            if os.path.exists(onnx_model_path + ".data"):
+                os.remove(onnx_model_path + ".data")
+            onnx_model_helper.model.save_model_to_file(
+                onnx_model_path, use_external_data_format=True, all_tensors_to_one_file=True, convert_attribute=True
+            )
 
     print("encoder onnx model saved to", onnx_model_path)
 
@@ -133,7 +183,7 @@ def test_image_encoder_onnx(
     onnx_model_path: str,
     dynamic_batch_axes=False,
 ):
-    ort_session = onnxruntime.InferenceSession(onnx_model_path, providers=onnxruntime.get_available_providers())
+    ort_session = onnxruntime.InferenceSession(onnx_model_path, providers=["CPUExecutionProvider"])
 
     model_inputs = ort_session.get_inputs()
     input_names = [model_inputs[i].name for i in range(len(model_inputs))]
diff --git a/onnxruntime/python/tools/transformers/models/sam2/mask_decoder.py b/onnxruntime/python/tools/transformers/models/sam2/mask_decoder.py
index 56473c002d4ae..fa83e2f666d06 100644
--- a/onnxruntime/python/tools/transformers/models/sam2/mask_decoder.py
+++ b/onnxruntime/python/tools/transformers/models/sam2/mask_decoder.py
@@ -177,7 +177,7 @@ def test_mask_decoder_onnx(
 
     import onnxruntime
 
-    ort_session = onnxruntime.InferenceSession(onnx_model_path, providers=onnxruntime.get_available_providers())
+    ort_session = onnxruntime.InferenceSession(onnx_model_path, providers=["CPUExecutionProvider"])
 
     model_inputs = ort_session.get_inputs()
     input_names = [model_inputs[i].name for i in range(len(model_inputs))]
diff --git a/onnxruntime/python/tools/transformers/models/sam2/prompt_encoder.py b/onnxruntime/python/tools/transformers/models/sam2/prompt_encoder.py
index 883c51858346c..f25e6ff23324b 100644
--- a/onnxruntime/python/tools/transformers/models/sam2/prompt_encoder.py
+++ b/onnxruntime/python/tools/transformers/models/sam2/prompt_encoder.py
@@ -146,7 +146,7 @@ def test_prompt_encoder_onnx(
 
     import onnxruntime
 
-    ort_session = onnxruntime.InferenceSession(onnx_model_path, providers=onnxruntime.get_available_providers())
+    ort_session = onnxruntime.InferenceSession(onnx_model_path, providers=["CPUExecutionProvider"])
 
     model_inputs = ort_session.get_inputs()
     input_names = [model_inputs[i].name for i in range(len(model_inputs))]

From aafa8d170a134e0b807ce852ea6294444ba4fb06 Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Wed, 5 Mar 2025 16:03:21 -0800
Subject: [PATCH 20/46] [js/web] improve workaround for bundlers (#23902)

### Description
This PR improves the workaround for bundlers in onnxruntime-web.
Specifically, the following changes have been made:

- Use [this
workaround](https://github.com/xenova/onnxruntime/commit/9c50aa2c63bad4cb73ad77ff1c43e0c43da0907f)
as suggested by @xenova in
https://github.com/huggingface/transformers.js/pull/1161#issuecomment-2695785730

- Use `url > "file:" && url < "file;"` instead of
`url.startsWith("file:")` to allow minifiers to remove dead code
correctly.

This change allows to remove unnecessary dependencies of file parsed
from `new URL("ort.bundle.min.js", import.meta.url)` in Vite, and
optimize code like `if("file://filepath.js".startsWith("file:"))
{do_sth1(); } else {do_sth2();}` into `do_sth1()` for webpack/terser
usages.

Resolves https://github.com/huggingface/transformers.js/pull/1161
---
 js/web/lib/wasm/proxy-wrapper.ts     |  8 +++--
 js/web/lib/wasm/wasm-utils-import.ts | 50 ++++++++++++++++++++++++++--
 js/web/script/build.ts               | 19 ++++++++---
 js/web/test/e2e/exports/main.js      | 11 +++++-
 js/web/test/e2e/exports/test.js      | 22 ++++++++++++
 5 files changed, 100 insertions(+), 10 deletions(-)

diff --git a/js/web/lib/wasm/proxy-wrapper.ts b/js/web/lib/wasm/proxy-wrapper.ts
index 5d97bb83e3475..30b1f5101e5f2 100644
--- a/js/web/lib/wasm/proxy-wrapper.ts
+++ b/js/web/lib/wasm/proxy-wrapper.ts
@@ -12,7 +12,11 @@ import {
 } from './proxy-messages';
 import * as core from './wasm-core-impl';
 import { initializeWebAssembly } from './wasm-factory';
-import { importProxyWorker, inferWasmPathPrefixFromScriptSrc } from './wasm-utils-import';
+import {
+  importProxyWorker,
+  inferWasmPathPrefixFromScriptSrc,
+  isEsmImportMetaUrlHardcodedAsFileUri,
+} from './wasm-utils-import';
 
 const isProxy = (): boolean => !!env.wasm.proxy && typeof document !== 'undefined';
 let proxyWorker: Worker | undefined;
@@ -116,7 +120,7 @@ export const initializeWebAssemblyAndOrtRuntime = async (): Promise<void> => {
             BUILD_DEFS.IS_ESM &&
             BUILD_DEFS.ENABLE_BUNDLE_WASM_JS &&
             !message.in!.wasm.wasmPaths &&
-            (objectUrl || BUILD_DEFS.ESM_IMPORT_META_URL?.startsWith('file:'))
+            (objectUrl || isEsmImportMetaUrlHardcodedAsFileUri)
           ) {
             // for a build bundled the wasm JS, if either of the following conditions is met:
             // - the proxy worker is loaded from a blob URL
diff --git a/js/web/lib/wasm/wasm-utils-import.ts b/js/web/lib/wasm/wasm-utils-import.ts
index 871b575d71edc..a8e27f6f334bc 100644
--- a/js/web/lib/wasm/wasm-utils-import.ts
+++ b/js/web/lib/wasm/wasm-utils-import.ts
@@ -11,6 +11,39 @@ import { isNode } from './wasm-utils-env';
  */
 const origin = isNode || typeof location === 'undefined' ? undefined : location.origin;
 
+/**
+ * Some bundlers (eg. Webpack) will rewrite `import.meta.url` to a file URL at compile time.
+ *
+ * This function checks if `import.meta.url` starts with `file:`, but using the `>` and `<` operators instead of
+ * `startsWith` function so that code minimizers can remove the dead code correctly.
+ *
+ * For example, if we use terser to minify the following code:
+ * ```js
+ * if ("file://hard-coded-filename".startsWith("file:")) {
+ *   console.log(1)
+ * } else {
+ *   console.log(2)
+ * }
+ *
+ * if ("file://hard-coded-filename" > "file:" && "file://hard-coded-filename" < "file;") {
+ *   console.log(3)
+ * } else {
+ *   console.log(4)
+ * }
+ * ```
+ *
+ * The minified code will be:
+ * ```js
+ * "file://hard-coded-filename".startsWith("file:")?console.log(1):console.log(2),console.log(3);
+ * ```
+ *
+ * (use Terser 5.39.0 with default options, https://try.terser.org/)
+ *
+ * @returns true if the import.meta.url is hardcoded as a file URI.
+ */
+export const isEsmImportMetaUrlHardcodedAsFileUri =
+  BUILD_DEFS.IS_ESM && BUILD_DEFS.ESM_IMPORT_META_URL! > 'file:' && BUILD_DEFS.ESM_IMPORT_META_URL! < 'file;';
+
 const getScriptSrc = (): string | undefined => {
   // if Nodejs, return undefined
   if (isNode) {
@@ -26,9 +59,22 @@ const getScriptSrc = (): string | undefined => {
     // new URL('actual-bundle-name.js', import.meta.url).href
     // ```
     // So that bundler can preprocess the URL correctly.
-    if (BUILD_DEFS.ESM_IMPORT_META_URL?.startsWith('file:')) {
+    if (isEsmImportMetaUrlHardcodedAsFileUri) {
       // if the rewritten URL is a relative path, we need to use the origin to resolve the URL.
-      return new URL(new URL(BUILD_DEFS.BUNDLE_FILENAME, BUILD_DEFS.ESM_IMPORT_META_URL).href, origin).href;
+
+      // The following is a workaround for Vite.
+      //
+      // Vite uses a bundler(rollup/rolldown) that does not rewrite `import.meta.url` to a file URL. So in theory, this
+      // code path should not be executed in Vite. However, the bundler does not know it and it still try to load the
+      // following pattern:
+      // - `return new URL('filename', import.meta.url).href`
+      //
+      // By replacing the pattern above with the following code, we can skip the resource loading behavior:
+      // - `const URL2 = URL; return new URL2('filename', import.meta.url).href;`
+      //
+      // And it still works in Webpack.
+      const URL2 = URL;
+      return new URL(new URL2(BUILD_DEFS.BUNDLE_FILENAME, BUILD_DEFS.ESM_IMPORT_META_URL).href, origin).href;
     }
 
     return BUILD_DEFS.ESM_IMPORT_META_URL;
diff --git a/js/web/script/build.ts b/js/web/script/build.ts
index 6006de62b41b6..7966262631bbf 100644
--- a/js/web/script/build.ts
+++ b/js/web/script/build.ts
@@ -123,13 +123,17 @@ async function minifyWasmModuleJsForBrowser(filepath: string): Promise<string> {
     // ```
     // with:
     // ```
-    // new Worker(import.meta.url.startsWith('file:')
-    //              ? new URL(BUILD_DEFS.BUNDLE_FILENAME, import.meta.url)
-    //              : new URL(import.meta.url), ...
+    // new Worker((() => {
+    //                      const URL2 = URL;
+    //                      return import.meta.url > 'file:' && import.meta.url < 'file;'
+    //                        ? new URL2(BUILD_DEFS.BUNDLE_FILENAME, import.meta.url)
+    //                        : new URL(import.meta.url);
+    //                    })(), ...
     // ```
     //
     // NOTE: this is a workaround for some bundlers that does not support runtime import.meta.url.
-    // TODO: in emscripten 3.1.61+, need to update this code.
+    //
+    // Check more details in the comment of `isEsmImportMetaUrlHardcodedAsFileUri()` and `getScriptSrc()` in file `lib/wasm/wasm-utils-import.ts`.
 
     // First, check if there is exactly one occurrence of "new Worker(new URL(import.meta.url)".
     const matches = [...contents.matchAll(/new Worker\(new URL\(import\.meta\.url\),/g)];
@@ -142,7 +146,12 @@ async function minifyWasmModuleJsForBrowser(filepath: string): Promise<string> {
     // Replace the only occurrence.
     contents = contents.replace(
       /new Worker\(new URL\(import\.meta\.url\),/,
-      `new Worker(import.meta.url.startsWith('file:')?new URL(BUILD_DEFS.BUNDLE_FILENAME, import.meta.url):new URL(import.meta.url),`,
+      `new Worker((() => {
+                            const URL2 = URL;
+                            return (import.meta.url > 'file:' && import.meta.url < 'file;')
+                              ? new URL2(BUILD_DEFS.BUNDLE_FILENAME, import.meta.url)
+                              : new URL(import.meta.url);
+                         })(),`,
     );
 
     // Use terser to minify the code with special configurations:
diff --git a/js/web/test/e2e/exports/main.js b/js/web/test/e2e/exports/main.js
index 8ed22a6784e7c..d8c7bbf69039f 100644
--- a/js/web/test/e2e/exports/main.js
+++ b/js/web/test/e2e/exports/main.js
@@ -3,7 +3,7 @@
 
 'use strict';
 
-const { runDevTest, runProdTest } = require('./test');
+const { runDevTest, runProdTest, verifyAssets } = require('./test');
 const { installOrtPackages } = require('./utils');
 
 /**
@@ -29,5 +29,14 @@ module.exports = async function main(PRESERVE, PACKAGES_TO_INSTALL) {
 
     await runDevTest('vite-default', '\x1b[32m➜\x1b[39m  \x1b[1mLocal\x1b[22m:', 5173);
     await runProdTest('vite-default', '\x1b[32m➜\x1b[39m  \x1b[1mLocal\x1b[22m:', 4173);
+
+    await verifyAssets('vite-default', async (cwd) => {
+      const globby = await import('globby');
+
+      return {
+        test: 'File "dist/assets/**/ort.*.mjs" should not exist',
+        success: globby.globbySync('dist/assets/**/ort.*.mjs', { cwd }).length === 0,
+      };
+    });
   }
 };
diff --git a/js/web/test/e2e/exports/test.js b/js/web/test/e2e/exports/test.js
index 9c5ed745ab0b5..e2bcffea97519 100644
--- a/js/web/test/e2e/exports/test.js
+++ b/js/web/test/e2e/exports/test.js
@@ -121,7 +121,29 @@ async function runProdTest(testCaseName, ready, port) {
   await runTest(testCaseName, ['prod'], ready, 'npm run start', port);
 }
 
+async function verifyAssets(testCaseName, testers) {
+  testers = Array.isArray(testers) ? testers : [testers];
+  const wd = path.join(__dirname, 'testcases', testCaseName);
+
+  console.log(`[${testCaseName}] Verifying assets...`);
+
+  const testResults = [];
+
+  try {
+    for (const tester of testers) {
+      testResults.push(await tester(wd));
+    }
+
+    if (testResults.some((r) => !r.success)) {
+      throw new Error(`[${testCaseName}] asset verification failed.`);
+    }
+  } finally {
+    console.log(`[${testCaseName}] asset verification result:`, testResults);
+  }
+}
+
 module.exports = {
   runDevTest,
   runProdTest,
+  verifyAssets,
 };

From d35db9b8fdae986a5ac8e894607350dee551568f Mon Sep 17 00:00:00 2001
From: Jianhui Dai <jianhui.j.dai@intel.com>
Date: Thu, 6 Mar 2025 09:33:03 +0800
Subject: [PATCH 21/46] [webgpu] Restore MatMulNBits workgroup size for Phi-3.5
 (#23349)

### Description
This change restores the MatMulNBits workgroup size from (8, 8, 1) back
to (16, 8, 1) to resolve a performance regression observed on Intel
iGPUs during token generation (M=1).

### Motivation and Context
As above.

Signed-off-by: Jianhui Dai <jianhui.j.dai@intel.com>
---
 onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits.cc b/onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits.cc
index 28d622b2c9c33..1534fd26d3ad9 100644
--- a/onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits.cc
+++ b/onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits.cc
@@ -884,7 +884,8 @@ Status MatMulNBits::ComputeInternal(onnxruntime::webgpu::ComputeContext& context
     program.CacheHint("T_M" + std::to_string(tile_m) + "Subgroup" + std::to_string(use_subgroup));
   } else if (block_size == 32) {
     components = 1;
-    constexpr uint32_t workgroup_size = 64;
+    // TODO: Tune the workgroup size when `M=1`.
+    constexpr uint32_t workgroup_size = 128;
     const uint32_t workgroup_y = N % 8 == 0 ? 8 : 1;
     const uint32_t workgroup_x = workgroup_size / workgroup_y;
     program.SetWorkgroupSize(workgroup_x, workgroup_y, 1);

From 95225dda1ea382923577b00bfdc6a4df10f32ec7 Mon Sep 17 00:00:00 2001
From: xhcao <xinghua.cao@intel.com>
Date: Thu, 6 Mar 2025 09:43:32 +0800
Subject: [PATCH 22/46] [webgpu] support Pad operator (#23141)

### Description
<!-- Describe your changes. -->


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
---
 .../core/providers/webgpu/tensor/pad.cc       | 261 ++++++++++++++++++
 .../core/providers/webgpu/tensor/pad.h        |  40 +++
 .../webgpu/webgpu_execution_provider.cc       |  16 +-
 3 files changed, 311 insertions(+), 6 deletions(-)
 create mode 100644 onnxruntime/core/providers/webgpu/tensor/pad.cc
 create mode 100644 onnxruntime/core/providers/webgpu/tensor/pad.h

diff --git a/onnxruntime/core/providers/webgpu/tensor/pad.cc b/onnxruntime/core/providers/webgpu/tensor/pad.cc
new file mode 100644
index 0000000000000..9ee13aada67fe
--- /dev/null
+++ b/onnxruntime/core/providers/webgpu/tensor/pad.cc
@@ -0,0 +1,261 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include <string>
+#include <vector>
+
+#include "core/util/math.h"
+#include "core/providers/webgpu/tensor/pad.h"
+#include "core/providers/webgpu/shader_helper.h"
+#include "core/providers/webgpu/webgpu_supported_types.h"
+
+namespace onnxruntime {
+namespace webgpu {
+
+Status PadProgram::GenerateShaderCode(ShaderHelper& shader) const {
+  if (!dim_value_zero_) {
+    shader.AddInput("data", ShaderUsage::UseUniform | ShaderUsage::UseShapeAndStride);
+  }
+  const auto& output = shader.AddOutput("output", ShaderUsage::UseUniform | ShaderUsage::UseShapeAndStride | ShaderUsage::UseValueTypeAlias);
+
+  shader.MainFunctionBody() << shader.GuardAgainstOutOfBoundsWorkgroupSizes("uniforms.output_size");
+  std::string constant_value_str = std::string("let constant_value = ") +
+                                   (is_float16_ ? "bitcast<vec2<f16>>(uniforms.constant_value)[0];\n" : "bitcast<output_value_t>(uniforms.constant_value);\n");
+  if (dim_value_zero_) {
+    // Only Constant mode needs fill output if the one dim value or mores dims' values of input are zero.
+    shader.MainFunctionBody() << constant_value_str
+                              << "output[global_idx] = constant_value;\n";
+    return Status::OK();
+  }
+
+  shader.MainFunctionBody() << "  let output_indices = " << output.OffsetToIndices("global_idx") << ";\n"
+                            << "  var input_index = u32(0);\n"
+                            << "  var use_pad_value = false;\n"
+                            << "  var in_coord = i32(0);\n";
+
+  const int rank = output.Rank();
+  std::string output_indices_str = "i32(" + GetElementAt("output_indices", "dim", rank) + ")";
+  std::string lower_pads_str = GetElementAt("uniforms.lower_pads", "dim", rank);
+  std::string data_shape_str = "i32(" + GetElementAt("uniforms.data_shape", "dim", rank) + ")";
+  std::string data_stride_str = rank == 1 ? "" : " * " + GetElementAt("uniforms.data_stride", "dim", rank - 1);
+  std::string begin_axis_statement = "in_coord = ";
+  std::string end_axis_statement = "in_coord = ";
+  std::string in_axis_statement = "in_coord = " + output_indices_str + " - " + lower_pads_str + ";\n";
+  switch (mode_) {
+    case Mode::Constant:
+      begin_axis_statement = "use_pad_value = true;\n";
+      end_axis_statement = "use_pad_value = true;\n";
+      break;
+    case Mode::Edge:
+      begin_axis_statement += "0;\n";
+      end_axis_statement += data_shape_str + " - 1;\n";
+      break;
+    case Mode::Reflect:
+      begin_axis_statement += lower_pads_str + " - " + output_indices_str + ";\n";
+      end_axis_statement += data_shape_str + " - 2 - (" + output_indices_str +
+                            " - (" + lower_pads_str + " + " + data_shape_str + "));\n";
+      break;
+    case Mode::Wrap:
+      begin_axis_statement += data_shape_str + " + " + output_indices_str + " - " + lower_pads_str + ";\n";
+      end_axis_statement += output_indices_str + " - " + lower_pads_str + " - " + data_shape_str + ";\n";
+      break;
+    default:
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Unsupported mode type: ", static_cast<int>(mode_));
+  }
+
+  shader.MainFunctionBody() << "  for (var dim = 0; dim < " << rank << " && !use_pad_value; dim++) {\n"
+                            << "    if (" << output_indices_str << " < " << lower_pads_str << ") {\n"
+                            << "      " << begin_axis_statement << "    }\n"
+                            << "    else if (" << output_indices_str << " >= " << lower_pads_str << " + " << data_shape_str << ") {\n"
+                            << "      " << end_axis_statement << "    }\n"
+                            << "    else {\n"
+                            << "      " << in_axis_statement << "    }\n"
+                            << "    input_index += select(u32(in_coord)" << data_stride_str << ", u32(in_coord), dim == " << rank - 1 << ");\n"
+                            << "  }\n"
+                            << "  " << constant_value_str
+                            << "  " << output.SetByOffset("global_idx", "select(data[input_index], constant_value, use_pad_value)");
+
+  return Status::OK();
+}
+
+Status Pad::ComputeInternal(ComputeContext& context) const {
+  const Tensor* input_tensor = context.Input<Tensor>(0);
+  auto const& input_shape = input_tensor->Shape();
+  size_t dimension_count = input_shape.NumDimensions();
+
+  const PadsVector* p_pads = &pads_;
+  const PadsVector* p_slices = &slices_;
+
+  PadsVector pads;
+  PadsVector slices;
+  // kOnnxDomain Pad opset >= 11 (Or) kMsDomain opset == 1
+  if (is_dynamic_) {
+    size_t data_rank = input_tensor->Shape().NumDimensions();
+
+    const Tensor* pads_tensor = context.Input<Tensor>(1);
+    auto pads_tensor_dims = pads_tensor->Shape().GetDims();
+    ORT_ENFORCE(pads_tensor_dims.size() == 1 || (pads_tensor_dims.size() == 2 && pads_tensor_dims[0] == 1),
+                "Pads tensor should be a 1D tensor of shape [2 * num_axes] "
+                "or a 2D tensor of shape [1, 2 * num_axes]");
+
+    const auto pads_data = pads_tensor->DataAsSpan<int64_t>();
+
+    // Compute Pads by applying axes if specified otherwise copy the supplied pads.
+    PadBase::ComputePads(context.KernelContext(), data_rank, pads_data, pads);
+
+    // Separate out any negative pads into the slices array
+    PadBase::SeparateNegativeToSlices(pads, slices);
+
+    p_pads = &pads;
+    p_slices = &slices;
+  }
+
+  auto output_dims(input_shape.AsShapeVector());
+  ORT_ENFORCE(dimension_count * 2 == p_pads->size(), "'pads' attribute has wrong number of values");
+
+  // Calculate output dimensions, and handle any negative padding
+  std::vector<int32_t> lower_pads(dimension_count);
+  for (size_t i = 0; i < dimension_count; i++) {
+    int64_t lower_pad = (*p_pads)[i] + (*p_slices)[i];
+    int64_t upper_pad = (*p_pads)[i + dimension_count] + (*p_slices)[i + dimension_count];
+    lower_pads[i] = static_cast<int32_t>(lower_pad);
+    output_dims[i] += lower_pad + upper_pad;
+  }
+  TensorShape output_shape(output_dims);
+
+  // special case when there is a dim value of 0 in the shape. behavior depends on mode
+  bool dim_value_zero = input_shape.Size() == 0;
+  if (dim_value_zero) {
+    ORT_RETURN_IF_ERROR(PadBase::HandleDimValueZero(mode_, input_shape, output_shape));
+  }
+
+  auto* output_tensor = context.Output(0, output_shape);
+  uint32_t output_size = gsl::narrow<uint32_t>(output_shape.Size());
+  if (output_size == 0) {
+    // Do not need to fill output, return
+    return Status::OK();
+  }
+
+  // Read constant value and bitcast to uint32.
+  uint32_t value_uint32 = 0;
+  const auto data_type = input_tensor->GetElementType();
+  bool is_float16 = data_type == ONNX_NAMESPACE::TensorProto_DataType_FLOAT16;
+  const Tensor* value_tensor = context.Input<Tensor>(2);
+  if (!is_dynamic_) {
+    if (is_float16) {
+      uint16_t value = math::floatToHalf(value_);
+      std::memcpy(&value_uint32, &value, sizeof(value));
+    } else {
+      value_uint32 = *reinterpret_cast<const uint32_t*>(&value_);
+    }
+  } else if (value_tensor) {
+    ORT_ENFORCE(value_tensor->DataType() == input_tensor->DataType() && value_tensor->Shape().Size() == 1,
+                "Value tensor should be a 1D tensor of size 1 with the same type as that of the input tensor");
+    switch (data_type) {
+      case ONNX_NAMESPACE::TensorProto_DataType_INT32: {
+        int32_t value = value_tensor->Data<int32_t>()[0];
+        value_uint32 = *reinterpret_cast<uint32_t*>(&value);
+      } break;
+      case ONNX_NAMESPACE::TensorProto_DataType_FLOAT: {
+        float value = value_tensor->Data<float>()[0];
+        value_uint32 = *reinterpret_cast<uint32_t*>(&value);
+      } break;
+      case ONNX_NAMESPACE::TensorProto_DataType_FLOAT16: {
+        uint16_t value = value_tensor->Data<MLFloat16>()[0].val;
+        std::memcpy(&value_uint32, &value, sizeof(value));
+      } break;
+      case ONNX_NAMESPACE::TensorProto_DataType_UINT32: {
+        value_uint32 = value_tensor->Data<uint32_t>()[0];
+      } break;
+      default:
+        return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Unsupported input type: ", static_cast<int>(data_type));
+    }
+  }
+
+  PadProgram program{mode_, dim_value_zero, is_float16};
+  if (!dim_value_zero) {
+    program.AddInput({input_tensor, ProgramTensorMetadataDependency::TypeAndRank});
+  }
+  program.AddOutput({output_tensor, ProgramTensorMetadataDependency::Rank})
+      .SetDispatchGroupSize((output_size + WORKGROUP_SIZE - 1) / WORKGROUP_SIZE)
+      .CacheHint(std::to_string(static_cast<int>(mode_)), dim_value_zero)
+      .AddUniformVariables({{gsl::span<const int32_t>(lower_pads.data(), lower_pads.size())}, {output_size}, {value_uint32}});
+
+  return context.RunProgram(program);
+}
+
+ONNX_OPERATOR_VERSIONED_KERNEL_EX(
+    Pad,
+    kOnnxDomain,
+    2, 10,
+    kWebGpuExecutionProvider,
+    (*KernelDefBuilder::Create())
+        .TypeConstraint("T", WebGpuSupportedNumberTypes()),
+    Pad);
+ONNX_OPERATOR_VERSIONED_KERNEL_EX(
+    Pad,
+    kOnnxDomain,
+    11, 12,
+    kWebGpuExecutionProvider,
+    (*KernelDefBuilder::Create())
+        .InputMemoryType(OrtMemTypeCPUInput, 1)
+        .InputMemoryType(OrtMemTypeCPUInput, 2)
+        .TypeConstraint("T", WebGpuSupportedNumberTypes()),
+    Pad);
+ONNX_OPERATOR_VERSIONED_KERNEL_EX(
+    Pad,
+    kOnnxDomain,
+    13, 17,
+    kWebGpuExecutionProvider,
+    (*KernelDefBuilder::Create())
+        .InputMemoryType(OrtMemTypeCPUInput, 1)
+        .InputMemoryType(OrtMemTypeCPUInput, 2)
+        .TypeConstraint("T", WebGpuSupportedNumberTypes()),
+    Pad);
+ONNX_OPERATOR_VERSIONED_KERNEL_EX(
+    Pad,
+    kOnnxDomain,
+    18, 18,
+    kWebGpuExecutionProvider,
+    (*KernelDefBuilder::Create())
+        .InputMemoryType(OrtMemTypeCPUInput, 1)
+        .InputMemoryType(OrtMemTypeCPUInput, 2)
+        .InputMemoryType(OrtMemTypeCPUInput, 3)
+        .TypeConstraint("T", WebGpuSupportedNumberTypes()),
+    Pad);
+ONNX_OPERATOR_VERSIONED_KERNEL_EX(
+    Pad,
+    kOnnxDomain,
+    19, 20,
+    kWebGpuExecutionProvider,
+    (*KernelDefBuilder::Create())
+        .InputMemoryType(OrtMemTypeCPUInput, 1)
+        .InputMemoryType(OrtMemTypeCPUInput, 2)
+        .InputMemoryType(OrtMemTypeCPUInput, 3)
+        .TypeConstraint("T", WebGpuSupportedNumberTypes()),
+    Pad);
+ONNX_OPERATOR_VERSIONED_KERNEL_EX(
+    Pad,
+    kOnnxDomain,
+    21, 22,
+    kWebGpuExecutionProvider,
+    (*KernelDefBuilder::Create())
+        .InputMemoryType(OrtMemTypeCPUInput, 1)
+        .InputMemoryType(OrtMemTypeCPUInput, 2)
+        .InputMemoryType(OrtMemTypeCPUInput, 3)
+        .TypeConstraint("T", WebGpuSupportedNumberTypes()),
+    Pad);
+ONNX_OPERATOR_KERNEL_EX(
+    Pad,
+    kOnnxDomain,
+    23,
+    kWebGpuExecutionProvider,
+    (*KernelDefBuilder::Create())
+        .InputMemoryType(OrtMemTypeCPUInput, 1)
+        .InputMemoryType(OrtMemTypeCPUInput, 2)
+        .InputMemoryType(OrtMemTypeCPUInput, 3)
+        .TypeConstraint("T", WebGpuSupportedNumberTypes()),
+    Pad);
+
+}  // namespace webgpu
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/webgpu/tensor/pad.h b/onnxruntime/core/providers/webgpu/tensor/pad.h
new file mode 100644
index 0000000000000..58049ddb0e5ce
--- /dev/null
+++ b/onnxruntime/core/providers/webgpu/tensor/pad.h
@@ -0,0 +1,40 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/providers/webgpu/program.h"
+#include "core/providers/webgpu/webgpu_kernel.h"
+#include "core/providers/cpu/tensor/padbase.h"
+
+namespace onnxruntime {
+namespace webgpu {
+
+class PadProgram final : public Program<PadProgram> {
+ public:
+  PadProgram(const Mode mode, bool dim_value_zero, bool is_float16) : Program<PadProgram>{"Pad"},
+                                                                      mode_{mode},
+                                                                      dim_value_zero_{dim_value_zero},
+                                                                      is_float16_{is_float16} {}
+
+  Status GenerateShaderCode(ShaderHelper& sh) const override;
+
+  WEBGPU_PROGRAM_DEFINE_UNIFORM_VARIABLES({"lower_pads", ProgramUniformVariableDataType::Int32},
+                                          {"output_size", ProgramUniformVariableDataType::Uint32},
+                                          {"constant_value", ProgramUniformVariableDataType::Uint32});
+
+ private:
+  Mode mode_;
+  bool dim_value_zero_;
+  bool is_float16_;
+};
+
+class Pad final : public PadBase, public WebGpuKernel {
+ public:
+  Pad(const OpKernelInfo& info) : PadBase(info), WebGpuKernel(info) {}
+
+  Status ComputeInternal(ComputeContext& context) const override;
+};
+
+}  // namespace webgpu
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc b/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc
index 4950d94dea4c4..a2b8709e0e075 100644
--- a/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc
+++ b/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc
@@ -363,7 +363,9 @@ class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxD
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 11, 12, Pad);
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 13, 17, Pad);
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 18, 18, Pad);
-class ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 19, Pad);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 19, 20, Pad);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 21, 22, Pad);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 23, Pad);
 
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 1, 10, If);
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 11, 12, If);
@@ -685,11 +687,13 @@ std::unique_ptr<KernelRegistry> RegisterKernels() {
 
       // BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 12, float, Einsum)>,
 
-      // BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 2, 10, Pad)>,
-      // BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 11, 12, Pad)>,
-      // BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 13, 17, Pad)>,
-      // BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 18, 18, Pad)>,
-      // BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 19, Pad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 2, 10, Pad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 11, 12, Pad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 13, 17, Pad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 18, 18, Pad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 19, 20, Pad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 21, 22, Pad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 23, Pad)>,
 
       // BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 1, 10, If)>,
       // BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 11, 12, If)>,

From b5242293475c944c2abae3fd5d96e1c1788054a7 Mon Sep 17 00:00:00 2001
From: Wanming Lin <wanming.lin@intel.com>
Date: Thu, 6 Mar 2025 14:33:07 +0800
Subject: [PATCH 23/46] [WebNN] Accept Float16Array for float16 data type if it
 is available (#23894)

Float16Array is now shipping and WebNN Chromium implementation has
accepted it. We should allow it in WebNN EP as well.
---
 js/web/lib/wasm/jsep/backend-webnn.ts              |  3 ++-
 .../builders/impl/rotaryEmbedding_op_builder.cc    | 14 +++++++++++---
 .../core/providers/webnn/builders/model_builder.cc |  6 ++++--
 .../core/providers/webnn/builders/model_builder.h  | 10 ++++++++--
 4 files changed, 25 insertions(+), 8 deletions(-)

diff --git a/js/web/lib/wasm/jsep/backend-webnn.ts b/js/web/lib/wasm/jsep/backend-webnn.ts
index 2b9a9208e2e53..55784ae13ad7a 100644
--- a/js/web/lib/wasm/jsep/backend-webnn.ts
+++ b/js/web/lib/wasm/jsep/backend-webnn.ts
@@ -314,7 +314,8 @@ export class WebNNBackend {
         bufferView = new Float32Array(buffer);
         break;
       case 'float16':
-        bufferView = new Uint16Array(buffer);
+        bufferView =
+          typeof Float16Array !== 'undefined' && Float16Array.from ? new Float16Array(buffer) : new Uint16Array(buffer);
         break;
       case 'int32':
         bufferView = new Int32Array(buffer);
diff --git a/onnxruntime/core/providers/webnn/builders/impl/rotaryEmbedding_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/rotaryEmbedding_op_builder.cc
index cbaff79f4fd4f..966deb14196dd 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/rotaryEmbedding_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/rotaryEmbedding_op_builder.cc
@@ -219,9 +219,17 @@ Status RotaryEmbeddingOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_build
     sign_buffer.set(0, -1.0f);
     sign_buffer.set(1, 1.0f);
   } else if (input_data_type == ONNX_NAMESPACE::TensorProto_DataType_FLOAT16) {
-    sign_buffer = emscripten::val::global("Uint16Array").new_(2);
-    sign_buffer.set(0, PackFloat32ToUint16AsFloat16(-1.0f));
-    sign_buffer.set(1, PackFloat32ToUint16AsFloat16(1.0f));
+    if (model_builder.IsFloat16ArrayAvailable()) {
+      // Float16Array is avaliable - use Float16Array.
+      sign_buffer = emscripten::val::global("Float16Array").new_(2);
+      sign_buffer.set(0, -1.0f);
+      sign_buffer.set(1, 1.0f);
+    } else {
+      // Float16Array is not available - use Uint16Array instead.
+      sign_buffer = emscripten::val::global("Uint16Array").new_(2);
+      sign_buffer.set(0, PackFloat32ToUint16AsFloat16(-1.0f));
+      sign_buffer.set(1, PackFloat32ToUint16AsFloat16(1.0f));
+    }
   } else {
     return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Unsupported input data type: ", input_data_type);
   }
diff --git a/onnxruntime/core/providers/webnn/builders/model_builder.cc b/onnxruntime/core/providers/webnn/builders/model_builder.cc
index ace6519a1fc11..cf4ce216ed5b3 100644
--- a/onnxruntime/core/providers/webnn/builders/model_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/model_builder.cc
@@ -197,7 +197,8 @@ Status ModelBuilder::RegisterInitializers() {
 
         // Wasm memory grow will cause all array buffers reallocation, which will be treated as detached
         // buffers in JS side. Simply create a copy to fix it.
-        operand = wnn_builder_.call<emscripten::val>("constant", desc, view.call<emscripten::val>("slice"));
+        view = view.call<emscripten::val>("slice");
+        operand = wnn_builder_.call<emscripten::val>("constant", desc, view["buffer"]);
       }
     } else {
       // TODO: support other type.
@@ -350,7 +351,8 @@ Status ModelBuilder::AddOperandFromPersistMemoryBuffer(
   emscripten::val operand = emscripten::val::object();
   // Wasm memory grow will cause all array buffers reallocation, which will be treated as detached
   // buffers in JS side. Simply create a copy to fix it.
-  operand = wnn_builder_.call<emscripten::val>("constant", desc, view.call<emscripten::val>("slice"));
+  view = view.call<emscripten::val>("slice");
+  operand = wnn_builder_.call<emscripten::val>("constant", desc, view["buffer"]);
 
   AddOperand(name, operand);
   mem_persist_buffers_.push_back(std::move(persist_buffer));
diff --git a/onnxruntime/core/providers/webnn/builders/model_builder.h b/onnxruntime/core/providers/webnn/builders/model_builder.h
index 4e2d84f481df0..1e5f859506d6b 100644
--- a/onnxruntime/core/providers/webnn/builders/model_builder.h
+++ b/onnxruntime/core/providers/webnn/builders/model_builder.h
@@ -30,6 +30,7 @@ class ModelBuilder {
   Status Compile(std::unique_ptr<Model>& model) ORT_MUST_USE_RESULT;
 
   // Accessors for members.
+  bool IsFloat16ArrayAvailable() const { return is_float16array_available_; }
   const GraphViewer& GetGraphViewer() const { return graph_viewer_; }
   InitializedTensorSet GetInitializerTensors();
 
@@ -68,6 +69,8 @@ class ModelBuilder {
  private:
   const GraphViewer& graph_viewer_;
   const logging::Logger& logger_;
+  const bool is_float16array_available_ = !emscripten::val::global("Float16Array").isUndefined() &&
+                                          emscripten::val::global("Float16Array").hasOwnProperty("from");
 
   emscripten::val wnn_context_ = emscripten::val::undefined();
   emscripten::val wnn_builder_ = emscripten::val::undefined();
@@ -172,9 +175,12 @@ const emscripten::val& ModelBuilder::CreateOrGetConstant(const int32_t& data_typ
         }
         break;
       case ONNX_NAMESPACE::TensorProto_DataType_FLOAT16:
-        buffer = emscripten::val::global("Uint16Array").new_(num_elements);
+        buffer = is_float16array_available_
+                     ? emscripten::val::global("Float16Array").new_(num_elements)
+                     : emscripten::val::global("Uint16Array").new_(num_elements);
         if (value) {
-          buffer.call<void>("fill", emscripten::val(PackFloat32ToUint16AsFloat16(value)));
+          buffer.call<void>("fill",
+                            emscripten::val(is_float16array_available_ ? value : PackFloat32ToUint16AsFloat16(value)));
         }
         break;
       case ONNX_NAMESPACE::TensorProto_DataType_FLOAT:

From 996fffbeddf8a6402a15f1db9155d083ecdd1ba1 Mon Sep 17 00:00:00 2001
From: Mark Schofield <mschofie@microsoft.com>
Date: Thu, 6 Mar 2025 08:51:50 -0800
Subject: [PATCH 24/46] Ensure that the 'cmake_minimum_required' is version 3.5
 or greater (#23888)

### Description
CMake 4.0 release candidate 2.0 is available, and it cannot compile all
of OnnxRuntime out-of-the-box. There's portions of the OnnxRuntime
codebase that specify a `cmake_minimum_required` version of 3.0, and
CMake 4.0 has removed support for compatibility with CMake < 3.5 - the
following error is reported:

```
CMake Error at winml_sdk_helpers.cmake:4 (cmake_minimum_required):
  Compatibility with CMake < 3.5 has been removed from CMake.

  Update the VERSION argument <min> value.  Or, use the <min>...<max> syntax
  to tell CMake that the project requires at least <min> but has been updated
  to work with policies introduced by <max> or earlier.

  Or, add -DCMAKE_POLICY_VERSION_MINIMUM=3.5 to try configuring anyway.
```

Since CMake 3.5 appears to have shipped in 2016, it seems reasonable to
set that as a minimum version to fix the error. The root CMakeLists.txt
does ask for a minimum version of 3.28, so we could snap to that, but
I'm still ramping up on the build, so wanted to propose a minimally
sufficient fix.

### Motivation and Context
Being able to build with the latest CMake - when it ships - reduces the
barrier to entry to building OnnxRuntime, and allows the OnnxRuntime to
leverage the latest and greatest tooling.
---
 cmake/nuget_helpers.cmake     | 2 +-
 cmake/winml_sdk_helpers.cmake | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/cmake/nuget_helpers.cmake b/cmake/nuget_helpers.cmake
index 22143ac422e9f..b066d1e9fb50e 100644
--- a/cmake/nuget_helpers.cmake
+++ b/cmake/nuget_helpers.cmake
@@ -1,7 +1,7 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License.
 
-cmake_minimum_required(VERSION 3.0)
+cmake_minimum_required(VERSION 3.5)
 
 # Determines the version of a native nuget package from the root packages.config.
 #
diff --git a/cmake/winml_sdk_helpers.cmake b/cmake/winml_sdk_helpers.cmake
index 9241fcd060caf..ca657311b7f14 100644
--- a/cmake/winml_sdk_helpers.cmake
+++ b/cmake/winml_sdk_helpers.cmake
@@ -1,7 +1,7 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License.
 
-cmake_minimum_required(VERSION 3.0)
+cmake_minimum_required(VERSION 3.5)
 
 # utility
 function(convert_forward_slashes_to_back input output)

From 54b2d64c525ccac20f6ebd9ca20dfea0c8b75565 Mon Sep 17 00:00:00 2001
From: jiangzhaoming <zhaoming.jiang@microsoft.com>
Date: Fri, 7 Mar 2025 00:53:23 +0800
Subject: [PATCH 25/46] WebGPU: Remove deprecated subgroups-f16 from WebGPU
 native and JS EP (#23898)

This PR removes the deprecated subgroups-f16 from WebGPU native and JS
EP, and also remove the unused deviceInfo in WebGPU JS EP.
---
 js/web/lib/wasm/jsep/backend-webgpu.ts        | 28 +------------------
 js/web/lib/wasm/jsep/init.ts                  | 10 +------
 .../lib/wasm/jsep/webgpu/program-manager.ts   |  1 -
 js/web/lib/wasm/jsep/webgpu/types.ts          | 10 -------
 .../core/providers/webgpu/shader_helper.cc    |  3 --
 .../core/providers/webgpu/webgpu_context.cc   |  3 +-
 6 files changed, 3 insertions(+), 52 deletions(-)

diff --git a/js/web/lib/wasm/jsep/backend-webgpu.ts b/js/web/lib/wasm/jsep/backend-webgpu.ts
index a0010df4643a4..413e89111740e 100644
--- a/js/web/lib/wasm/jsep/backend-webgpu.ts
+++ b/js/web/lib/wasm/jsep/backend-webgpu.ts
@@ -13,7 +13,6 @@ import { ProgramManager } from './webgpu/program-manager';
 import {
   AdapterInfo,
   ComputeContext,
-  DeviceInfo,
   GpuArchitecture,
   GpuData,
   GpuVendor,
@@ -135,26 +134,6 @@ class AdapterInfoImpl implements AdapterInfo {
   }
 }
 
-class DeviceInfoImpl implements DeviceInfo {
-  readonly subgroupsSupported: boolean;
-  readonly subgroupsF16Supported: boolean;
-  readonly subgroupSizeRange?: readonly [number, number];
-
-  constructor(device: GPUDevice) {
-    this.subgroupsSupported = device.features.has('subgroups' as GPUFeatureName);
-    this.subgroupsF16Supported = device.features.has('subgroups' as GPUFeatureName);
-    // Currently subgroups feature is still experimental and size attributes are not in the WebGPU IDL, so we have to
-    // workaround the IDL type checks.
-    // TODO: clean this after subgroups feature is settled in IDL.
-    const deviceSubgroupsLimits = device.limits as { minSubgroupSize?: number; maxSubgroupSize?: number };
-    if (!this.subgroupsSupported || !deviceSubgroupsLimits.minSubgroupSize || !deviceSubgroupsLimits.maxSubgroupSize) {
-      this.subgroupSizeRange = undefined;
-    } else {
-      this.subgroupSizeRange = [deviceSubgroupsLimits.minSubgroupSize, deviceSubgroupsLimits.maxSubgroupSize];
-    }
-  }
-}
-
 /**
  * this class is designed to store status and being used as a singleton for JSEP. It will be passed to jsepInit() as
  * the first parameter so that it is stored for future use.
@@ -162,7 +141,6 @@ class DeviceInfoImpl implements DeviceInfo {
 export class WebGpuBackend {
   adapterInfo: AdapterInfoImpl;
   device: GPUDevice;
-  deviceInfo: DeviceInfoImpl;
   /**
    * an instance of GpuDataManager to manage a GpuDataId -> GpuBuffer mapping
    */
@@ -274,13 +252,9 @@ export class WebGpuBackend {
     }
     requireFeatureIfAvailable('shader-f16');
     // Try subgroups
-    if (requireFeatureIfAvailable('subgroups' as GPUFeatureName)) {
-      // If subgroups feature is available, also try subgroups-f16
-      requireFeatureIfAvailable('subgroups-f16' as GPUFeatureName);
-    }
+    requireFeatureIfAvailable('subgroups' as GPUFeatureName);
 
     this.device = await adapter.requestDevice(deviceDescriptor);
-    this.deviceInfo = new DeviceInfoImpl(this.device);
     this.adapterInfo = new AdapterInfoImpl(adapter.info || (await adapter.requestAdapterInfo()));
     this.gpuDataManager = createGpuDataManager(this);
     this.programManager = new ProgramManager(this);
diff --git a/js/web/lib/wasm/jsep/init.ts b/js/web/lib/wasm/jsep/init.ts
index b4071eae51c8f..6c2be3aa0cfe1 100644
--- a/js/web/lib/wasm/jsep/init.ts
+++ b/js/web/lib/wasm/jsep/init.ts
@@ -11,13 +11,7 @@ import { WebGpuBackend } from './backend-webgpu';
 import { LOG_DEBUG } from './log';
 import { TensorView } from './tensor-view';
 import { ShapeUtil } from './util';
-import {
-  AdapterInfo,
-  ComputeContext,
-  ComputeContextInputsOutputsMapping,
-  DeviceInfo,
-  ProgramInfo,
-} from './webgpu/types';
+import { AdapterInfo, ComputeContext, ComputeContextInputsOutputsMapping, ProgramInfo } from './webgpu/types';
 import { WebNNBackend } from './backend-webnn';
 
 /* eslint-disable no-bitwise */
@@ -76,7 +70,6 @@ class TensorViewImpl implements TensorView {
 
 class ComputeContextImpl implements ComputeContext {
   readonly adapterInfo: AdapterInfo;
-  readonly deviceInfo: DeviceInfo;
   readonly opKernelContext: number;
   readonly inputs: readonly TensorView[];
   readonly outputCount: number;
@@ -94,7 +87,6 @@ class ComputeContextImpl implements ComputeContext {
     contextDataOffset: number,
   ) {
     this.adapterInfo = backend.adapterInfo;
-    this.deviceInfo = backend.deviceInfo;
 
     // extract context data
     const ptrSize = module.PTR_SIZE;
diff --git a/js/web/lib/wasm/jsep/webgpu/program-manager.ts b/js/web/lib/wasm/jsep/webgpu/program-manager.ts
index 2c5180c5db3ee..18d505f57655a 100644
--- a/js/web/lib/wasm/jsep/webgpu/program-manager.ts
+++ b/js/web/lib/wasm/jsep/webgpu/program-manager.ts
@@ -99,7 +99,6 @@ export class ProgramManager {
     const extensionsInfo: Array<{ feature: GPUFeatureName; extension: string }> = [
       { feature: 'shader-f16', extension: 'f16' },
       { feature: 'subgroups' as GPUFeatureName, extension: 'subgroups' },
-      { feature: 'subgroups-f16' as GPUFeatureName, extension: 'subgroups_f16' },
     ];
     extensionsInfo.forEach((info) => {
       if (device.features.has(info.feature)) {
diff --git a/js/web/lib/wasm/jsep/webgpu/types.ts b/js/web/lib/wasm/jsep/webgpu/types.ts
index 9321ac170d036..f3cfc6cb98cae 100644
--- a/js/web/lib/wasm/jsep/webgpu/types.ts
+++ b/js/web/lib/wasm/jsep/webgpu/types.ts
@@ -21,11 +21,6 @@ export interface AdapterInfo {
   isArchitecture: (architecture: GpuArchitecture) => boolean;
   isVendor: (vendor: GpuVendor) => boolean;
 }
-export interface DeviceInfo {
-  readonly subgroupsSupported: boolean;
-  readonly subgroupsF16Supported: boolean;
-  readonly subgroupSizeRange?: readonly [number, number];
-}
 
 export interface GpuData {
   type: GpuDataType;
@@ -165,11 +160,6 @@ export interface ComputeContext {
    */
   readonly adapterInfo: AdapterInfo;
 
-  /**
-   * gpu device info
-   */
-  readonly deviceInfo: DeviceInfo;
-
   /**
    * stores the pointer to OpKernelContext
    */
diff --git a/onnxruntime/core/providers/webgpu/shader_helper.cc b/onnxruntime/core/providers/webgpu/shader_helper.cc
index 8fccbacac903b..19cab9b178b1f 100644
--- a/onnxruntime/core/providers/webgpu/shader_helper.cc
+++ b/onnxruntime/core/providers/webgpu/shader_helper.cc
@@ -345,9 +345,6 @@ Status ShaderHelper::GenerateSourceCode(std::string& code, std::vector<int>& sha
                   })) {
     ORT_RETURN_IF_NOT(device_.HasFeature(wgpu::FeatureName::ShaderF16), "Program ", program_.Name(), " requires f16 but the device does not support it.");
     ss << "enable f16;\n";
-    if (device_.HasFeature(wgpu::FeatureName::SubgroupsF16)) {
-      ss << "enable subgroups_f16;\n";
-    }
   }
   if (device_.HasFeature(wgpu::FeatureName::Subgroups)) {
     ss << "enable subgroups;\n";
diff --git a/onnxruntime/core/providers/webgpu/webgpu_context.cc b/onnxruntime/core/providers/webgpu/webgpu_context.cc
index 2bd547f406226..21e5e55588a2e 100644
--- a/onnxruntime/core/providers/webgpu/webgpu_context.cc
+++ b/onnxruntime/core/providers/webgpu/webgpu_context.cc
@@ -489,8 +489,7 @@ std::vector<wgpu::FeatureName> WebGpuContext::GetAvailableRequiredFeatures(const
 #endif
       wgpu::FeatureName::TimestampQuery,
       wgpu::FeatureName::ShaderF16,
-      wgpu::FeatureName::Subgroups,
-      wgpu::FeatureName::SubgroupsF16};
+      wgpu::FeatureName::Subgroups};
   for (auto feature : features) {
     if (adapter.HasFeature(feature)) {
       required_features.push_back(feature);

From ccf8fdd9ea8b0df0eb4b092323479ce8a47be161 Mon Sep 17 00:00:00 2001
From: Satya Kumar Jandhyala <satya.k.jandhyala@gmail.com>
Date: Thu, 6 Mar 2025 09:09:53 -0800
Subject: [PATCH 26/46] [JSEP/WebGPU] Fixed error in softmax dispatch. (#23906)

### Description
Fixed an error softmax dispatch


### Motivation and Context
Produce expected results for LlaMA model
---
 js/web/lib/wasm/jsep/webgpu/ops/attention.ts | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/js/web/lib/wasm/jsep/webgpu/ops/attention.ts b/js/web/lib/wasm/jsep/webgpu/ops/attention.ts
index 6a78c8ae3b190..6a8dffb73fa08 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/attention.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/attention.ts
@@ -433,7 +433,7 @@ const createInPlaceSoftmaxProgramInfo = (
     getShaderSource,
     getRunData: () => ({
       outputs: [],
-      dispatchGroup: { x: Math.ceil(totalSequenceLength / WG), y: sequenceLength, z: batchSize * numHeads },
+      dispatchGroup: { x: 1, y: sequenceLength, z: batchSize * numHeads },
       programUniforms,
     }),
   };

From 101353cf5e54bdab057bf759befc8440465a1339 Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Thu, 6 Mar 2025 15:23:01 -0800
Subject: [PATCH 27/46] enable WebGPU EP in WebAssembly build (#23913)

### Description

This PR is the first step for migrating the webgpu backend of
onnxruntime-web from JSEP based to WebGPU EP based.

In this change, we enable building WebGPU EP in a wasm build (ie.
`--build_wasm` `--use_webgpu` `--use_jsep`). However, the old build
flags should still keep previous behavior.
---
 .../external/onnxruntime_external_deps.cmake  |  24 +-
 cmake/onnxruntime_webassembly.cmake           |  37 ++-
 cmake/patches/dawn/dawn.patch                 | 113 ++++++-
 js/build_webgpu.bat                           |  79 +++++
 js/web/lib/build-def.d.ts                     |   7 +
 js/web/lib/wasm/jsep/init.ts                  | 136 ++++----
 js/web/lib/wasm/session-options.ts            | 116 +++++--
 js/web/lib/wasm/wasm-core-impl.ts             |  97 ++++--
 js/web/lib/wasm/wasm-types.ts                 |  68 +++-
 js/web/script/build.ts                        |  17 +-
 .../core/framework/external_data_loader.cc    |   7 +-
 .../core/framework/external_data_loader.h     |   2 +-
 .../providers/webgpu/external_data_loader.cc  |  40 +++
 .../providers/webgpu/external_data_loader.h   |  30 ++
 onnxruntime/core/providers/webgpu/program.cc  |  20 ++
 onnxruntime/core/providers/webgpu/program.h   |   1 +
 .../core/providers/webgpu/webgpu_context.cc   |  53 +--
 .../webgpu/webgpu_execution_provider.cc       |   7 +
 .../webgpu/webgpu_execution_provider.h        |   3 +
 .../webgpu/webgpu_provider_factory.cc         |   6 +
 onnxruntime/wasm/api.cc                       |  26 +-
 onnxruntime/wasm/api.h                        |  24 +-
 onnxruntime/wasm/js_post_js.js                |   2 -
 onnxruntime/wasm/js_post_js_64.js             |   2 -
 onnxruntime/wasm/post-webgpu.js               | 261 +++++++++++++++
 onnxruntime/wasm/pre-async.js                 | 132 ++++++++
 onnxruntime/wasm/pre-jsep.js                  | 308 ++++++------------
 onnxruntime/wasm/pre.js                       |  15 +-
 tools/ci_build/build.py                       |   7 +-
 29 files changed, 1245 insertions(+), 395 deletions(-)
 create mode 100644 js/build_webgpu.bat
 create mode 100644 onnxruntime/core/providers/webgpu/external_data_loader.cc
 create mode 100644 onnxruntime/core/providers/webgpu/external_data_loader.h
 create mode 100644 onnxruntime/wasm/post-webgpu.js
 create mode 100644 onnxruntime/wasm/pre-async.js

diff --git a/cmake/external/onnxruntime_external_deps.cmake b/cmake/external/onnxruntime_external_deps.cmake
index a579badee666c..2ab9fc129a90d 100644
--- a/cmake/external/onnxruntime_external_deps.cmake
+++ b/cmake/external/onnxruntime_external_deps.cmake
@@ -725,7 +725,29 @@ if (onnxruntime_USE_WEBGPU)
       # # if we need to apply patches in the future, we can uncomment the following line.
       #
       # The dawn.patch contains the following changes:
-      # - https://dawn-review.googlesource.com/c/dawn/+/225514
+      #
+      # - (public) CMake fix to support Emscripten v4.0.3+
+      #   This change allows Dawn to find the file "gen_struct_info.py" in the correct location.
+      #   https://dawn-review.googlesource.com/c/dawn/+/225514
+      #
+      # - (public) Fix emwgpu C++ implementation for buffer destroy
+      #   In native implementation, wgpuBufferRelease will trigger the buffer destroy (if refcount decreased to 0). But
+      #   in emwgpu implementation, the buffer destroy won't happen. This change fixes the bug.
+      #   https://dawn-review.googlesource.com/c/dawn/+/226315
+      #
+      # - (private) Allow "external" buffer in emwgpu C++ implementation
+      #   This change allows WGPUBufferImpl to destroy the buffer when the refcount decreased to 0 only for non-external
+      #   buffer.
+      #   "external buffer" means the GPUBuffer instance created in JavaScript and imported to C++ by `importJsBuffer`.
+      #
+      # - (private) Remove hard-coded CMAKE_OSX_DEPLOYMENT_TARGET in Dawn's CMake files
+      #   https://github.com/microsoft/onnxruntime/pull/23729
+      #
+      # - (private) Fix external ref count for "external" device in emwgpu C++ implementation
+      #   This change fixes the incorrect external ref count for class WGPUDeviceImpl when used with "external" device.
+      #   "external device" means the GPUDevice instance created in JavaScript and imported to C++ by `importJsDevice`.
+      #
+      #
       PATCH_COMMAND ${Patch_EXECUTABLE} --binary --ignore-whitespace -p1 < ${PROJECT_SOURCE_DIR}/patches/dawn/dawn.patch
       EXCLUDE_FROM_ALL
     )
diff --git a/cmake/onnxruntime_webassembly.cmake b/cmake/onnxruntime_webassembly.cmake
index 8106e46ccf580..f3afaf7033fd1 100644
--- a/cmake/onnxruntime_webassembly.cmake
+++ b/cmake/onnxruntime_webassembly.cmake
@@ -211,10 +211,14 @@ else()
     target_link_libraries(onnxruntime_webassembly PRIVATE tensorboard)
   endif()
 
+  set(onnxruntime_webassembly_script_deps "${ONNXRUNTIME_ROOT}/wasm/pre.js")
+
+  set(EXPORTED_FUNCTIONS "_malloc,_free")
   if (onnxruntime_USE_JSEP)
-    set(EXPORTED_FUNCTIONS "_malloc,_free,_JsepOutput,_JsepGetNodeName")
-  else()
-    set(EXPORTED_FUNCTIONS "_malloc,_free")
+    string(APPEND EXPORTED_FUNCTIONS ",_JsepOutput,_JsepGetNodeName")
+  endif()
+  if (onnxruntime_USE_WEBGPU)
+    string(APPEND EXPORTED_FUNCTIONS ",_wgpuBufferRelease,_wgpuCreateInstance")
   endif()
 
   if (onnxruntime_ENABLE_WEBASSEMBLY_MEMORY64)
@@ -312,13 +316,15 @@ else()
         target_compile_options(noexcep_operators PRIVATE ${SMEMORY_FLAG} -Wno-experimental)
     endif()
     target_link_options(onnxruntime_webassembly PRIVATE
-      --post-js "${ONNXRUNTIME_ROOT}/wasm/js_post_js_64.js"
+      "SHELL:--post-js \"${ONNXRUNTIME_ROOT}/wasm/js_post_js_64.js\""
     )
+    list(APPEND onnxruntime_webassembly_script_deps "${ONNXRUNTIME_ROOT}/wasm/js_post_js_64.js")
   else ()
     set(MAXIMUM_MEMORY "4294967296")
     target_link_options(onnxruntime_webassembly PRIVATE
-      --post-js "${ONNXRUNTIME_ROOT}/wasm/js_post_js.js"
+      "SHELL:--post-js \"${ONNXRUNTIME_ROOT}/wasm/js_post_js.js\""
     )
+    list(APPEND onnxruntime_webassembly_script_deps "${ONNXRUNTIME_ROOT}/wasm/js_post_js.js")
   endif ()
 
   target_link_options(onnxruntime_webassembly PRIVATE
@@ -372,7 +378,6 @@ jsepDownload:_pp_")
       "SHELL:-s SIGNATURE_CONVERSIONS='${SIGNATURE_CONVERSIONS}'"
     )
   endif ()
-  set_target_properties(onnxruntime_webassembly PROPERTIES LINK_DEPENDS ${ONNXRUNTIME_ROOT}/wasm/pre.js)
 
   if (onnxruntime_USE_JSEP)
     # NOTE: "-s ASYNCIFY=1" is required for JSEP to work with WebGPU
@@ -382,10 +387,8 @@ jsepDownload:_pp_")
     target_compile_definitions(onnxruntime_webassembly PRIVATE USE_JSEP=1)
     target_link_options(onnxruntime_webassembly PRIVATE
       "SHELL:--pre-js \"${ONNXRUNTIME_ROOT}/wasm/pre-jsep.js\""
-      "SHELL:-s ASYNCIFY=1"
-      "SHELL:-s ASYNCIFY_STACK_SIZE=65536"
     )
-    set_target_properties(onnxruntime_webassembly PROPERTIES LINK_DEPENDS ${ONNXRUNTIME_ROOT}/wasm/pre-jsep.js)
+    list(APPEND onnxruntime_webassembly_script_deps "${ONNXRUNTIME_ROOT}/wasm/pre-jsep.js")
 
     if (onnxruntime_ENABLE_WEBASSEMBLY_MEMORY64)
       target_link_options(onnxruntime_webassembly PRIVATE
@@ -397,6 +400,20 @@ jsepDownload:_pp_")
 
   if (onnxruntime_USE_WEBGPU)
     target_compile_definitions(onnxruntime_webassembly PRIVATE USE_WEBGPU=1)
+    target_link_options(onnxruntime_webassembly PRIVATE
+      "SHELL:--post-js \"${ONNXRUNTIME_ROOT}/wasm/post-webgpu.js\""
+    )
+    list(APPEND onnxruntime_webassembly_script_deps "${ONNXRUNTIME_ROOT}/wasm/post-webgpu.js")
+  endif()
+
+  if (onnxruntime_USE_JSEP OR onnxruntime_USE_WEBGPU OR onnxruntime_USE_WEBNN)
+    # if any of the above is enabled, we need to use the asyncify library
+    target_link_options(onnxruntime_webassembly PRIVATE
+      "SHELL:--pre-js \"${ONNXRUNTIME_ROOT}/wasm/pre-async.js\""
+      "SHELL:-s ASYNCIFY=1"
+      "SHELL:-s ASYNCIFY_STACK_SIZE=65536"
+    )
+    list(APPEND onnxruntime_webassembly_script_deps "${ONNXRUNTIME_ROOT}/wasm/pre-async.js")
   endif()
 
   if (onnxruntime_EMSCRIPTEN_SETTINGS)
@@ -458,6 +475,8 @@ jsepDownload:_pp_")
     )
   endif()
 
+  set_target_properties(onnxruntime_webassembly PROPERTIES LINK_DEPENDS "${onnxruntime_webassembly_script_deps}")
+
   set(target_name_list ort)
 
   if (onnxruntime_ENABLE_TRAINING_APIS)
diff --git a/cmake/patches/dawn/dawn.patch b/cmake/patches/dawn/dawn.patch
index 2f85d5ab473b5..b578b858eac59 100644
--- a/cmake/patches/dawn/dawn.patch
+++ b/cmake/patches/dawn/dawn.patch
@@ -18,7 +18,7 @@ index 6e8ae37593..633af91eef 100644
 @@ -77,9 +77,17 @@ if (${DAWN_ENABLE_EMSCRIPTEN})
                  "${arg_UNPARSED_ARGUMENTS}")
          endif()
-
+ 
 +        # since Emscripten 4.0.3, file gen_struct_info.py is moved to outside of directory maint.
 +        if (EXISTS "${DAWN_EMSCRIPTEN_TOOLCHAIN}/tools/gen_struct_info.py")
 +            set(EM_GEN_STRUCT_INFO_SCRIPT "${DAWN_EMSCRIPTEN_TOOLCHAIN}/tools/gen_struct_info.py")
@@ -34,3 +34,114 @@ index 6e8ae37593..633af91eef 100644
              -q
              "${EM_BUILD_GEN_DIR}/struct_info_webgpu.json"
              "-I=${EM_BUILD_GEN_DIR}/include"
+diff --git a/src/emdawnwebgpu/README.md b/src/emdawnwebgpu/README.md
+index efd6491cd6..8ebc5d28b6 100644
+--- a/src/emdawnwebgpu/README.md
++++ b/src/emdawnwebgpu/README.md
+@@ -56,7 +56,7 @@ Set up the build directory using emcmake
+ mkdir out/cmake-wasm
+ cd out/cmake-wasm
+ 
+-# Make sure the path is to the source checkout of Emscripten, not emsdk's release.
++# If using Emscripten v4.0.2 or lower, make sure the path is to the source checkout of Emscripten, not emsdk's release.
+ emcmake cmake -GNinja -DDAWN_EMSCRIPTEN_TOOLCHAIN="path/to/emscripten" ../..
+ 
+ ninja
+diff --git a/third_party/emdawnwebgpu/webgpu.cpp b/third_party/emdawnwebgpu/webgpu.cpp
+index f1c5a7d50e..16f2495712 100644
+--- a/third_party/emdawnwebgpu/webgpu.cpp
++++ b/third_party/emdawnwebgpu/webgpu.cpp
+@@ -131,7 +131,6 @@ class RefCounted : NonMovable {
+   bool Release() {
+     if (mRefCount.fetch_sub(1u, std::memory_order_release) == 1u) {
+       std::atomic_thread_fence(std::memory_order_acquire);
+-      emwgpuDelete(this);
+       return true;
+     }
+     return false;
+@@ -234,6 +233,7 @@ class Ref {
+   static void Release(T value) {
+     if (value != nullptr && value->RefCounted::Release()) {
+       delete value;
++      emwgpuDelete(value);
+     }
+   }
+ 
+@@ -641,7 +641,8 @@ struct WGPUAdapterImpl final : public EventSource, public RefCounted {
+ struct WGPUBufferImpl final : public EventSource,
+                               public RefCountedWithExternalCount {
+  public:
+-  WGPUBufferImpl(const EventSource* source, bool mappedAtCreation);
++  WGPUBufferImpl(const EventSource* source, bool mappedAtCreation, bool isExternal);
++  ~WGPUBufferImpl();
+ 
+   void Destroy();
+   const void* GetConstMappedRange(size_t offset, size_t size);
+@@ -671,6 +672,7 @@ struct WGPUBufferImpl final : public EventSource,
+   };
+   MapRequest mPendingMapRequest;
+   WGPUBufferMapState mMapState;
++  bool mIsExternal;
+ };
+ 
+ struct WGPUQueueImpl final : public EventSource, public RefCounted {
+@@ -1164,11 +1166,15 @@ WGPUAdapter emwgpuCreateAdapter(const EventSource* source) {
+ 
+ WGPUBuffer emwgpuCreateBuffer(const EventSource* source,
+                               bool mappedAtCreation = false) {
+-  return new WGPUBufferImpl(source, mappedAtCreation);
++  return new WGPUBufferImpl(source, mappedAtCreation, true);
+ }
+ 
+ WGPUDevice emwgpuCreateDevice(const EventSource* source, WGPUQueue queue) {
+-  return new WGPUDeviceImpl(source, queue);
++  // This function is only called from JS via `importJsDevice()`, which
++  // needs to increment the external ref count to fix the behavior.
++  WGPUDeviceImpl* device = new WGPUDeviceImpl(source, queue);
++  device->AddExternalRef();
++  return device;
+ }
+ 
+ WGPUQueue emwgpuCreateQueue(const EventSource* source) {
+@@ -1275,15 +1281,22 @@ WGPUAdapterImpl::WGPUAdapterImpl(const EventSource* source)
+ // WGPUBuffer implementations.
+ // ----------------------------------------------------------------------------
+ 
+-WGPUBufferImpl::WGPUBufferImpl(const EventSource* source, bool mappedAtCreation)
++WGPUBufferImpl::WGPUBufferImpl(const EventSource* source, bool mappedAtCreation, bool isExternal)
+     : EventSource(source),
+       mMapState(mappedAtCreation ? WGPUBufferMapState_Mapped
+-                                 : WGPUBufferMapState_Unmapped) {
++                                 : WGPUBufferMapState_Unmapped),
++      mIsExternal(isExternal) {
+   if (mappedAtCreation) {
+     mPendingMapRequest = {kNullFutureId, WGPUMapMode_Write};
+   }
+ }
+ 
++WGPUBufferImpl::~WGPUBufferImpl() {
++  if (!mIsExternal) {
++    Destroy();
++  }
++}
++
+ void WGPUBufferImpl::Destroy() {
+   emwgpuBufferDestroy(this);
+   AbortPendingMap("Buffer was destroyed before mapping was resolved.");
+@@ -1504,6 +1517,7 @@ WGPUFuture WGPUShaderModuleImpl::GetCompilationInfo(
+   void wgpu##Name##Release(WGPU##Name o) {       \
+     if (o->Release()) {                          \
+       delete o;                                  \
++      emwgpuDelete(o);                           \
+     }                                            \
+   }
+ WGPU_OBJECTS(DEFINE_WGPU_DEFAULT_ADDREF_RELEASE)
+@@ -1638,7 +1652,7 @@ void wgpuBufferUnmap(WGPUBuffer buffer) {
+ 
+ WGPUBuffer wgpuDeviceCreateBuffer(WGPUDevice device,
+                                   const WGPUBufferDescriptor* descriptor) {
+-  WGPUBuffer buffer = new WGPUBufferImpl(device, descriptor->mappedAtCreation);
++  WGPUBuffer buffer = new WGPUBufferImpl(device, descriptor->mappedAtCreation, false);
+   emwgpuDeviceCreateBuffer(device, descriptor, buffer);
+   return buffer;
+ }
diff --git a/js/build_webgpu.bat b/js/build_webgpu.bat
new file mode 100644
index 0000000000000..95413509e701d
--- /dev/null
+++ b/js/build_webgpu.bat
@@ -0,0 +1,79 @@
+@echo off
+
+rem build_webgpu.bat --- build onnxruntime-web with WebGPU EP
+rem
+rem Usage:
+rem   build_webgpu.bat  config  [clean]
+rem
+rem Options:
+rem   config      Build configuration, "d" or "r"
+rem   clean       Perform a clean build, "clean" or empty
+
+setlocal enabledelayedexpansion
+
+set ROOT=%~dp0..\
+set BUILD_DIR=%ROOT%build_webgpu
+
+:arg1
+if ["%~1"]==["d"] (
+    set CONFIG=Debug
+    set CONFIG_EXTRA_FLAG=
+    @rem --enable_wasm_profiling --wasm_run_tests_in_browser
+    @rem --cmake_extra_defines onnxruntime_ENABLE_WEBASSEMBLY_OUTPUT_OPTIMIZED_MODEL=1
+    @rem --enable_wasm_debug_info
+    goto :arg2
+)
+if ["%~1"]==["r"] (
+    set CONFIG=Release
+    set CONFIG_EXTRA_FLAG=
+    @rem --enable_wasm_api_exception_catching --disable_rtti
+    goto :arg2
+)
+echo Invalid configuration "%~1", must be "d"(Debug) or "r"(Release)
+exit /b 1
+
+:arg2
+if ["%~2"]==["clean"] (
+    goto :clean
+)
+if not exist "%ROOT%js\web\dist" (
+    goto :npm_ci
+)
+
+goto :build_wasm
+
+:clean
+if exist "%BUILD_DIR%" (
+    rd /s /q %BUILD_DIR%
+)
+
+pushd %ROOT%
+git submodule sync --recursive
+git submodule update --init --recursive
+popd
+
+:npm_ci
+pushd %ROOT%js
+call npm ci
+popd
+pushd %ROOT%js\common
+call npm ci
+popd
+pushd %ROOT%js\web
+call npm ci
+call npm run pull:wasm
+popd
+
+:build_wasm
+
+set PATH=C:\Program Files\Git\usr\bin;%PATH%
+
+call %ROOT%build.bat --config %CONFIG% %CONFIG_EXTRA_FLAG% --skip_submodule_sync --build_wasm --target onnxruntime_webassembly --skip_tests^
+ --enable_wasm_simd --enable_wasm_threads --use_jsep --use_webnn --use_webgpu --build_dir %BUILD_DIR%
+
+IF NOT "%ERRORLEVEL%" == "0" (
+  exit /b %ERRORLEVEL%
+)
+
+copy /Y %BUILD_DIR%\%CONFIG%\ort-wasm-simd-threaded.jsep.wasm %ROOT%js\web\dist\
+copy /Y %BUILD_DIR%\%CONFIG%\ort-wasm-simd-threaded.jsep.mjs %ROOT%js\web\dist\
diff --git a/js/web/lib/build-def.d.ts b/js/web/lib/build-def.d.ts
index 59f64a3179605..83a52ebaefe05 100644
--- a/js/web/lib/build-def.d.ts
+++ b/js/web/lib/build-def.d.ts
@@ -40,6 +40,13 @@ interface BuildDefinitions {
    */
   readonly ENABLE_BUNDLE_WASM_JS: boolean;
 
+  /**
+   * defines whether to use WebGPU EP instead of JSEP for WebGPU backend.
+   *
+   * This flag requires the corresponding WebAssembly artifact to be built with `--use_webgpu` flag.
+   */
+  readonly USE_WEBGPU_EP: boolean;
+
   // #endregion
 
   // #region Build definitions for ESM
diff --git a/js/web/lib/wasm/jsep/init.ts b/js/web/lib/wasm/jsep/init.ts
index 6c2be3aa0cfe1..8ab6b054bf8a7 100644
--- a/js/web/lib/wasm/jsep/init.ts
+++ b/js/web/lib/wasm/jsep/init.ts
@@ -1,17 +1,17 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-import { Env } from 'onnxruntime-common';
+import type { Env } from 'onnxruntime-common';
 
 import { calculateTensorSizeInBytes, DataType } from '../wasm-common';
 
 import type { OrtWasmModule } from '../wasm-types';
 
-import { WebGpuBackend } from './backend-webgpu';
+import type { WebGpuBackend } from './backend-webgpu';
 import { LOG_DEBUG } from './log';
-import { TensorView } from './tensor-view';
+import type { TensorView } from './tensor-view';
 import { ShapeUtil } from './util';
-import { AdapterInfo, ComputeContext, ComputeContextInputsOutputsMapping, ProgramInfo } from './webgpu/types';
+import type { AdapterInfo, ComputeContext, ComputeContextInputsOutputsMapping, ProgramInfo } from './webgpu/types';
 import { WebNNBackend } from './backend-webnn';
 
 /* eslint-disable no-bitwise */
@@ -197,79 +197,83 @@ export const init = async (
   }
 
   if (name === 'webgpu') {
-    const backend = new WebGpuBackend();
-    await backend.initialize(env, gpuAdapter!);
+    if (!BUILD_DEFS.USE_WEBGPU_EP) {
+      // eslint-disable-next-line @typescript-eslint/no-require-imports, @typescript-eslint/no-var-requires
+      const webGpuBackendImpl = require('./backend-webgpu').WebGpuBackend;
+      const backend = new webGpuBackendImpl();
+      await backend.initialize(env, gpuAdapter!);
 
-    jsepInit('webgpu', [
-      // backend
-      backend,
+      jsepInit('webgpu', [
+        // backend
+        backend,
+
+        // jsepAlloc()
+        (size: number) => backend.alloc(Number(size)),
 
-      // jsepAlloc()
-      (size: number) => backend.alloc(Number(size)),
+        // jsepFree()
+        (ptr: number) => backend.free(ptr),
 
-      // jsepFree()
-      (ptr: number) => backend.free(ptr),
+        // jsepCopy(src, dst, size, isSourceGpu)
+        (src: number, dst: number, size: number, isSourceGpu = false) => {
+          if (isSourceGpu) {
+            LOG_DEBUG(
+              'verbose',
+              () => `[WebGPU] jsepCopyGpuToGpu: src=${Number(src)}, dst=${Number(dst)}, size=${Number(size)}`,
+            );
+            backend.memcpy(Number(src), Number(dst));
+          } else {
+            LOG_DEBUG(
+              'verbose',
+              () =>
+                `[WebGPU] jsepCopyCpuToGpu: dataOffset=${Number(src)}, gpuDataId=${Number(dst)}, size=${Number(size)}`,
+            );
+            const data = module.HEAPU8.subarray(Number(src >>> 0), Number(src >>> 0) + Number(size));
+            backend.upload(Number(dst), data);
+          }
+        },
 
-      // jsepCopy(src, dst, size, isSourceGpu)
-      (src: number, dst: number, size: number, isSourceGpu = false) => {
-        if (isSourceGpu) {
+        // jsepCopyAsync(src, dst, size)
+        async (gpuDataId: number, dataOffset: number, size: number): Promise<void> => {
           LOG_DEBUG(
             'verbose',
-            () => `[WebGPU] jsepCopyGpuToGpu: src=${Number(src)}, dst=${Number(dst)}, size=${Number(size)}`,
+            () => `[WebGPU] jsepCopyGpuToCpu: gpuDataId=${gpuDataId}, dataOffset=${dataOffset}, size=${size}`,
           );
-          backend.memcpy(Number(src), Number(dst));
-        } else {
-          LOG_DEBUG(
-            'verbose',
-            () =>
-              `[WebGPU] jsepCopyCpuToGpu: dataOffset=${Number(src)}, gpuDataId=${Number(dst)}, size=${Number(size)}`,
-          );
-          const data = module.HEAPU8.subarray(Number(src >>> 0), Number(src >>> 0) + Number(size));
-          backend.upload(Number(dst), data);
-        }
-      },
 
-      // jsepCopyAsync(src, dst, size)
-      async (gpuDataId: number, dataOffset: number, size: number): Promise<void> => {
-        LOG_DEBUG(
-          'verbose',
-          () => `[WebGPU] jsepCopyGpuToCpu: gpuDataId=${gpuDataId}, dataOffset=${dataOffset}, size=${size}`,
-        );
-
-        await backend.download(Number(gpuDataId), () =>
-          module.HEAPU8.subarray(Number(dataOffset) >>> 0, Number(dataOffset + size) >>> 0),
-        );
-      },
+          await backend.download(Number(gpuDataId), () =>
+            module.HEAPU8.subarray(Number(dataOffset) >>> 0, Number(dataOffset + size) >>> 0),
+          );
+        },
 
-      // jsepCreateKernel
-      (kernelType: string, kernelId: number, attribute: unknown) =>
-        backend.createKernel(
-          kernelType,
-          Number(kernelId),
-          attribute,
-          module.UTF8ToString(module._JsepGetNodeName!(Number(kernelId))),
-        ),
+        // jsepCreateKernel
+        (kernelType: string, kernelId: number, attribute: unknown) =>
+          backend.createKernel(
+            kernelType,
+            Number(kernelId),
+            attribute,
+            module.UTF8ToString(module._JsepGetNodeName!(Number(kernelId))),
+          ),
 
-      // jsepReleaseKernel
-      (kernel: number) => backend.releaseKernel(kernel),
+        // jsepReleaseKernel
+        (kernel: number) => backend.releaseKernel(kernel),
 
-      // jsepRun
-      (kernel: number, contextDataOffset: number, sessionHandle: number, errors: Array<Promise<string | null>>) => {
-        LOG_DEBUG(
-          'verbose',
-          () =>
-            `[WebGPU] jsepRun: sessionHandle=${sessionHandle}, kernel=${kernel}, contextDataOffset=${contextDataOffset}`,
-        );
-        const context = new ComputeContextImpl(module, backend, Number(contextDataOffset));
-        return backend.computeKernel(Number(kernel), context, errors);
-      },
-      // jsepCaptureBegin
-      () => backend.captureBegin(),
-      // jsepCaptureEnd
-      () => backend.captureEnd(),
-      // jsepReplay
-      () => backend.replay(),
-    ]);
+        // jsepRun
+        (kernel: number, contextDataOffset: number, sessionHandle: number, errors: Array<Promise<string | null>>) => {
+          LOG_DEBUG(
+            'verbose',
+            () =>
+              `[WebGPU] jsepRun: sessionHandle=${sessionHandle}, kernel=${kernel}, contextDataOffset=${contextDataOffset}`,
+          );
+          const context = new ComputeContextImpl(module, backend, Number(contextDataOffset));
+          return backend.computeKernel(Number(kernel), context, errors);
+        },
+        // jsepCaptureBegin
+        () => backend.captureBegin(),
+        // jsepCaptureEnd
+        () => backend.captureEnd(),
+        // jsepReplay
+        () => backend.replay(),
+      ]);
+    }
   } else {
     const backend = new WebNNBackend(env);
     jsepInit('webnn', [
diff --git a/js/web/lib/wasm/session-options.ts b/js/web/lib/wasm/session-options.ts
index 17e564247863d..89a4484e5a1c4 100644
--- a/js/web/lib/wasm/session-options.ts
+++ b/js/web/lib/wasm/session-options.ts
@@ -1,7 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-import { InferenceSession } from 'onnxruntime-common';
+import type { InferenceSession } from 'onnxruntime-common';
 
 import { getInstance } from './wasm-factory';
 import { allocWasmString, checkLastError, iterateExtraOptions } from './wasm-utils';
@@ -54,13 +54,28 @@ const appendDefaultOptions = (options: InferenceSession.SessionOptions): void =>
   }
 };
 
-const setExecutionProviders = (
+const appendSessionConfig = (sessionOptionsHandle: number, key: string, value: string, allocs: number[]): void => {
+  const keyDataOffset = allocWasmString(key, allocs);
+  const valueDataOffset = allocWasmString(value, allocs);
+  if (getInstance()._OrtAddSessionConfigEntry(sessionOptionsHandle, keyDataOffset, valueDataOffset) !== 0) {
+    checkLastError(`Can't set a session config entry: ${key} - ${value}.`);
+  }
+};
+
+const appendEpOption = (epOptions: Array<[number, number]>, key: string, value: string, allocs: number[]): void => {
+  const keyDataOffset = allocWasmString(key, allocs);
+  const valueDataOffset = allocWasmString(value, allocs);
+  epOptions.push([keyDataOffset, valueDataOffset]);
+};
+
+const setExecutionProviders = async (
   sessionOptionsHandle: number,
   executionProviders: readonly InferenceSession.ExecutionProviderConfig[],
   allocs: number[],
-): void => {
+): Promise<void> => {
   for (const ep of executionProviders) {
     let epName = typeof ep === 'string' ? ep : ep.name;
+    const epOptions: Array<[number, number]> = [];
 
     // check EP name
     switch (epName) {
@@ -71,26 +86,44 @@ const setExecutionProviders = (
           // const context = (webnnOptions as InferenceSession.WebNNOptionsWithMLContext)?.context;
           const deviceType = (webnnOptions as InferenceSession.WebNNContextOptions)?.deviceType;
           if (deviceType) {
-            const keyDataOffset = allocWasmString('deviceType', allocs);
-            const valueDataOffset = allocWasmString(deviceType, allocs);
-            if (getInstance()._OrtAddSessionConfigEntry(sessionOptionsHandle, keyDataOffset, valueDataOffset) !== 0) {
-              checkLastError(`Can't set a session config entry: 'deviceType' - ${deviceType}.`);
-            }
+            appendSessionConfig(sessionOptionsHandle, 'deviceType', deviceType, allocs);
           }
         }
         break;
       case 'webgpu':
-        epName = 'JS';
-        if (typeof ep !== 'string') {
-          const webgpuOptions = ep as InferenceSession.WebGpuExecutionProviderOption;
-          if (webgpuOptions?.preferredLayout) {
-            if (webgpuOptions.preferredLayout !== 'NCHW' && webgpuOptions.preferredLayout !== 'NHWC') {
-              throw new Error(`preferredLayout must be either 'NCHW' or 'NHWC': ${webgpuOptions.preferredLayout}`);
+        if (BUILD_DEFS.USE_WEBGPU_EP) {
+          epName = 'WebGPU';
+          let customDevice: GPUDevice | undefined;
+
+          if (typeof ep !== 'string') {
+            const customOptions = ep as unknown as { device: GPUDevice };
+            if (customOptions.device) {
+              if (typeof GPUDevice !== 'undefined' && customOptions.device instanceof GPUDevice) {
+                customDevice = customOptions.device;
+              } else {
+                throw new Error('Invalid GPU device set in WebGPU EP options.');
+              }
             }
-            const keyDataOffset = allocWasmString('preferredLayout', allocs);
-            const valueDataOffset = allocWasmString(webgpuOptions.preferredLayout, allocs);
-            if (getInstance()._OrtAddSessionConfigEntry(sessionOptionsHandle, keyDataOffset, valueDataOffset) !== 0) {
-              checkLastError(`Can't set a session config entry: 'preferredLayout' - ${webgpuOptions.preferredLayout}.`);
+
+            // TODO: handle more options
+          }
+
+          const info = getInstance().webgpuRegisterDevice!(customDevice);
+          if (info) {
+            const [deviceId, instanceHandle, deviceHandle] = info;
+            appendEpOption(epOptions, 'deviceId', deviceId.toString(), allocs);
+            appendEpOption(epOptions, 'webgpuInstance', instanceHandle.toString(), allocs);
+            appendEpOption(epOptions, 'webgpuDevice', deviceHandle.toString(), allocs);
+          }
+        } else {
+          epName = 'JS';
+          if (typeof ep !== 'string') {
+            const webgpuOptions = ep as InferenceSession.WebGpuExecutionProviderOption;
+            if (webgpuOptions?.preferredLayout) {
+              if (webgpuOptions.preferredLayout !== 'NCHW' && webgpuOptions.preferredLayout !== 'NHWC') {
+                throw new Error(`preferredLayout must be either 'NCHW' or 'NHWC': ${webgpuOptions.preferredLayout}`);
+              }
+              appendSessionConfig(sessionOptionsHandle, 'preferredLayout', webgpuOptions.preferredLayout, allocs);
             }
           }
         }
@@ -103,13 +136,34 @@ const setExecutionProviders = (
     }
 
     const epNameDataOffset = allocWasmString(epName, allocs);
-    if (getInstance()._OrtAppendExecutionProvider(sessionOptionsHandle, epNameDataOffset) !== 0) {
+    const epOptionsCount = epOptions.length;
+    let keysOffset = 0;
+    let valuesOffset = 0;
+    if (epOptionsCount > 0) {
+      keysOffset = getInstance()._malloc(epOptionsCount * getInstance().PTR_SIZE);
+      allocs.push(keysOffset);
+      valuesOffset = getInstance()._malloc(epOptionsCount * getInstance().PTR_SIZE);
+      allocs.push(valuesOffset);
+      for (let i = 0; i < epOptionsCount; i++) {
+        getInstance().setValue(keysOffset + i * getInstance().PTR_SIZE, epOptions[i][0], '*');
+        getInstance().setValue(valuesOffset + i * getInstance().PTR_SIZE, epOptions[i][1], '*');
+      }
+    }
+    if (
+      (await getInstance()._OrtAppendExecutionProvider(
+        sessionOptionsHandle,
+        epNameDataOffset,
+        keysOffset,
+        valuesOffset,
+        epOptionsCount,
+      )) !== 0
+    ) {
       checkLastError(`Can't append execution provider: ${epName}.`);
     }
   }
 };
 
-export const setSessionOptions = (options?: InferenceSession.SessionOptions): [number, number[]] => {
+export const setSessionOptions = async (options?: InferenceSession.SessionOptions): Promise<[number, number[]]> => {
   const wasm = getInstance();
   let sessionOptionsHandle = 0;
   const allocs: number[] = [];
@@ -155,20 +209,19 @@ export const setSessionOptions = (options?: InferenceSession.SessionOptions): [n
     }
 
     if (sessionOptions.executionProviders) {
-      setExecutionProviders(sessionOptionsHandle, sessionOptions.executionProviders, allocs);
+      await setExecutionProviders(sessionOptionsHandle, sessionOptions.executionProviders, allocs);
     }
 
     if (sessionOptions.enableGraphCapture !== undefined) {
       if (typeof sessionOptions.enableGraphCapture !== 'boolean') {
         throw new Error(`enableGraphCapture must be a boolean value: ${sessionOptions.enableGraphCapture}`);
       }
-      const keyDataOffset = allocWasmString('enableGraphCapture', allocs);
-      const valueDataOffset = allocWasmString(sessionOptions.enableGraphCapture.toString(), allocs);
-      if (wasm._OrtAddSessionConfigEntry(sessionOptionsHandle, keyDataOffset, valueDataOffset) !== 0) {
-        checkLastError(
-          `Can't set a session config entry: 'enableGraphCapture' - ${sessionOptions.enableGraphCapture}.`,
-        );
-      }
+      appendSessionConfig(
+        sessionOptionsHandle,
+        'enableGraphCapture',
+        sessionOptions.enableGraphCapture.toString(),
+        allocs,
+      );
     }
 
     if (sessionOptions.freeDimensionOverrides) {
@@ -188,12 +241,7 @@ export const setSessionOptions = (options?: InferenceSession.SessionOptions): [n
 
     if (sessionOptions.extra !== undefined) {
       iterateExtraOptions(sessionOptions.extra, '', new WeakSet<Record<string, unknown>>(), (key, value) => {
-        const keyDataOffset = allocWasmString(key, allocs);
-        const valueDataOffset = allocWasmString(value, allocs);
-
-        if (wasm._OrtAddSessionConfigEntry(sessionOptionsHandle, keyDataOffset, valueDataOffset) !== 0) {
-          checkLastError(`Can't set a session config entry: ${key} - ${value}.`);
-        }
+        appendSessionConfig(sessionOptionsHandle, key, value, allocs);
       });
     }
 
diff --git a/js/web/lib/wasm/wasm-core-impl.ts b/js/web/lib/wasm/wasm-core-impl.ts
index 4bccfa76fdda3..dbcf80adf3552 100644
--- a/js/web/lib/wasm/wasm-core-impl.ts
+++ b/js/web/lib/wasm/wasm-core-impl.ts
@@ -102,11 +102,20 @@ export const initRuntime = async (env: Env): Promise<void> => {
  * @param epName
  */
 export const initEp = async (env: Env, epName: string): Promise<void> => {
+  // initialize ASYNCIFY support
+  getInstance().asyncInit?.();
+
+  if (epName === 'webgpu' && BUILD_DEFS.USE_WEBGPU_EP) {
+    getInstance().webgpuInit!((device) => {
+      env.webgpu.device = device;
+    });
+  }
+
   if (!BUILD_DEFS.DISABLE_JSEP) {
     // eslint-disable-next-line @typescript-eslint/no-require-imports, @typescript-eslint/no-var-requires
     const initJsep = require('./jsep/init').init;
 
-    if (epName === 'webgpu') {
+    if (epName === 'webgpu' && !BUILD_DEFS.USE_WEBGPU_EP) {
       // perform WebGPU availability check
       if (typeof navigator === 'undefined' || !navigator.gpu) {
         throw new Error('WebGPU is not supported in current environment');
@@ -270,7 +279,7 @@ export const createSession = async (
   const outputNamesUTF8Encoded = [];
 
   try {
-    [sessionOptionsHandle, allocs] = setSessionOptions(options);
+    [sessionOptionsHandle, allocs] = await setSessionOptions(options);
 
     if (options?.externalData && wasm.mountExternalData) {
       const loadingPromises = [];
@@ -278,7 +287,7 @@ export const createSession = async (
         const path = typeof file === 'string' ? file : file.path;
         loadingPromises.push(
           loadFile(typeof file === 'string' ? file : file.data).then((data) => {
-            wasm.mountExternalData!(path, data);
+            wasm.mountExternalData(path, data);
           }),
         );
       }
@@ -312,6 +321,7 @@ export const createSession = async (
     }
 
     sessionHandle = await wasm._OrtCreateSession(modelDataOffset, modelDataLength, sessionOptionsHandle);
+    wasm.webgpuOnCreateSession?.(sessionHandle);
     if (sessionHandle === 0) {
       checkLastError("Can't create a session.");
     }
@@ -444,6 +454,7 @@ export const releaseSession = (sessionId: number): void => {
   }
 
   wasm.jsepOnReleaseSession?.(sessionId);
+  wasm.webgpuOnReleaseSession?.(sessionId);
 
   inputNamesUTF8Encoded.forEach((buf) => wasm._OrtFree(buf));
   outputNamesUTF8Encoded.forEach((buf) => wasm._OrtFree(buf));
@@ -491,11 +502,20 @@ export const prepareInputOutputTensor = async (
     const gpuBuffer = tensor[2].gpuBuffer;
     dataByteLength = calculateTensorSizeInBytes(tensorDataTypeStringToEnum(dataType), dims)!;
 
-    const registerBuffer = wasm.jsepRegisterBuffer;
-    if (!registerBuffer) {
-      throw new Error('Tensor location "gpu-buffer" is not supported without using WebGPU.');
+    if (BUILD_DEFS.USE_WEBGPU_EP) {
+      const registerBuffer = wasm.webgpuRegisterBuffer;
+      if (!registerBuffer) {
+        throw new Error('Tensor location "gpu-buffer" is not supported without using WebGPU.');
+      }
+
+      rawData = registerBuffer(gpuBuffer, sessionId);
+    } else {
+      const registerBuffer = wasm.jsepRegisterBuffer;
+      if (!registerBuffer) {
+        throw new Error('Tensor location "gpu-buffer" is not supported without using WebGPU.');
+      }
+      rawData = registerBuffer(sessionId, index, gpuBuffer, dataByteLength);
     }
-    rawData = registerBuffer(sessionId, index, gpuBuffer, dataByteLength);
   } else if (location === 'ml-tensor') {
     const mlTensor = tensor[2].mlTensor as MLTensor;
     dataByteLength = calculateTensorSizeInBytes(tensorDataTypeStringToEnum(dataType), dims)!;
@@ -791,7 +811,7 @@ export const run = async (
           // If a certain output's preferred location is GPU but the tensor is empty, we still need to create a CPU
           // tensor for it. There is no mapping GPU buffer for an empty tensor.
           if (preferredLocation === 'gpu-buffer' && size > 0) {
-            const getBuffer = wasm.jsepGetBuffer;
+            const getBuffer = BUILD_DEFS.USE_WEBGPU_EP ? wasm.webgpuGetBuffer : wasm.jsepGetBuffer;
             if (!getBuffer) {
               throw new Error('preferredLocation "gpu-buffer" is not supported without using WebGPU.');
             }
@@ -804,20 +824,43 @@ export const run = async (
             // do not release the tensor right now. it will be released when user calls tensor.dispose().
             keepOutputTensor = true;
 
-            output.push([
-              type,
-              dims,
-              {
-                gpuBuffer,
-                download: wasm.jsepCreateDownloader!(gpuBuffer, bufferSize, type),
-                dispose: () => {
-                  if (wasm._OrtReleaseTensor(tensor) !== 0) {
-                    checkLastError("Can't release tensor.");
-                  }
+            if (BUILD_DEFS.USE_WEBGPU_EP) {
+              wasm.webgpuRegisterBuffer!(gpuBuffer, sessionId, dataOffset);
+              const downloadDataFunction = wasm.webgpuCreateDownloader!(gpuBuffer, bufferSize, sessionId);
+              output.push([
+                type,
+                dims,
+                {
+                  gpuBuffer,
+                  download: async () => {
+                    const arrayBuffer = await downloadDataFunction();
+                    const data = new (tensorTypeToTypedArrayConstructor(type!))(arrayBuffer);
+                    return data as Tensor.DataTypeMap[Tensor.GpuBufferDataTypes];
+                  },
+                  dispose: () => {
+                    if (wasm._OrtReleaseTensor(tensor) !== 0) {
+                      checkLastError("Can't release tensor.");
+                    }
+                  },
                 },
-              },
-              'gpu-buffer',
-            ]);
+                'gpu-buffer',
+              ]);
+            } else {
+              output.push([
+                type,
+                dims,
+                {
+                  gpuBuffer,
+                  download: wasm.jsepCreateDownloader!(gpuBuffer, bufferSize, type),
+                  dispose: () => {
+                    if (wasm._OrtReleaseTensor(tensor) !== 0) {
+                      checkLastError("Can't release tensor.");
+                    }
+                  },
+                },
+                'gpu-buffer',
+              ]);
+            }
           } else if (preferredLocation === 'ml-tensor' && size > 0) {
             const ensureTensor = wasm.jsepEnsureTensor;
             if (!ensureTensor) {
@@ -887,6 +930,18 @@ export const run = async (
   } finally {
     wasm.stackRestore(beforeRunStack);
 
+    if (BUILD_DEFS.USE_WEBGPU_EP) {
+      inputTensors.forEach((t) => {
+        if (t && t[3] === 'gpu-buffer') {
+          wasm.webgpuUnregisterBuffer!(t[2].gpuBuffer);
+        }
+      });
+      outputTensors.forEach((t) => {
+        if (t && t[3] === 'gpu-buffer') {
+          wasm.webgpuUnregisterBuffer!(t[2].gpuBuffer);
+        }
+      });
+    }
     inputTensorHandles.forEach((v) => wasm._OrtReleaseTensor(v));
     outputTensorHandles.forEach((v) => wasm._OrtReleaseTensor(v));
     inputOutputAllocs.forEach((p) => wasm._free(p));
diff --git a/js/web/lib/wasm/wasm-types.ts b/js/web/lib/wasm/wasm-types.ts
index b4871e145f4d7..9b2ec71fd351d 100644
--- a/js/web/lib/wasm/wasm-types.ts
+++ b/js/web/lib/wasm/wasm-types.ts
@@ -41,18 +41,6 @@ export declare namespace JSEP {
   type DownloadTensorFunction = (tensorId: number, dstBuffer: ArrayBufferView | ArrayBuffer) => Promise<undefined>;
 
   export interface Module extends WebGpuModule, WebNnModule {
-    /**
-     * Mount the external data file to an internal map, which will be used during session initialization.
-     *
-     * @param externalDataFilePath - specify the relative path of the external data file.
-     * @param externalDataFileData - specify the content data.
-     */
-    mountExternalData(externalDataFilePath: string, externalDataFileData: Uint8Array): void;
-    /**
-     * Unmount all external data files from the internal map.
-     */
-    unmountExternalData(): void;
-
     /**
      * This is the entry of JSEP initialization. This function is called once when initializing ONNX Runtime per
      * backend. This function initializes Asyncify support. If name is 'webgpu', also initializes WebGPU backend and
@@ -294,6 +282,21 @@ export declare namespace JSEP {
   }
 }
 
+export declare namespace WebGpu {
+  export interface Module {
+    webgpuInit(setDefaultDevice: (device: GPUDevice) => void): void;
+    webgpuRegisterDevice(
+      device?: GPUDevice,
+    ): undefined | [deviceId: number, instanceHandle: number, deviceHandle: number];
+    webgpuOnCreateSession(sessionHandle: number): void;
+    webgpuOnReleaseSession(sessionHandle: number): void;
+    webgpuRegisterBuffer(buffer: GPUBuffer, sessionHandle: number, bufferHandle?: number): number;
+    webgpuUnregisterBuffer(buffer: GPUBuffer): void;
+    webgpuGetBuffer(bufferHandle: number): GPUBuffer;
+    webgpuCreateDownloader(gpuBuffer: GPUBuffer, size: number, sessionHandle: number): () => Promise<ArrayBuffer>;
+  }
+}
+
 export interface OrtInferenceAPIs {
   _OrtInit(numThreads: number, loggingLevel: number): number;
 
@@ -358,7 +361,13 @@ export interface OrtInferenceAPIs {
     logVerbosityLevel: number,
     optimizedModelFilePath: number,
   ): number;
-  _OrtAppendExecutionProvider(sessionOptionsHandle: number, name: number): number;
+  _OrtAppendExecutionProvider(
+    sessionOptionsHandle: number,
+    name: number,
+    providerOptionsKeys: number,
+    providerOptionsValues: number,
+    numKeys: number,
+  ): Promise<number>;
   _OrtAddFreeDimensionOverride(sessionOptionsHandle: number, name: number, dim: number): number;
   _OrtAddSessionConfigEntry(sessionOptionsHandle: number, configKey: number, configValue: number): number;
   _OrtReleaseSessionOptions(sessionOptionsHandle: number): number;
@@ -373,8 +382,11 @@ export interface OrtInferenceAPIs {
 /**
  * The interface of the WebAssembly module for ONNX Runtime, compiled from C++ source code by Emscripten.
  */
-export interface OrtWasmModule extends EmscriptenModule, OrtInferenceAPIs, Partial<JSEP.Module> {
-  PTR_SIZE: number;
+export interface OrtWasmModule
+  extends EmscriptenModule,
+    OrtInferenceAPIs,
+    Partial<JSEP.Module>,
+    Partial<WebGpu.Module> {
   // #region emscripten functions
   stackSave(): number;
   stackRestore(stack: number): void;
@@ -387,7 +399,31 @@ export interface OrtWasmModule extends EmscriptenModule, OrtInferenceAPIs, Parti
   stringToUTF8(str: string, offset: number, maxBytes: number): void;
   // #endregion
 
+  // #region ORT shared
+
+  readonly PTR_SIZE: 4 | 8;
+
+  /**
+   * Mount the external data file to an internal map, which will be used during session initialization.
+   *
+   * @param externalDataFilePath - specify the relative path of the external data file.
+   * @param externalDataFileData - specify the content data.
+   */
+  mountExternalData(externalDataFilePath: string, externalDataFileData: Uint8Array): void;
+  /**
+   * Unmount all external data files from the internal map.
+   */
+  unmountExternalData(): void;
+
+  /**
+   * This function patches the WebAssembly module to support Asyncify. This function should be called at least once
+   * before any ORT API is called.
+   */
+  asyncInit?(): void;
+
+  // #endregion
+
   // #region config
-  numThreads?: number;
+  readonly numThreads?: number;
   // #endregion
 }
diff --git a/js/web/script/build.ts b/js/web/script/build.ts
index 7966262631bbf..98e61c9f87fbb 100644
--- a/js/web/script/build.ts
+++ b/js/web/script/build.ts
@@ -27,7 +27,8 @@ const args = minimist(process.argv.slice(2));
  * --bundle-mode=node
  *   Build a single ort-web bundle for nodejs.
  */
-const BUNDLE_MODE: 'prod' | 'dev' | 'perf' | 'node' = args['bundle-mode'] || 'prod';
+const BUNDLE_MODE: 'prod' | 'dev' | 'perf' | 'node' =
+  process.env.npm_config_bundle_mode || args['bundle-mode'] || 'prod';
 
 /**
  * --debug
@@ -41,7 +42,18 @@ const BUNDLE_MODE: 'prod' | 'dev' | 'perf' | 'node' = args['bundle-mode'] || 'pr
  *  Enable debug mode. In this mode, esbuild metafile feature will be enabled. Full bundle analysis will be saved to a
  * file as JSON.
  */
-const DEBUG = args.debug; // boolean|'verbose'|'save'
+const DEBUG = process.env.npm_config_debug || args.debug; // boolean|'verbose'|'save'
+
+/**
+ * --webgpu-ep
+ * --no-webgpu-ep (default)
+ *
+ * Enable or disable the use of WebGPU EP. If enabled, the WebGPU EP will be used. If disabled, the WebGPU backend will
+ * be used with JSEP.
+ *
+ * (temporary) This flag is used to test the WebGPU EP integration. It will be removed in the future.
+ */
+const USE_WEBGPU_EP = process.env.npm_config_webgpu_ep ?? args['webgpu-ep'] ?? false;
 
 /**
  * Root folder of the source code: `<ORT_ROOT>/js/`
@@ -57,6 +69,7 @@ const DEFAULT_DEFINE = {
   'BUILD_DEFS.DISABLE_WASM': 'false',
   'BUILD_DEFS.DISABLE_WASM_PROXY': 'false',
   'BUILD_DEFS.ENABLE_BUNDLE_WASM_JS': 'false',
+  'BUILD_DEFS.USE_WEBGPU_EP': JSON.stringify(!!USE_WEBGPU_EP),
 
   'BUILD_DEFS.IS_ESM': 'false',
   'BUILD_DEFS.ESM_IMPORT_META_URL': 'undefined',
diff --git a/onnxruntime/core/framework/external_data_loader.cc b/onnxruntime/core/framework/external_data_loader.cc
index fe73a55735631..c577805e69cc4 100644
--- a/onnxruntime/core/framework/external_data_loader.cc
+++ b/onnxruntime/core/framework/external_data_loader.cc
@@ -60,7 +60,12 @@ common::Status LoadWebAssemblyExternalData(const Env& env,
                                      break;
                                    case 1:
                                      // Load external data to GPU.
-                                     Module.jsepUploadExternalBuffer(dataIdOrBuffer, data);
+                                     // TODO: use a unified interface for upload external buffer.
+                                     if (Module.webgpuUploadExternalBuffer) {
+                                       Module.webgpuUploadExternalBuffer(dataIdOrBuffer, data);
+                                     } else {
+                                       Module.jsepUploadExternalBuffer(dataIdOrBuffer, data);
+                                     }
                                      break;
                                    default:
                                      return 4;  // Unknown error occurred in memory copy.
diff --git a/onnxruntime/core/framework/external_data_loader.h b/onnxruntime/core/framework/external_data_loader.h
index 117da7d0a4afa..90d48ca800797 100644
--- a/onnxruntime/core/framework/external_data_loader.h
+++ b/onnxruntime/core/framework/external_data_loader.h
@@ -42,7 +42,7 @@ class IExternalDataLoader {
 
 enum class ExternalDataLoadType {
   CPU = 0,
-#if defined(USE_JSEP)
+#if defined(USE_JSEP) || defined(USE_WEBGPU)
   WEBGPU_BUFFER = 1,
 #endif
 };
diff --git a/onnxruntime/core/providers/webgpu/external_data_loader.cc b/onnxruntime/core/providers/webgpu/external_data_loader.cc
new file mode 100644
index 0000000000000..6da9598b146f5
--- /dev/null
+++ b/onnxruntime/core/providers/webgpu/external_data_loader.cc
@@ -0,0 +1,40 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#if defined(__wasm__)
+
+#include <emscripten.h>
+
+#include "core/framework/tensor.h"
+#include "core/providers/webgpu/external_data_loader.h"
+
+namespace onnxruntime {
+namespace webgpu {
+
+bool ExternalDataLoader::CanLoad(const OrtMemoryInfo& target_memory_info) const {
+  return target_memory_info.device.Type() == OrtDevice::CPU ||
+         (target_memory_info.device.Type() == OrtDevice::GPU && target_memory_info.name == WEBGPU_BUFFER);
+}
+
+common::Status ExternalDataLoader::LoadTensor(const Env& env,
+                                              const std::filesystem::path& data_file_path,
+                                              FileOffsetType data_offset,
+                                              SafeInt<size_t> data_length,
+                                              Tensor& tensor) const {
+  ExternalDataLoadType load_type;
+  if (tensor.Location().device.Type() == OrtDevice::CPU) {
+    load_type = ExternalDataLoadType::CPU;
+  } else if (tensor.Location().device.Type() == OrtDevice::GPU &&
+             tensor.Location().name == WEBGPU_BUFFER) {
+    load_type = ExternalDataLoadType::WEBGPU_BUFFER;
+  } else {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Unsupported tensor location: ", tensor.Location().ToString());
+  }
+
+  return LoadWebAssemblyExternalData(env, data_file_path, data_offset, data_length, load_type, tensor.MutableDataRaw());
+}
+
+}  // namespace webgpu
+}  // namespace onnxruntime
+
+#endif
diff --git a/onnxruntime/core/providers/webgpu/external_data_loader.h b/onnxruntime/core/providers/webgpu/external_data_loader.h
new file mode 100644
index 0000000000000..7ced4e930bf7a
--- /dev/null
+++ b/onnxruntime/core/providers/webgpu/external_data_loader.h
@@ -0,0 +1,30 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#if defined(__wasm__)
+
+#include "core/framework/external_data_loader.h"
+
+namespace onnxruntime {
+namespace webgpu {
+
+class ExternalDataLoader : public IExternalDataLoader {
+ public:
+  ExternalDataLoader() {};
+  ~ExternalDataLoader() {};
+
+  bool CanLoad(const OrtMemoryInfo& target_memory_info) const override;
+
+  common::Status LoadTensor(const Env& env,
+                            const std::filesystem::path& data_file_path,
+                            FileOffsetType data_offset,
+                            SafeInt<size_t> data_length,
+                            Tensor& tensor) const override;
+};
+
+}  // namespace webgpu
+}  // namespace onnxruntime
+
+#endif
diff --git a/onnxruntime/core/providers/webgpu/program.cc b/onnxruntime/core/providers/webgpu/program.cc
index d1d4c242c4697..976b7927ac3dd 100644
--- a/onnxruntime/core/providers/webgpu/program.cc
+++ b/onnxruntime/core/providers/webgpu/program.cc
@@ -206,6 +206,26 @@ ProgramVariableDataType ToProgramVariableDataType(int32_t element_type, int comp
   }
 }
 
+std::ostream& operator<<(std::ostream& os, ValidationMode mode) {
+  switch (mode) {
+    case ValidationMode::Disabled:
+      os << "Disabled";
+      break;
+    case ValidationMode::WGPUOnly:
+      os << "WGPUOnly";
+      break;
+    case ValidationMode::Basic:
+      os << "Basic";
+      break;
+    case ValidationMode::Full:
+      os << "Full";
+      break;
+    default:
+      os << "Unknown(" << static_cast<int>(mode) << ")";
+  }
+  return os;
+}
+
 namespace {
 TensorShape GetReducedShape(const TensorShape& shape, int component /* > 1 */) {
   ORT_ENFORCE(shape.NumDimensions() > 0 && shape.GetDims()[shape.NumDimensions() - 1] % component == 0,
diff --git a/onnxruntime/core/providers/webgpu/program.h b/onnxruntime/core/providers/webgpu/program.h
index 7bfd9e8800099..95fef36144025 100644
--- a/onnxruntime/core/providers/webgpu/program.h
+++ b/onnxruntime/core/providers/webgpu/program.h
@@ -237,6 +237,7 @@ enum class ValidationMode {
   Basic,
   Full
 };
+std::ostream& operator<<(std::ostream& os, ValidationMode mode);
 
 namespace details {
 class ProgramWrapper;
diff --git a/onnxruntime/core/providers/webgpu/webgpu_context.cc b/onnxruntime/core/providers/webgpu/webgpu_context.cc
index 21e5e55588a2e..14c12ac247080 100644
--- a/onnxruntime/core/providers/webgpu/webgpu_context.cc
+++ b/onnxruntime/core/providers/webgpu/webgpu_context.cc
@@ -134,6 +134,8 @@ void WebGpuContext::Initialize(const WebGpuBufferCacheConfig& buffer_cache_confi
       ORT_ENFORCE(device_ != nullptr, "Failed to get a WebGPU device.");
     }
 
+    LOGS_DEFAULT(VERBOSE) << "WebGPU EP Context is created for: Instance=" << instance_.Get() << ", Device=" << device_.Get() << ".";
+
     // cache adapter info
     ORT_ENFORCE(Device().GetAdapterInfo(&adapter_info_));
     // cache device limits
@@ -706,45 +708,46 @@ WebGpuContext& WebGpuContextFactory::CreateContext(const WebGpuContextConfig& co
   WGPUInstance instance = config.instance;
   WGPUDevice device = config.device;
 
-  if (context_id == 0) {
-    // context ID is preserved for the default context. User cannot use context ID 0 as a custom context.
-    ORT_ENFORCE(instance == nullptr && device == nullptr,
-                "WebGPU EP default context (contextId=0) must not have custom WebGPU instance or device.");
-
-    std::call_once(init_default_flag_, [
+  std::call_once(init_default_flag_, [
 #if !defined(__wasm__)
-                                           dawn_proc_table = config.dawn_proc_table
+                                         dawn_proc_table = config.dawn_proc_table
 #endif
-    ]() {
-    // Step.1 - setup dawn proc table (only for non-WASM build)
+  ]() {
+  // Step.1 - setup dawn proc table (only for non-WASM build)
 
 #if !defined(__wasm__)
-      const DawnProcTable* dawn_procs = reinterpret_cast<const DawnProcTable*>(dawn_proc_table);
+    const DawnProcTable* dawn_procs = reinterpret_cast<const DawnProcTable*>(dawn_proc_table);
 #if defined(BUILD_DAWN_MONOLITHIC_LIBRARY)
-      ORT_ENFORCE(dawn_procs == nullptr, "setting DawnProcTable is not allowed when dynamically linked to webgpu_dawn.");
+    ORT_ENFORCE(dawn_procs == nullptr, "setting DawnProcTable is not allowed when dynamically linked to webgpu_dawn.");
 #else
 #if !defined(USE_EXTERNAL_DAWN)
-      if (dawn_procs == nullptr) {
-        dawn_procs = &dawn::native::GetProcs();
-      }
+    if (dawn_procs == nullptr) {
+      dawn_procs = &dawn::native::GetProcs();
+    }
 #else
-      ORT_ENFORCE(dawn_procs != nullptr, "DawnProcTable must be provided.");
+    ORT_ENFORCE(dawn_procs != nullptr, "DawnProcTable must be provided.");
 #endif
-      dawnProcSetProcs(dawn_procs);
+    dawnProcSetProcs(dawn_procs);
 #endif
 #endif
 
-      // Step.2 - Create wgpu::Instance
+    // Step.2 - Create wgpu::Instance
 #if !defined(__wasm__)
-      wgpu::InstanceDescriptor instance_desc{};
-      instance_desc.capabilities.timedWaitAnyEnable = true;
-      default_instance_ = wgpu::CreateInstance(&instance_desc);
+    wgpu::InstanceDescriptor instance_desc{};
+    instance_desc.capabilities.timedWaitAnyEnable = true;
+    default_instance_ = wgpu::CreateInstance(&instance_desc);
 #else
-      default_instance_ = wgpu::CreateInstance(nullptr);
+    default_instance_ = wgpu::CreateInstance(nullptr);
 #endif
 
-      ORT_ENFORCE(default_instance_ != nullptr, "Failed to create wgpu::Instance.");
-    });
+    ORT_ENFORCE(default_instance_ != nullptr, "Failed to create wgpu::Instance.");
+  });
+
+  if (context_id == 0) {
+    // context ID is preserved for the default context. User cannot use context ID 0 as a custom context.
+    ORT_ENFORCE(instance == nullptr && device == nullptr,
+                "WebGPU EP default context (contextId=0) must not have custom WebGPU instance or device.");
+
     instance = default_instance_.Get();
   } else {
     // for context ID > 0, user must provide custom WebGPU instance and device.
@@ -798,5 +801,9 @@ void CleanupWebGpuContexts() {
   WebGpuContextFactory::Cleanup();
 }
 
+WGPUDevice GetDevice(int context_id) {
+  return WebGpuContextFactory::GetContext(context_id).Device().Get();
+}
+
 }  // namespace webgpu
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc b/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc
index a2b8709e0e075..d673f9d0717f0 100644
--- a/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc
+++ b/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc
@@ -23,6 +23,7 @@
 
 #include "core/providers/webgpu/webgpu_context.h"
 #include "core/providers/webgpu/data_transfer.h"
+#include "core/providers/webgpu/external_data_loader.h"
 #include "core/providers/webgpu/webgpu_profiler.h"
 
 namespace onnxruntime {
@@ -825,6 +826,12 @@ std::unique_ptr<onnxruntime::IDataTransfer> WebGpuExecutionProvider::GetDataTran
   return std::make_unique<webgpu::DataTransfer>(context_);
 }
 
+#if defined(__wasm__)
+std::unique_ptr<onnxruntime::IExternalDataLoader> WebGpuExecutionProvider::GetExternalDataLoader() const {
+  return std::make_unique<webgpu::ExternalDataLoader>();
+}
+#endif
+
 WebGpuExecutionProvider::~WebGpuExecutionProvider() {
   WebGpuContextFactory::ReleaseContext(context_id_);
 }
diff --git a/onnxruntime/core/providers/webgpu/webgpu_execution_provider.h b/onnxruntime/core/providers/webgpu/webgpu_execution_provider.h
index 7a0ade97aa3df..dc25636821651 100644
--- a/onnxruntime/core/providers/webgpu/webgpu_execution_provider.h
+++ b/onnxruntime/core/providers/webgpu/webgpu_execution_provider.h
@@ -49,6 +49,9 @@ class WebGpuExecutionProvider : public IExecutionProvider {
 
   std::shared_ptr<KernelRegistry> GetKernelRegistry() const override;
   std::unique_ptr<onnxruntime::IDataTransfer> GetDataTransfer() const override;
+#if defined(__wasm__)
+  std::unique_ptr<onnxruntime::IExternalDataLoader> GetExternalDataLoader() const override;
+#endif
 
   DataLayout GetPreferredLayout() const override { return preferred_data_layout_; }
 
diff --git a/onnxruntime/core/providers/webgpu/webgpu_provider_factory.cc b/onnxruntime/core/providers/webgpu/webgpu_provider_factory.cc
index 60c61b2ca5665..1d779152f91f3 100644
--- a/onnxruntime/core/providers/webgpu/webgpu_provider_factory.cc
+++ b/onnxruntime/core/providers/webgpu/webgpu_provider_factory.cc
@@ -151,6 +151,12 @@ std::shared_ptr<IExecutionProviderFactory> WebGpuProviderFactoryCreator::Create(
       validation_mode,
   };
 
+  LOGS_DEFAULT(VERBOSE) << "WebGPU EP Device ID: " << context_id;
+  LOGS_DEFAULT(VERBOSE) << "WebGPU EP WGPUInstance: " << webgpu_instance;
+  LOGS_DEFAULT(VERBOSE) << "WebGPU EP WGPUDevice: " << webgpu_device;
+  LOGS_DEFAULT(VERBOSE) << "WebGPU EP DawnProcTable: " << dawn_proc_table;
+  LOGS_DEFAULT(VERBOSE) << "WebGPU EP ValidationMode: " << validation_mode;
+
   //
   // STEP.3 - prepare parameters for WebGPU context initialization.
   //
diff --git a/onnxruntime/wasm/api.cc b/onnxruntime/wasm/api.cc
index 7adfc6a2b2ccb..1ad35b51bb1c1 100644
--- a/onnxruntime/wasm/api.cc
+++ b/onnxruntime/wasm/api.cc
@@ -8,6 +8,14 @@
 #include "core/session/onnxruntime_cxx_api.h"
 #include "api.h"
 
+#ifdef USE_WEBGPU
+namespace onnxruntime {
+namespace webgpu {
+WGPUDevice GetDevice(int);
+}
+}  // namespace onnxruntime
+#endif
+
 #include <iostream>
 #include <sstream>
 #include <vector>
@@ -164,8 +172,12 @@ OrtSessionOptions* OrtCreateSessionOptions(size_t graph_optimization_level,
   return UNREGISTER_AUTO_RELEASE(session_options);
 }
 
-int OrtAppendExecutionProvider(ort_session_options_handle_t session_options, const char* name) {
-  return CHECK_STATUS(SessionOptionsAppendExecutionProvider, session_options, name, nullptr, nullptr, 0);
+int OrtAppendExecutionProvider(ort_session_options_handle_t session_options,
+                               const char* name,
+                               const char* const* provider_options_keys,
+                               const char* const* provider_options_values,
+                               size_t num_keys) {
+  return CHECK_STATUS(SessionOptionsAppendExecutionProvider, session_options, name, provider_options_keys, provider_options_values, num_keys);
 }
 
 int OrtAddFreeDimensionOverride(ort_session_options_handle_t session_options,
@@ -507,6 +519,16 @@ char* OrtEndProfiling(ort_session_handle_t session) {
              : nullptr;
 }
 
+// WebGPU API Section
+
+#ifdef USE_WEBGPU
+
+WGPUDevice OrtGetWebGpuDevice(int device_id) {
+  return onnxruntime::webgpu::GetDevice(device_id);
+}
+
+#endif
+
 // Training API Section
 
 #ifdef ENABLE_TRAINING_APIS
diff --git a/onnxruntime/wasm/api.h b/onnxruntime/wasm/api.h
index f44c515d98f6b..9ff1eb55ecedc 100644
--- a/onnxruntime/wasm/api.h
+++ b/onnxruntime/wasm/api.h
@@ -10,6 +10,10 @@
 
 #include <emscripten.h>
 
+#ifdef USE_WEBGPU
+#include <webgpu/webgpu.h>
+#endif
+
 #include <stddef.h>
 
 struct OrtSession;
@@ -85,7 +89,10 @@ ort_session_options_handle_t EMSCRIPTEN_KEEPALIVE OrtCreateSessionOptions(size_t
  * @returns ORT error code. If not zero, call OrtGetLastError() to get detailed error message.
  */
 int EMSCRIPTEN_KEEPALIVE OrtAppendExecutionProvider(ort_session_options_handle_t session_options,
-                                                    const char* name);
+                                                    const char* name,
+                                                    const char* const* provider_options_keys,
+                                                    const char* const* provider_options_values,
+                                                    size_t num_keys);
 
 /**
  * add a free dimension override for one dimension of a session's input.
@@ -294,6 +301,21 @@ int EMSCRIPTEN_KEEPALIVE OrtRun(ort_session_handle_t session,
  */
 char* EMSCRIPTEN_KEEPALIVE OrtEndProfiling(ort_session_handle_t session);
 
+// WebGPU API Section
+
+#ifdef USE_WEBGPU
+
+/**
+ * get the GPU Device by device ID.
+ *
+ * This function is only available after the GPU Device is initialized in WebGpuContextFactory.
+ *
+ * @returns a WGPUDevice handle.
+ */
+WGPUDevice EMSCRIPTEN_KEEPALIVE OrtGetWebGpuDevice(int device_id);
+
+#endif
+
 // Training API Section
 
 #ifdef ENABLE_TRAINING_APIS
diff --git a/onnxruntime/wasm/js_post_js.js b/onnxruntime/wasm/js_post_js.js
index b77d82fbd7d10..56d3246fd07f0 100644
--- a/onnxruntime/wasm/js_post_js.js
+++ b/onnxruntime/wasm/js_post_js.js
@@ -2,6 +2,4 @@
 
 // Licensed under the MIT License.
 
-'use strict';
-
 Module["PTR_SIZE"] = 4;
diff --git a/onnxruntime/wasm/js_post_js_64.js b/onnxruntime/wasm/js_post_js_64.js
index b140df927ebbd..cfd79523f7900 100644
--- a/onnxruntime/wasm/js_post_js_64.js
+++ b/onnxruntime/wasm/js_post_js_64.js
@@ -2,6 +2,4 @@
 
 // Licensed under the MIT License.
 
-'use strict';
-
 Module["PTR_SIZE"] = 8;
diff --git a/onnxruntime/wasm/post-webgpu.js b/onnxruntime/wasm/post-webgpu.js
new file mode 100644
index 0000000000000..146355f6a44d3
--- /dev/null
+++ b/onnxruntime/wasm/post-webgpu.js
@@ -0,0 +1,261 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+//
+// This file contains the post-run code for the ORT WebAssembly module. The code in this file will be injected into the
+// final module using Emscripten's `--post-js` option.
+//
+// This file will only be used in build with flag `--use_webgpu`.
+
+/**
+ * This function is called only once when initializing the WebGPU backend.
+ *
+ * @param {(gpuDevice: GPUDevice) => void} setDefaultDevice A callback function to set the default device.
+ */
+Module["webgpuInit"] = (setDefaultDevice) => {
+  /**
+   * a map from GPUDevice to [deviceId, instanceHandle, deviceHandle]
+   *
+   * only stores custom devices (ie. devices created by the user, not the default device created by ORT)
+   *
+   * key is the GPUDevice object.
+   *
+   * value is a tuple of 3 elements:
+   * - deviceId: a unique ID for the device. Must be positive integer.
+   * - instanceHandle: the instance handle(pointer) of the device.
+   * - deviceHandle: the device handle(pointer) of the device.
+   *
+   * @type {WeakMap<GPUDevice, [number, number, number]>}
+   */
+  const webgpuActiveDevices = new WeakMap();
+  /**
+   * a number that is used to assign a unique ID to the next custom device.
+   */
+  let webgpuNextDeviceId = 1;
+  /**
+   * a function to set the default device.
+   *
+   * @type {(gpuDevice: GPUDevice) => void}
+   */
+  const webgpuSetDefaultDevice = setDefaultDevice;
+  /**
+   * the current device that is being used to create a WebGPU EP inference session.
+   *
+   * the value of this variable is only valid during the creation of a WebGPU EP inference session.
+   *
+   * @type {GPUDevice|undefined}
+   */
+  let webgpuCurrentDevice = undefined;
+  /**
+   * the current device ID that is being used to create a WebGPU EP inference session.
+   *
+   * the value of this variable is only valid during the creation of a WebGPU EP inference session.
+   *
+   * @type {number|undefined}
+   */
+  let webgpuCurrentDeviceId = undefined;
+
+  /**
+   * This function is called only when a custom device is used, during preparation of session options.
+   *
+   * @param {GPUDevice} device the user provided device object.
+   * @returns {undefined|[number, number, number]} a tuple of device id, instance handle, and device handle.
+   */
+  Module["webgpuRegisterDevice"] = (device) => {
+    if (webgpuCurrentDeviceId !== undefined) {
+      throw new Error("another WebGPU EP inference session is being created.");
+    }
+
+    if (device) {
+      let deviceInfo = webgpuActiveDevices.get(device);
+      if (!deviceInfo) {
+        const instanceHandle = _wgpuCreateInstance(0);
+        const deviceHandle = WebGPU.importJsDevice(device, instanceHandle);
+        deviceInfo = [webgpuNextDeviceId++, instanceHandle, deviceHandle];
+        webgpuActiveDevices.set(device, deviceInfo);
+      }
+
+      // The current device ID is a temporary storage for the device ID to be used in the session that is being created.
+      //
+      // Soon after `webgpuRegisterDevice` (this function) is called, `webgpuOnCreateSession` will be called so that the
+      // value of `webgpuCurrentDeviceId` is used and reset then.
+      webgpuCurrentDevice = device;
+      webgpuCurrentDeviceId = deviceInfo[0];
+      return deviceInfo;
+    } else {
+      webgpuCurrentDevice = undefined;
+      webgpuCurrentDeviceId = 0;
+      return undefined;
+    }
+  };
+
+  const webgpuActiveSessions = new Map();
+  Module["webgpuOnCreateSession"] = (sessionHandle) => {
+    if (webgpuCurrentDeviceId === undefined) {
+      // do nothing if webgpuCurrentDeviceId is undefined.
+      // this means no WebGPU EP is being created.
+      return;
+    }
+
+    const deviceId = webgpuCurrentDeviceId;
+    webgpuCurrentDeviceId = undefined;
+
+    if (sessionHandle) {
+      // when session created successfully
+      const deviceHandle = _OrtGetWebGpuDevice(deviceId);
+      webgpuActiveSessions.set(sessionHandle, deviceHandle);
+
+      if (deviceId === 0) {
+        const device = webgpuCurrentDevice ?? WebGPU.getJsObject(deviceHandle);
+        webgpuSetDefaultDevice(device);
+      }
+    }
+    webgpuCurrentDevice = undefined;
+  };
+
+  Module["webgpuOnReleaseSession"] = (sessionHandle) => {
+    webgpuActiveSessions.delete(sessionHandle);
+  };
+
+  const gpuBufferMetadataSymbol = Symbol("gpuBufferMetadata");
+
+  Module["webgpuRegisterBuffer"] = (buffer, sessionHandle, bufferHandle) => {
+    if (bufferHandle) {
+      // This is a buffer that was created by ORT. Metadata is [bufferHandle, NaN]
+
+      buffer[gpuBufferMetadataSymbol] = [bufferHandle, NaN];
+      return bufferHandle;
+    } else {
+      // This is a buffer that was created by the user. Metadata is [bufferHandle, refCount]
+
+      const metadata = buffer[gpuBufferMetadataSymbol];
+      if (metadata) {
+        metadata[1]++;
+        return metadata[0];
+      }
+
+      const deviceHandle = webgpuActiveSessions.get(sessionHandle);
+      if (deviceHandle === undefined) {
+        throw new Error(
+          "Invalid session handle passed to webgpuRegisterBuffer"
+        );
+      }
+
+      const bufferHandle = WebGPU.importJsBuffer(buffer, deviceHandle);
+      buffer[gpuBufferMetadataSymbol] = [bufferHandle, 1];
+      return bufferHandle;
+    }
+  };
+
+  Module["webgpuUnregisterBuffer"] = (buffer) => {
+    const metadata = buffer[gpuBufferMetadataSymbol];
+    if (!metadata) {
+      throw new Error("Buffer is not registered");
+    }
+    metadata[1]--;
+    // For buffers created by ORT, metadata[1] will always be NaN. This function will not release the buffer.
+    // Instead, the buffer will be released when user calls `Tensor.dispose()` in JavaScript.
+    if (metadata[1] === 0) {
+      _wgpuBufferRelease(metadata[0]);
+      delete buffer[gpuBufferMetadataSymbol];
+    }
+  };
+
+  Module["webgpuGetBuffer"] = (bufferHandle) => {
+    return WebGPU.getJsObject(bufferHandle);
+  };
+
+  Module["webgpuCreateDownloader"] = (gpuBuffer, bufferSize, sessionHandle) => {
+    const deviceHandle = webgpuActiveSessions.get(sessionHandle);
+    if (deviceHandle === undefined) {
+      throw new Error("Invalid session handle passed to webgpuRegisterBuffer");
+    }
+
+    const buffer = gpuBuffer;
+    const device = WebGPU.getJsObject(deviceHandle);
+    const originalSize = bufferSize;
+    const size = Math.ceil(Number(originalSize) / 16) * 16;
+
+    return async () => {
+      // prettier-ignore
+      //
+      // the line above is used to force prettier to skip formatting the next statement.
+      // this is because prettier will remove the quotes around the property names, but we need to keep them
+      // because otherwise closure compiler may rename them and break the code.
+      const gpuReadBufferDescriptor = {
+        "size": size,
+        "usage": 9 /* GPUBufferUsage.COPY_DST | GPUBufferUsage.MAP_READ */,
+      };
+      const gpuReadBuffer = device.createBuffer(gpuReadBufferDescriptor);
+      try {
+        const commandEncoder = device.createCommandEncoder();
+        commandEncoder.copyBufferToBuffer(
+          buffer /* source buffer */,
+          0 /* source offset */,
+          gpuReadBuffer /* destination buffer */,
+          0 /* destination offset */,
+          size /* size */
+        );
+        device.queue.submit([commandEncoder.finish()]);
+
+        await gpuReadBuffer.mapAsync(GPUMapMode.READ);
+
+        const arrayBuffer = gpuReadBuffer.getMappedRange();
+        return arrayBuffer.slice(0, originalSize);
+      } finally {
+        gpuReadBuffer.destroy();
+      }
+    };
+  };
+
+  // Setup a callback function for loading external buffers (model weights).
+  Module.webgpuUploadExternalBuffer = (bufferHandle, data) => {
+    const srcArrayBuffer = data.buffer;
+    const srcOffset = data.byteOffset;
+    const srcLength = data.byteLength;
+    const size = Math.ceil(Number(srcLength) / 16) * 16;
+
+    const gpuBuffer = WebGPU.getJsObject(bufferHandle);
+
+    // get current device
+    if (!webgpuCurrentDevice) {
+      const deviceHandle = _OrtGetWebGpuDevice(webgpuCurrentDeviceId);
+      webgpuCurrentDevice = WebGPU.getJsObject(deviceHandle);
+    }
+
+    // create gpu buffer
+
+    // prettier-ignore
+    //
+    // the line above is used to force prettier to skip formatting the next statement.
+    // this is because prettier will remove the quotes around the property names, but we need to keep them
+    // because otherwise closure compiler may rename them and break the code.
+    const gpuBufferForUploadingDescriptor = {
+      "mappedAtCreation": true,
+      "size": size,
+      "usage": 6 /* GPUBufferUsage.MAP_WRITE | GPUBufferUsage.COPY_SRC */,
+    };
+    const gpuBufferForUploading = webgpuCurrentDevice.createBuffer(
+      gpuBufferForUploadingDescriptor
+    );
+
+    // copy (upload) data
+    const arrayBuffer = gpuBufferForUploading.getMappedRange();
+    new Uint8Array(arrayBuffer).set(
+      new Uint8Array(srcArrayBuffer, srcOffset, srcLength)
+    );
+    gpuBufferForUploading.unmap();
+
+    // GPU copy
+    const commandEncoder = webgpuCurrentDevice.createCommandEncoder();
+    commandEncoder.copyBufferToBuffer(
+      gpuBufferForUploading,
+      0,
+      gpuBuffer,
+      0,
+      size
+    );
+    webgpuCurrentDevice.queue.submit([commandEncoder.finish()]);
+    gpuBufferForUploading.destroy();
+  };
+};
diff --git a/onnxruntime/wasm/pre-async.js b/onnxruntime/wasm/pre-async.js
new file mode 100644
index 0000000000000..8c75dc7c5cf1e
--- /dev/null
+++ b/onnxruntime/wasm/pre-async.js
@@ -0,0 +1,132 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+//
+// This file contains the pre-run code for the ORT WebAssembly module. The code in this file will be injected into the
+// final module using Emscripten's `--pre-js` option.
+//
+// This file will only be used in build with flag `-s ASYNCIFY=1`.
+
+/**
+ * initialize for asyncify support.
+ */
+let initAsyncImpl = () => {
+  // This is a simplified version of cwrap() with options.async === true (-sASYNCIFY=1)
+  // It removes some overhead in cwarp() and ccall() that we don't need.
+  //
+  // Currently in ASYNCIFY build, we only use this for the following functions:
+  // - OrtCreateSession()
+  // - OrtRun()
+  // - OrtRunWithBinding()
+  // - OrtBindInput()
+  //
+  // Note: about parameters "getFunc" and "setFunc":
+  // - Emscripten has different behaviors for Debug and Release builds for generating exported function wrapper.
+  //
+  //   - In Debug build, it will generate a wrapper function for each exported function. For example, it generates a
+  //     wrapper for OrtRun() like this (minified):
+  //     ```
+  //     var _OrtRun = Module["_OrtRun"] = createExportWrapper("OrtRun");
+  //     ```
+  //
+  //   - In Release build, it will generate a lazy loading wrapper for each exported function. For example, it generates
+  //     a wrapper for OrtRun() like this (minified):
+  //     ```
+  //     d._OrtRun = (a, b, c, e, f, h, l, q) => (d._OrtRun = J.ka)(a, b, c, e, f, h, l, q);
+  //     ```
+  //
+  //   The behavior of these two wrappers are different. The debug build will assign `Module["_OrtRun"]` only once
+  //   because `createExportWrapper()` does not reset `Module["_OrtRun"]` inside. The release build, however, will
+  //   reset d._OrtRun to J.ka when the first time it is called.
+  //
+  //   The difference is important because we need to design the async wrapper in a way that it can handle both cases.
+  //
+  //   Now, let's look at how the async wrapper is designed to work for both cases:
+  //
+  //   - Debug build:
+  //      1. When Web assembly is being loaded, `Module["_OrtRun"]` is assigned to `createExportWrapper("OrtRun")`.
+  //      2. When the first time `Module["initAsync"]` is called, `Module["_OrtRun"]` is re-assigned to a new async
+  //         wrapper function.
+  //      Value of `Module["_OrtRun"]` will not be changed again.
+  //
+  //   - Release build:
+  //      1. When Web assembly is being loaded, `Module["_OrtRun"]` is assigned to a lazy loading wrapper function.
+  //      2. When the first time `Module["initAsync"]` is called, `Module["_OrtRun"]` is re-assigned to a new async
+  //         wrapper function.
+  //      3. When the first time `Module["_OrtRun"]` is called, the async wrapper will be called. It will call into this
+  //         function:
+  //         ```
+  //         (a, b, c, e, f, h, l, q) => (d._OrtRun = J.ka)(a, b, c, e, f, h, l, q);
+  //         ```
+  //         This function will assign d._OrtRun (ie. the minimized `Module["_OrtRun"]`) to the real function (J.ka).
+  //      4. Since d._OrtRun is re-assigned, we need to update the async wrapper to re-assign its stored
+  //         function to the updated value (J.ka), and re-assign the value of `d._OrtRun` back to the async wrapper.
+  //      Value of `Module["_OrtRun"]` will not be changed again.
+  //
+  //   The value of `Module["_OrtRun"]` will need to be assigned for 2 times for debug build and 4 times for release
+  //   build.
+  //
+  //   This is why we need this `getFunc` and `setFunc` parameters. They are used to get the current value of an
+  //   exported function and set the new value of an exported function.
+  //
+  const wrapAsync = (func, getFunc, setFunc) => {
+    return (...args) => {
+      // cache the async data before calling the function.
+      const previousAsync = Asyncify.currData;
+
+      const previousFunc = getFunc?.();
+      const ret = func(...args);
+      const newFunc = getFunc?.();
+      if (previousFunc !== newFunc) {
+        // The exported function has been updated.
+        // Set the sync function reference to the new function.
+        func = newFunc;
+        // Set the exported function back to the async wrapper.
+        setFunc(previousFunc);
+        // Remove getFunc and setFunc. They are no longer needed.
+        setFunc = null;
+        getFunc = null;
+      }
+
+      // If the async data has been changed, it means that the function started an async operation.
+      if (Asyncify.currData != previousAsync) {
+        // returns the promise
+        return Asyncify.whenDone();
+      }
+      // the function is synchronous. returns the result.
+      return ret;
+    };
+  };
+
+  // replace the original functions with asyncified versions
+  const wrapAsyncAPIs = (funcNames) => {
+    for (const funcName of funcNames) {
+      Module[funcName] = wrapAsync(
+        Module[funcName],
+        () => Module[funcName],
+        (v) => (Module[funcName] = v)
+      );
+    }
+  };
+
+  wrapAsyncAPIs([
+    "_OrtAppendExecutionProvider",
+    "_OrtCreateSession",
+    "_OrtRun",
+    "_OrtRunWithBinding",
+    "_OrtBindInput",
+  ]);
+
+  // If JSEP is enabled, wrap OrtRun() and OrtRunWithBinding() with asyncify.
+  if (typeof jsepRunAsync !== "undefined") {
+    Module["_OrtRun"] = jsepRunAsync(Module["_OrtRun"]);
+    Module["_OrtRunWithBinding"] = jsepRunAsync(Module["_OrtRunWithBinding"]);
+  }
+
+  // remove this function to make sure it is called only once.
+  initAsyncImpl = undefined;
+};
+
+Module["asyncInit"] = () => {
+  initAsyncImpl?.();
+};
diff --git a/onnxruntime/wasm/pre-jsep.js b/onnxruntime/wasm/pre-jsep.js
index 0c83e71a921cb..5b2f044d4c27b 100644
--- a/onnxruntime/wasm/pre-jsep.js
+++ b/onnxruntime/wasm/pre-jsep.js
@@ -1,255 +1,157 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-'use strict';
-
 //
 // This file contains the pre-run code for the ORT WebAssembly module. The code in this file will be injected into the
 // final module using Emscripten's `--pre-js` option.
 //
 // This file will only be used in build with flag `--use_jsep`.
 
-
-/**
- * initialize JSEP for asyncify support.
- */
-let jsepInitAsync = () => {
-  // This is a simplified version of cwrap() with options.async === true (-sASYNCIFY=1)
-  // It removes some overhead in cwarp() and ccall() that we don't need.
-  //
-  // Currently in JSEP build, we only use this for the following functions:
-  // - OrtRun()
-  // - OrtRunWithBinding()
-  // - OrtBindInput()
-  //
-  // Note: about parameters "getFunc" and "setFunc":
-  // - Emscripten has different behaviors for Debug and Release builds for generating exported function wrapper.
-  //
-  //   - In Debug build, it will generate a wrapper function for each exported function. For example, it generates a
-  //     wrapper for OrtRun() like this (minified):
-  //     ```
-  //     var _OrtRun = Module["_OrtRun"] = createExportWrapper("OrtRun");
-  //     ```
-  //
-  //   - In Release build, it will generate a lazy loading wrapper for each exported function. For example, it generates
-  //     a wrapper for OrtRun() like this (minified):
-  //     ```
-  //     d._OrtRun = (a, b, c, e, f, h, l, q) => (d._OrtRun = J.ka)(a, b, c, e, f, h, l, q);
-  //     ```
-  //
-  //   The behavior of these two wrappers are different. The debug build will assign `Module["_OrtRun"]` only once
-  //   because `createExportWrapper()` does not reset `Module["_OrtRun"]` inside. The release build, however, will
-  //   reset d._OrtRun to J.ka when the first time it is called.
-  //
-  //   The difference is important because we need to design the async wrapper in a way that it can handle both cases.
-  //
-  //   Now, let's look at how the async wrapper is designed to work for both cases:
-  //
-  //   - Debug build:
-  //      1. When Web assembly is being loaded, `Module["_OrtRun"]` is assigned to `createExportWrapper("OrtRun")`.
-  //      2. When the first time `Module["jsepInit"]` is called, `Module["_OrtRun"]` is re-assigned to a new async
-  //         wrapper function.
-  //      Value of `Module["_OrtRun"]` will not be changed again.
-  //
-  //   - Release build:
-  //      1. When Web assembly is being loaded, `Module["_OrtRun"]` is assigned to a lazy loading wrapper function.
-  //      2. When the first time `Module["jsepInit"]` is called, `Module["_OrtRun"]` is re-assigned to a new async
-  //         wrapper function.
-  //      3. When the first time `Module["_OrtRun"]` is called, the async wrapper will be called. It will call into this
-  //         function:
-  //         ```
-  //         (a, b, c, e, f, h, l, q) => (d._OrtRun = J.ka)(a, b, c, e, f, h, l, q);
-  //         ```
-  //         This function will assign d._OrtRun (ie. the minimized `Module["_OrtRun"]`) to the real function (J.ka).
-  //      4. Since d._OrtRun is re-assigned, we need to update the async wrapper to re-assign its stored
-  //         function to the updated value (J.ka), and re-assign the value of `d._OrtRun` back to the async wrapper.
-  //      Value of `Module["_OrtRun"]` will not be changed again.
-  //
-  //   The value of `Module["_OrtRun"]` will need to be assigned for 2 times for debug build and 4 times for release
-  //   build.
-  //
-  //   This is why we need this `getFunc` and `setFunc` parameters. They are used to get the current value of an
-  //   exported function and set the new value of an exported function.
-  //
-  const jsepWrapAsync = (func, getFunc, setFunc) => {
-    return (...args) => {
-      // cache the async data before calling the function.
-      const previousAsync = Asyncify.currData;
-
-      const previousFunc = getFunc?.();
-      const ret = func(...args);
-      const newFunc = getFunc?.();
-      if (previousFunc !== newFunc) {
-        // The exported function has been updated.
-        // Set the sync function reference to the new function.
-        func = newFunc;
-        // Set the exported function back to the async wrapper.
-        setFunc(previousFunc);
-        // Remove getFunc and setFunc. They are no longer needed.
-        setFunc = null;
-        getFunc = null;
+// This is a wrapper for OrtRun() and OrtRunWithBinding() to ensure that Promises are handled correctly.
+const jsepRunAsync = (runAsyncFunc) => {
+  return async (...args) => {
+    try {
+      // Module.jsepSessionState should be null, unless we are in the middle of a session.
+      // If it is not null, it means that the previous session has not finished yet.
+      if (Module.jsepSessionState) {
+        throw new Error("Session already started");
       }
+      const state = (Module.jsepSessionState = {
+        sessionHandle: args[0],
+        errors: [],
+      });
 
-      // If the async data has been changed, it means that the function started an async operation.
-      if (Asyncify.currData != previousAsync) {
-        // returns the promise
-        return Asyncify.whenDone();
-      }
-      // the function is synchronous. returns the result.
-      return ret;
-    };
-  };
-
-  // This is a wrapper for OrtRun() and OrtRunWithBinding() to ensure that Promises are handled correctly.
-  const runAsync = (runAsyncFunc) => {
-    return async (...args) => {
-      try {
-        // Module.jsepSessionState should be null, unless we are in the middle of a session.
-        // If it is not null, it means that the previous session has not finished yet.
-        if (Module.jsepSessionState) {
-          throw new Error('Session already started');
-        }
-        const state = Module.jsepSessionState = {sessionHandle: args[0], errors: []};
-
-        // Run the acyncified function: OrtRun() or OrtRunWithBinding()
-        const ret = await runAsyncFunc(...args);
+      // Run the acyncified function: OrtRun() or OrtRunWithBinding()
+      const ret = await runAsyncFunc(...args);
 
-        // Check if the session is still valid. this object should be the same as the one we set above.
-        if (Module.jsepSessionState !== state) {
-          throw new Error('Session mismatch');
-        }
+      // Check if the session is still valid. this object should be the same as the one we set above.
+      if (Module.jsepSessionState !== state) {
+        throw new Error("Session mismatch");
+      }
 
-        // Flush the backend. This will submit all pending commands to the GPU.
-        Module.jsepBackend?.['flush']();
+      // Flush the backend. This will submit all pending commands to the GPU.
+      Module.jsepBackend?.["flush"]();
 
-        // Await all pending promises. This includes GPU validation promises for diagnostic purposes.
-        const errorPromises = state.errors;
-        if (errorPromises.length > 0) {
-          let errors = await Promise.all(errorPromises);
-          errors = errors.filter(e => e);
-          if (errors.length > 0) {
-            throw new Error(errors.join('\n'));
-          }
+      // Await all pending promises. This includes GPU validation promises for diagnostic purposes.
+      const errorPromises = state.errors;
+      if (errorPromises.length > 0) {
+        let errors = await Promise.all(errorPromises);
+        errors = errors.filter((e) => e);
+        if (errors.length > 0) {
+          throw new Error(errors.join("\n"));
         }
-
-        return ret;
-      } finally {
-        Module.jsepSessionState = null;
       }
-    };
-  };
 
-  // replace the original functions with asyncified versions
-  Module['_OrtCreateSession'] = jsepWrapAsync(
-      Module['_OrtCreateSession'],
-      () => Module['_OrtCreateSession'],
-      v => Module['_OrtCreateSession'] = v);
-  Module['_OrtRun'] = runAsync(jsepWrapAsync(
-      Module['_OrtRun'],
-      () => Module['_OrtRun'],
-      v => Module['_OrtRun'] = v));
-  Module['_OrtRunWithBinding'] = runAsync(jsepWrapAsync(
-      Module['_OrtRunWithBinding'],
-      () => Module['_OrtRunWithBinding'],
-      v => Module['_OrtRunWithBinding'] = v));
-  Module['_OrtBindInput'] = jsepWrapAsync(
-      Module['_OrtBindInput'],
-      () => Module['_OrtBindInput'],
-      v => Module['_OrtBindInput'] = v);
-
-  // remove this function to make sure it is called only once.
-  jsepInitAsync = undefined;
+      return ret;
+    } finally {
+      Module.jsepSessionState = null;
+    }
+  };
 };
 
-
 /**
- * initialize JSEP for WebGPU.
+ * initialize JSEP for WebGPU and WebNN.
  */
-Module['jsepInit'] = (name, params) => {
-  jsepInitAsync?.();
-
-  if (name === 'webgpu') {
-    [Module.jsepBackend,
-     Module.jsepAlloc,
-     Module.jsepFree,
-     Module.jsepCopy,
-     Module.jsepCopyAsync,
-     Module.jsepCreateKernel,
-     Module.jsepReleaseKernel,
-     Module.jsepRunKernel,
-     Module.jsepCaptureBegin,
-     Module.jsepCaptureEnd,
-     Module.jsepReplay] = params;
+Module["jsepInit"] = (name, params) => {
+  if (name === "webgpu") {
+    [
+      Module.jsepBackend,
+      Module.jsepAlloc,
+      Module.jsepFree,
+      Module.jsepCopy,
+      Module.jsepCopyAsync,
+      Module.jsepCreateKernel,
+      Module.jsepReleaseKernel,
+      Module.jsepRunKernel,
+      Module.jsepCaptureBegin,
+      Module.jsepCaptureEnd,
+      Module.jsepReplay,
+    ] = params;
 
     // expose webgpu backend functions
     const backend = Module.jsepBackend;
-    Module['jsepRegisterBuffer'] = (sessionId, index, buffer, size) => {
-      return backend['registerBuffer'](sessionId, index, buffer, size);
+    Module["jsepRegisterBuffer"] = (sessionId, index, buffer, size) => {
+      return backend["registerBuffer"](sessionId, index, buffer, size);
     };
-    Module['jsepGetBuffer'] = (dataId) => {
-      return backend['getBuffer'](dataId);
+    Module["jsepGetBuffer"] = (dataId) => {
+      return backend["getBuffer"](dataId);
     };
-    Module['jsepCreateDownloader'] = (gpuBuffer, size, type) => {
-      return backend['createDownloader'](gpuBuffer, size, type);
+    Module["jsepCreateDownloader"] = (gpuBuffer, size, type) => {
+      return backend["createDownloader"](gpuBuffer, size, type);
     };
-    Module['jsepOnCreateSession'] = sessionId => {
-      backend['onCreateSession'](sessionId);
+    Module["jsepOnCreateSession"] = (sessionId) => {
+      backend["onCreateSession"](sessionId);
     };
-    Module['jsepOnReleaseSession'] = sessionId => {
-      backend['onReleaseSession'](sessionId);
+    Module["jsepOnReleaseSession"] = (sessionId) => {
+      backend["onReleaseSession"](sessionId);
     };
-    Module['jsepOnRunStart'] = sessionId => {
-      return backend['onRunStart'](sessionId);
+    Module["jsepOnRunStart"] = (sessionId) => {
+      return backend["onRunStart"](sessionId);
     };
 
     Module.jsepUploadExternalBuffer = (dataId, buffer) => {
-      backend['upload'](dataId, buffer);
+      backend["upload"](dataId, buffer);
     };
-  } else if (name === 'webnn') {
+  } else if (name === "webnn") {
     // Functions called from EM_ASM need to be assigned in a way that can be minified.
     // Functions called via emscripten::val::module_property need to be assigned by name so that the minifier doesn't
     // change the name.
 
-    [Module.jsepBackend,
-     Module.jsepReserveTensorId,
-     Module.jsepReleaseTensorId,
-     Module['jsepEnsureTensor'],
-     Module.jsepUploadTensor,
-     Module['jsepDownloadTensor'],
+    [
+      Module.jsepBackend,
+      Module.jsepReserveTensorId,
+      Module.jsepReleaseTensorId,
+      Module["jsepEnsureTensor"],
+      Module.jsepUploadTensor,
+      Module["jsepDownloadTensor"],
     ] = params;
 
     // This function is called from both JS and an EM_ASM block, it needs both a minifiable name and an explicit name.
-    Module['jsepReleaseTensorId'] = Module.jsepReleaseTensorId;
-    Module['jsepUploadTensor'] = Module.jsepUploadTensor;
+    Module["jsepReleaseTensorId"] = Module.jsepReleaseTensorId;
+    Module["jsepUploadTensor"] = Module.jsepUploadTensor;
 
     // Functions called from JS also need to have explicit names.
     const backend = Module.jsepBackend;
-    Module['jsepOnRunStart'] = sessionId => {
-      return backend['onRunStart'](sessionId);
+    Module["jsepOnRunStart"] = (sessionId) => {
+      return backend["onRunStart"](sessionId);
     };
-    Module['jsepOnRunEnd'] = backend['onRunEnd'].bind(backend);
-    Module['jsepRegisterMLContext'] = (sessionId, mlContext) => {
-      backend['registerMLContext'](sessionId, mlContext);
+    Module["jsepOnRunEnd"] = backend["onRunEnd"].bind(backend);
+    Module["jsepRegisterMLContext"] = (sessionId, mlContext) => {
+      backend["registerMLContext"](sessionId, mlContext);
     };
-    Module['jsepOnReleaseSession'] = sessionId => {
-      backend['onReleaseSession'](sessionId);
+    Module["jsepOnReleaseSession"] = (sessionId) => {
+      backend["onReleaseSession"](sessionId);
     };
-    Module['jsepCreateMLTensorDownloader'] = (tensorId, type) => {
-      return backend['createMLTensorDownloader'](tensorId, type);
-    }
-    Module['jsepRegisterMLTensor'] = (sessionId, tensor, dataType, shape) => {
-      return backend['registerMLTensor'](sessionId, tensor, dataType, shape);
+    Module["jsepCreateMLTensorDownloader"] = (tensorId, type) => {
+      return backend["createMLTensorDownloader"](tensorId, type);
+    };
+    Module["jsepRegisterMLTensor"] = (sessionId, tensor, dataType, shape) => {
+      return backend["registerMLTensor"](sessionId, tensor, dataType, shape);
     };
-    Module['jsepCreateMLContext'] = (optionsOrGpuDevice) => {
-      return backend['createMLContext'](optionsOrGpuDevice);
+    Module["jsepCreateMLContext"] = (optionsOrGpuDevice) => {
+      return backend["createMLContext"](optionsOrGpuDevice);
     };
-    Module['jsepRegisterMLConstant'] = (externalFilePath, dataOffset, dataLength, builder, desc) => {
-      return backend['registerMLConstant'](
-          externalFilePath, dataOffset, dataLength, builder, desc, Module.MountedFiles);
+    Module["jsepRegisterMLConstant"] = (
+      externalFilePath,
+      dataOffset,
+      dataLength,
+      builder,
+      desc
+    ) => {
+      return backend["registerMLConstant"](
+        externalFilePath,
+        dataOffset,
+        dataLength,
+        builder,
+        desc,
+        Module.MountedFiles
+      );
     };
-    Module['jsepRegisterGraphInput'] = backend['registerGraphInput'].bind(backend);
-    Module['jsepIsGraphInput'] = backend['isGraphInput'].bind(backend);
+    Module["jsepRegisterGraphInput"] =
+      backend["registerGraphInput"].bind(backend);
+    Module["jsepIsGraphInput"] = backend["isGraphInput"].bind(backend);
 
-    Module['jsepCreateTemporaryTensor'] = backend['createTemporaryTensor'].bind(backend);
+    Module["jsepCreateTemporaryTensor"] =
+      backend["createTemporaryTensor"].bind(backend);
   }
 };
diff --git a/onnxruntime/wasm/pre.js b/onnxruntime/wasm/pre.js
index 9b5f3ce545b78..636a9713519a7 100644
--- a/onnxruntime/wasm/pre.js
+++ b/onnxruntime/wasm/pre.js
@@ -1,21 +1,18 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-'use strict';
-
 //
 // This file contains the pre-run code for the ORT WebAssembly module. The code in this file will be injected into the
 // final module using Emscripten's `--pre-js` option.
 
-
 /**
  * Mount external data files of a model to an internal map, which will be used during session initialization.
  *
  * @param {string} externalDataFilesPath
  * @param {Uint8Array} externalDataFilesData
  */
-Module['mountExternalData'] = (externalDataFilePath, externalDataFileData) => {
-  if (externalDataFilePath.startsWith('./')) {
+Module["mountExternalData"] = (externalDataFilePath, externalDataFileData) => {
+  if (externalDataFilePath.startsWith("./")) {
     externalDataFilePath = externalDataFilePath.substring(2);
   }
   const files = Module.MountedFiles || (Module.MountedFiles = new Map());
@@ -25,7 +22,7 @@ Module['mountExternalData'] = (externalDataFilePath, externalDataFileData) => {
 /**
  * Unmount external data files of a model.
  */
-Module['unmountExternalData'] = () => {
+Module["unmountExternalData"] = () => {
   delete Module.MountedFiles;
 };
 
@@ -48,5 +45,7 @@ Module['unmountExternalData'] = () => {
  *
  * @suppress {checkVars}
  */
-var SharedArrayBuffer = globalThis.SharedArrayBuffer ??
-    new WebAssembly.Memory({'initial': 0, 'maximum': 0, 'shared': true}).buffer.constructor;
+var SharedArrayBuffer =
+  globalThis.SharedArrayBuffer ??
+  new WebAssembly.Memory({ initial: 0, maximum: 0, shared: true }).buffer
+    .constructor;
diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py
index 8607887072347..fe20351b0e8bb 100644
--- a/tools/ci_build/build.py
+++ b/tools/ci_build/build.py
@@ -1592,8 +1592,11 @@ def generate_build_tree(
             raise BuildError("WebNN is only available for WebAssembly build.")
         cmake_args += ["-Donnxruntime_USE_WEBNN=ON"]
 
-    if args.use_jsep and args.use_webgpu:
-        raise BuildError("JSEP (--use_jsep) and WebGPU (--use_webgpu) cannot be enabled at the same time.")
+    # TODO: currently we allows building with both --use_jsep and --use_webgpu in this working branch.
+    #       This situation is temporary. Eventually, those two flags will be mutually exclusive.
+    #
+    # if args.use_jsep and args.use_webgpu:
+    #     raise BuildError("JSEP (--use_jsep) and WebGPU (--use_webgpu) cannot be enabled at the same time.")
 
     if args.use_external_dawn and not args.use_webgpu:
         raise BuildError("External Dawn (--use_external_dawn) must be enabled with WebGPU (--use_webgpu).")

From 8f07743597f454b00317939cdb33c1e4e9d6e2a6 Mon Sep 17 00:00:00 2001
From: Yi-Hong Lyu <yilyu@microsoft.com>
Date: Thu, 6 Mar 2025 16:01:08 -0800
Subject: [PATCH 28/46] Adding OpenVINO Windows CI Pipeline (#23919)

### Description
<!-- Describe your changes. -->

Enable an OpenVINO Windows CI pipeline. This includes:
- Downloading the OpenVINO toolkit for Windows from an external source.
- Setting up OpenVINO environment variables.
- Building the ONNX Runtime OpenVINO Execution Provider.
- Running unit tests.

### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->

This change is required to run checks on precommit and commit in the
ONNX Runtime project. It ensures that the code is tested with the
OpenVINO toolkit on Windows, improving the reliability and compatibility
of the project.
---
 cmake/onnxruntime_unittests.cmake             |   4 +-
 .../templates/jobs/download_win_openvino.yml  |  64 ++++++++++
 .../win-openvino-ci-pipeline.yml              | 116 ++++++++++++++++++
 tools/ci_build/set-trigger-rules.py           |   3 +-
 4 files changed, 183 insertions(+), 4 deletions(-)
 create mode 100644 tools/ci_build/github/azure-pipelines/templates/jobs/download_win_openvino.yml
 create mode 100644 tools/ci_build/github/azure-pipelines/win-openvino-ci-pipeline.yml

diff --git a/cmake/onnxruntime_unittests.cmake b/cmake/onnxruntime_unittests.cmake
index 2ed7923941643..87aee2a174fab 100644
--- a/cmake/onnxruntime_unittests.cmake
+++ b/cmake/onnxruntime_unittests.cmake
@@ -236,14 +236,14 @@ function(AddTest)
         )
       endif()
       # Set test timeout to 3 hours.
-      set_tests_properties(${_UT_TARGET} PROPERTIES TIMEOUT 7200)
+      set_tests_properties(${_UT_TARGET} PROPERTIES TIMEOUT 10800)
     else()
       add_test(NAME ${_UT_TARGET}
         COMMAND ${_UT_TARGET} ${TEST_ARGS}
         WORKING_DIRECTORY $<TARGET_FILE_DIR:${_UT_TARGET}>
       )
       # Set test timeout to 3 hours.
-      set_tests_properties(${_UT_TARGET} PROPERTIES TIMEOUT 7200)
+      set_tests_properties(${_UT_TARGET} PROPERTIES TIMEOUT 10800)
     endif()
   endif()
 endfunction(AddTest)
diff --git a/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_openvino.yml b/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_openvino.yml
new file mode 100644
index 0000000000000..f6956b426ddfc
--- /dev/null
+++ b/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_openvino.yml
@@ -0,0 +1,64 @@
+parameters:
+  - name: OpenVINOVersion
+    type: string
+    default: '2025.0.0'
+
+steps:
+  - powershell: |
+      $Url = "https://storage.openvinotoolkit.org/repositories/openvino/packages/2025.0/windows/openvino_toolkit_windows_2025.0.0.17942.1f68be9f594_x86_64.zip"
+      $OutputPath = "$env:Agent_TempDirectory\openvino.zip"
+      $ExtractPath = "$env:Agent_TempDirectory\openvino-v$env:OpenVINOVersion"
+      $TempExtractPath = "$env:Agent_TempDirectory\openvino_temp"
+
+      # Ensure directories exist
+      if (Test-Path $ExtractPath) {
+        Remove-Item -Recurse -Force $ExtractPath
+      }
+      New-Item -ItemType Directory -Path $ExtractPath | Out-Null
+      New-Item -ItemType Directory -Path $TempExtractPath | Out-Null
+
+      # Download OpenVINO ZIP
+      Write-Output "Downloading OpenVINO"
+      Invoke-WebRequest -Uri $Url -OutFile $OutputPath
+
+      # Extract to temporary directory first
+      Write-Output "Extracting OpenVINO to a temporary directory"
+      Expand-Archive -Path $OutputPath -DestinationPath $TempExtractPath -Force
+
+      # Locate the nested subdirectory
+      $InnerFolder = Get-ChildItem -Path $TempExtractPath -Directory | Select-Object -First 1
+
+      if ($InnerFolder) {
+        Write-Output "Moving extracted files to final destination"
+        Move-Item -Path "$($InnerFolder.FullName)\*" -Destination $ExtractPath -Force
+      } else {
+        Write-Error "Extraction failed: No expected subdirectory found in $TempExtractPath."
+        Write-Error "The archive may not have extracted correctly, or its structure is different than expected."
+        exit 1
+      }
+
+      # Clean up temporary files
+      Remove-Item -Recurse -Force $TempExtractPath
+      Remove-Item -Force $OutputPath
+
+      # Confirm success
+      Write-Output "OpenVINO extracted to $ExtractPath"
+    displayName: 'Download OpenVINO Toolkit v${{ parameters.OpenVINOVersion }}'
+    env:
+      OpenVINOVersion: ${{ parameters.OpenVINOVersion }}
+
+  - powershell: |
+      echo "##vso[task.setvariable variable=OpenVINORootDir]$(Agent.TempDirectory)\openvino-v${{ parameters.OpenVINOVersion }}"
+    displayName: 'Set OpenVINORootDir'
+
+  - task: CmdLine@2
+    inputs:
+      script: |
+        echo $(OpenVINORootDir)
+    displayName: 'Print OpenVINORootDir after downloading OpenVINO'
+
+  - task: CmdLine@2
+    displayName: 'Print contents of OpenVINO Toolkit'
+    inputs:
+      script: |
+        dir $(OpenVINORootDir)
diff --git a/tools/ci_build/github/azure-pipelines/win-openvino-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-openvino-ci-pipeline.yml
new file mode 100644
index 0000000000000..f95ac526886fa
--- /dev/null
+++ b/tools/ci_build/github/azure-pipelines/win-openvino-ci-pipeline.yml
@@ -0,0 +1,116 @@
+##### start trigger Don't edit it manually, Please do edit set-trigger-rules.py ####
+### please do rerun set-trigger-rules.py ###
+trigger:
+  branches:
+    include:
+    - main
+    - rel-*
+  paths:
+    exclude:
+    - docs/**
+    - README.md
+    - CONTRIBUTING.md
+    - BUILD.md
+    - 'js/web'
+    - 'onnxruntime/core/providers/js'
+pr:
+  branches:
+    include:
+    - main
+    - rel-*
+  paths:
+    exclude:
+    - docs/**
+    - README.md
+    - CONTRIBUTING.md
+    - BUILD.md
+    - 'js/web'
+    - 'onnxruntime/core/providers/js'
+#### end trigger ####
+
+jobs:
+- job: 'BUILD_OPENVINO_EP'
+  pool: 'onnxruntime-Win-CPU-2022'
+  variables:
+    MsbuildArguments: '-detailedsummary -maxcpucount -consoleloggerparameters:PerformanceSummary'
+    OnnxRuntimeBuildDirectory: '$(Build.BinariesDirectory)'
+    DOTNET_SKIP_FIRST_TIME_EXPERIENCE: true
+    buildArch: x64
+    setVcvars: true
+    BuildConfig: 'RelWithDebInfo'
+    ALLOW_RELEASED_ONNX_OPSET_ONLY: '1'
+    TODAY: $[format('{0:dd}{0:MM}{0:yyyy}', pipeline.startTime)]
+  timeoutInMinutes: 240
+  workspace:
+    clean: all
+  steps:
+
+  - task: UsePythonVersion@0
+    inputs:
+      versionSpec: '3.12'
+      addToPath: true
+      architecture: $(buildArch)
+
+  - template: templates/jobs/download_win_openvino.yml
+
+  - powershell: |
+      Write-Output "Setting up OpenVINO environment variables"
+      . "$(OpenVINORootDir)\setupvars.ps1"
+
+      Write-Output "Exporting selected environment variables to pipeline"
+
+      $vars = @(
+        "INTEL_OPENVINO_DIR",
+        "OpenVINO_DIR",
+        "OpenVINOGenAI_DIR",
+        "OPENVINO_LIB_PATHS",
+        "TBB_DIR",
+        "PATH",
+        "PYTHONPATH"
+      )
+
+      foreach ($var in $vars) {
+        if (Test-Path "Env:$var") {
+          $value = [System.Environment]::GetEnvironmentVariable($var, "Process")
+          Write-Output "Setting $var"
+          Write-Output "##vso[task.setvariable variable=$var;]$value"
+        } else {
+          Write-Output "Warning: $var is not set."
+        }
+      }
+
+      Write-Output "Selected environment variables exported successfully"
+    displayName: 'Set up OpenVINO environment'
+
+  - template: templates/jobs/win-ci-build-steps.yml
+    parameters:
+      WithCache: True
+      Today: $(TODAY)
+      AdditionalKey: "win-openvino | $(BuildConfig)"
+      BuildPyArguments: >-
+        --config $(BuildConfig)
+        --build_dir $(Build.BinariesDirectory)
+        --cmake_generator "Visual Studio 17 2022"
+        --build_shared_lib
+        --use_openvino CPU
+        --use_binskim_compliant_compile_flags
+        --update --parallel
+      MsbuildArguments: $(MsbuildArguments)
+      BuildArch: $(buildArch)
+      Platform: 'x64'
+      BuildConfig: $(BuildConfig)
+
+  - powershell: |
+      Write-Output "Getting CPU information"
+      Get-WmiObject Win32_Processor | Select-Object Name, NumberOfCores, NumberOfLogicalProcessors, Architecture | Format-Table -AutoSize
+
+      Write-Output "Starting unit tests"
+      python "$(Build.SourcesDirectory)\tools\ci_build\build.py" `
+        --config "$(BuildConfig)" `
+        --build_dir "$(Build.BinariesDirectory)" `
+        --cmake_generator "Visual Studio 17 2022" `
+        --build_shared_lib `
+        --use_openvino CPU `
+        --use_binskim_compliant_compile_flags `
+        --test --enable_onnx_tests
+    displayName: 'Run unit tests'
diff --git a/tools/ci_build/set-trigger-rules.py b/tools/ci_build/set-trigger-rules.py
index 78f59452d1284..899aaaa95216a 100644
--- a/tools/ci_build/set-trigger-rules.py
+++ b/tools/ci_build/set-trigger-rules.py
@@ -16,8 +16,6 @@
     "android-x86_64-crosscompile-ci-pipeline.yml",
     "bigmodels-ci-pipeline.yml",
     "linux-ci-pipeline.yml",
-    "linux-cpu-aten-pipeline.yml",
-    "linux-cpu-eager-pipeline.yml",
     "linux-dnnl-ci-pipeline.yml",
     "linux-gpu-ci-pipeline.yml",
     "linux-gpu-tensorrt-ci-pipeline.yml",
@@ -36,6 +34,7 @@
     "win-gpu-doc-gen-ci-pipeline.yml",
     "win-gpu-tensorrt-ci-pipeline.yml",
     "win-gpu-webgpu-ci-pipeline.yml",
+    "win-openvino-ci-pipeline.yml",
     "win-qnn-arm64-ci-pipeline.yml",
     "win-qnn-ci-pipeline.yml",
 ]

From 4bb79d13a06c6a02d522b736f030070bdfe752cb Mon Sep 17 00:00:00 2001
From: vraspar <vrajang@outlook.com>
Date: Thu, 6 Mar 2025 16:46:56 -0800
Subject: [PATCH 29/46] [WebGPU EP] SoftMax Implementation (#23538)

Increase coverage for WebGPU Op
---
 .../core/providers/webgpu/math/softmax.cc     | 238 ++++++++++++++++++
 .../core/providers/webgpu/math/softmax.h      |  54 ++++
 .../core/providers/webgpu/tensor/transpose.cc |  60 +++--
 .../core/providers/webgpu/tensor/transpose.h  |   2 +
 .../webgpu/webgpu_execution_provider.cc       |   6 +-
 .../test/providers/cpu/math/softmax_test.cc   |  13 +-
 6 files changed, 344 insertions(+), 29 deletions(-)
 create mode 100644 onnxruntime/core/providers/webgpu/math/softmax.cc
 create mode 100644 onnxruntime/core/providers/webgpu/math/softmax.h

diff --git a/onnxruntime/core/providers/webgpu/math/softmax.cc b/onnxruntime/core/providers/webgpu/math/softmax.cc
new file mode 100644
index 0000000000000..d06fc5a57eb8c
--- /dev/null
+++ b/onnxruntime/core/providers/webgpu/math/softmax.cc
@@ -0,0 +1,238 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include <string>
+
+#include "core/common/inlined_containers.h"
+#include "core/providers/common.h"
+#include "core/providers/webgpu/math/softmax.h"
+#include "core/providers/webgpu/tensor/transpose.h"
+#include "core/providers/cpu/tensor/utils.h"
+#include "core/providers/webgpu/shader_variable.h"
+#include "core/providers/webgpu/shader_helper.h"
+#include "core/providers/webgpu/webgpu_supported_types.h"
+namespace onnxruntime {
+namespace webgpu {
+
+ONNX_OPERATOR_VERSIONED_KERNEL_EX(
+    Softmax,
+    kOnnxDomain,
+    1, 10,
+    kWebGpuExecutionProvider,
+    (*KernelDefBuilder::Create())
+        .TypeConstraint("T", WebGpuSupportedNumberTypes()),
+    Softmax);
+
+ONNX_OPERATOR_VERSIONED_KERNEL_EX(
+    Softmax,
+    kOnnxDomain,
+    11, 12,
+    kWebGpuExecutionProvider,
+    (*KernelDefBuilder::Create())
+        .TypeConstraint("T", WebGpuSupportedNumberTypes()),
+    Softmax);
+
+ONNX_OPERATOR_KERNEL_EX(
+    Softmax,
+    kOnnxDomain,
+    13,
+    kWebGpuExecutionProvider,
+    (*KernelDefBuilder::Create())
+        .TypeConstraint("T", WebGpuSupportedNumberTypes()),
+    Softmax);
+
+static std::string MaxVector(const std::string& name, int components) {
+  switch (components) {
+    case 1:
+      return name;
+    case 2:
+      return "max(" + name + ".x, " + name + ".y)";
+    case 3:
+      return "max(max(" + name + ".x, " + name + ".y), " + name + ".z)";
+    case 4:
+      return "max(max(" + name + ".x, " + name + ".y), max(" + name + ".z, " + name + ".w))";
+    default:
+      ORT_THROW("Unsupported number of components: ", components);
+  }
+}
+
+static std::string SumVector(const std::string& x, int components) {
+  switch (components) {
+    case 1:
+      return x;
+    case 2:
+      return "(" + x + ".x + " + x + ".y" + ")";
+    case 4:
+      return "(" + x + ".x + " + x + ".y + " + x + ".w + " + x + ".z" + ")";
+    default:
+      ORT_THROW("Unsupported number of components: ", components);
+  }
+}
+
+static int GetMaxComponents(int64_t size) {
+  if (size % 4 == 0) {
+    return 4;
+  } else if (size % 2 == 0) {
+    return 2;
+  }
+  return 1;
+}
+
+Status SoftmaxProgram::GenerateShaderCode(ShaderHelper& shader) const {
+  // Add input and output variables
+  const auto& input = shader.AddInput("x", ShaderUsage::UseUniform | ShaderUsage::UseIndicesTypeAlias | ShaderUsage::UseValueTypeAlias);
+  shader.AddOutput("result", ShaderUsage::UseUniform | ShaderUsage::UseIndicesTypeAlias);
+  int components = input.NumComponents();
+
+  const std::string thread_max_decl = is_fp32_
+                                          ? "var thread_max = x_value_t(-3.402823e+38f);\n"
+                                          : "var thread_max = x_value_t(-65504.0h);\n";
+
+  // Define shared memory for row max and row sum
+  shader.AdditionalImplementation()
+      << "var<workgroup> row_max_shared : x_value_t;\n"
+      << "var<workgroup> row_sum_shared : x_value_t;\n"
+      << "var<workgroup> thread_shared : array<x_value_t, " << wg_ << ">;\n";
+
+  // Define helper functions to get and set values
+  shader.AdditionalImplementation()
+      << "fn getValue(row: i32, col: i32, row_stride: i32) -> x_value_t {\n"
+      << "  let index = row * row_stride + col;\n"
+      << "  return x[index];\n"
+      << "}\n"
+      << "fn setValue(row: i32, col: i32, row_stride: i32, value: x_value_t) {\n"
+      << "  let index = row * row_stride + col;\n"
+      << "  result[index] = value;\n"
+      << "}\n";
+
+  // Main function body
+  shader.MainFunctionBody()
+      << "  let gindex = i32(global_idx);\n"
+      << "  let lindex = i32(local_idx);\n"
+      << "  const wg = " << wg_ << ";\n"
+      << "  let row = gindex / wg;\n"
+      << "  let cols = uniforms.packedCols;\n"
+      << "  let row_stride : i32 = uniforms.packedCols;\n"
+
+      // Find the row's max value
+      << thread_max_decl
+      << "  for (var col = lindex; col < cols; col += wg) {\n"
+      << "    let value = getValue(row, col, row_stride);\n"
+      << "    thread_max = max(thread_max, value);\n"
+      << "  }\n"
+      << "  if (lindex < cols) {\n"
+      << "    thread_shared[lindex] = thread_max;\n"
+      << "  }\n"
+      << "  workgroupBarrier();\n"
+
+      // Reduce to find the max value
+      << "  var reduce_size = min(cols, wg);\n"
+      << "  for (var curr_size = reduce_size >> 1; curr_size > 0; curr_size = reduce_size >> 1) {\n"
+      << "    reduce_size = curr_size + (reduce_size & 1);\n"
+      << "    if (lindex < curr_size) {\n"
+      << "      thread_shared[lindex] = max(thread_shared[lindex], thread_shared[lindex + reduce_size]);\n"
+      << "    }\n"
+      << "    workgroupBarrier();\n"
+      << "  }\n"
+      << "  if (lindex == 0) {\n"
+      << "    row_max_shared = x_value_t(" << MaxVector("thread_shared[0]", components) << ");\n"
+      << "  }\n"
+      << "  workgroupBarrier();\n"
+
+      // Find the row's sum of exponentials
+      << "  var thread_sum = x_value_t(0.0);\n"
+      << "  for (var col = lindex; col < cols; col += wg) {\n"
+      << "    let sub_exp = exp(getValue(row, col, row_stride) - row_max_shared);\n"
+      << "    thread_sum += sub_exp;\n"
+      << "  }\n"
+      << "  thread_shared[lindex] = thread_sum;\n"
+      << "  workgroupBarrier();\n"
+
+      // Reduce to find the sum of exponentials
+      << "  for (var curr_size = wg >> 1; curr_size > 0; curr_size = curr_size >> 1) {\n"
+      << "    if (lindex < curr_size) {\n"
+      << "      thread_shared[lindex] = thread_shared[lindex] + thread_shared[lindex + curr_size];\n"
+      << "    }\n"
+      << "    workgroupBarrier();\n"
+      << "  }\n"
+      << "  if (lindex == 0) {\n"
+      << "    row_sum_shared = x_value_t(" << SumVector("thread_shared[0]", components) << ");\n"
+      << "  }\n"
+      << "  workgroupBarrier();\n"
+
+      // Calculate the final value for each element in the row
+      << "  for (var col = lindex; col < cols; col += wg) {\n"
+      << "    let value = exp(getValue(row, col, row_stride) - row_max_shared) / row_sum_shared;\n"
+      << "    setValue(row, col, row_stride, value);\n"
+      << "  }\n";
+
+  return Status::OK();
+}
+
+Status Softmax::ComputeInternal(ComputeContext& context) const {
+  const auto* input_tensor = context.Input(0);
+  const TensorShape& input_shape = input_tensor->Shape();
+  size_t input_rank = input_shape.NumDimensions();
+  auto* output_tensor = context.Output(0, input_shape);
+
+  // normalize axis
+  size_t axis = static_cast<size_t>(HandleNegativeAxis(axis_, input_rank));
+  bool is_transpose_required = axis < input_rank - 1;
+
+  TensorShape transposed_input_shape;
+  Tensor transposed_input_tensor;
+  Tensor intermediate_output;
+  InlinedVector<size_t> perm(input_rank);
+
+  if (is_transpose_required) {
+    std::iota(std::begin(perm), std::end(perm), 0);
+    perm[axis] = input_rank - 1;
+    perm[input_rank - 1] = axis;
+
+    TensorShapeVector transposed_input_dims;
+    for (auto e : perm) {
+      transposed_input_dims.push_back(input_shape[e]);
+    }
+
+    transposed_input_shape = TensorShape(transposed_input_dims);
+    transposed_input_tensor = context.CreateGPUTensor(input_tensor->DataType(), transposed_input_shape);
+    ORT_RETURN_IF_ERROR(Transpose::DoTranspose(context, perm, *input_tensor, transposed_input_tensor));
+    intermediate_output = context.CreateGPUTensor(output_tensor->DataType(), transposed_input_shape);
+  }
+
+  const int64_t cols = is_transpose_required ? transposed_input_shape[input_rank - 1] : input_shape[input_rank - 1];
+  const int64_t rows = input_shape.Size() / cols;
+  const int64_t components = GetMaxComponents(cols);
+  const auto packed_cols = cols / components;
+  uint32_t workgroup_size = rows == 1 ? 256 : 64;
+  // check input tensor element type is float
+  const bool is_fp32 = input_tensor->GetElementType() == ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
+
+  SoftmaxProgram program{workgroup_size, is_fp32};
+  if (is_transpose_required) {
+    program
+        .AddInputs({{&transposed_input_tensor, ProgramTensorMetadataDependency::TypeAndRank, static_cast<int>(components)}})
+        .AddOutputs({{&intermediate_output, ProgramTensorMetadataDependency::TypeAndRank, static_cast<int>(components)}});
+  } else {
+    program
+        .AddInputs({{input_tensor, ProgramTensorMetadataDependency::TypeAndRank, static_cast<int>(components)}})
+        .AddOutputs({{output_tensor, ProgramTensorMetadataDependency::TypeAndRank, static_cast<int>(components)}});
+  }
+
+  program
+      .CacheHint(std::to_string(components), std::to_string(workgroup_size))
+      .SetWorkgroupSize(workgroup_size)
+      .SetDispatchGroupSize(static_cast<uint32_t>(rows))
+      .AddUniformVariables({{static_cast<int32_t>(packed_cols)}});
+
+  ORT_RETURN_IF_ERROR(context.RunProgram(program));
+
+  // If transpose was required, transpose the result back
+  if (is_transpose_required) {
+    ORT_RETURN_IF_ERROR(Transpose::DoTranspose(context, perm, intermediate_output, *output_tensor));
+  }
+
+  return Status::OK();
+}
+}  // namespace webgpu
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/webgpu/math/softmax.h b/onnxruntime/core/providers/webgpu/math/softmax.h
new file mode 100644
index 0000000000000..cc97611dcb4bc
--- /dev/null
+++ b/onnxruntime/core/providers/webgpu/math/softmax.h
@@ -0,0 +1,54 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/providers/webgpu/webgpu_supported_types.h"
+#include "core/providers/webgpu/webgpu_kernel.h"
+#include "core/providers/webgpu/program.h"
+#include "core/framework/op_kernel.h"
+
+namespace onnxruntime {
+namespace webgpu {
+
+class Softmax final : public WebGpuKernel {
+ public:
+  Softmax(const OpKernelInfo& info) : WebGpuKernel{info} {
+    int opset_ = info.node().SinceVersion();
+    int64_t axis;
+    Status status = info.GetAttr<int64_t>("axis", &axis);
+
+    if (status.IsOK()) {
+      axis_ = axis;
+    } else {
+      if (opset_ < 13) {
+        axis_ = 1;  // opset-12 and below, the default axis value is 1
+      } else {
+        axis_ = -1;  // opset-13, the default axis value is -1
+      }
+    }
+  }
+
+  Status ComputeInternal(ComputeContext& context) const override;
+
+ private:
+  int64_t axis_;
+};
+
+class SoftmaxProgram final : public Program<SoftmaxProgram> {
+ public:
+  SoftmaxProgram(uint32_t wg, bool is_fp32)
+      : Program{"Softmax"}, wg_{wg}, is_fp32_{is_fp32} {
+  }
+
+  Status GenerateShaderCode(ShaderHelper& sh) const override;
+
+  WEBGPU_PROGRAM_DEFINE_UNIFORM_VARIABLES({"packedCols", ProgramUniformVariableDataType::Int32});
+
+ private:
+  uint32_t wg_;
+  bool is_fp32_;
+};
+
+}  // namespace webgpu
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/webgpu/tensor/transpose.cc b/onnxruntime/core/providers/webgpu/tensor/transpose.cc
index c40ec43dd0009..24b98e9533d17 100644
--- a/onnxruntime/core/providers/webgpu/tensor/transpose.cc
+++ b/onnxruntime/core/providers/webgpu/tensor/transpose.cc
@@ -47,7 +47,10 @@ ONNX_OPERATOR_KERNEL_EX(
         .TypeConstraint("T", WebGpuSupportedNumberTypes()),
     Transpose);
 
-auto SqueezeShape(const gsl::span<const int64_t>& shape, const gsl::span<const size_t>& adjusted_perm, InlinedVector<int64_t>& new_shape, InlinedVector<int64_t>& new_perm) {
+auto SqueezeShape(const gsl::span<const int64_t>& shape,
+                  const gsl::span<const size_t>& adjusted_perm,
+                  TensorShapeVector& new_shape,
+                  TensorShapeVector& new_perm) {
   for (size_t i = 0; i < shape.size(); ++i) {
     if (shape[i] != 1) {
       new_shape.push_back(shape[i]);
@@ -97,26 +100,28 @@ Status TransposeProgram::GenerateShaderCode(ShaderHelper& shader) const {
   return Status::OK();
 }
 
-Status Transpose::ComputeInternal(ComputeContext& context) const {
-  const auto* input_tensor = context.Input(0);
-  const TensorShape& input_shape = input_tensor->Shape();
+Status Transpose::DoTranspose(onnxruntime::webgpu::ComputeContext& context,
+                              gsl::span<const size_t> permutations,
+                              const Tensor& input, Tensor& output) {
+  const auto& input_shape = input.Shape();
+  const auto& input_dims = input_shape.GetDims();
   int32_t rank = gsl::narrow_cast<int32_t>(input_shape.NumDimensions());
 
   TensorShapeVector output_dims(rank);
-  InlinedVector<size_t> default_perm(rank);
-  const InlinedVector<size_t>* p_perm = nullptr;
-  ORT_RETURN_IF_ERROR(ComputeOutputShape(*input_tensor, output_dims, default_perm, p_perm));
-  TensorShape output_shape(output_dims);
-  auto* output_tensor = context.Output(0, output_shape);
 
-  InlinedVector<int64_t> new_shape{};
-  InlinedVector<int64_t> new_perm{};
-  SqueezeShape(input_shape.GetDims(), *p_perm, new_shape, new_perm);
-  const bool channels_last = new_perm == InlinedVector<int64_t>({2, 3, 1});
-  const bool channels_first = new_perm == InlinedVector<int64_t>({3, 1, 2});
+  for (int32_t i = 0; i < rank; i++) {
+    output_dims[i] = input_dims[permutations[i]];
+  }
+
+  TensorShapeVector new_shape{};
+  TensorShapeVector new_perm{};
+  SqueezeShape(input_shape.GetDims(), permutations, new_shape, new_perm);
+  const bool channels_last = new_perm == TensorShapeVector({2, 3, 1});
+  const bool channels_first = new_perm == TensorShapeVector({3, 1, 2});
   const bool use_shared = (new_shape.size() == 2 && new_perm[0] > new_perm[1]) || channels_last || channels_first;
   auto new_input_shape = input_shape;
   TensorShape new_output_shape(output_dims);
+
   if (use_shared) {
     new_input_shape = channels_last
                           ? TensorShape({new_shape[0], new_shape[1] * new_shape[2]})
@@ -126,16 +131,16 @@ Status Transpose::ComputeInternal(ComputeContext& context) const {
     new_output_shape = TensorShape({new_input_shape[1], new_input_shape[0]});
   }
 
-  uint32_t output_size = gsl::narrow_cast<int32_t>(input_tensor->Shape().Size());
-  TransposeProgram program{*p_perm, use_shared};
+  uint32_t output_size = gsl::narrow_cast<int32_t>(input_shape.Size());
+  TransposeProgram program{permutations, use_shared};
+
   if (use_shared) {
     program.SetWorkgroupSize(TILE_SIZE, TILE_SIZE, 1);
   }
-
   program
-      .CacheHint(absl::StrJoin(*p_perm, "-"))
-      .AddInputs({{input_tensor, ProgramTensorMetadataDependency::TypeAndRank, new_input_shape, 1}})
-      .AddOutputs({{output_tensor, ProgramTensorMetadataDependency::None, new_output_shape, 1}})
+      .CacheHint(absl::StrJoin(permutations, "-"))
+      .AddInputs({{&input, ProgramTensorMetadataDependency::TypeAndRank, new_input_shape, 1}})
+      .AddOutputs({{&output, ProgramTensorMetadataDependency::None, new_output_shape, 1}})
       .SetDispatchGroupSize(static_cast<uint32_t>((new_output_shape[1] + TILE_SIZE - 1) / TILE_SIZE),
                             static_cast<uint32_t>(((new_output_shape[0] + TILE_SIZE - 1) / TILE_SIZE)))
       .AddUniformVariables({
@@ -148,5 +153,20 @@ Status Transpose::ComputeInternal(ComputeContext& context) const {
   return context.RunProgram(program);
 }
 
+Status Transpose::ComputeInternal(ComputeContext& context) const {
+  const auto* input_tensor = context.Input(0);
+  const TensorShape& input_shape = input_tensor->Shape();
+  int32_t rank = gsl::narrow_cast<int32_t>(input_shape.NumDimensions());
+
+  TensorShapeVector output_dims(rank);
+  InlinedVector<size_t> default_perm(rank);
+  const InlinedVector<size_t>* p_perm = nullptr;
+  ORT_RETURN_IF_ERROR(ComputeOutputShape(*input_tensor, output_dims, default_perm, p_perm));
+  TensorShape output_shape(output_dims);
+  auto* output_tensor = context.Output(0, output_shape);
+
+  return DoTranspose(context, *p_perm, *input_tensor, *output_tensor);
+}
+
 }  // namespace webgpu
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/webgpu/tensor/transpose.h b/onnxruntime/core/providers/webgpu/tensor/transpose.h
index 7cf5c1fe0865d..b62a419fa12bc 100644
--- a/onnxruntime/core/providers/webgpu/tensor/transpose.h
+++ b/onnxruntime/core/providers/webgpu/tensor/transpose.h
@@ -16,6 +16,8 @@ class Transpose final : public WebGpuKernel, public TransposeBase {
   Transpose(const OpKernelInfo& info) : WebGpuKernel{info}, TransposeBase{info} {
   }
   Status ComputeInternal(ComputeContext& context) const override;
+  static Status DoTranspose(onnxruntime::webgpu::ComputeContext& context, gsl::span<const size_t> permutations, const Tensor& input, Tensor& output);
+
   constexpr static uint32_t TILE_SIZE = 16;
 };
 
diff --git a/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc b/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc
index d673f9d0717f0..f3bf2402252b7 100644
--- a/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc
+++ b/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc
@@ -628,9 +628,9 @@ std::unique_ptr<KernelRegistry> RegisterKernels() {
       // BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 11, 12, float, ArgMin)>,
       // BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 13, float, ArgMin)>,
 
-      // BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 1, 10, Softmax)>,
-      // BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 11, 12, Softmax)>,
-      // BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 13, Softmax)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 1, 10, Softmax)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 11, 12, Softmax)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 13, Softmax)>,
 
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 1, 3, Concat)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 4, 10, Concat)>,
diff --git a/onnxruntime/test/providers/cpu/math/softmax_test.cc b/onnxruntime/test/providers/cpu/math/softmax_test.cc
index 6f7930f722564..1c6375ebdb0b1 100644
--- a/onnxruntime/test/providers/cpu/math/softmax_test.cc
+++ b/onnxruntime/test/providers/cpu/math/softmax_test.cc
@@ -170,11 +170,11 @@ TEST(SoftmaxOperator, ThreeAndFourDimsAxis0) {
 
   RunTest(input_vals_60, expected_vals, three_dimensions, /*opset*/ 7, /*axis*/ 0,
           // axis=0 is not supported by TensorRT
-          {kTensorrtExecutionProvider, kOpenVINOExecutionProvider, kDnnlExecutionProvider});
+          {kTensorrtExecutionProvider, kOpenVINOExecutionProvider, kDnnlExecutionProvider, kWebGpuExecutionProvider});
 
   RunTest(input_vals_60, expected_vals, four_dimensions, /*opset*/ 7, /*axis*/ 0,
           // axis=0 is not supported by TensorRT
-          {kTensorrtExecutionProvider, kOpenVINOExecutionProvider, kDnnlExecutionProvider});
+          {kTensorrtExecutionProvider, kOpenVINOExecutionProvider, kDnnlExecutionProvider, kWebGpuExecutionProvider});
 }
 
 TEST(SoftmaxOperator, ThreeAndFourDimsSecondLastAxis) {
@@ -201,10 +201,10 @@ TEST(SoftmaxOperator, ThreeAndFourDimsSecondLastAxis) {
       0.040478885f, 0.033857856f, 0.080346674f, 0.06199841f, 0.040481992f};
 
   RunTest(input_vals_60, expected_vals, three_dimensions, /*opset*/ 7, /*axis*/ 1,
-          {kTensorrtExecutionProvider, kOpenVINOExecutionProvider, kDnnlExecutionProvider});
+          {kTensorrtExecutionProvider, kOpenVINOExecutionProvider, kDnnlExecutionProvider, kWebGpuExecutionProvider});
 
   RunTest(input_vals_60, expected_vals, four_dimensions, /*opset*/ 7, /*axis*/ 2,
-          {kTensorrtExecutionProvider, kOpenVINOExecutionProvider, kDnnlExecutionProvider});
+          {kTensorrtExecutionProvider, kOpenVINOExecutionProvider, kDnnlExecutionProvider, kWebGpuExecutionProvider});
 }
 
 TEST(SoftmaxOperator, ThreeAndFourDimsSecondLastAxis_opset13) {
@@ -376,8 +376,9 @@ TEST(SoftmaxOperator, DimWithZero) {
 
   RunTest(x_vals, expected_vals, dimensions, /*opset*/ -1, /*axis*/ 0,
           {kTensorrtExecutionProvider,
-           kNnapiExecutionProvider,  // NNAPI softmax does not support empty input
-           kQnnExecutionProvider}    // QNN doesn't support dim 0
+           kNnapiExecutionProvider,   // NNAPI softmax does not support empty input
+           kWebGpuExecutionProvider,  // WebGPU does not support dim 0
+           kQnnExecutionProvider}     // QNN doesn't support dim 0
   );
 }
 

From b2ab87e8b76edc67aeba5ba31093bae94efc64fb Mon Sep 17 00:00:00 2001
From: Scott McKay <skottmckay@gmail.com>
Date: Fri, 7 Mar 2025 11:24:10 +1000
Subject: [PATCH 30/46] Exclude MAUI projects from GPU C# packaging builds
 (#23923)

### Description
<!-- Describe your changes. -->
Use 'desktop only' solution in GPU C# packaging builds. We don't need to
include any MAUI support for those builds.

### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
---
 .../rocm-nuget-packaging-pipeline.yml             |  6 +++---
 .../stages/nuget-cuda-packaging-stage.yml         | 15 +++------------
 2 files changed, 6 insertions(+), 15 deletions(-)

diff --git a/tools/ci_build/github/azure-pipelines/rocm-nuget-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/rocm-nuget-packaging-pipeline.yml
index f7f5c7b1494e8..286f92b36f7e4 100644
--- a/tools/ci_build/github/azure-pipelines/rocm-nuget-packaging-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/rocm-nuget-packaging-pipeline.yml
@@ -224,7 +224,7 @@ stages:
     - task: MSBuild@1
       displayName: 'Restore NuGet Packages and create project.assets.json'
       inputs:
-        solution: '$(Build.SourcesDirectory)\csharp\OnnxRuntime.CSharp.sln'
+        solution: '$(Build.SourcesDirectory)\csharp\OnnxRuntime.DesktopOnly.CSharp.sln'
         platform: 'Any CPU'
         configuration: RelWithDebInfo
         msbuildArguments: '-t:restore -p:OrtPackageId="Microsoft.ML.OnnxRuntime.ROCm"'
@@ -233,7 +233,7 @@ stages:
     - task: MSBuild@1
       displayName: 'Build C# bindings'
       inputs:
-        solution: '$(Build.SourcesDirectory)\csharp\OnnxRuntime.CSharp.sln'
+        solution: '$(Build.SourcesDirectory)\csharp\OnnxRuntime.DesktopOnly.CSharp.sln'
         platform: 'Any CPU'
         configuration: RelWithDebInfo
         msbuildArguments: >
@@ -317,7 +317,7 @@ stages:
     - task: MSBuild@1
       displayName: 'Clean C#'
       inputs:
-        solution: '$(Build.SourcesDirectory)\csharp\OnnxRuntime.CSharp.sln'
+        solution: '$(Build.SourcesDirectory)\csharp\OnnxRuntime.DesktopOnly.CSharp.sln'
         platform: 'Any CPU'
         configuration: RelWithDebInfo
         msbuildArguments: '-t:Clean -p:OnnxRuntimeBuildDirectory="$(Build.BinariesDirectory)" -p:OrtPackageId=Microsoft.ML.OnnxRuntime.ROCm'
diff --git a/tools/ci_build/github/azure-pipelines/stages/nuget-cuda-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/stages/nuget-cuda-packaging-stage.yml
index 8fabb80a73869..5ae60aac8f9b4 100644
--- a/tools/ci_build/github/azure-pipelines/stages/nuget-cuda-packaging-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/stages/nuget-cuda-packaging-stage.yml
@@ -96,18 +96,10 @@ stages:
           inputs:
             versionSpec: 6.10.x
 
-        - task: PowerShell@2
-          displayName: Install MAUI workloads
-          inputs:
-            targetType: 'inline'
-            script: |
-              dotnet workload install android ios maccatalyst
-            workingDirectory: '$(Build.SourcesDirectory)\csharp'
-
         - task: MSBuild@1
           displayName: 'Restore NuGet Packages and create project.assets.json'
           inputs:
-            solution: '$(Build.SourcesDirectory)\csharp\OnnxRuntime.CSharp.sln'
+            solution: '$(Build.SourcesDirectory)\csharp\OnnxRuntime.DesktopOnly.CSharp.sln'
             platform: 'Any CPU'
             configuration: RelWithDebInfo
             msbuildArguments: '-t:restore -p:OrtPackageId="Microsoft.ML.OnnxRuntime.Gpu"'
@@ -116,7 +108,7 @@ stages:
         - task: MSBuild@1
           displayName: 'Build C# bindings'
           inputs:
-            solution: '$(Build.SourcesDirectory)\csharp\OnnxRuntime.CSharp.sln'
+            solution: '$(Build.SourcesDirectory)\csharp\OnnxRuntime.DesktopOnly.CSharp.sln'
             configuration: RelWithDebInfo
             platform: 'Any CPU'
             msbuildArguments: >
@@ -208,7 +200,7 @@ stages:
         - task: MSBuild@1
           displayName: 'Clean C#'
           inputs:
-            solution: '$(Build.SourcesDirectory)\csharp\OnnxRuntime.CSharp.sln'
+            solution: '$(Build.SourcesDirectory)\csharp\OnnxRuntime.DesktopOnly.CSharp.sln'
             platform: 'Any CPU'
             configuration: RelWithDebInfo
             msbuildArguments: '-t:Clean -p:OnnxRuntimeBuildDirectory="$(Build.BinariesDirectory)" -p:OrtPackageId=Microsoft.ML.OnnxRuntime.Gpu'
@@ -223,4 +215,3 @@ stages:
           inputs:
             artifactName: 'drop-signed-nuget-GPU'
             targetPath: '$(Build.ArtifactStagingDirectory)'
-

From eeaf73b3f418f829acc17f3212056959e5703cd5 Mon Sep 17 00:00:00 2001
From: Sushanth Rajasankar <44513542+sushraja-msft@users.noreply.github.com>
Date: Thu, 6 Mar 2025 17:44:37 -0800
Subject: [PATCH 31/46] Support all block sizes that are multiples of 32 for
 DP4A (#23907)

### Description
Simple change
1. The DP4A shader actually supports all block sizes that are multiples
of 32, relaxing the restriction and making a small tweak to support
sizes other than 32.
2. Moved the shader to a separate file for maintainability.

---------

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
---
 .../webgpu/quantization/dp4a_matmul_nbits.cc  | 326 ++++++++++++++++++
 .../webgpu/quantization/dp4a_matmul_nbits.h   |  56 +++
 .../webgpu/quantization/matmul_nbits.cc       | 297 +---------------
 .../webgpu/quantization/matmul_nbits.h        |  19 -
 4 files changed, 387 insertions(+), 311 deletions(-)
 create mode 100644 onnxruntime/contrib_ops/webgpu/quantization/dp4a_matmul_nbits.cc
 create mode 100644 onnxruntime/contrib_ops/webgpu/quantization/dp4a_matmul_nbits.h

diff --git a/onnxruntime/contrib_ops/webgpu/quantization/dp4a_matmul_nbits.cc b/onnxruntime/contrib_ops/webgpu/quantization/dp4a_matmul_nbits.cc
new file mode 100644
index 0000000000000..6720a6072f7bb
--- /dev/null
+++ b/onnxruntime/contrib_ops/webgpu/quantization/dp4a_matmul_nbits.cc
@@ -0,0 +1,326 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "contrib_ops/webgpu/quantization/dp4a_matmul_nbits.h"
+#include "core/providers/webgpu/shader_helper.h"
+
+namespace onnxruntime {
+namespace contrib {
+namespace webgpu {
+
+Status DP4AMatMulQuantizeProgram::GenerateShaderCode(ShaderHelper& shader) const {
+  shader.AddInput("input_a", ShaderUsage::UseUniform | ShaderUsage::UseIndicesTypeAlias | ShaderUsage::UseValueTypeAlias | ShaderUsage::UseElementTypeAlias);
+  shader.AddOutput("output", ShaderUsage::UseUniform);
+  shader.AddOutput("scales", ShaderUsage::UseUniform);
+  shader.AdditionalImplementation() << R"ADDNL_FN(
+        fn readInput(offset: u32) -> input_a_value_t
+        {
+            if (offset > uniforms.input_size) {
+                return input_a_value_t(0);
+            }
+            return input_a[offset];
+        }
+    )ADDNL_FN";
+  shader.MainFunctionBody() << R"MAIN_FN(
+        var local_a : array<vec4<input_a_element_t>, 32>;
+        var max_value:vec4<input_a_element_t> = vec4<input_a_element_t>(0);
+        for (var idx:u32=0;idx<32;idx+=1)
+        {
+            local_a[idx] = readInput(workgroup_idx*32 + idx);
+            max_value = max(max_value, abs(local_a[idx]));
+        }
+        var scale = max(max_value.x, max_value.y);
+        scale = max(scale, max_value.z);
+        scale = max(scale, max_value.w);
+        for (var idx:u32=0;idx<32;idx+=1)
+        {
+            output[workgroup_idx*32+idx] = pack4x8snorm(vec4<f32>(local_a[idx]/scale));
+        }
+        // 127 is the max value of signed int8 [-127,127] used by pack4x8snorm for 1.0f.
+        scales[workgroup_idx] = scale/127;
+    )MAIN_FN";
+  return Status::OK();
+}
+
+Status DP4AMatMulNBitsProgram::GenerateShaderCode(ShaderHelper& shader) const {
+  shader.AddInput("input_a", ShaderUsage::UseUniform | ShaderUsage::UseIndicesTypeAlias | ShaderUsage::UseValueTypeAlias);
+  shader.AddInput("scales_a", ShaderUsage::UseUniform);
+  shader.AddInput("input_b", ShaderUsage::UseUniform);
+  shader.AddInput("scales_b", ShaderUsage::UseUniform);
+  shader.AddOutput("output", ShaderUsage::UseUniform | ShaderUsage::UseElementTypeAlias);
+
+  // This shader implements co-operative matrix multiply. The key idea here is to
+  // assume there is a primitive for medium size matrix multiply a subgroup can perform,
+  // using all its lanes and pooling all its registers to keep the values in registry.
+  //
+  // The entire workgroup which has N subgroups first loads a tile into shared memory,
+  // Then each subgroup loads a subtile from shared memory into registers and uses
+  // the medium size matrix multiply primitive to perform the math.
+  // The values for tile/subtile size are chosen to conform to the resource limits
+  // of an alderlake/tiger lake gpu. A tile is 64x64, workgroup is 256 threads -
+  // therefore there are 16 subgroups and 16 lanes in each subgroup.
+  // K the hidden dimension is paged in from RAM at k tile size which is 64.
+  // All this puts the shared memory requirement slightly above 16KB.
+  // WebGPU limit is 16KB, output is moved to registers instead of SHM to make
+  // everything fit in shared memory.
+  //
+  // Each subgroup performs a 16 x 64 x 16 multiply which is implemented with
+  // subgroup shuffle as a placeholder for the day the medium matrix mul primitive
+  // becomes available in WGSL. The registry requirements is ~2KB per subgroup, on
+  // Alderlake/Tigerlake subgroup has 8KB of registry space pooling the
+  // 512B of registry from each lane.
+  //
+  // The medium size matmul is implemented using dot4I8Packed, so the inputs for
+  // this shader require A to be int8 quantized with block size 64. B is regular
+  // matmulnbits input with block size 32.
+
+  shader.AdditionalImplementation() << "  const block_size = " << block_size_ << ";";
+
+  shader.AdditionalImplementation() << R"ADDNL_FN(
+        const tile_size = 64;
+        const subtile_size = 16;
+        const tile_size_k =  32;
+        const vec_factor = 4;
+        const u32_factor = 4;
+        const tile_size_k_vec = 2;
+
+        // Shared memory
+        var<workgroup> tile_A : array<array<vec4<u32>, tile_size>, tile_size_k_vec>;                     // 64 x 32
+        var<workgroup> scale_A : array<output_element_t, tile_size>;                                     // 64 x 1
+        var<workgroup> tile_B : array<array<vec4<u32>, tile_size>, tile_size_k_vec>;                     // 64 x 32
+        var<workgroup> scale_B : array<output_element_t, tile_size>;                                     // 64 x 1
+
+        fn loadSHMA(a_global_base:u32, kidx_v:u32, row: u32, col: u32)
+        {
+            let a_global = a_global_base + row;
+            if (a_global >= uniforms.M)
+            {
+                return;
+            }
+            tile_A[col][row] = input_a[a_global*uniforms.K16+kidx_v+col];
+            if (col == 0)
+            {
+                // kidx_v - covers 16 values of k
+                scale_A[row] = scales_a[a_global*(uniforms.K/128) + kidx_v/8];
+            }
+        }
+
+        fn loadSHMB(b_global_base:u32, kidx_v:u32, row: u32, col: u32)
+        {
+            let b_global = b_global_base + row;
+            if (b_global >= uniforms.N)
+            {
+                return;
+            }
+
+            let b_value = input_b[b_global*uniforms.K16+kidx_v+col];
+            var b_value_lower = vec4<i32>(unpack4xU8(b_value[0] & 0x0F0F0F0Fu)) - vec4<i32>(8);
+            var b_value_upper = vec4<i32>(unpack4xU8((b_value[0] >> 4) & 0x0F0F0F0Fu)) - vec4<i32>(8);
+            tile_B[col][row][0] = pack4xI8(vec4<i32>(b_value_lower[0], b_value_upper[0], b_value_lower[1], b_value_upper[1]));
+            tile_B[col][row][1] = pack4xI8(vec4<i32>(b_value_lower[2], b_value_upper[2], b_value_lower[3], b_value_upper[3]));
+            b_value_lower = vec4<i32>(unpack4xU8(b_value[1] & 0x0F0F0F0Fu)) - vec4<i32>(8);
+            b_value_upper = vec4<i32>(unpack4xU8((b_value[1] >> 4) & 0x0F0F0F0Fu)) - vec4<i32>(8);
+            tile_B[col][row][2] = pack4xI8(vec4<i32>(b_value_lower[0], b_value_upper[0], b_value_lower[1], b_value_upper[1]));
+            tile_B[col][row][3] = pack4xI8(vec4<i32>(b_value_lower[2], b_value_upper[2], b_value_lower[3], b_value_upper[3]));
+            if (col == 0)
+            {
+                // kidx_v - each kidx_v covers 16 values of k
+                scale_B[row] = scales_b[b_global*(uniforms.K/block_size) + kidx_v/(block_size/16)];
+            }
+        }
+
+        // Scaled dot product of 8 packed unsigned integers.
+        fn SDP8AI(a1:vec4<u32>, b1:vec4<u32>, a2:vec4<u32>, b2:vec4<u32>, scale:output_element_t) -> output_element_t
+        {
+            var local_sum = dot4I8Packed(a1[0], b1[0]);
+            local_sum += dot4I8Packed(a1[1], b1[1]);
+            local_sum += dot4I8Packed(a1[2], b1[2]);
+            local_sum += dot4I8Packed(a1[3], b1[3]);
+            local_sum += dot4I8Packed(a2[0], b2[0]);
+            local_sum += dot4I8Packed(a2[1], b2[1]);
+            local_sum += dot4I8Packed(a2[2], b2[2]);
+            local_sum += dot4I8Packed(a2[3], b2[3]);
+            return output_element_t(local_sum) * scale;
+        }
+    )ADDNL_FN";
+
+  shader.MainFunctionBody() << R"MAIN_FN(
+        // During the load phase we use all 256 threads to load 64 rows of A/B.
+        // For each row we load tile_size_k_vec (2) vectorized elements, which are 32 elements of K.
+        let a_global_base = workgroup_id.x * tile_size;
+        let b_global_base = workgroup_id.y * tile_size;
+        let load_AorB = u32(local_idx/128);
+        let load_row = u32((local_idx%128)/2);
+        let load_col = u32(local_idx%2);
+
+        // During the compute phase, we have the 64x64 tile split into
+        // subtiles of 16x16. We have a grid of 4x4 subtiles.
+        let subtile_id = u32(local_idx / subtile_size);
+        let subtile_idx = u32(subtile_id / 4);
+        let subtile_idy = u32(subtile_id % 4);
+        let base_A = subtile_idx * 16;
+        let base_B = subtile_idy * 16;
+        // For each subtile we have 16 threads assigned.
+        let a_idx = u32(local_idx % subtile_size);
+
+        var lane_output1: vec4<output_element_t>;
+        var lane_output2: vec4<output_element_t>;
+        var lane_output3: vec4<output_element_t>;
+        var lane_output4: vec4<output_element_t>;
+        // K's vectrorization is 16 items per index. See input_a/input_b.
+        // tile_size_k_vec - is the k tile size in vectorized space (1/16). That is
+        // k tile size is 32. In vectorized space that is 32/16 = 2.
+        for (var kidx_v:u32 = 0; kidx_v < uniforms.K16; kidx_v+=tile_size_k_vec)
+        {
+            // Load Phase: Populate shared memory for the workgroup.
+            if (load_AorB == 0)
+            {
+                loadSHMA(a_global_base, kidx_v, load_row, load_col);
+            }
+            else
+            {
+                loadSHMB(b_global_base, kidx_v, load_row, load_col);
+            }
+            workgroupBarrier();
+
+            // Compute phase: Perform matmul for this subtile 16 x 32 x 16.
+            // Step 1: Load from shared memory into registers across entire subgroup.
+            var own_a0: vec4<u32> = tile_A[0][base_A + a_idx];
+            var own_a1: vec4<u32> = tile_A[1][base_A + a_idx];
+            var own_scale_a: output_element_t = scale_A[base_A + a_idx];
+            if (sg_size == 16)
+            {
+                var own_b0: vec4<u32> = tile_B[0][base_B + sg_id];
+                var own_b1: vec4<u32> = tile_B[1][base_B + sg_id];
+                var own_scale_b: output_element_t  = scale_B[base_B + sg_id];
+                // Step 2: Access registers across the subgroup using subgroupShuffle and perform the matmul.
+                lane_output1[0] += SDP8AI(own_a0, subgroupShuffle(own_b0, 0), own_a1, subgroupShuffle(own_b1, 0), subgroupShuffle(own_scale_b, 0) * own_scale_a);
+                lane_output1[1] += SDP8AI(own_a0, subgroupShuffle(own_b0, 1), own_a1, subgroupShuffle(own_b1, 1), subgroupShuffle(own_scale_b, 1) * own_scale_a);
+                lane_output1[2] += SDP8AI(own_a0, subgroupShuffle(own_b0, 2), own_a1, subgroupShuffle(own_b1, 2), subgroupShuffle(own_scale_b, 2) * own_scale_a);
+                lane_output1[3] += SDP8AI(own_a0, subgroupShuffle(own_b0, 3), own_a1, subgroupShuffle(own_b1, 3), subgroupShuffle(own_scale_b, 3) * own_scale_a);
+
+                lane_output2[0] += SDP8AI(own_a0, subgroupShuffle(own_b0, 4), own_a1, subgroupShuffle(own_b1, 4), subgroupShuffle(own_scale_b, 4) * own_scale_a);
+                lane_output2[1] += SDP8AI(own_a0, subgroupShuffle(own_b0, 5), own_a1, subgroupShuffle(own_b1, 5), subgroupShuffle(own_scale_b, 5) * own_scale_a);
+                lane_output2[2] += SDP8AI(own_a0, subgroupShuffle(own_b0, 6), own_a1, subgroupShuffle(own_b1, 6), subgroupShuffle(own_scale_b, 6) * own_scale_a);
+                lane_output2[3] += SDP8AI(own_a0, subgroupShuffle(own_b0, 7), own_a1, subgroupShuffle(own_b1, 7), subgroupShuffle(own_scale_b, 7) * own_scale_a);
+
+                lane_output3[0] += SDP8AI(own_a0, subgroupShuffle(own_b0, 8), own_a1, subgroupShuffle(own_b1, 8), subgroupShuffle(own_scale_b, 8) * own_scale_a);
+                lane_output3[1] += SDP8AI(own_a0, subgroupShuffle(own_b0, 9), own_a1, subgroupShuffle(own_b1, 9), subgroupShuffle(own_scale_b, 9) * own_scale_a);
+                lane_output3[2] += SDP8AI(own_a0, subgroupShuffle(own_b0, 10), own_a1, subgroupShuffle(own_b1, 10), subgroupShuffle(own_scale_b, 10) * own_scale_a);
+                lane_output3[3] += SDP8AI(own_a0, subgroupShuffle(own_b0, 11), own_a1, subgroupShuffle(own_b1, 11), subgroupShuffle(own_scale_b, 11) * own_scale_a);
+
+                lane_output4[0] += SDP8AI(own_a0, subgroupShuffle(own_b0, 12), own_a1, subgroupShuffle(own_b1, 12), subgroupShuffle(own_scale_b, 12) * own_scale_a);
+                lane_output4[1] += SDP8AI(own_a0, subgroupShuffle(own_b0, 13), own_a1, subgroupShuffle(own_b1, 13), subgroupShuffle(own_scale_b, 13) * own_scale_a);
+                lane_output4[2] += SDP8AI(own_a0, subgroupShuffle(own_b0, 14), own_a1, subgroupShuffle(own_b1, 14), subgroupShuffle(own_scale_b, 14) * own_scale_a);
+                lane_output4[3] += SDP8AI(own_a0, subgroupShuffle(own_b0, 15), own_a1, subgroupShuffle(own_b1, 15), subgroupShuffle(own_scale_b, 15) * own_scale_a);
+            }
+            else
+            {
+                // Code for other subgroup sizes, simply doesnt use subgroups at all.
+                // Relies on reads from single location tile_B[][base_B + col] by all
+                // being optimized by the hardware.
+                lane_output1[0] += SDP8AI(own_a0, tile_B[0][base_B + 0], own_a1, tile_B[1][base_B + 0],  own_scale_a * scale_B[base_B + 0]);
+                lane_output1[1] += SDP8AI(own_a0, tile_B[0][base_B + 1], own_a1, tile_B[1][base_B + 1],  own_scale_a * scale_B[base_B + 1]);
+                lane_output1[2] += SDP8AI(own_a0, tile_B[0][base_B + 2], own_a1, tile_B[1][base_B + 2],  own_scale_a * scale_B[base_B + 2]);
+                lane_output1[3] += SDP8AI(own_a0, tile_B[0][base_B + 3], own_a1, tile_B[1][base_B + 3],  own_scale_a * scale_B[base_B + 3]);
+
+                lane_output2[0] += SDP8AI(own_a0, tile_B[0][base_B + 4], own_a1, tile_B[1][base_B + 4],  own_scale_a * scale_B[base_B + 4]);
+                lane_output2[1] += SDP8AI(own_a0, tile_B[0][base_B + 5], own_a1, tile_B[1][base_B + 5],  own_scale_a * scale_B[base_B + 5]);
+                lane_output2[2] += SDP8AI(own_a0, tile_B[0][base_B + 6], own_a1, tile_B[1][base_B + 6],  own_scale_a * scale_B[base_B + 6]);
+                lane_output2[3] += SDP8AI(own_a0, tile_B[0][base_B + 7], own_a1, tile_B[1][base_B + 7],  own_scale_a * scale_B[base_B + 7]);
+
+                lane_output3[0] += SDP8AI(own_a0, tile_B[0][base_B + 8], own_a1, tile_B[1][base_B + 8],  own_scale_a * scale_B[base_B + 8]);
+                lane_output3[1] += SDP8AI(own_a0, tile_B[0][base_B + 9], own_a1, tile_B[1][base_B + 9],  own_scale_a * scale_B[base_B + 9]);
+                lane_output3[2] += SDP8AI(own_a0, tile_B[0][base_B + 10], own_a1, tile_B[1][base_B + 10],  own_scale_a * scale_B[base_B + 10]);
+                lane_output3[3] += SDP8AI(own_a0, tile_B[0][base_B + 11], own_a1, tile_B[1][base_B + 11],  own_scale_a * scale_B[base_B + 11]);
+
+                lane_output4[0] += SDP8AI(own_a0, tile_B[0][base_B + 12], own_a1, tile_B[1][base_B + 12],  own_scale_a * scale_B[base_B + 12]);
+                lane_output4[1] += SDP8AI(own_a0, tile_B[0][base_B + 13], own_a1, tile_B[1][base_B + 13],  own_scale_a * scale_B[base_B + 13]);
+                lane_output4[2] += SDP8AI(own_a0, tile_B[0][base_B + 14], own_a1, tile_B[1][base_B + 14],  own_scale_a * scale_B[base_B + 14]);
+                lane_output4[3] += SDP8AI(own_a0, tile_B[0][base_B + 15], own_a1, tile_B[1][base_B + 15],  own_scale_a * scale_B[base_B + 15]);
+            }
+            workgroupBarrier();
+        }
+
+        let a_global = a_global_base + base_A + a_idx;
+        let b_global = b_global_base + base_B;
+        let output_idx = ((a_global) * uniforms.N + b_global)/4;
+        // This creates a shader requirement that uniforms.N % 16 == 0
+        if (a_global < uniforms.M && b_global < uniforms.N)
+        {
+            output[output_idx] = lane_output1;
+            output[output_idx+1] = lane_output2;
+            output[output_idx+2] = lane_output3;
+            output[output_idx+3] = lane_output4;
+        }
+    )MAIN_FN";
+
+  return Status::OK();
+}
+
+Status ApplyDP4AMatrixMatMulNBits(const Tensor* a, const Tensor* b, const Tensor* scales,
+                                  uint32_t M,
+                                  uint32_t N,
+                                  uint32_t K,
+                                  uint32_t block_size,
+                                  onnxruntime::webgpu::ComputeContext& context,
+                                  Tensor* y) {
+  constexpr uint32_t kVec4Components = 4;
+  constexpr uint32_t kVec2Components = 2;
+  constexpr uint32_t kU32Components = 4;
+
+  constexpr uint32_t kBlockSizeA = 128;
+  DP4AMatMulQuantizeProgram quantize_program;
+  quantize_program.SetWorkgroupSize(1);
+  quantize_program.SetDispatchGroupSize(M * K / kBlockSizeA, 1, 1);
+  TensorShape a_quant_shape{1, M, K / kU32Components};
+  Tensor a_quant = context.CreateGPUTensor(DataTypeImpl::GetType<uint32_t>(), a_quant_shape);
+  TensorShapeVector a_scales_dims({1, 1, M, K / kBlockSizeA});
+  Tensor a_scale = context.CreateGPUTensor(a->DataType(), a_scales_dims);
+  quantize_program.AddInputs({{a, ProgramTensorMetadataDependency::TypeAndRank, gsl::narrow<int>(kVec4Components)}})
+      .AddOutputs({{&a_quant, ProgramTensorMetadataDependency::Rank, a_quant.Shape(), gsl::narrow<int>(1)},
+                   {&a_scale, ProgramTensorMetadataDependency::Rank, a_scale.Shape(), gsl::narrow<int>(1)}})
+      .AddUniformVariable({static_cast<uint32_t>(M * K / kVec4Components)});
+  ORT_RETURN_IF_ERROR(context.RunProgram(quantize_program));
+
+  constexpr uint32_t kTileSize = 64;
+  TensorShape reshaped_y_shape{1, M, N / kVec4Components};
+  DP4AMatMulNBitsProgram mul_program{block_size};
+  mul_program.SetWorkgroupSize(256);
+  mul_program.SetDispatchGroupSize(
+      (M + kTileSize - 1) / kTileSize,
+      (N + kTileSize - 1) / kTileSize, 1);
+  mul_program.AddInputs({{&a_quant, ProgramTensorMetadataDependency::TypeAndRank, gsl::narrow<int>(kVec4Components)},
+                         {&a_scale, ProgramTensorMetadataDependency::TypeAndRank, gsl::narrow<int>(1)},
+                         {b, ProgramTensorMetadataDependency::TypeAndRank, gsl::narrow<int>(kVec2Components * kU32Components)},
+                         {scales, ProgramTensorMetadataDependency::TypeAndRank, gsl::narrow<int>(1)}})
+      .AddUniformVariables({{static_cast<uint32_t>(M)},
+                            {static_cast<uint32_t>(N)},
+                            {static_cast<uint32_t>(K)},
+                            {static_cast<uint32_t>(K / 8)},
+                            {static_cast<uint32_t>(K / 16)}})
+      .AddOutput({y, ProgramTensorMetadataDependency::TypeAndRank, reshaped_y_shape, gsl::narrow<int>(kVec4Components)})
+      .CacheHint("Block" + std::to_string(block_size));
+  return context.RunProgram(mul_program);
+}
+
+bool CanApplyDP4AMatrixMatMulNBits(onnxruntime::webgpu::ComputeContext& context,
+                                   uint64_t accuracy_level,
+                                   uint32_t block_size,
+                                   uint32_t batch_count,
+                                   uint32_t N,
+                                   uint32_t K,
+                                   uint32_t components_k,
+                                   bool has_zero_points) {
+  // macOS - Avoid using dp4a on Metal, as it does not appear to have native dp4a support.
+  // https://github.com/gpuweb/gpuweb/issues/2677#issuecomment-1713292226
+  bool use_dp4a = context.Device().HasFeature(wgpu::FeatureName::Subgroups) &&
+                  context.AdapterInfo().backendType != wgpu::BackendType::Metal;
+  return (accuracy_level == 4 && block_size % 32 == 0 &&
+          batch_count == 1 && components_k == 4 && K % 64 == 0 && N % 16 == 0 &&
+          !has_zero_points && use_dp4a);
+}
+
+}  // namespace webgpu
+}  // namespace contrib
+}  // namespace onnxruntime
diff --git a/onnxruntime/contrib_ops/webgpu/quantization/dp4a_matmul_nbits.h b/onnxruntime/contrib_ops/webgpu/quantization/dp4a_matmul_nbits.h
new file mode 100644
index 0000000000000..15b86d78301ad
--- /dev/null
+++ b/onnxruntime/contrib_ops/webgpu/quantization/dp4a_matmul_nbits.h
@@ -0,0 +1,56 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/providers/webgpu/program.h"
+#include "core/providers/webgpu/webgpu_kernel.h"
+
+namespace onnxruntime {
+namespace contrib {
+namespace webgpu {
+
+using namespace onnxruntime::webgpu;
+
+class DP4AMatMulQuantizeProgram final : public Program<DP4AMatMulQuantizeProgram> {
+ public:
+  DP4AMatMulQuantizeProgram() : Program{"DP4AMatMulQuantize"} {}
+  Status GenerateShaderCode(ShaderHelper& sh) const override;
+  WEBGPU_PROGRAM_DEFINE_UNIFORM_VARIABLES({"input_size", ProgramUniformVariableDataType::Uint32});
+};
+
+class DP4AMatMulNBitsProgram final : public Program<DP4AMatMulNBitsProgram> {
+ public:
+  DP4AMatMulNBitsProgram(uint32_t block_size) : Program{"DP4AMatMulNBits"}, block_size_(block_size) {}
+  Status GenerateShaderCode(ShaderHelper& sh) const override;
+  WEBGPU_PROGRAM_DEFINE_UNIFORM_VARIABLES(
+      {"M", ProgramUniformVariableDataType::Uint32},
+      {"N", ProgramUniformVariableDataType::Uint32},
+      {"K", ProgramUniformVariableDataType::Uint32},
+      {"K8", ProgramUniformVariableDataType::Uint32},
+      {"K16", ProgramUniformVariableDataType::Uint32});
+
+ private:
+  uint32_t block_size_;
+};
+
+Status ApplyDP4AMatrixMatMulNBits(const Tensor* a, const Tensor* b, const Tensor* scales,
+                                  uint32_t M,
+                                  uint32_t N,
+                                  uint32_t K,
+                                  uint32_t block_size,
+                                  onnxruntime::webgpu::ComputeContext& context,
+                                  Tensor* y);
+
+bool CanApplyDP4AMatrixMatMulNBits(onnxruntime::webgpu::ComputeContext& context,
+                                   uint64_t accuracy_level,
+                                   uint32_t block_size,
+                                   uint32_t batch_count,
+                                   uint32_t N,
+                                   uint32_t K,
+                                   uint32_t components_k,
+                                   bool has_zero_points);
+
+}  // namespace webgpu
+}  // namespace contrib
+}  // namespace onnxruntime
diff --git a/onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits.cc b/onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits.cc
index 1534fd26d3ad9..e10a7f551eec9 100644
--- a/onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits.cc
+++ b/onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits.cc
@@ -5,6 +5,7 @@
 
 #include "contrib_ops/webgpu/quantization/matmul_nbits.h"
 #include "contrib_ops/webgpu/quantization/subgroup_matrix_matmul_nbits.h"
+#include "contrib_ops/webgpu/quantization/dp4a_matmul_nbits.h"
 #include "contrib_ops/webgpu/webgpu_contrib_kernels.h"
 #include "core/providers/cpu/math/matmul_helper.h"
 #include "core/providers/webgpu/shader_helper.h"
@@ -532,255 +533,6 @@ Status MatMulNBitsProgram::GenerateShaderCode(ShaderHelper& shader) const {
   return Status::OK();
 }
 
-Status DP4AMatMulQuantizeProgram::GenerateShaderCode(ShaderHelper& shader) const {
-  shader.AddInput("input_a", ShaderUsage::UseUniform | ShaderUsage::UseIndicesTypeAlias | ShaderUsage::UseValueTypeAlias | ShaderUsage::UseElementTypeAlias);
-  shader.AddOutput("output", ShaderUsage::UseUniform);
-  shader.AddOutput("scales", ShaderUsage::UseUniform);
-  shader.AdditionalImplementation() << R"ADDNL_FN(
-  fn readInput(offset: u32) -> input_a_value_t
-  {
-    if (offset > uniforms.input_size) {
-      return input_a_value_t(0);
-    }
-    return input_a[offset];
-  }
-)ADDNL_FN";
-  shader.MainFunctionBody() << R"MAIN_FN(
-  var local_a : array<vec4<input_a_element_t>, 32>;
-  var max_value:vec4<input_a_element_t> = vec4<input_a_element_t>(0);
-  for (var idx:u32=0;idx<32;idx+=1)
-  {
-    local_a[idx] = readInput(workgroup_idx*32 + idx);
-    max_value = max(max_value, abs(local_a[idx]));
-  }
-  var scale = max(max_value.x, max_value.y);
-  scale = max(scale, max_value.z);
-  scale = max(scale, max_value.w);
-  for (var idx:u32=0;idx<32;idx+=1)
-  {
-    output[workgroup_idx*32+idx] = pack4x8snorm(vec4<f32>(local_a[idx]/scale));
-  }
-  // 127 is the max value of signed int8 [-127,127] used by pack4x8snorm for 1.0f.
-  scales[workgroup_idx] = scale/127;
-)MAIN_FN";
-  return Status::OK();
-}
-
-Status DP4AMatMulNBitsProgram::GenerateShaderCode(ShaderHelper& shader) const {
-  shader.AddInput("input_a", ShaderUsage::UseUniform | ShaderUsage::UseIndicesTypeAlias | ShaderUsage::UseValueTypeAlias);
-  shader.AddInput("scales_a", ShaderUsage::UseUniform);
-  shader.AddInput("input_b", ShaderUsage::UseUniform);
-  shader.AddInput("scales_b", ShaderUsage::UseUniform);
-  shader.AddOutput("output", ShaderUsage::UseUniform | ShaderUsage::UseElementTypeAlias);
-
-  // This shader implements co-operative matrix multiply. The key idea here is to
-  // assume there is a primitive for medium size matrix multiply a subgroup can perform,
-  // using all its lanes and pooling all its registers to keep the values in registry.
-  //
-  // The entire workgroup which has N subgroups first loads a tile into shared memory,
-  // Then each subgroup loads a subtile from shared memory into registers and uses
-  // the medium size matrix multiply primitive to perform the math.
-  // The values for tile/subtile size are chosen to conform to the resource limits
-  // of an alderlake/tiger lake gpu. A tile is 64x64, workgroup is 256 threads -
-  // therefore there are 16 subgroups and 16 lanes in each subgroup.
-  // K the hidden dimension is paged in from RAM at k tile size which is 64.
-  // All this puts the shared memory requirement slightly above 16KB.
-  // WebGPU limit is 16KB, output is moved to registers instead of SHM to make
-  // everything fit in shared memory.
-  //
-  // Each subgroup performs a 16 x 64 x 16 multiply which is implemented with
-  // subgroup shuffle as a placeholder for the day the medium matrix mul primitive
-  // becomes available in WGSL. The registry requirements is ~2KB per subgroup, on
-  // Alderlake/Tigerlake subgroup has 8KB of registry space pooling the
-  // 512B of registry from each lane.
-  //
-  // The medium size matmul is implemented using dot4I8Packed, so the inputs for
-  // this shader require A to be int8 quantized with block size 64. B is regular
-  // matmulnbits input with block size 32.
-
-  shader.AdditionalImplementation() << R"ADDNL_FN(
-  const tile_size = 64;
-  const subtile_size = 16;
-  const tile_size_k =  32;
-  const vec_factor = 4;
-  const u32_factor = 4;
-  const tile_size_k_vec = 2;
-  const block_size = 32;
-
-  // Shared memory
-  var<workgroup> tile_A : array<array<vec4<u32>, tile_size>, tile_size_k_vec>;                     // 64 x 32
-  var<workgroup> scale_A : array<output_element_t, tile_size>;                                     // 64 x 1
-  var<workgroup> tile_B : array<array<vec4<u32>, tile_size>, tile_size_k_vec>;                     // 64 x 32
-  var<workgroup> scale_B : array<output_element_t, tile_size>;                                     // 64 x 1
-
-  fn loadSHMA(a_global_base:u32, kidx_v:u32, row: u32, col: u32)
-  {
-    let a_global = a_global_base + row;
-    if (a_global >= uniforms.M)
-    {
-      return;
-    }
-    tile_A[col][row] = input_a[a_global*uniforms.K16+kidx_v+col];
-    if (col == 0)
-    {
-      // kidx_v - covers 16 values of k
-      scale_A[row] = scales_a[a_global*(uniforms.K/128) + kidx_v/8];
-    }
-  }
-
-  fn loadSHMB(b_global_base:u32, kidx_v:u32, row: u32, col: u32)
-  {
-      let b_global = b_global_base + row;
-      if (b_global >= uniforms.N)
-      {
-        return;
-      }
-
-      let b_value = input_b[b_global*uniforms.K16+kidx_v+col];
-      var b_value_lower = vec4<i32>(unpack4xU8(b_value[0] & 0x0F0F0F0Fu)) - vec4<i32>(8);
-      var b_value_upper = vec4<i32>(unpack4xU8((b_value[0] >> 4) & 0x0F0F0F0Fu)) - vec4<i32>(8);
-      tile_B[col][row][0] = pack4xI8(vec4<i32>(b_value_lower[0], b_value_upper[0], b_value_lower[1], b_value_upper[1]));
-      tile_B[col][row][1] = pack4xI8(vec4<i32>(b_value_lower[2], b_value_upper[2], b_value_lower[3], b_value_upper[3]));
-      b_value_lower = vec4<i32>(unpack4xU8(b_value[1] & 0x0F0F0F0Fu)) - vec4<i32>(8);
-      b_value_upper = vec4<i32>(unpack4xU8((b_value[1] >> 4) & 0x0F0F0F0Fu)) - vec4<i32>(8);
-      tile_B[col][row][2] = pack4xI8(vec4<i32>(b_value_lower[0], b_value_upper[0], b_value_lower[1], b_value_upper[1]));
-      tile_B[col][row][3] = pack4xI8(vec4<i32>(b_value_lower[2], b_value_upper[2], b_value_lower[3], b_value_upper[3]));
-      if (col == 0)
-      {
-        // kidx_v - each kidx_v covers 16 values of k
-        scale_B[row] = scales_b[b_global*(uniforms.K/32) + kidx_v/2];
-      }
-  }
-
-  // Scaled dot product of 8 packed unsigned integers.
-  fn SDP8AI(a1:vec4<u32>, b1:vec4<u32>, a2:vec4<u32>, b2:vec4<u32>, scale:output_element_t) -> output_element_t
-  {
-      var local_sum = dot4I8Packed(a1[0], b1[0]);
-      local_sum += dot4I8Packed(a1[1], b1[1]);
-      local_sum += dot4I8Packed(a1[2], b1[2]);
-      local_sum += dot4I8Packed(a1[3], b1[3]);
-      local_sum += dot4I8Packed(a2[0], b2[0]);
-      local_sum += dot4I8Packed(a2[1], b2[1]);
-      local_sum += dot4I8Packed(a2[2], b2[2]);
-      local_sum += dot4I8Packed(a2[3], b2[3]);
-      return output_element_t(local_sum) * scale;
-  }
-)ADDNL_FN";
-
-  shader.MainFunctionBody() << R"MAIN_FN(
-  // During the load phase we use all 256 threads to load 64 rows of A/B.
-  // For each row we load tile_size_k_vec (2) vectorized elements, which are 32 elements of K.
-  let a_global_base = workgroup_id.x * tile_size;
-  let b_global_base = workgroup_id.y * tile_size;
-  let load_AorB = u32(local_idx/128);
-  let load_row = u32((local_idx%128)/2);
-  let load_col = u32(local_idx%2);
-
-  // During the compute phase, we have the 64x64 tile split into
-  // subtiles of 16x16. We have a grid of 4x4 subtiles.
-  let subtile_id = u32(local_idx / subtile_size);
-  let subtile_idx = u32(subtile_id / 4);
-  let subtile_idy = u32(subtile_id % 4);
-  let base_A = subtile_idx * 16;
-  let base_B = subtile_idy * 16;
-  // For each subtile we have 16 threads assigned.
-  let a_idx = u32(local_idx % subtile_size);
-
-  var lane_output1: vec4<output_element_t>;
-  var lane_output2: vec4<output_element_t>;
-  var lane_output3: vec4<output_element_t>;
-  var lane_output4: vec4<output_element_t>;
-  // K's vectrorization is 16 items per index. See input_a/input_b.
-  // tile_size_k_vec - is the k tile size in vectorized space (1/16). That is
-  // k tile size is 32. In vectorized space that is 32/16 = 2.
-  for (var kidx_v:u32 = 0; kidx_v < uniforms.K16; kidx_v+=tile_size_k_vec)
-  {
-    // Load Phase: Populate shared memory for the workgroup.
-    if (load_AorB == 0)
-    {
-      loadSHMA(a_global_base, kidx_v, load_row, load_col);
-    }
-    else
-    {
-      loadSHMB(b_global_base, kidx_v, load_row, load_col);
-    }
-    workgroupBarrier();
-
-    // Compute phase: Perform matmul for this subtile 16 x 32 x 16.
-    // Step 1: Load from shared memory into registers across entire subgroup.
-    var own_a0: vec4<u32> = tile_A[0][base_A + a_idx];
-    var own_a1: vec4<u32> = tile_A[1][base_A + a_idx];
-    var own_scale_a: output_element_t = scale_A[base_A + a_idx];
-    if (sg_size == 16)
-    {
-      var own_b0: vec4<u32> = tile_B[0][base_B + sg_id];
-      var own_b1: vec4<u32> = tile_B[1][base_B + sg_id];
-      var own_scale_b: output_element_t  = scale_B[base_B + sg_id];
-      // Step 2: Access registers across the subgroup using subgroupShuffle and perform the matmul.
-      lane_output1[0] += SDP8AI(own_a0, subgroupShuffle(own_b0, 0), own_a1, subgroupShuffle(own_b1, 0), subgroupShuffle(own_scale_b, 0) * own_scale_a);
-      lane_output1[1] += SDP8AI(own_a0, subgroupShuffle(own_b0, 1), own_a1, subgroupShuffle(own_b1, 1), subgroupShuffle(own_scale_b, 1) * own_scale_a);
-      lane_output1[2] += SDP8AI(own_a0, subgroupShuffle(own_b0, 2), own_a1, subgroupShuffle(own_b1, 2), subgroupShuffle(own_scale_b, 2) * own_scale_a);
-      lane_output1[3] += SDP8AI(own_a0, subgroupShuffle(own_b0, 3), own_a1, subgroupShuffle(own_b1, 3), subgroupShuffle(own_scale_b, 3) * own_scale_a);
-
-      lane_output2[0] += SDP8AI(own_a0, subgroupShuffle(own_b0, 4), own_a1, subgroupShuffle(own_b1, 4), subgroupShuffle(own_scale_b, 4) * own_scale_a);
-      lane_output2[1] += SDP8AI(own_a0, subgroupShuffle(own_b0, 5), own_a1, subgroupShuffle(own_b1, 5), subgroupShuffle(own_scale_b, 5) * own_scale_a);
-      lane_output2[2] += SDP8AI(own_a0, subgroupShuffle(own_b0, 6), own_a1, subgroupShuffle(own_b1, 6), subgroupShuffle(own_scale_b, 6) * own_scale_a);
-      lane_output2[3] += SDP8AI(own_a0, subgroupShuffle(own_b0, 7), own_a1, subgroupShuffle(own_b1, 7), subgroupShuffle(own_scale_b, 7) * own_scale_a);
-
-      lane_output3[0] += SDP8AI(own_a0, subgroupShuffle(own_b0, 8), own_a1, subgroupShuffle(own_b1, 8), subgroupShuffle(own_scale_b, 8) * own_scale_a);
-      lane_output3[1] += SDP8AI(own_a0, subgroupShuffle(own_b0, 9), own_a1, subgroupShuffle(own_b1, 9), subgroupShuffle(own_scale_b, 9) * own_scale_a);
-      lane_output3[2] += SDP8AI(own_a0, subgroupShuffle(own_b0, 10), own_a1, subgroupShuffle(own_b1, 10), subgroupShuffle(own_scale_b, 10) * own_scale_a);
-      lane_output3[3] += SDP8AI(own_a0, subgroupShuffle(own_b0, 11), own_a1, subgroupShuffle(own_b1, 11), subgroupShuffle(own_scale_b, 11) * own_scale_a);
-
-      lane_output4[0] += SDP8AI(own_a0, subgroupShuffle(own_b0, 12), own_a1, subgroupShuffle(own_b1, 12), subgroupShuffle(own_scale_b, 12) * own_scale_a);
-      lane_output4[1] += SDP8AI(own_a0, subgroupShuffle(own_b0, 13), own_a1, subgroupShuffle(own_b1, 13), subgroupShuffle(own_scale_b, 13) * own_scale_a);
-      lane_output4[2] += SDP8AI(own_a0, subgroupShuffle(own_b0, 14), own_a1, subgroupShuffle(own_b1, 14), subgroupShuffle(own_scale_b, 14) * own_scale_a);
-      lane_output4[3] += SDP8AI(own_a0, subgroupShuffle(own_b0, 15), own_a1, subgroupShuffle(own_b1, 15), subgroupShuffle(own_scale_b, 15) * own_scale_a);
-    }
-    else
-    {
-      // Code for other subgroup sizes, simply doesnt use subgroups at all.
-      // Relies on reads from single location tile_B[][base_B + col] by all
-      // being optimized by the hardware.
-      lane_output1[0] += SDP8AI(own_a0, tile_B[0][base_B + 0], own_a1, tile_B[1][base_B + 0],  own_scale_a * scale_B[base_B + 0]);
-      lane_output1[1] += SDP8AI(own_a0, tile_B[0][base_B + 1], own_a1, tile_B[1][base_B + 1],  own_scale_a * scale_B[base_B + 1]);
-      lane_output1[2] += SDP8AI(own_a0, tile_B[0][base_B + 2], own_a1, tile_B[1][base_B + 2],  own_scale_a * scale_B[base_B + 2]);
-      lane_output1[3] += SDP8AI(own_a0, tile_B[0][base_B + 3], own_a1, tile_B[1][base_B + 3],  own_scale_a * scale_B[base_B + 3]);
-
-      lane_output2[0] += SDP8AI(own_a0, tile_B[0][base_B + 4], own_a1, tile_B[1][base_B + 4],  own_scale_a * scale_B[base_B + 4]);
-      lane_output2[1] += SDP8AI(own_a0, tile_B[0][base_B + 5], own_a1, tile_B[1][base_B + 5],  own_scale_a * scale_B[base_B + 5]);
-      lane_output2[2] += SDP8AI(own_a0, tile_B[0][base_B + 6], own_a1, tile_B[1][base_B + 6],  own_scale_a * scale_B[base_B + 6]);
-      lane_output2[3] += SDP8AI(own_a0, tile_B[0][base_B + 7], own_a1, tile_B[1][base_B + 7],  own_scale_a * scale_B[base_B + 7]);
-
-      lane_output3[0] += SDP8AI(own_a0, tile_B[0][base_B + 8], own_a1, tile_B[1][base_B + 8],  own_scale_a * scale_B[base_B + 8]);
-      lane_output3[1] += SDP8AI(own_a0, tile_B[0][base_B + 9], own_a1, tile_B[1][base_B + 9],  own_scale_a * scale_B[base_B + 9]);
-      lane_output3[2] += SDP8AI(own_a0, tile_B[0][base_B + 10], own_a1, tile_B[1][base_B + 10],  own_scale_a * scale_B[base_B + 10]);
-      lane_output3[3] += SDP8AI(own_a0, tile_B[0][base_B + 11], own_a1, tile_B[1][base_B + 11],  own_scale_a * scale_B[base_B + 11]);
-
-      lane_output4[0] += SDP8AI(own_a0, tile_B[0][base_B + 12], own_a1, tile_B[1][base_B + 12],  own_scale_a * scale_B[base_B + 12]);
-      lane_output4[1] += SDP8AI(own_a0, tile_B[0][base_B + 13], own_a1, tile_B[1][base_B + 13],  own_scale_a * scale_B[base_B + 13]);
-      lane_output4[2] += SDP8AI(own_a0, tile_B[0][base_B + 14], own_a1, tile_B[1][base_B + 14],  own_scale_a * scale_B[base_B + 14]);
-      lane_output4[3] += SDP8AI(own_a0, tile_B[0][base_B + 15], own_a1, tile_B[1][base_B + 15],  own_scale_a * scale_B[base_B + 15]);
-    }
-    workgroupBarrier();
-  }
-
-  let a_global = a_global_base + base_A + a_idx;
-  let b_global = b_global_base + base_B;
-  let output_idx = ((a_global) * uniforms.N + b_global)/4;
-  // This creates a shader requirement that uniforms.N % 16 == 0
-  if (a_global < uniforms.M && b_global < uniforms.N)
-  {
-    output[output_idx] = lane_output1;
-    output[output_idx+1] = lane_output2;
-    output[output_idx+2] = lane_output3;
-    output[output_idx+3] = lane_output4;
-  }
-)MAIN_FN";
-
-  return Status::OK();
-}
-
 Status MatMulNBits::ComputeInternal(onnxruntime::webgpu::ComputeContext& context) const {
   const Tensor* a = context.Input(0);
   const Tensor* b = context.Input(1);
@@ -822,54 +574,15 @@ Status MatMulNBits::ComputeInternal(onnxruntime::webgpu::ComputeContext& context
     return ApplySubgroupMatrixMatMulNBits(a, b, scales, M, N, K, context, y);
   }
 
-  const bool has_subgroup = context.Device().HasFeature(wgpu::FeatureName::Subgroups);
-  // macOS - Avoid using dp4a on Metal, as it does not appear to have native dp4a support.
-  // https://github.com/gpuweb/gpuweb/issues/2677#issuecomment-1713292226
-  const bool use_dp4a = has_subgroup && context.AdapterInfo().backendType != wgpu::BackendType::Metal;
-  if (accuracy_level_ == 4 && block_size == 32 &&
-      batch_count == 1 && components_a == 4 && K % 64 == 0 && N % 16 == 0 &&
-      !has_zero_points && use_dp4a && M >= kMinMForTileOptimization) {
-    constexpr uint32_t kVec4Components = 4;
-    constexpr uint32_t kVec2Components = 2;
-    constexpr uint32_t kU32Components = 4;
-
-    constexpr uint32_t kBlockSizeA = 128;
-    DP4AMatMulQuantizeProgram quantize_program;
-    quantize_program.SetWorkgroupSize(1);
-    quantize_program.SetDispatchGroupSize(M * K / kBlockSizeA, 1, 1);
-    TensorShape a_quant_shape{1, M, K / kU32Components};
-    Tensor a_quant = context.CreateGPUTensor(DataTypeImpl::GetType<uint32_t>(), a_quant_shape);
-    TensorShapeVector a_scales_dims({1, 1, M, K / kBlockSizeA});
-    Tensor a_scale = context.CreateGPUTensor(a->DataType(), a_scales_dims);
-    quantize_program.AddInputs({{a, ProgramTensorMetadataDependency::TypeAndRank, gsl::narrow<int>(kVec4Components)}})
-        .AddOutputs({{&a_quant, ProgramTensorMetadataDependency::Rank, a_quant.Shape(), gsl::narrow<int>(1)},
-                     {&a_scale, ProgramTensorMetadataDependency::Rank, a_scale.Shape(), gsl::narrow<int>(1)}})
-        .AddUniformVariable({static_cast<uint32_t>(M * K / kVec4Components)});
-    ORT_RETURN_IF_ERROR(context.RunProgram(quantize_program));
-
-    constexpr uint32_t kTileSize = 64;
-    TensorShape reshaped_y_shape{1, M, N / kVec4Components};
-    DP4AMatMulNBitsProgram mul_program;
-    mul_program.SetWorkgroupSize(256);
-    mul_program.SetDispatchGroupSize(
-        (M + kTileSize - 1) / kTileSize,
-        (N + kTileSize - 1) / kTileSize, 1);
-    mul_program.AddInputs({{&a_quant, ProgramTensorMetadataDependency::TypeAndRank, gsl::narrow<int>(kVec4Components)},
-                           {&a_scale, ProgramTensorMetadataDependency::TypeAndRank, gsl::narrow<int>(1)},
-                           {b, ProgramTensorMetadataDependency::TypeAndRank, gsl::narrow<int>(kVec2Components * kU32Components)},
-                           {scales, ProgramTensorMetadataDependency::TypeAndRank, gsl::narrow<int>(1)}})
-        .AddUniformVariables({{static_cast<uint32_t>(M)},
-                              {static_cast<uint32_t>(N)},
-                              {static_cast<uint32_t>(K)},
-                              {static_cast<uint32_t>(K / 8)},
-                              {static_cast<uint32_t>(K / 16)}})
-        .AddOutput({y, ProgramTensorMetadataDependency::TypeAndRank, reshaped_y_shape, gsl::narrow<int>(kVec4Components)});
-    return context.RunProgram(mul_program);
+  if (M >= kMinMForTileOptimization &&
+      CanApplyDP4AMatrixMatMulNBits(context, accuracy_level_, block_size, batch_count, N, K, components_a, has_zero_points)) {
+    return ApplyDP4AMatrixMatMulNBits(a, b, scales, M, N, K, block_size, context, y);
   }
 
   // TODO: Support output_number > 1. Some cases are failed when output_number > 1.
   constexpr uint32_t output_number = 1;
   const uint32_t tile_m = M > kMinMForTileOptimization ? 4 : 1;
+  const bool has_subgroup = context.Device().HasFeature(wgpu::FeatureName::Subgroups);
   const bool use_subgroup = has_subgroup && context.AdapterInfo().vendor == std::string_view{"intel"} && components_a == 4 && block_size == 32;
   MatMulNBitsProgram program{output_number, block_size, tile_m, gsl::narrow<int>(components_b), has_zero_points, use_subgroup};
   if (M > kMinMForTileOptimization && block_size == 32) {
diff --git a/onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits.h b/onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits.h
index 3d72629bf6b25..10221e19c7400 100644
--- a/onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits.h
+++ b/onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits.h
@@ -35,25 +35,6 @@ class MatMulNBitsProgram final : public Program<MatMulNBitsProgram> {
   bool use_subgroup_;
 };
 
-class DP4AMatMulQuantizeProgram final : public Program<DP4AMatMulQuantizeProgram> {
- public:
-  DP4AMatMulQuantizeProgram() : Program{"DP4AMatMulQuantize"} {}
-  Status GenerateShaderCode(ShaderHelper& sh) const override;
-  WEBGPU_PROGRAM_DEFINE_UNIFORM_VARIABLES({"input_size", ProgramUniformVariableDataType::Uint32});
-};
-
-class DP4AMatMulNBitsProgram final : public Program<DP4AMatMulNBitsProgram> {
- public:
-  DP4AMatMulNBitsProgram() : Program{"DP4AMatMulNBits"} {}
-  Status GenerateShaderCode(ShaderHelper& sh) const override;
-  WEBGPU_PROGRAM_DEFINE_UNIFORM_VARIABLES(
-      {"M", ProgramUniformVariableDataType::Uint32},
-      {"N", ProgramUniformVariableDataType::Uint32},
-      {"K", ProgramUniformVariableDataType::Uint32},
-      {"K8", ProgramUniformVariableDataType::Uint32},
-      {"K16", ProgramUniformVariableDataType::Uint32});
-};
-
 class MatMulNBits final : public WebGpuKernel {
  public:
   MatMulNBits(const OpKernelInfo& info) : WebGpuKernel(info) {

From c28bf78841458eee694924408c40ac7958278a16 Mon Sep 17 00:00:00 2001
From: Scott McKay <skottmckay@gmail.com>
Date: Fri, 7 Mar 2025 12:13:41 +1000
Subject: [PATCH 32/46] Example custom op with output type inferencing (#23916)

### Description
<!-- Describe your changes. -->
Add example of a custom op that is required to do type inference for the
output type for the model load to work.
Also acts as an example of how to override an ONNX op with a custom
implementation.

### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
#23891
---
 .../core/session/onnxruntime_cxx_api.h        |   3 +
 onnxruntime/core/session/custom_ops.cc        |  23 ++++---
 .../test/shared_lib/custom_op_utils.cc        |  20 ++++++
 onnxruntime/test/shared_lib/custom_op_utils.h |  61 +++++++++++++++++-
 onnxruntime/test/shared_lib/test_inference.cc |  30 ++++++++-
 .../test/testdata/cast_float_to_double.onnx   | Bin 0 -> 136 bytes
 6 files changed, 126 insertions(+), 11 deletions(-)
 create mode 100644 onnxruntime/test/testdata/cast_float_to_double.onnx

diff --git a/include/onnxruntime/core/session/onnxruntime_cxx_api.h b/include/onnxruntime/core/session/onnxruntime_cxx_api.h
index 89488b5158c93..979b478e2fbb4 100644
--- a/include/onnxruntime/core/session/onnxruntime_cxx_api.h
+++ b/include/onnxruntime/core/session/onnxruntime_cxx_api.h
@@ -2532,6 +2532,9 @@ struct CustomOpBase : OrtCustomOp {
     return std::vector<std::string>{};
   }
 
+  // Ort::CustomOpBase derived class should provide the following static method with the type/shape inferencing
+  // implementation if needed:
+  //   static OrtStatusPtr InferOutputShape(Ort::ShapeInferContext& context)
   template <typename C>
   decltype(&C::InferOutputShape) SetShapeInferFn(decltype(&C::InferOutputShape)) {
     OrtCustomOp::InferOutputShapeFn = [](const OrtCustomOp*, OrtShapeInferContext* ort_ctx) -> OrtStatusPtr {
diff --git a/onnxruntime/core/session/custom_ops.cc b/onnxruntime/core/session/custom_ops.cc
index bb9f278d83cf7..f583767346d88 100644
--- a/onnxruntime/core/session/custom_ops.cc
+++ b/onnxruntime/core/session/custom_ops.cc
@@ -900,13 +900,14 @@ KernelCreateInfo CreateKernelCreateInfo(const std::string& domain, const OrtCust
 ONNX_NAMESPACE::OpSchema CreateSchema(const std::string& domain, const std::vector<const OrtCustomOp*>& ops) {
   // The function registers the first schema assuming all the other one are the same except the types constraints.
   ORT_ENFORCE(ops.size() > 0, "No kernels to registers.");
-  int undefined = 0;
+  int num_inputs_with_dynamic_type = 0;
 
   // Creation of the schema for the first kernel in ops.
   const OrtCustomOp* op = *ops.begin();
   ONNX_NAMESPACE::OpSchema schema(op->GetName(op), "custom op registered at runtime", 0);
 
-  auto create_type_constraint = [&ops, &schema, &undefined](const OrtCustomOp* op, int count, int i, bool is_input) {
+  auto create_type_constraint = [&ops, &schema, &num_inputs_with_dynamic_type](
+                                    const OrtCustomOp* op, int count, int i, bool is_input) {
     onnx::OpSchema::FormalParameterOption option = onnx::OpSchema::FormalParameterOption::Single;
     bool is_homogeneous = true;
     int min_arity = 1;
@@ -976,7 +977,9 @@ ONNX_NAMESPACE::OpSchema CreateSchema(const std::string& domain, const std::vect
     } else {
       // all_types is empty. As mentioned in the previous loop, all types are allowed.
       schema.TypeConstraint(name, DataTypeImpl::ToString(SUPPORTED_TENSOR_TYPES), "all types");
-      undefined++;
+      if (is_input) {
+        ++num_inputs_with_dynamic_type;
+      }
     }
   };
 
@@ -985,19 +988,21 @@ ONNX_NAMESPACE::OpSchema CreateSchema(const std::string& domain, const std::vect
     create_type_constraint(op, static_cast<int>(input_count), static_cast<int>(i), true);
   }
 
+  const bool have_shape_infer_fn = op->version >= min_ort_version_with_shape_inference && op->InferOutputShapeFn;
+
   const size_t output_count = op->GetOutputTypeCount(op);
   for (size_t i = 0; i < output_count; i++) {
     const auto type = op->GetOutputType(op, i);
     if (ONNX_TENSOR_ELEMENT_DATA_TYPE_UNDEFINED == type) {
       if (op->GetOutputCharacteristic(op, i) == OrtCustomOpInputOutputCharacteristic::INPUT_OUTPUT_REQUIRED) {
-        ORT_ENFORCE(1 == undefined,
-                    "There must be one (and only one) dynamic typed input to the custom op. "
-                    "Its type info at runtime will be used to infer the type info of this dynamic typed output "
-                    "which is required for the success of the model loading step. "
-                    "More than one dynamic typed inputs are currently not supported as differing types at runtime "
-                    "means the output type cannot be inferred without which model loading cannot proceed.");
+        // if there's a dynamically typed input and output we infer they both have the same type from the input.
+        // if that isn't the case the user must provide an output shape inference fn which must set the output type.
+        ORT_ENFORCE(num_inputs_with_dynamic_type == 1 || have_shape_infer_fn,
+                    "The type of a dynamically typed output can be inferred from a single dynamically typed input, "
+                    "or by a user provided OrtCustomOp->InferOutputShapeFn that sets the output type.");
       }
     }
+
     create_type_constraint(op, static_cast<int>(output_count), static_cast<int>(i), false);
   }
 
diff --git a/onnxruntime/test/shared_lib/custom_op_utils.cc b/onnxruntime/test/shared_lib/custom_op_utils.cc
index bf7efacdbb505..a624479bcd00b 100644
--- a/onnxruntime/test/shared_lib/custom_op_utils.cc
+++ b/onnxruntime/test/shared_lib/custom_op_utils.cc
@@ -1,6 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
+#include <gsl/gsl>
 #include "gtest/gtest.h"
 
 #include "custom_op_utils.h"
@@ -639,3 +640,22 @@ void StandaloneCustomKernel::Compute(OrtKernelContext* context) {
 
 StandaloneCustomKernel::~StandaloneCustomKernel() {
 }
+
+OrtStatusPtr CustomCastKernel::ComputeV2(OrtKernelContext* context) {
+  Ort::KernelContext ctx(context);
+
+  auto in = ctx.GetInput(0);
+  std::vector<int64_t> shape = in.GetTensorTypeAndShapeInfo().GetShape();
+  int64_t num_elements = std::accumulate(shape.cbegin(), shape.cend(), int64_t(1), std::multiplies<int64_t>());
+
+  // CustomCast::GetInputType constraint ensures we only get float input
+  const float* data = in.GetTensorData<float>();
+  double* out_data = ctx.GetOutput(0, shape).GetTensorMutableData<double>();
+  gsl::span<const float> input_span(data, num_elements);
+  gsl::span<double> output_span(out_data, num_elements);
+
+  std::transform(input_span.begin(), input_span.end(), output_span.begin(),
+                 [](float val) { return static_cast<double>(val); });
+
+  return nullptr;
+}
diff --git a/onnxruntime/test/shared_lib/custom_op_utils.h b/onnxruntime/test/shared_lib/custom_op_utils.h
index ea2a5f2771342..424c2e2fe3a08 100644
--- a/onnxruntime/test/shared_lib/custom_op_utils.h
+++ b/onnxruntime/test/shared_lib/custom_op_utils.h
@@ -458,4 +458,63 @@ struct MulTopOpFloat16 : Ort::CustomOpBase<MulTopOpFloat16, MulTopKernelFloat16>
   OrtCustomOpInputOutputCharacteristic GetInputCharacteristic(size_t) const {
     return OrtCustomOpInputOutputCharacteristic::INPUT_OUTPUT_OPTIONAL;
   }
-};
\ No newline at end of file
+};
+
+//
+// Example overriding an operator where type inference is required for the output so kernel matching works correctly
+//
+struct CustomCastKernel {
+  CustomCastKernel(const OrtApi& /*ort_api*/, const OrtKernelInfo* /*info*/)
+  /*: ort_(ort_api)*/ {
+  }
+
+  OrtStatusPtr ComputeV2(OrtKernelContext* context);
+
+ private:
+  // const OrtApi& ort_;
+};
+
+// Custom Cast op that takes float input and converts based on 'to' attribute.
+// Example implementation only supports cast to double.
+struct CustomCast : Ort::CustomOpBase<CustomCast, CustomCastKernel, true> {
+  explicit CustomCast(const char* provider) : provider_(provider) {
+    // if overriding an ONNX op you need to set the opset versions you are overriding
+    start_ver_ = 7;  // should match minimum ONNX schema you implement
+    // end_ver_ = ...; should match maximum ONNX schema you implement or unset for unlimited.
+  }
+
+  // static method used by Ort::CustomOpBase::SetShapeInferFn
+  static OrtStatusPtr InferOutputShape(Ort::ShapeInferContext& context) {
+    auto shape = context.GetInputShape(0);
+
+    // infer output type based on 'to'.
+    auto to = context.GetAttrInt("to");
+    if (to != ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE) {
+      return Ort::Status("Unexpected type", ORT_INVALID_ARGUMENT).release();
+    }
+
+    context.SetOutputShape(0, shape, ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE);
+    return nullptr;
+  }
+
+  OrtStatusPtr CreateKernelV2(const OrtApi& api, const OrtKernelInfo* info, void** op_kernel) const {
+    Ort::ConstKernelInfo ki(info);
+    *op_kernel = new CustomCastKernel(api, info);
+    return nullptr;
+  };
+
+  const char* GetName() const { return "Cast"; };
+  const char* GetExecutionProviderType() const { return provider_; };
+
+  size_t GetInputTypeCount() const { return 1; };
+  ONNXTensorElementDataType GetInputType(size_t /*index*/) const {
+    // example only accepts float input
+    return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
+  };
+
+  size_t GetOutputTypeCount() const { return 1; };
+  ONNXTensorElementDataType GetOutputType(size_t /*index*/) const { return ONNX_TENSOR_ELEMENT_DATA_TYPE_UNDEFINED; };
+
+ private:
+  const char* provider_{"CPUExecutionProvider"};
+};
diff --git a/onnxruntime/test/shared_lib/test_inference.cc b/onnxruntime/test/shared_lib/test_inference.cc
index 4216efdfdfdb8..b517ba7032886 100644
--- a/onnxruntime/test/shared_lib/test_inference.cc
+++ b/onnxruntime/test/shared_lib/test_inference.cc
@@ -4805,4 +4805,32 @@ TEST(CApiTest, GenerateNodeStatsFile) {
               output_names, 1);
 }
 
-#endif
\ No newline at end of file
+#endif
+
+// Test that creates a custom Cast kernel which requires type inference of the output type to work.
+// Also demonstrates overriding an ONNX operator as we register the custom op in the ONNX domain.
+TEST(CApiTest, custom_cast) {
+  std::vector<Input<float>> inputs(1);
+  auto& input = inputs[0];
+  input.name = "input";
+  input.dims = {3, 4};
+  input.values = {1.0f, 2.0f, 3.0f, 4.0f,
+                  -1.0f, -2.0f, -3.0f, -4.0f,
+                  1.0f, 2.0f, 3.0f, 4.0f};
+
+  // prepare expected inputs and outputs
+  std::vector<int64_t> expected_dims_y = {3, 4};
+  std::vector<double> expected_values_y = {1.0, 2.0, 3.0, 4.0,
+                                           -1.0, -2.0, -3.0, -4.0,
+                                           1.0, 2.0, 3.0, 4.0};
+
+  CustomCast custom_op{onnxruntime::kCpuExecutionProvider};
+
+  Ort::CustomOpDomain custom_op_domain("");  // onnx domain is empty string
+  custom_op_domain.Add(&custom_op);
+
+  // model with Cast from ONNX test data
+  TestInference<double, float>(*ort_env, TSTR("testdata/cast_float_to_double.onnx"),
+                               inputs, "output", expected_dims_y, expected_values_y, 0,
+                               custom_op_domain, nullptr);
+}
diff --git a/onnxruntime/test/testdata/cast_float_to_double.onnx b/onnxruntime/test/testdata/cast_float_to_double.onnx
new file mode 100644
index 0000000000000000000000000000000000000000..dc7997cddd8a8c762e354316662fb0d734e25e86
GIT binary patch
literal 136
zcmd<!65>fpOwLZtOVKS!EiSPt;8NgX&CDw(EfHeNFD(JmN-WNa#U)ytTudeT65I<I
znS>-kD&v!ZqVaA%{*EE>CHe6#{-I7ju2JGJ&3s%u9E?I7TudCyK+KXP!38x=2qeRe
Mka1$+Vh|7o0L&R4`v3p{

literal 0
HcmV?d00001


From 1199dc081582fc25fbbd17e7130341391bad704c Mon Sep 17 00:00:00 2001
From: Chi Lo <54722500+chilo-ms@users.noreply.github.com>
Date: Thu, 6 Mar 2025 20:24:07 -0800
Subject: [PATCH 33/46] Enabling L2+ Optimizations for EPs (#23517)

There are some requirements to modify the graph which are specific to
the EP/hardware.
ORT has the hardcoded EP list for optimizations but that can't scale and
it's hard be extended to enable EP custom optimizations.

Here is the prototype to enable L2+ optimizations for EPs (The original
overview is provided by @skottmckay) as well as the TRT EP
implementation for the ConstantFoldingDQ optimization.

Signatures for selection and optimization functions:
````
  - Selection: std::function<std::vector<std::unique_ptr<ComputeCapability>>(const GraphViewer&, const KeyValueConfig&)>
  - Optimization: std::function<Status(const Graph&, const ComputeCapability& this_optimization, ComputeCapability& cc_to_update)>
````
GetCapability

- call (new) provider bridge API to lookup pre-defined optimizer by name
and get selection function
- ComputeCapability.optimize_func, i.e. optimization function, would be
set by the optimizer to the function that does the optimization

- EP has to update the returning ComputeCapability to include the
optimization ComputeCapability in nodes_to_optimize. So that later ORT
can perform optimization/transformation accordingly.

GraphPartitioner

- After assigning the ComputeCapability to the EP and prior to Compile,
if the ComputeCapability has nodes_to_optimize, iterate that list
  - optimization function needs to be called with
    - a mutable Graph instance
    - the ComputeCapability for the individual optimization
    - the overall ComputeCapability so it can be updated
---
 cmake/onnxruntime_optimizer.cmake             |   1 +
 .../core/framework/execution_provider.h       |  16 ++
 .../core/graph/indexed_sub_graph.h            |   6 +
 .../core/framework/compute_capability.h       |  20 ++
 .../core/framework/execution_provider.cc      |   1 +
 .../core/framework/graph_partitioner.cc       | 220 +++++++++++-------
 .../core/framework/graph_partitioner.h        |   9 +-
 .../core/optimizer/constant_folding.cc        |  13 +-
 onnxruntime/core/optimizer/constant_folding.h |  18 ++
 .../optimizer/graph_optimizer_registry.cc     |  49 ++++
 .../core/optimizer/graph_optimizer_registry.h |  77 ++++++
 .../constant_folding_dq_node.cc               |  26 +++
 .../constant_folding_dq_node.h                |  37 +++
 .../selection_and_optimization_func.cc        |  99 ++++++++
 .../selection_and_optimization_func.h         |  31 +++
 .../providers/acl/acl_execution_provider.cc   |   1 +
 .../providers/acl/acl_execution_provider.h    |   1 +
 .../providers/cann/cann_execution_provider.cc |   1 +
 .../providers/cann/cann_execution_provider.h  |   1 +
 .../coreml/coreml_execution_provider.cc       |   1 +
 .../coreml/coreml_execution_provider.h        |   1 +
 .../providers/cuda/cuda_execution_provider.cc |   1 +
 .../providers/cuda/cuda_execution_provider.h  |   1 +
 .../src/ExecutionProvider.cpp                 |   6 +-
 .../src/ExecutionProvider.h                   |   3 +
 .../providers/dnnl/dnnl_execution_provider.cc |   1 +
 .../providers/dnnl/dnnl_execution_provider.h  |   1 +
 .../providers/js/js_execution_provider.cc     |   1 +
 .../core/providers/js/js_execution_provider.h |   1 +
 .../migraphx/migraphx_execution_provider.cc   |   1 +
 .../migraphx/migraphx_execution_provider.h    |   1 +
 .../nnapi_builtin/nnapi_execution_provider.cc |   1 +
 .../nnapi_builtin/nnapi_execution_provider.h  |   1 +
 .../openvino/openvino_execution_provider.cc   |   1 +
 .../openvino/openvino_execution_provider.h    |   1 +
 .../providers/qnn/qnn_execution_provider.cc   |   1 +
 .../providers/qnn/qnn_execution_provider.h    |   1 +
 .../rknpu/rknpu_execution_provider.cc         |   1 +
 .../rknpu/rknpu_execution_provider.h          |   1 +
 .../providers/rocm/rocm_execution_provider.cc |   1 +
 .../providers/rocm/rocm_execution_provider.h  |   1 +
 .../providers/shared_library/provider_api.h   |   1 +
 .../provider_bridge_provider.cc               |   3 +-
 .../shared_library/provider_interfaces.h      |   9 +
 .../shared_library/provider_wrappedtypes.h    |   3 +
 .../providers/snpe/snpe_execution_provider.cc |   1 +
 .../providers/snpe/snpe_execution_provider.h  |   1 +
 .../tensorrt/tensorrt_execution_provider.cc   |  55 ++++-
 .../tensorrt/tensorrt_execution_provider.h    |  31 +++
 .../tensorrt_execution_provider_helper.cc     | 129 ++++++++++
 .../vitisai/vitisai_execution_provider.cc     |   2 +-
 .../vitisai/vitisai_execution_provider.h      |   1 +
 .../vsinpu/vsinpu_execution_provider.cc       |   1 +
 .../vsinpu/vsinpu_execution_provider.h        |   1 +
 .../webgpu/webgpu_execution_provider.cc       |   1 +
 .../webgpu/webgpu_execution_provider.h        |   1 +
 .../webnn/webnn_execution_provider.cc         |   1 +
 .../webnn/webnn_execution_provider.h          |   1 +
 .../xnnpack/xnnpack_execution_provider.cc     |   1 +
 .../xnnpack/xnnpack_execution_provider.h      |   1 +
 onnxruntime/core/session/inference_session.cc |  21 +-
 .../core/session/provider_bridge_ort.cc       |  23 +-
 .../test/framework/inference_session_test.cc  |   1 +
 .../test/framework/session_state_test.cc      |  27 ++-
 .../internal_testing_execution_provider.cc    |   1 +
 .../internal_testing_execution_provider.h     |   1 +
 .../test/providers/qnn/qnn_test_utils.cc      |   7 +-
 67 files changed, 874 insertions(+), 107 deletions(-)
 create mode 100644 onnxruntime/core/optimizer/graph_optimizer_registry.cc
 create mode 100644 onnxruntime/core/optimizer/graph_optimizer_registry.h
 create mode 100644 onnxruntime/core/optimizer/qdq_transformer/constant_folding_dq_node.cc
 create mode 100644 onnxruntime/core/optimizer/qdq_transformer/constant_folding_dq_node.h
 create mode 100644 onnxruntime/core/optimizer/selection_and_optimization_func.cc
 create mode 100644 onnxruntime/core/optimizer/selection_and_optimization_func.h

diff --git a/cmake/onnxruntime_optimizer.cmake b/cmake/onnxruntime_optimizer.cmake
index 9d680cd04af10..173c872d4cc06 100644
--- a/cmake/onnxruntime_optimizer.cmake
+++ b/cmake/onnxruntime_optimizer.cmake
@@ -9,6 +9,7 @@ if (onnxruntime_MINIMAL_BUILD)
   list(APPEND onnxruntime_optimizer_src_patterns
     "${ONNXRUNTIME_INCLUDE_DIR}/core/optimizer/graph_transformer.h"
     "${ONNXRUNTIME_ROOT}/core/optimizer/graph_transformer.cc"
+    "${ONNXRUNTIME_ROOT}/core/optimizer/graph_optimizer_registry.cc"
   )
 
   if (onnxruntime_EXTENDED_MINIMAL_BUILD)
diff --git a/include/onnxruntime/core/framework/execution_provider.h b/include/onnxruntime/core/framework/execution_provider.h
index c9a15de9ef897..2245ff5791feb 100644
--- a/include/onnxruntime/core/framework/execution_provider.h
+++ b/include/onnxruntime/core/framework/execution_provider.h
@@ -20,6 +20,7 @@ struct ComputeCapability;
 class KernelRegistry;
 struct KernelCreateInfo;
 class Node;
+class GraphOptimizerRegistry;
 }  // namespace onnxruntime
 #else
 #include <memory>
@@ -129,10 +130,25 @@ class IExecutionProvider {
      and decide whether a node will be assigned to <*this> execution provider.
      For kernels registered in a kernel registry, `kernel_lookup` must be used
      to find a matching kernel for this EP.
+
+     The graph_optimizer_registry is designed for enabling L2+ graph optimizations tailored for EPs.
+     These optimizations are applied after the graph partitioner assigns ComputeCapability to the EP
+     and before EP's "Compile" or fusion.
+
+     Steps to use graph_optimizer_registry and create the optimization ComputeCapability:
+     1. Lookup Optimizer: The EP calls provider bridge API to lookup pre-defined optimizer by name and get selection function.
+        - Example: g_host->GetOptimizerByName(optimizer_name, graph_optimizer_registry, selection_func)
+     2. Run Selection Function: The EP executes the selection function to obtain the selection ComputeCapability.
+        - ComputeCapability.optimize_func would be set by the optimizer to the function that does the optimization.
+     3. Create Optimization ComputeCapability: The EP uses the selection ComputeCapability to create the optimization ComputeCapability.
+     4. Return ComputeCapability: The EP returns the final ComputeCapability, with nodes_to_optimize set to the optimization ComputeCapability.
+
+     Note: For more detailed implementations of using graph_optimizer_registry, please refer to TensorRT EP.
   */
   virtual std::vector<std::unique_ptr<ComputeCapability>>
   GetCapability(const onnxruntime::GraphViewer& graph_viewer,
                 const IKernelLookup& kernel_lookup,
+                const GraphOptimizerRegistry& graph_optimizer_registry,
                 IResourceAccountant* resource_accountant = nullptr) const;
 
   /**
diff --git a/include/onnxruntime/core/graph/indexed_sub_graph.h b/include/onnxruntime/core/graph/indexed_sub_graph.h
index e457d3dcad1f1..088db79a7e005 100644
--- a/include/onnxruntime/core/graph/indexed_sub_graph.h
+++ b/include/onnxruntime/core/graph/indexed_sub_graph.h
@@ -72,6 +72,12 @@ struct IndexedSubGraph {
     return meta_def_.get();
   }
 
+  /** Gets the mutable meta definition needed to represent this subgraph as a FunctionProto.
+  @returns MetaDef instance if it has been set. nullptr if not. */
+  MetaDef* GetMutableMetaDef() {
+    return meta_def_.get();
+  }
+
   // Check if the accounting is enabled for the current EP
   bool IsAccountingEnabled() const {
     return resource_accountant != nullptr &&
diff --git a/onnxruntime/core/framework/compute_capability.h b/onnxruntime/core/framework/compute_capability.h
index 5f21ba2f013e0..819264b3960e7 100644
--- a/onnxruntime/core/framework/compute_capability.h
+++ b/onnxruntime/core/framework/compute_capability.h
@@ -2,8 +2,11 @@
 // Licensed under the MIT License.
 
 #pragma once
+#include <functional>
 #include "core/common/common.h"
 #include "core/graph/indexed_sub_graph.h"
+#include "core/graph/graph.h"
+#include "core/optimizer/graph_optimizer_registry.h"
 
 namespace onnxruntime {
 // A structure encodes a subgraph and the method to run it.
@@ -21,5 +24,22 @@ struct ComputeCapability {
 
   ComputeCapability(std::unique_ptr<IndexedSubGraph> t_sub_graph)
       : sub_graph(std::move(t_sub_graph)) {}
+
+  // Optional function to optimize this ComputeCapability.
+  // This will be called by ORT once the ComputeCapability is assigned to the EP.
+  std::function<Status(Graph&,
+                       const ComputeCapability& /* this_optimization*/,
+                       ComputeCapability& /* cc_to_update */,
+                       const GraphOptimizerRegistry&)>
+      optimization_func;
+
+  // Optional ComputeCapability instances for sets of nodes within this ComputeCapability that should be optimized.
+  // when an optimization is applied, ORT will update this ComputeCapability to reflect the changes made.
+  // IndexedSubGraph.nodes:
+  //  - update based on RemovedNode/AddNode calls
+  // IndexedSubGraph.MetaDef (if present):
+  //  - inputs and outputs will be unchanged
+  //  - constant_initializers MAY change if we constant fold an initializer during optimization
+  std::vector<std::unique_ptr<ComputeCapability>> nodes_to_optimize;
 };
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/framework/execution_provider.cc b/onnxruntime/core/framework/execution_provider.cc
index 3a937a119d03b..df85daa006a43 100644
--- a/onnxruntime/core/framework/execution_provider.cc
+++ b/onnxruntime/core/framework/execution_provider.cc
@@ -14,6 +14,7 @@ namespace onnxruntime {
 std::vector<std::unique_ptr<ComputeCapability>>
 IExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph,
                                   const IKernelLookup& kernel_lookup,
+                                  const GraphOptimizerRegistry&,
                                   IResourceAccountant*) const {
   std::vector<std::unique_ptr<ComputeCapability>> result;
   for (const auto& node : graph.Nodes()) {
diff --git a/onnxruntime/core/framework/graph_partitioner.cc b/onnxruntime/core/framework/graph_partitioner.cc
index b79d0327c3ef5..ff4d300f665b1 100644
--- a/onnxruntime/core/framework/graph_partitioner.cc
+++ b/onnxruntime/core/framework/graph_partitioner.cc
@@ -142,13 +142,15 @@ struct GetCapabilityForEPParams {
   std::reference_wrapper<const layout_transformation::DebugGraphFn> debug_graph_fn;
 #endif  // !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
   IResourceAccountant* resource_accountant;
+  std::reference_wrapper<const GraphOptimizerRegistry> graph_optimizer_registry;
 };
 
 auto get_capabilities = [](const IExecutionProvider& ep,
                            const GraphViewer& graph_viewer,
                            const IExecutionProvider::IKernelLookup& kernel_lookup,
-                           IResourceAccountant* resource_accountant) {
-  auto capabilities = ep.GetCapability(graph_viewer, kernel_lookup, resource_accountant);
+                           IResourceAccountant* resource_accountant,
+                           const GraphOptimizerRegistry& graph_optimizer_registry) {
+  auto capabilities = ep.GetCapability(graph_viewer, kernel_lookup, graph_optimizer_registry, resource_accountant);
 
   // In theory an EP could return an empty capability. Remove those.
   capabilities.erase(std::remove_if(capabilities.begin(), capabilities.end(),
@@ -182,10 +184,11 @@ static Status GetCapabilityForEP(const GetCapabilityForEPParams& params, const l
 
   auto& graph = params.graph.get();
   auto& capabilities = params.capabilities.get();
+  const auto& graph_optimizer_registry = params.graph_optimizer_registry.get();
 
   {
     const GraphViewer graph_viewer(graph);
-    capabilities = get_capabilities(current_ep, graph_viewer, kernel_lookup, params.resource_accountant);
+    capabilities = get_capabilities(current_ep, graph_viewer, kernel_lookup, params.resource_accountant, graph_optimizer_registry);
 
     if (capabilities.empty()) {
       return Status::OK();
@@ -223,7 +226,7 @@ static Status GetCapabilityForEP(const GetCapabilityForEPParams& params, const l
     capabilities.clear();
 
     const GraphViewer graph_viewer(graph);
-    capabilities = get_capabilities(current_ep, graph_viewer, kernel_lookup, params.resource_accountant);
+    capabilities = get_capabilities(current_ep, graph_viewer, kernel_lookup, params.resource_accountant, graph_optimizer_registry);
 
     // all nodes with an index >= first_new_node with domain of kMSInternalNHWCDomain should be in the capabilities
     InlinedHashSet<NodeIndex> new_nodes_in_capabilities;
@@ -261,6 +264,7 @@ static Status GetCapabilityForEP(const GetCapabilityForEPParams& params, const l
 static Status GetCapabilityForEPForAotInlining(const GraphViewer& graph_viewer,
                                                const KernelRegistryManager& kernel_registry_mgr,
                                                const IExecutionProvider& current_ep,
+                                               const GraphOptimizerRegistry& graph_optimizer_registry,
                                                const logging::Logger& logger,
                                                std::vector<std::unique_ptr<ComputeCapability>>& capabilities) {
   const auto& ep_type = current_ep.Type();
@@ -272,14 +276,62 @@ static Status GetCapabilityForEPForAotInlining(const GraphViewer& graph_viewer,
                                    logger};
 
   // TODO: Provide EP with a capability to look inside the functions.
-  capabilities = get_capabilities(current_ep, graph_viewer, kernel_lookup, nullptr);
+  capabilities = get_capabilities(current_ep, graph_viewer, kernel_lookup, nullptr, graph_optimizer_registry);
 
   return Status::OK();
 }
 
 /**
- * Check if a node can be placed on a specific provider.
- * Do nothing if the node is already assigned
+ * Check whether the given IndexedSubGraph is available for assigning to a specific provider.
+ *
+ */
+static bool IsIndexedSubGraphAvailableForAssignment(Graph& graph,
+                                                    const IndexedSubGraph& capability,
+                                                    GraphPartitioner::Mode mode,
+                                                    const std::string& provider_type) {
+  // The provider can run a single node in the <graph> if not using meta-defs.
+  if (capability.GetMetaDef() == nullptr && capability.nodes.size() == 1) {
+    auto* node = graph.GetNode(capability.nodes[0]);
+    if (nullptr != node && node->GetExecutionProviderType().empty()) {
+      // The node was not fused or assigned.
+      return true;
+    }
+    return false;
+  }
+
+  // if mode is kAssignOnly we want all nodes that can _potentially_ be taken by compiling EPs to be assigned,
+  // so that we aggregate the nodes covered and ensure the original nodes remain in the ORT format model by
+  // preventing level 2 and 3 optimizers from changing them. optimizers check the EP the node is assigned to
+  // and only make changes if the EP is on the optimizer's list of supported EPs. an EP that compiles nodes
+  // should never be on those lists.
+  //
+  // when the ORT format model is loaded we will process it normally with EP priority being applied for
+  // whichever EPs are enabled at the time.
+  //
+  // e.g. an Android NNAPI EP may take different/overlapping nodes to a iOS CoreML EP.
+  // We want the ORT format model to be able to be run as efficiently as possible on either platform,
+  // so we want all the nodes that either may take to be preserved. If we did not do this we would
+  // need to create one ORT format model for Android and one for iOS.
+  if (mode == GraphPartitioner::Mode::kAssignOnly) {
+    return true;
+  }
+
+  for (auto node_index : capability.nodes) {
+    const auto* node = graph.GetNode(node_index);
+    if ((nullptr == node) ||
+        (!node->GetExecutionProviderType().empty() && node->GetExecutionProviderType() != provider_type)) {
+      // The node was fused or assigned, so that the whole sub-graph will not be assigned to this <provider>
+      // The assumption is that this <provider> can only run the sub-graph as a whole unit.
+      return false;
+    }
+  }
+
+  return true;
+}
+
+/**
+ * Return a fused node or assign the nodes in the indexed subgraph to the current EP.
+ *
  * \param graph
  * \param capability
  * \param kernel_registry_mgr
@@ -298,75 +350,42 @@ static Node* PlaceNode(Graph& graph, const IndexedSubGraph& capability,
   if (nullptr == capability.GetMetaDef()) {
     TryAssignSingleNode(graph, capability, provider_type);
   } else {
-    // The <provider> can run a fused <sub_graph> in the <graph>.
+    const bool acc_enabled = capability.IsAccountingEnabled();
+    if (mode == GraphPartitioner::Mode::kNormal) {
+      std::ostringstream oss;
+      oss << provider_type << "_" << capability.GetMetaDef()->name << "_" << fused_node_unique_id++;
+      std::string node_name = oss.str();
 
-    // Check whether any node in the <sub_graph> was already assigned. If so it cannot be stolen as assignment is done
-    // in order of EP priority
-    bool sub_graph_available_for_assignment = true;
-    if (mode != GraphPartitioner::Mode::kAssignOnly) {
-      // if mode is kAssignOnly we want all nodes that can _potentially_ be taken by compiling EPs to be assigned,
-      // so that we aggregate the nodes covered and ensure the original nodes remain in the ORT format model by
-      // preventing level 2 and 3 optimizers from changing them. optimizers check the EP the node is assigned to
-      // and only make changes if the EP is on the optimizer's list of supported EPs. an EP that compiles nodes
-      // should never be on those lists.
-      //
-      // when the ORT format model is loaded we will process it normally with EP priority being applied for
-      // whichever EPs are enabled at the time.
-      //
-      // e.g. an Android NNAPI EP may take different/overlapping nodes to a iOS CoreML EP.
-      // We want the ORT format model to be able to be run as efficiently as possible on either platform,
-      // so we want all the nodes that either may take to be preserved. If we did not do this we would
-      // need to create one ORT format model for Android and one for iOS.
-      for (auto node_index : capability.nodes) {
-        const auto* node = graph.GetNode(node_index);
-        if ((nullptr == node) ||
-            (!node->GetExecutionProviderType().empty() && node->GetExecutionProviderType() != provider_type)) {
-          // The node was fused or assigned, so that the whole sub-graph will not be assigned to this <provider>
-          // The assumption is that this <provider> can only run the sub-graph as a whole unit.
-          sub_graph_available_for_assignment = false;
-          break;
-        }
+      Node* fused_node = nullptr;
+      if (fusion_style == IExecutionProvider::FusionStyle::Function) {
+        fused_node = &graph.FuseSubGraph(capability, node_name);
+      } else {
+        // create a fused node without copying everything to a Function body. The IndexedSubGraph will be passed
+        // through to Compile via a filtered GraphViewer.
+        fused_node = &graph.BeginFuseSubGraph(capability, node_name);
       }
-    }
-
-    if (sub_graph_available_for_assignment) {
-      const bool acc_enabled = capability.IsAccountingEnabled();
-      if (mode == GraphPartitioner::Mode::kNormal) {
-        std::ostringstream oss;
-        oss << provider_type << "_" << capability.GetMetaDef()->name << "_" << fused_node_unique_id++;
-        std::string node_name = oss.str();
-
-        Node* fused_node = nullptr;
-        if (fusion_style == IExecutionProvider::FusionStyle::Function) {
-          fused_node = &graph.FuseSubGraph(capability, node_name);
-        } else {
-          // create a fused node without copying everything to a Function body. The IndexedSubGraph will be passed
-          // through to Compile via a filtered GraphViewer.
-          fused_node = &graph.BeginFuseSubGraph(capability, node_name);
-        }
 
-        fused_node->SetExecutionProviderType(provider_type);
-        if (acc_enabled) {
-          // We account for the fused node. We operate under assumption
-          // that the fused node would use no more memory when the nodes we are fusing.
-          // and potentially less than that, and therefore, no threshold check is needed here.
-          // All threshold checks are done within the EP.
-          capability.ComputeAndAccountForNode(*fused_node);
-        }
+      fused_node->SetExecutionProviderType(provider_type);
+      if (acc_enabled) {
+        // We account for the fused node. We operate under assumption
+        // that the fused node would use no more memory when the nodes we are fusing.
+        // and potentially less than that, and therefore, no threshold check is needed here.
+        // All threshold checks are done within the EP.
+        capability.ComputeAndAccountForNode(*fused_node);
+      }
 
-        result = fused_node;
-      } else {
-        // assign the nodes in the indexed subgraph to the current EP so that level 2+ optimizers will not change them.
-        // This is used when exporting an ORT format model to maintain the original nodes and re-do the fusion
-        // at runtime. The original nodes provide a fallback if fewer nodes can be fused at runtime due to device
-        // capabilities.
-        for (size_t i = 0, limit = capability.nodes.size(); i < limit; ++i) {
-          auto* node = graph.GetNode(capability.nodes[i]);
-          if (node != nullptr) {
-            node->SetExecutionProviderType(provider_type);
-            if (acc_enabled) {
-              capability.AccountForNode(i);
-            }
+      result = fused_node;
+    } else {
+      // assign the nodes in the indexed subgraph to the current EP so that level 2+ optimizers will not change them.
+      // This is used when exporting an ORT format model to maintain the original nodes and re-do the fusion
+      // at runtime. The original nodes provide a fallback if fewer nodes can be fused at runtime due to device
+      // capabilities.
+      for (size_t i = 0, limit = capability.nodes.size(); i < limit; ++i) {
+        auto* node = graph.GetNode(capability.nodes[i]);
+        if (node != nullptr) {
+          node->SetExecutionProviderType(provider_type);
+          if (acc_enabled) {
+            capability.AccountForNode(i);
           }
         }
       }
@@ -386,7 +405,8 @@ static Status PartitionOnnxFormatModelImpl(Graph& graph, FuncManager& func_mgr,
                                            int& fused_node_unique_id,
                                            const layout_transformation::TransformLayoutFunction& transform_layout_fn,
                                            const layout_transformation::DebugGraphFn& debug_graph_fn,
-                                           const logging::Logger& logger, IResourceAccountant* resource_accountant) {
+                                           const logging::Logger& logger, IResourceAccountant* resource_accountant,
+                                           const GraphOptimizerRegistry& graph_optimizer_registry) {
   // handle testing edge case where optimizers or constant lifting results in graph with no nodes.
   // doing it here saves all providers checking for this in GetCapability
   if (graph.NumberOfNodes() == 0) {
@@ -400,7 +420,7 @@ static Status PartitionOnnxFormatModelImpl(Graph& graph, FuncManager& func_mgr,
       // we pass through the FuncManager from the top level graph
       ORT_RETURN_IF_ERROR(PartitionOnnxFormatModelImpl(*subgraph, func_mgr, kernel_registry_mgr,
                                                        fused_kernel_registry, current_ep, mode, fused_node_unique_id,
-                                                       transform_layout_fn, debug_graph_fn, logger, resource_accountant));
+                                                       transform_layout_fn, debug_graph_fn, logger, resource_accountant, graph_optimizer_registry));
     }
   }
 
@@ -424,7 +444,8 @@ static Status PartitionOnnxFormatModelImpl(Graph& graph, FuncManager& func_mgr,
       mode,
       std::cref(transform_layout_fn),
       std::cref(debug_graph_fn),
-      resource_accountant};
+      resource_accountant,
+      std::ref(graph_optimizer_registry)};
 
   ORT_RETURN_IF_ERROR(GetCapabilityForEP(get_capability_params, logger));
   if (capabilities.empty()) {
@@ -450,7 +471,30 @@ static Status PartitionOnnxFormatModelImpl(Graph& graph, FuncManager& func_mgr,
                                                          entry->sub_graph->GetMetaDef() != nullptr;
                                                 }));
   for (auto& capability : capabilities) {
-    Node* n = PlaceNode(graph, *capability->sub_graph, fusion_style, type, mode, fused_node_unique_id);
+    // The <provider> can run a fused <sub_graph> in the <graph>.
+    // Check whether any node in the <sub_graph> was already assigned. If so it cannot be stolen as assignment is done
+    // in order of EP priority
+    bool sub_graph_available_for_assignment = IsIndexedSubGraphAvailableForAssignment(graph, *capability->sub_graph, mode, type);
+
+    // If the <sub_graph> is available to be assigned to the EP and the ComputeCapability has nodes_to_optimize,
+    // run EP related optimizations and update ComputeCapability.
+    if (sub_graph_available_for_assignment && !capability->nodes_to_optimize.empty()) {
+      for (auto& optimization_cc : capability->nodes_to_optimize) {
+        if (optimization_cc->optimization_func) {
+          auto status = optimization_cc->optimization_func(graph, *optimization_cc, *capability, graph_optimizer_registry);
+          if (status != Status::OK()) {
+            return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, type, "The optimization function failed to finish.");
+          }
+          // #TODO: Handle nested optimization ComputeCapability
+        }
+      }
+    }
+
+    Node* n = nullptr;
+    if (sub_graph_available_for_assignment) {
+      n = PlaceNode(graph, *capability->sub_graph, fusion_style, type, mode, fused_node_unique_id);
+    }
+
     if (n != nullptr) {
       // searching in kernel registries, if no kernel registered for the fused_node, use compile approach
       if (!KernelRegistryManager::HasImplementationOf(kernel_registry_mgr, *n, type, logger)) {
@@ -587,6 +631,7 @@ static Status InlineNodes(Graph& graph, bool& modified_graph) {
 static Status InlineFunctionsAOTImpl(const ExecutionProviders& execution_providers,
                                      const KernelRegistryManager& kernel_registry_mgr,
                                      Graph& graph,
+                                     const GraphOptimizerRegistry& graph_optimizer_registry,
                                      const logging::Logger& logger,
                                      InlinedHashSet<std::string>& not_inlined,
                                      size_t& inlined_count) {
@@ -603,6 +648,7 @@ static Status InlineFunctionsAOTImpl(const ExecutionProviders& execution_provide
       ORT_RETURN_IF_ERROR(InlineFunctionsAOTImpl(execution_providers,
                                                  kernel_registry_mgr,
                                                  *subgraph,
+                                                 graph_optimizer_registry,
                                                  logger,
                                                  not_inlined,
                                                  inlined_count));
@@ -627,7 +673,7 @@ static Status InlineFunctionsAOTImpl(const ExecutionProviders& execution_provide
   InlinedHashSet<NodeIndex> claimed_by_ep;
   for (const auto& ep : execution_providers) {
     std::vector<std::unique_ptr<ComputeCapability>> capabilities;
-    ORT_RETURN_IF_ERROR(GetCapabilityForEPForAotInlining(graph_viewer, kernel_registry_mgr, *ep, logger,
+    ORT_RETURN_IF_ERROR(GetCapabilityForEPForAotInlining(graph_viewer, kernel_registry_mgr, *ep, graph_optimizer_registry, logger,
                                                          capabilities));
     for (auto& capability : capabilities) {
       const auto& nodes = capability->sub_graph->nodes;
@@ -791,6 +837,7 @@ static Status PartitionOnnxFormatModel(const PartitionParams& partition_params,
                                        const ExecutionProviders& execution_providers,
                                        KernelRegistryManager& kernel_registry_manager,
                                        const std::optional<ResourceAccountantMap>& acc_map,
+                                       const GraphOptimizerRegistry& graph_optimizer_registry,
                                        const logging::Logger& logger) {
   bool modified_graph = false;
 
@@ -814,7 +861,7 @@ static Status PartitionOnnxFormatModel(const PartitionParams& partition_params,
                                                        fused_kernel_registry, *ep, mode, fused_node_unique_id,
                                                        transform_layout_function,
                                                        partition_params.debug_graph_fn,
-                                                       logger, resource_accountant));
+                                                       logger, resource_accountant, graph_optimizer_registry));
     }
 
     // expand any nodes that have an ONNX function definition but no matching ORT kernel.
@@ -835,6 +882,7 @@ static Status PartitionOnnxFormatModel(const PartitionParams& partition_params,
 static Status PartitionOrtFormatModelImpl(const PartitionParams& partition_params,
                                           KernelRegistryManager& kernel_registry_mgr,
                                           IExecutionProvider& current_ep,
+                                          const GraphOptimizerRegistry& graph_optimizer_registry,
                                           const logging::Logger& logger) {
   // handle testing edge case where optimizers or constant lifting results in graph with no nodes.
   // doing it here saves all providers checking for this in GetCapability
@@ -850,7 +898,7 @@ static Status PartitionOrtFormatModelImpl(const PartitionParams& partition_param
       PartitionParams subgraph_partition_params = partition_params;
       subgraph_partition_params.graph = std::ref(subgraph);
       ORT_RETURN_IF_ERROR(PartitionOrtFormatModelImpl(subgraph_partition_params, kernel_registry_mgr,
-                                                      current_ep, logger));
+                                                      current_ep, graph_optimizer_registry, logger));
     }
   }
 
@@ -866,7 +914,8 @@ static Status PartitionOrtFormatModelImpl(const PartitionParams& partition_param
       std::cref(partition_params.transform_layout_function),
       std::cref(partition_params.debug_graph_fn),
 #endif  // !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
-      nullptr
+      nullptr,
+      std::ref(graph_optimizer_registry)
   };
   // clang-format on
 
@@ -959,10 +1008,11 @@ static Status PartitionOrtFormatModelImpl(const PartitionParams& partition_param
 static Status PartitionOrtFormatModel(const PartitionParams& partition_params,
                                       const ExecutionProviders& execution_providers,
                                       KernelRegistryManager& kernel_registry_manager,
+                                      const GraphOptimizerRegistry& graph_optimizer_registry,
                                       const logging::Logger& logger) {
   // process full graph with each EP
   for (const auto& ep : execution_providers) {
-    ORT_RETURN_IF_ERROR(PartitionOrtFormatModelImpl(partition_params, kernel_registry_manager, *ep, logger));
+    ORT_RETURN_IF_ERROR(PartitionOrtFormatModelImpl(partition_params, kernel_registry_manager, *ep, graph_optimizer_registry, logger));
   }
 
   return Status::OK();
@@ -989,6 +1039,7 @@ Status GraphPartitioner::InlineFunctionsAOT(Model& model,
     ORT_RETURN_IF_ERROR(InlineFunctionsAOTImpl(execution_providers,
                                                kernel_registry_manager,
                                                graph,
+                                               *graph_optimizer_registry_,
                                                logger,
                                                not_inlined,
                                                inlined_count));
@@ -1045,8 +1096,7 @@ Status GraphPartitioner::Partition(Graph& graph, FuncManager& func_mgr,
       std::ref(*fused_kernel_registry),
       std::ref(fused_node_unique_id),
       std::cref(transform_layout_function),
-      std::cref(debug_graph_fn),
-  };
+      std::cref(debug_graph_fn)};
 
 #else  // !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
 
@@ -1075,7 +1125,7 @@ Status GraphPartitioner::Partition(Graph& graph, FuncManager& func_mgr,
     ORT_RETURN_IF_ERROR(NodeStatsRecorder::CreateAccountants(config_options, graph.ModelPath(), ep_acc_map));
 
     ORT_RETURN_IF_ERROR(PartitionOnnxFormatModel(partition_params, mode, providers_, kernel_registry_mgr_,
-                                                 ep_acc_map, logger));
+                                                 ep_acc_map, *graph_optimizer_registry_, logger));
 
     if (ep_context_enabled) {
       std::string ep_context_path = config_options.GetConfigOrDefault(kOrtSessionOptionEpContextFilePath, "");
@@ -1089,7 +1139,7 @@ Status GraphPartitioner::Partition(Graph& graph, FuncManager& func_mgr,
     return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "ONNX models are not supported in this build.");
 #endif  //! defined(ORT_MINIMAL_BUILD)
   } else {
-    ORT_RETURN_IF_ERROR(PartitionOrtFormatModel(partition_params, providers_, kernel_registry_mgr_, logger));
+    ORT_RETURN_IF_ERROR(PartitionOrtFormatModel(partition_params, providers_, kernel_registry_mgr_, *graph_optimizer_registry_, logger));
   }
 
 #if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
diff --git a/onnxruntime/core/framework/graph_partitioner.h b/onnxruntime/core/framework/graph_partitioner.h
index d1ef193cf1520..b9d4022cb5a14 100644
--- a/onnxruntime/core/framework/graph_partitioner.h
+++ b/onnxruntime/core/framework/graph_partitioner.h
@@ -7,6 +7,7 @@
 #include "core/graph/graph.h"
 #include "core/framework/fuse_nodes_funcs.h"
 #include "core/framework/transform_layout_functions.h"
+#include "core/optimizer/graph_optimizer_registry.h"
 
 namespace onnxruntime {
 
@@ -24,9 +25,12 @@ class GraphPartitioner {
   };
 
   // The order of providers represents the user preference.
-  GraphPartitioner(KernelRegistryManager& kernel_registry_mgr, const ExecutionProviders& providers)
+  GraphPartitioner(KernelRegistryManager& kernel_registry_mgr,
+                   const ExecutionProviders& providers,
+                   std::unique_ptr<GraphOptimizerRegistry> graph_optimizer_registry)
       : kernel_registry_mgr_(kernel_registry_mgr),
-        providers_(providers) {
+        providers_(providers),
+        graph_optimizer_registry_(std::move(graph_optimizer_registry)) {
   }
 
   // Run partitioning.
@@ -64,6 +68,7 @@ class GraphPartitioner {
 
   KernelRegistryManager& kernel_registry_mgr_;
   const ExecutionProviders& providers_;
+  std::unique_ptr<GraphOptimizerRegistry> graph_optimizer_registry_;
 };
 
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/optimizer/constant_folding.cc b/onnxruntime/core/optimizer/constant_folding.cc
index e755b4bfa6364..e36eef672c1ed 100644
--- a/onnxruntime/core/optimizer/constant_folding.cc
+++ b/onnxruntime/core/optimizer/constant_folding.cc
@@ -21,7 +21,16 @@ ConstantFolding::ConstantFolding(const IExecutionProvider& execution_provider,
                                  const ConfigOptions& config_options,
                                  const InlinedHashSet<std::string_view>& compatible_execution_providers,
                                  const InlinedHashSet<std::string>& excluded_initializers) noexcept
-    : GraphTransformer("ConstantFolding", compatible_execution_providers),
+    : ConstantFolding("ConstantFolding", execution_provider, skip_dequantize_linear, config_options, compatible_execution_providers, excluded_initializers) {
+}
+
+ConstantFolding::ConstantFolding(const std::string& name,
+                                 const IExecutionProvider& execution_provider,
+                                 bool skip_dequantize_linear,
+                                 const ConfigOptions& config_options,
+                                 const InlinedHashSet<std::string_view>& compatible_execution_providers,
+                                 const InlinedHashSet<std::string>& excluded_initializers) noexcept
+    : GraphTransformer(name, compatible_execution_providers),
       skip_dequantize_linear_(skip_dequantize_linear),
       config_options_(config_options),
       excluded_initializers_(excluded_initializers),
@@ -144,7 +153,7 @@ Status ConstantFolding::ApplyImpl(Graph& graph, bool& modified, int graph_level,
 
   for (NodeIndex i : order) {
     auto* node = graph.GetNode(i);
-    if (!node) {
+    if (!node || !AllowConstantFolding(*node)) {
       continue;
     }
 
diff --git a/onnxruntime/core/optimizer/constant_folding.h b/onnxruntime/core/optimizer/constant_folding.h
index 14eb2a9c5f06b..29bc67d560788 100644
--- a/onnxruntime/core/optimizer/constant_folding.h
+++ b/onnxruntime/core/optimizer/constant_folding.h
@@ -28,6 +28,24 @@ class ConstantFolding : public GraphTransformer {
                   const InlinedHashSet<std::string_view>& compatible_execution_providers = {},
                   const InlinedHashSet<std::string>& excluded_initializers = {}) noexcept;
 
+ protected:
+  /**
+   * Same as the constructor above but with a name provided by derived class.
+   */
+  ConstantFolding(const std::string& name,
+                  const IExecutionProvider& execution_provider,
+                  bool skip_dequantize_linear,
+                  const ConfigOptions& config_options,
+                  const InlinedHashSet<std::string_view>& compatible_execution_providers = {},
+                  const InlinedHashSet<std::string>& excluded_initializers = {}) noexcept;
+  /**
+   * Derived class can implement this virtual function to limit the nodes that can be constant folded.
+   */
+  virtual bool AllowConstantFolding(const Node& node) const {
+    ORT_UNUSED_PARAMETER(node);
+    return true;
+  }
+
  private:
   Status ApplyImpl(Graph& graph, bool& modified, int graph_level, const logging::Logger& logger) const override;
 
diff --git a/onnxruntime/core/optimizer/graph_optimizer_registry.cc b/onnxruntime/core/optimizer/graph_optimizer_registry.cc
new file mode 100644
index 0000000000000..8ede372470485
--- /dev/null
+++ b/onnxruntime/core/optimizer/graph_optimizer_registry.cc
@@ -0,0 +1,49 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/optimizer/graph_optimizer_registry.h"
+#include "core/optimizer/graph_transformer_utils.h"
+#include "core/optimizer/selection_and_optimization_func.h"
+#include "core/optimizer/qdq_transformer/constant_folding_dq_node.h"
+
+using namespace onnxruntime;
+using namespace ::onnxruntime::common;
+
+namespace onnxruntime {
+#if !defined(ORT_MINIMAL_BUILD)
+GraphOptimizerRegistry::GraphOptimizerRegistry(const onnxruntime::SessionOptions* sess_options,
+                                               const onnxruntime::IExecutionProvider* cpu_ep,
+                                               const logging::Logger* logger) : session_options_(sess_options),
+                                                                                cpu_ep_(cpu_ep),
+                                                                                logger_(logger) {
+  auto status = CreatePredefinedSelectionFuncs();
+  ORT_ENFORCE(status.IsOK(), "Could not create pre-defined selection functions. Error Message: ",
+              status.ErrorMessage());
+}
+
+Status GraphOptimizerRegistry::CreatePredefinedSelectionFuncs() {
+  transformer_name_to_selection_func_[kConstantFoldingDQ] = ConstantFoldingDQFuncs::Select;
+
+  return Status::OK();
+}
+
+std::optional<SelectionFunc> GraphOptimizerRegistry::GetSelectionFunc(std::string& name) const {
+  auto lookup = transformer_name_to_selection_func_.find(name);
+  if (lookup != transformer_name_to_selection_func_.end()) {
+    return transformer_name_to_selection_func_.at(name);
+  }
+  LOGS(*logger_, WARNING) << "Can't find selection function of " << name;
+  return std::nullopt;
+}
+#else
+GraphOptimizerRegistry::GraphOptimizerRegistry(const onnxruntime::SessionOptions* sess_options,
+                                               const onnxruntime::IExecutionProvider* cpu_ep,
+                                               const logging::Logger* logger) : session_options_(sess_options),
+                                                                                cpu_ep_(cpu_ep),
+                                                                                logger_(logger) {}
+
+std::optional<SelectionFunc> GraphOptimizerRegistry::GetSelectionFunc(std::string& /*name*/) const {
+  return std::nullopt;
+}
+#endif
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/optimizer/graph_optimizer_registry.h b/onnxruntime/core/optimizer/graph_optimizer_registry.h
new file mode 100644
index 0000000000000..15c9287c0eac8
--- /dev/null
+++ b/onnxruntime/core/optimizer/graph_optimizer_registry.h
@@ -0,0 +1,77 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/common/inlined_containers.h"
+#include "core/common/logging/logging.h"
+#include "core/common/common.h"
+#include "core/optimizer/graph_transformer.h"
+#include "core/framework/execution_providers.h"
+#include "core/framework/compute_capability.h"
+
+namespace onnxruntime {
+/**
+ * Optimizer's selection function: Selects a set of nodes from a given graph for optimization. Additional key/value strings can be provided to configure the optimizer.
+ *                                 If needed, use graph_optimizer_registry to access the session options, the CPU EP and the logger.
+ *
+ * Optimizer's optimization function: Gets the nodes in ComputeCapability from nodes_to_optimize. Use graph_optimizer_registry to access the session options, the CPU EP
+ *                                    and the logger if needed to create the optimizer. Run optimization on the nodes/subgraph, and finally, update the ComputeCapability.
+ *
+ */
+using KeyValueConfig = std::unordered_map<std::string, std::string>;
+using SelectionFunc = std::function<std::vector<std::unique_ptr<ComputeCapability>>(const GraphViewer&,
+                                                                                    const KeyValueConfig&,
+                                                                                    const GraphOptimizerRegistry& graph_optimizer_registry)>;
+using OptimizationFunc = std::function<Status(Graph& graph,
+                                              const ComputeCapability& optimization_cc,
+                                              ComputeCapability& cc_to_update,
+                                              const GraphOptimizerRegistry& graph_optimizer_registry)>;
+
+/**
+ * A registration/lookup class for re-usable optimizers for EPs.
+ */
+class GraphOptimizerRegistry {
+ public:
+  /**
+   * The constructor takes in session options, the CPU EP and a logger as these are required by some optimizers.
+   */
+  GraphOptimizerRegistry(const onnxruntime::SessionOptions* sess_options,
+                         const onnxruntime::IExecutionProvider* cpu_ep,
+                         const logging::Logger* logger);
+
+  /**
+   * Get optimizer selection function. If the optimizer name can't be found, return nullopt.
+   */
+  std::optional<SelectionFunc> GetSelectionFunc(std::string& name) const;
+
+  /**
+   * Get CPU EP.
+   */
+  const onnxruntime::IExecutionProvider& GetCpuEp() const { return *cpu_ep_; }
+
+  /**
+   * Get Session Options.
+   */
+  const onnxruntime::SessionOptions& GetSessionOptions() const { return *session_options_; }
+
+  /**
+   * Get Logger.
+   */
+  const logging::Logger* GetLogger() const { return logger_; }
+
+ private:
+  const onnxruntime::SessionOptions* session_options_;
+  const onnxruntime::IExecutionProvider* cpu_ep_;
+  const logging::Logger* logger_;
+
+#if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
+  InlinedHashMap<std::string, SelectionFunc> transformer_name_to_selection_func_;
+
+  /**
+   * Create pre-defined selection functions.
+   */
+  Status CreatePredefinedSelectionFuncs();
+#endif
+};
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/optimizer/qdq_transformer/constant_folding_dq_node.cc b/onnxruntime/core/optimizer/qdq_transformer/constant_folding_dq_node.cc
new file mode 100644
index 0000000000000..a2f46d6ae693c
--- /dev/null
+++ b/onnxruntime/core/optimizer/qdq_transformer/constant_folding_dq_node.cc
@@ -0,0 +1,26 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/optimizer/qdq_transformer/constant_folding_dq_node.h"
+#include "core/optimizer/graph_optimizer_registry.h"
+#include "core/graph/graph_utils.h"
+
+namespace onnxruntime {
+
+ConstantFoldingDQ::ConstantFoldingDQ(const IExecutionProvider& execution_provider,
+                                     bool skip_dequantize_linear,
+                                     const ConfigOptions& config_options,
+                                     const InlinedHashSet<NodeIndex>& node_index_set,
+                                     const InlinedHashSet<std::string_view>& compatible_execution_providers,
+                                     const InlinedHashSet<std::string>& excluded_initializers) noexcept
+    : ConstantFolding("ConstantFoldingDQ", execution_provider, skip_dequantize_linear, config_options, compatible_execution_providers, excluded_initializers),
+      node_index_set_(node_index_set) {}
+
+bool ConstantFoldingDQ::AllowConstantFolding(const Node& node) const {
+  if (node_index_set_.find(node.Index()) != node_index_set_.end()) {
+    return true;
+  }
+  return false;
+}
+
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/optimizer/qdq_transformer/constant_folding_dq_node.h b/onnxruntime/core/optimizer/qdq_transformer/constant_folding_dq_node.h
new file mode 100644
index 0000000000000..7aed87fa06adb
--- /dev/null
+++ b/onnxruntime/core/optimizer/qdq_transformer/constant_folding_dq_node.h
@@ -0,0 +1,37 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/optimizer/graph_transformer.h"
+#include "core/optimizer/constant_folding.h"
+#include "core/framework/ort_value.h"
+#include <memory>
+#include "core/framework/execution_provider.h"
+
+namespace onnxruntime {
+
+/**
+@class ConstantFoldingDQ
+
+It's the derived class from ConstantFolding.
+*/
+class ConstantFoldingDQ : public ConstantFolding {
+ public:
+  /*! Constant folding will not be applied to nodes that have one of initializers from excluded_initializers as input.
+      \param execution_provider Execution provider instance to execute constant folding.
+  */
+  ConstantFoldingDQ(const IExecutionProvider& execution_provider,
+                    bool skip_dequantize_linear,
+                    const ConfigOptions& config_options,
+                    const InlinedHashSet<NodeIndex>& node_index_set,
+                    const InlinedHashSet<std::string_view>& compatible_execution_providers = {},
+                    const InlinedHashSet<std::string>& excluded_initializers = {}) noexcept;
+
+  bool AllowConstantFolding(const Node& node) const override;
+
+ private:
+  InlinedHashSet<NodeIndex> node_index_set_;
+};
+
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/optimizer/selection_and_optimization_func.cc b/onnxruntime/core/optimizer/selection_and_optimization_func.cc
new file mode 100644
index 0000000000000..151c61952a631
--- /dev/null
+++ b/onnxruntime/core/optimizer/selection_and_optimization_func.cc
@@ -0,0 +1,99 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "selection_and_optimization_func.h"
+#include "core/graph/graph_utils.h"
+#include "core/framework/compute_capability.h"
+#include "core/optimizer/qdq_transformer/constant_folding_dq_node.h"
+
+namespace onnxruntime {
+
+std::vector<std::unique_ptr<ComputeCapability>> ConstantFoldingDQFuncs::Select(const GraphViewer& graph_viewer,
+                                                                               const KeyValueConfig& /*config*/,
+                                                                               const GraphOptimizerRegistry& /*graph_optimizer_registry*/) {
+  std::vector<std::unique_ptr<ComputeCapability>> result;
+  std::unique_ptr<IndexedSubGraph> sub_graph = std::make_unique<IndexedSubGraph>();
+  const std::vector<NodeIndex>& node_index = graph_viewer.GetNodesInTopologicalOrder(ExecutionOrder::PRIORITY_BASED /*priority-based topological sort*/);
+  InitializedTensorSet constant_inputs;
+  const InlinedHashSet<std::string> excluded_initializers;
+
+  // Select DequantizeLinear node where all inputs are constant
+  for (const auto& index : node_index) {
+    const auto& node = graph_viewer.GetNode(index);
+    if (node->OpType() != "DequantizeLinear") {
+      continue;
+    }
+    if (!graph_utils::AllNodeInputsAreConstant(graph_viewer.GetGraph(), *node, constant_inputs, excluded_initializers)) {
+      continue;
+    }
+    sub_graph->nodes.push_back(index);
+  }
+
+  result.push_back(std::make_unique<ComputeCapability>(std::move(sub_graph)));
+  result.back()->optimization_func = ConstantFoldingDQFuncs::Optimize;
+  return result;
+}
+
+Status ConstantFoldingDQFuncs::Optimize(Graph& graph,
+                                        const ComputeCapability& optimization_cc,
+                                        ComputeCapability& cc_to_update,
+                                        const GraphOptimizerRegistry& graph_optimizer_registry) {
+  std::string optimizer_name = kConstantFoldingDQ;
+  std::unordered_set<std::string> original_initializers_to_remove;
+  std::unordered_set<std::string> new_initializers_to_add;
+  InlinedHashSet<NodeIndex> dq_node_index_set;
+
+  // iterate the nodes in node_to_optimize to:
+  //   1. get original initializers to remove
+  //   2. add new initializers
+  //   3. create dq node index set
+  for (const auto& index : optimization_cc.sub_graph->nodes) {
+    auto node = graph.GetNode(index);
+    if (node->OpType() != "DequantizeLinear") {
+      continue;
+    }
+    auto input_0 = node->InputDefs()[0];
+    auto output_0 = node->OutputDefs()[0];
+    original_initializers_to_remove.insert(input_0->Name());
+    new_initializers_to_add.insert(output_0->Name());
+    dq_node_index_set.insert(index);
+  }
+
+  static auto transformer = std::make_unique<ConstantFoldingDQ>(graph_optimizer_registry.GetCpuEp(),
+                                                                false /*skip_dequantize_linear*/,
+                                                                graph_optimizer_registry.GetSessionOptions().config_options,
+                                                                dq_node_index_set);
+
+  bool modified = false;
+  ORT_RETURN_IF_ERROR(transformer->Apply(graph, modified, *graph_optimizer_registry.GetLogger()));
+
+  // update the overall ComputeCapability
+  std::vector<onnxruntime::NodeIndex> updated_nodes;
+  for (auto index : cc_to_update.sub_graph->nodes) {
+    if (dq_node_index_set.find(index) != dq_node_index_set.end()) {
+      continue;
+    }
+    updated_nodes.push_back(index);
+  }
+  cc_to_update.sub_graph->nodes = updated_nodes;
+
+  auto meta_def = cc_to_update.sub_graph->GetMutableMetaDef();
+  std::vector<std::string> updated_constant_initializers;
+
+  for (auto constant_initializer : meta_def->constant_initializers) {
+    if (original_initializers_to_remove.find(constant_initializer) != original_initializers_to_remove.end()) {
+      continue;
+    }
+    updated_constant_initializers.push_back(constant_initializer);
+  }
+
+  for (auto constant_initializer : new_initializers_to_add) {
+    updated_constant_initializers.push_back(constant_initializer);
+  }
+
+  meta_def->constant_initializers = updated_constant_initializers;
+
+  return Status::OK();
+}
+
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/optimizer/selection_and_optimization_func.h b/onnxruntime/core/optimizer/selection_and_optimization_func.h
new file mode 100644
index 0000000000000..6ad62518833b0
--- /dev/null
+++ b/onnxruntime/core/optimizer/selection_and_optimization_func.h
@@ -0,0 +1,31 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/optimizer/graph_optimizer_registry.h"
+#include "core/framework/compute_capability.h"
+#include "core/graph/graph_viewer.h"
+
+namespace onnxruntime {
+static const std::string kConstantFoldingDQ = "ConstantFoldingDQ";
+
+/**
+ * Optimizer's selection function: Selects a set of nodes from a given graph for optimization. Additional key/value strings can be provided to configure the optimizer.
+ *                                 If needed, use graph_optimizer_registry to access the session options, the CPU EP and the logger.
+ *
+ * Optimizer's optimization function: Gets the nodes in ComputeCapability from nodes_to_optimize. Use graph_optimizer_registry to access the session options, the CPU EP
+ *                                    and the logger if needed to create the optimizer. Run optimization on the nodes/subgraph, and finally, update the ComputeCapability.
+ *
+ */
+
+struct ConstantFoldingDQFuncs {
+  static std::vector<std::unique_ptr<ComputeCapability>> Select(const GraphViewer& graph_viewer,
+                                                                const KeyValueConfig& configs,
+                                                                const GraphOptimizerRegistry& graph_optimizer_registry);
+  static Status Optimize(Graph& graph,
+                         const ComputeCapability& optimization_cc,
+                         ComputeCapability& cc_to_update,
+                         const GraphOptimizerRegistry& graph_optimizer_registry);
+};
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/acl/acl_execution_provider.cc b/onnxruntime/core/providers/acl/acl_execution_provider.cc
index ede476ff74d1b..def1d5e4b704c 100644
--- a/onnxruntime/core/providers/acl/acl_execution_provider.cc
+++ b/onnxruntime/core/providers/acl/acl_execution_provider.cc
@@ -153,6 +153,7 @@ std::shared_ptr<KernelRegistry> ACLExecutionProvider::GetKernelRegistry() const
 std::vector<std::unique_ptr<ComputeCapability>>
 ACLExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph,
                                     const IKernelLookup& kernel_lookup,
+                                    const GraphOptimizerRegistry& /* graph_optimizer_registry */,
                                     IResourceAccountant*) const {
   std::vector<std::unique_ptr<ComputeCapability>> result;
   for (const auto& node : graph.Nodes()) {
diff --git a/onnxruntime/core/providers/acl/acl_execution_provider.h b/onnxruntime/core/providers/acl/acl_execution_provider.h
index d635e56add30b..80e4aaaf021e3 100755
--- a/onnxruntime/core/providers/acl/acl_execution_provider.h
+++ b/onnxruntime/core/providers/acl/acl_execution_provider.h
@@ -39,6 +39,7 @@ class ACLExecutionProvider : public IExecutionProvider {
   std::vector<std::unique_ptr<ComputeCapability>> GetCapability(
       const onnxruntime::GraphViewer& graph,
       const IKernelLookup& kernel_lookup,
+      const GraphOptimizerRegistry& /* graph_optimizer_registry */,
       IResourceAccountant* resource_accountant) const override;
 
   Status OnRunStart(const onnxruntime::RunOptions&) override;
diff --git a/onnxruntime/core/providers/cann/cann_execution_provider.cc b/onnxruntime/core/providers/cann/cann_execution_provider.cc
index 07e83933a890c..be09eefba791b 100644
--- a/onnxruntime/core/providers/cann/cann_execution_provider.cc
+++ b/onnxruntime/core/providers/cann/cann_execution_provider.cc
@@ -1254,6 +1254,7 @@ GetSubGraphPartition(const std::vector<NodeIndex>& topological_order, const std:
 std::vector<std::unique_ptr<ComputeCapability>>
 CANNExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph_viewer,
                                      const IKernelLookup& kernel_lookup,
+                                     const GraphOptimizerRegistry& /* graph_optimizer_registry */,
                                      IResourceAccountant*) const {
   std::vector<std::unique_ptr<ComputeCapability>> result;
 
diff --git a/onnxruntime/core/providers/cann/cann_execution_provider.h b/onnxruntime/core/providers/cann/cann_execution_provider.h
index 5ff935463a1c1..f28ae77e49f83 100644
--- a/onnxruntime/core/providers/cann/cann_execution_provider.h
+++ b/onnxruntime/core/providers/cann/cann_execution_provider.h
@@ -56,6 +56,7 @@ class CANNExecutionProvider : public IExecutionProvider {
   std::vector<std::unique_ptr<ComputeCapability>> GetCapability(
       const onnxruntime::GraphViewer& graph_viewer,
       const IKernelLookup& kernel_lookup,
+      const GraphOptimizerRegistry& /* graph_optimizer_registry */,
       IResourceAccountant* resource_accountant) const override;
 
   Status Compile(const std::vector<FusedNodeAndGraph>& fused_nodes_and_graphs,
diff --git a/onnxruntime/core/providers/coreml/coreml_execution_provider.cc b/onnxruntime/core/providers/coreml/coreml_execution_provider.cc
index 3fa3868267c9b..cc7beed6bb298 100644
--- a/onnxruntime/core/providers/coreml/coreml_execution_provider.cc
+++ b/onnxruntime/core/providers/coreml/coreml_execution_provider.cc
@@ -39,6 +39,7 @@ CoreMLExecutionProvider::~CoreMLExecutionProvider() {}
 std::vector<std::unique_ptr<ComputeCapability>>
 CoreMLExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph_viewer,
                                        const IKernelLookup& /*kernel_lookup*/,
+                                       const GraphOptimizerRegistry& /* graph_optimizer_registry */,
                                        IResourceAccountant* /* resource_accountant */) const {
   std::vector<std::unique_ptr<ComputeCapability>> result;
 
diff --git a/onnxruntime/core/providers/coreml/coreml_execution_provider.h b/onnxruntime/core/providers/coreml/coreml_execution_provider.h
index 0609bf6af726d..574ae1fc0106b 100644
--- a/onnxruntime/core/providers/coreml/coreml_execution_provider.h
+++ b/onnxruntime/core/providers/coreml/coreml_execution_provider.h
@@ -20,6 +20,7 @@ class CoreMLExecutionProvider : public IExecutionProvider {
   std::vector<std::unique_ptr<ComputeCapability>>
   GetCapability(const onnxruntime::GraphViewer& graph_viewer,
                 const IKernelLookup& /*kernel_lookup*/,
+                const GraphOptimizerRegistry& /* graph_optimizer_registry */,
                 IResourceAccountant* resource_accountant) const override;
 
 #if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
index b675c08e5f804..54fb4429c0536 100644
--- a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
+++ b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
@@ -2660,6 +2660,7 @@ std::unique_ptr<onnxruntime::IDataTransfer> CUDAExecutionProvider::GetDataTransf
 std::vector<std::unique_ptr<ComputeCapability>>
 CUDAExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph,
                                      const IKernelLookup& kernel_lookup,
+                                     const GraphOptimizerRegistry& /* graph_optimizer_registry */,
                                      IResourceAccountant* resource_accountant) const {
   std::vector<std::unique_ptr<ComputeCapability>> result;
   const logging::Logger& logger = *GetLogger();
diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.h b/onnxruntime/core/providers/cuda/cuda_execution_provider.h
index 79a48e7cb89e1..a75e81f1f0c6d 100644
--- a/onnxruntime/core/providers/cuda/cuda_execution_provider.h
+++ b/onnxruntime/core/providers/cuda/cuda_execution_provider.h
@@ -73,6 +73,7 @@ class CUDAExecutionProvider : public IExecutionProvider {
   std::vector<std::unique_ptr<ComputeCapability>> GetCapability(
       const onnxruntime::GraphViewer& graph,
       const IKernelLookup& kernel_lookup,
+      const GraphOptimizerRegistry& /* graph_optimizer_registry */,
       IResourceAccountant* resource_accountant) const override;
 
   int GetDeviceId() const override { return info_.device_id; }
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
index 9d23b8b950272..868b2103586f9 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
@@ -93,12 +93,13 @@ namespace Dml
     ExecutionProvider::GetCapability(
         const onnxruntime::GraphViewer& graph,
         const onnxruntime::IExecutionProvider::IKernelLookup& kernel_lookup,
+        const onnxruntime::GraphOptimizerRegistry& graph_optimizer_registry,
         onnxruntime::IResourceAccountant* resource_accountant) const
     {
 #ifdef ENABLE_GRAPH_COMPILATION
-        return m_impl->GetCapability(graph, kernel_lookup, resource_accountant, *GetLogger());
+        return m_impl->GetCapability(graph, kernel_lookup, graph_optimizer_registry, resource_accountant, *GetLogger());
 #else
-        return onnxruntime::IExecutionProvider::GetCapability(graph, kernel_lookup, resource_accountant);
+        return onnxruntime::IExecutionProvider::GetCapability(graph, kernel_lookup, graph_optimizer_registry, resource_accountant);
 #endif
     }
 
@@ -878,6 +879,7 @@ namespace Dml
     ExecutionProviderImpl::GetCapability(
         const onnxruntime::GraphViewer& graph,
         const onnxruntime::IExecutionProvider::IKernelLookup& kernel_lookup,
+        const onnxruntime::GraphOptimizerRegistry& /* graph_optimizer_registry */,
         onnxruntime::IResourceAccountant*, const onnxruntime::logging::Logger& logger) const {
         uint32_t deviceDataTypeMask = GetSupportedDeviceDataTypeMask(); // Each bit corresponds to each DML_TENSOR_DATA_TYPE.
 
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h
index 7f420f8850001..aa3d8b0b4a409 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h
@@ -13,6 +13,7 @@
 
 namespace onnxruntime {
 class IResourceAccountant;
+class GraphOptimizerRegistry;
 }
 
 namespace WRL {
@@ -93,6 +94,7 @@ namespace Dml
         GetCapability(
             const onnxruntime::GraphViewer& graph,
             const onnxruntime::IExecutionProvider::IKernelLookup& kernel_lookup,
+            const onnxruntime::GraphOptimizerRegistry& graph_optimizer_registry,
             onnxruntime::IResourceAccountant* resource_accountant,
             const onnxruntime::logging::Logger& logger) const;
 
@@ -288,6 +290,7 @@ namespace Dml
         std::vector<std::unique_ptr<onnxruntime::ComputeCapability>>
             GetCapability(const onnxruntime::GraphViewer& graph,
                 const onnxruntime::IExecutionProvider::IKernelLookup& kernel_lookup,
+                const onnxruntime::GraphOptimizerRegistry& /* graph_optimizer_registry */,
                 onnxruntime::IResourceAccountant* resource_accountant) const final override;
 
         onnxruntime::common::Status OnSessionInitializationEnd() override
diff --git a/onnxruntime/core/providers/dnnl/dnnl_execution_provider.cc b/onnxruntime/core/providers/dnnl/dnnl_execution_provider.cc
index 4da82b351f1d6..d0e5b0b1588ef 100644
--- a/onnxruntime/core/providers/dnnl/dnnl_execution_provider.cc
+++ b/onnxruntime/core/providers/dnnl/dnnl_execution_provider.cc
@@ -147,6 +147,7 @@ std::vector<std::vector<NodeIndex>> DnnlExecutionProvider::GetSupportedNodes(con
 std::vector<std::unique_ptr<ComputeCapability>> DnnlExecutionProvider::GetCapability(
     const GraphViewer& graph_viewer,
     const IKernelLookup& /*kernel_lookup*/,
+    const GraphOptimizerRegistry& /* graph_optimizer_registry */,
     IResourceAccountant* /* resource_accountant */) const {
   // follow from coreml ep's Getcapability
 
diff --git a/onnxruntime/core/providers/dnnl/dnnl_execution_provider.h b/onnxruntime/core/providers/dnnl/dnnl_execution_provider.h
index bde18e139f2a3..8f951efef2a94 100644
--- a/onnxruntime/core/providers/dnnl/dnnl_execution_provider.h
+++ b/onnxruntime/core/providers/dnnl/dnnl_execution_provider.h
@@ -25,6 +25,7 @@ class DnnlExecutionProvider : public IExecutionProvider {
   std::vector<std::unique_ptr<ComputeCapability>>
   GetCapability(const onnxruntime::GraphViewer& graph,
                 const IKernelLookup& /*kernel_lookup*/,
+                const GraphOptimizerRegistry& /* graph_optimizer_registry */,
                 onnxruntime::IResourceAccountant* /* resource_accountant */) const override;
 
   common::Status Compile(const std::vector<FusedNodeAndGraph>& fused_nodes_and_graphs,
diff --git a/onnxruntime/core/providers/js/js_execution_provider.cc b/onnxruntime/core/providers/js/js_execution_provider.cc
index 9d00436150286..d8e24ff1f5053 100644
--- a/onnxruntime/core/providers/js/js_execution_provider.cc
+++ b/onnxruntime/core/providers/js/js_execution_provider.cc
@@ -791,6 +791,7 @@ std::vector<AllocatorPtr> JsExecutionProvider::CreatePreferredAllocators() {
 std::vector<std::unique_ptr<ComputeCapability>> JsExecutionProvider::GetCapability(
     const onnxruntime::GraphViewer& graph,
     const IKernelLookup& kernel_lookup,
+    const GraphOptimizerRegistry& /* graph_optimizer_registry */,
     IResourceAccountant* /* resource_accountant */) const {
   InlinedVector<NodeIndex> candidates;
   // `tenative_candidates` is a subset of `candidates`.
diff --git a/onnxruntime/core/providers/js/js_execution_provider.h b/onnxruntime/core/providers/js/js_execution_provider.h
index 4bead50fc782e..c87303209c689 100644
--- a/onnxruntime/core/providers/js/js_execution_provider.h
+++ b/onnxruntime/core/providers/js/js_execution_provider.h
@@ -45,6 +45,7 @@ class JsExecutionProvider : public IExecutionProvider {
   std::vector<std::unique_ptr<ComputeCapability>> GetCapability(
       const onnxruntime::GraphViewer& graph_viewer,
       const IKernelLookup& /*kernel_lookup*/,
+      const GraphOptimizerRegistry& /* graph_optimizer_registry */,
       IResourceAccountant* /* resource_accountant */) const override;
 
   std::shared_ptr<KernelRegistry> GetKernelRegistry() const override;
diff --git a/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc b/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc
index 1558d22137c05..9a694b03387ae 100644
--- a/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc
+++ b/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc
@@ -993,6 +993,7 @@ GetPartitionedSubgraphs(const std::vector<NodeIndex>& topological_order,
 std::vector<std::unique_ptr<ComputeCapability>>
 MIGraphXExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph_viewer,
                                          const IKernelLookup& /*kernel_lookup*/,
+                                         const GraphOptimizerRegistry& /* graph_optimizer_registry */,
                                          IResourceAccountant* /* resource_accountant */) const {
   std::vector<std::unique_ptr<ComputeCapability>> result;
   auto model = graph_viewer.CreateModel(*GetLogger());
diff --git a/onnxruntime/core/providers/migraphx/migraphx_execution_provider.h b/onnxruntime/core/providers/migraphx/migraphx_execution_provider.h
index d6af991f9b77e..7c89b5ec544a1 100644
--- a/onnxruntime/core/providers/migraphx/migraphx_execution_provider.h
+++ b/onnxruntime/core/providers/migraphx/migraphx_execution_provider.h
@@ -69,6 +69,7 @@ class MIGraphXExecutionProvider : public IExecutionProvider {
   std::vector<std::unique_ptr<ComputeCapability>>
   GetCapability(const onnxruntime::GraphViewer& graph_viewer,
                 const IKernelLookup& /*kernel_lookup*/,
+                const GraphOptimizerRegistry& /* graph_optimizer_registry */,
                 IResourceAccountant* /* resource_accountant */) const override;
 
   common::Status Compile(const std::vector<FusedNodeAndGraph>& fused_nodes,
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/nnapi_execution_provider.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/nnapi_execution_provider.cc
index 27bd584e2d3c6..28cfde817a620 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/nnapi_execution_provider.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/nnapi_execution_provider.cc
@@ -81,6 +81,7 @@ NnapiExecutionProvider::~NnapiExecutionProvider() {}
 std::vector<std::unique_ptr<ComputeCapability>>
 NnapiExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph_viewer,
                                       const IKernelLookup& /*kernel_lookup*/,
+                                      const GraphOptimizerRegistry& /* graph_optimizer_registry */,
                                       IResourceAccountant* /* resource_accountant */) const {
   std::vector<std::unique_ptr<ComputeCapability>> result;
   const logging::Logger& logger = *GetLogger();
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/nnapi_execution_provider.h b/onnxruntime/core/providers/nnapi/nnapi_builtin/nnapi_execution_provider.h
index ebf9372eb668d..a2269fdd89436 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/nnapi_execution_provider.h
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/nnapi_execution_provider.h
@@ -26,6 +26,7 @@ class NnapiExecutionProvider : public IExecutionProvider {
   std::vector<std::unique_ptr<ComputeCapability>>
   GetCapability(const onnxruntime::GraphViewer& graph_view,
                 const IKernelLookup& /*kernel_lookup*/,
+                const GraphOptimizerRegistry& /* graph_optimizer_registry */,
                 IResourceAccountant* /* resource_accountant */) const override;
 
 #if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
diff --git a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc
index 12c16e9c9b8f6..6482a07ee92bc 100644
--- a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc
+++ b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc
@@ -107,6 +107,7 @@ OpenVINOExecutionProvider::~OpenVINOExecutionProvider() {
 std::vector<std::unique_ptr<ComputeCapability>>
 OpenVINOExecutionProvider::GetCapability(const GraphViewer& graph_viewer,
                                          const IKernelLookup& /*kernel_lookup*/,
+                                         const GraphOptimizerRegistry& /* graph_optimizer_registry */,
                                          IResourceAccountant* /* resource_accountant */) const {
   std::vector<std::unique_ptr<ComputeCapability>> result;
 
diff --git a/onnxruntime/core/providers/openvino/openvino_execution_provider.h b/onnxruntime/core/providers/openvino/openvino_execution_provider.h
index bbcca583b074b..020aec16e507c 100644
--- a/onnxruntime/core/providers/openvino/openvino_execution_provider.h
+++ b/onnxruntime/core/providers/openvino/openvino_execution_provider.h
@@ -51,6 +51,7 @@ class OpenVINOExecutionProvider : public IExecutionProvider {
   std::vector<std::unique_ptr<ComputeCapability>>
   GetCapability(const GraphViewer& graph_viewer,
                 const IKernelLookup& /*kernel_lookup*/,
+                const GraphOptimizerRegistry& /* graph_optimizer_registry */,
                 IResourceAccountant* /* resource_accountant */) const override;
 
   Status Compile(const std::vector<FusedNodeAndGraph>& fused_nodes,
diff --git a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
index 1ad17d96e9322..a5813dc2a4adc 100644
--- a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
+++ b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
@@ -669,6 +669,7 @@ static void PartitionCtxModel(const onnxruntime::GraphViewer& graph_viewer,
 std::vector<std::unique_ptr<ComputeCapability>>
 QNNExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph_viewer,
                                     const IKernelLookup& /*kernel_lookup*/,
+                                    const GraphOptimizerRegistry& /* graph_optimizer_registry */,
                                     IResourceAccountant* /* resource_accountant */) const {
   std::vector<std::unique_ptr<ComputeCapability>> result;
 
diff --git a/onnxruntime/core/providers/qnn/qnn_execution_provider.h b/onnxruntime/core/providers/qnn/qnn_execution_provider.h
index 0f40e40c2fa36..d7a5d04d22692 100644
--- a/onnxruntime/core/providers/qnn/qnn_execution_provider.h
+++ b/onnxruntime/core/providers/qnn/qnn_execution_provider.h
@@ -31,6 +31,7 @@ class QNNExecutionProvider : public IExecutionProvider {
   std::vector<std::unique_ptr<ComputeCapability>>
   GetCapability(const onnxruntime::GraphViewer& graph_view,
                 const IKernelLookup& /*kernel_lookup*/,
+                const GraphOptimizerRegistry& /* graph_optimizer_registry */,
                 IResourceAccountant* /* resource_accountant */) const override;
 
   Status Compile(const std::vector<FusedNodeAndGraph>& fused_nodes_and_graphs,
diff --git a/onnxruntime/core/providers/rknpu/rknpu_execution_provider.cc b/onnxruntime/core/providers/rknpu/rknpu_execution_provider.cc
index 10fd81786f977..e9343e2b2e06a 100644
--- a/onnxruntime/core/providers/rknpu/rknpu_execution_provider.cc
+++ b/onnxruntime/core/providers/rknpu/rknpu_execution_provider.cc
@@ -51,6 +51,7 @@ std::vector<std::vector<int>> RknpuExecutionProvider::GetSupportedNodes(
 std::vector<std::unique_ptr<ComputeCapability>>
 RknpuExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph_viewer,
                                       const IKernelLookup& /*kernel_lookup*/,
+                                      const GraphOptimizerRegistry& /* graph_optimizer_registry */,
                                       IResourceAccountant* /* resource_accountant */) const {
   // Find inputs, initializers and outputs for each supported subgraph
   std::vector<std::unique_ptr<ComputeCapability>> result;
diff --git a/onnxruntime/core/providers/rknpu/rknpu_execution_provider.h b/onnxruntime/core/providers/rknpu/rknpu_execution_provider.h
index ce16d63e111d9..75cae37d117a0 100644
--- a/onnxruntime/core/providers/rknpu/rknpu_execution_provider.h
+++ b/onnxruntime/core/providers/rknpu/rknpu_execution_provider.h
@@ -20,6 +20,7 @@ class RknpuExecutionProvider : public IExecutionProvider {
   std::vector<std::unique_ptr<ComputeCapability>>
   GetCapability(const onnxruntime::GraphViewer& graph,
                 const IKernelLookup& /*kernel_lookup*/,
+                const GraphOptimizerRegistry& /* graph_optimizer_registry */,
                 IResourceAccountant* /* resource_accountant */) const override;
   common::Status Compile(const std::vector<FusedNodeAndGraph>& fused_nodes_and_graphs,
                          std::vector<NodeComputeInfo>& node_compute_funcs) override;
diff --git a/onnxruntime/core/providers/rocm/rocm_execution_provider.cc b/onnxruntime/core/providers/rocm/rocm_execution_provider.cc
index 9d6e9df907ce3..49771488efc44 100644
--- a/onnxruntime/core/providers/rocm/rocm_execution_provider.cc
+++ b/onnxruntime/core/providers/rocm/rocm_execution_provider.cc
@@ -2441,6 +2441,7 @@ std::unique_ptr<onnxruntime::IDataTransfer> ROCMExecutionProvider::GetDataTransf
 std::vector<std::unique_ptr<ComputeCapability>>
 ROCMExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph,
                                      const IKernelLookup& kernel_lookup,
+                                     const GraphOptimizerRegistry& /* graph_optimizer_registry */,
                                      IResourceAccountant* /* resource_accountant */) const {
   InlinedVector<NodeIndex> candidates;
   // A subset of the above vector. A subset of the tentative_nodes might be moved to CPU.
diff --git a/onnxruntime/core/providers/rocm/rocm_execution_provider.h b/onnxruntime/core/providers/rocm/rocm_execution_provider.h
index ff2bff7c98723..2baaf2ff1a886 100644
--- a/onnxruntime/core/providers/rocm/rocm_execution_provider.h
+++ b/onnxruntime/core/providers/rocm/rocm_execution_provider.h
@@ -62,6 +62,7 @@ class ROCMExecutionProvider : public IExecutionProvider {
   std::vector<std::unique_ptr<ComputeCapability>> GetCapability(
       const onnxruntime::GraphViewer& graph,
       const IKernelLookup& kernel_lookup,
+      const GraphOptimizerRegistry& /* graph_optimizer_registry */,
       IResourceAccountant* /* resource_accountant */) const override;
 
   int GetDeviceId() const override { return info_.device_id; }
diff --git a/onnxruntime/core/providers/shared_library/provider_api.h b/onnxruntime/core/providers/shared_library/provider_api.h
index 6ff2572e5e668..9d61e1f12f5b6 100644
--- a/onnxruntime/core/providers/shared_library/provider_api.h
+++ b/onnxruntime/core/providers/shared_library/provider_api.h
@@ -200,6 +200,7 @@ struct SparseTensor;
 class TensorSeq;
 class SessionState;
 class ModelMetadefIdGenerator;
+class GraphOptimizerRegistry;
 
 class If;
 class Loop;
diff --git a/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc b/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc
index 2dab9f6a402a0..90fd36ea29956 100644
--- a/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc
+++ b/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc
@@ -332,8 +332,9 @@ bool IAllocator::CalcMemSizeForArrayWithAlignment(size_t nmemb, size_t size, siz
 
 std::vector<std::unique_ptr<ComputeCapability>> IExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph_viewer,
                                                                                   const IKernelLookup& kernel_lookup,
+                                                                                  const GraphOptimizerRegistry& graph_optimizer_registry,
                                                                                   IResourceAccountant* resource_accountant) const {
-  return g_host->IExecutionProvider__GetCapability(this, graph_viewer, kernel_lookup, resource_accountant);
+  return g_host->IExecutionProvider__GetCapability(this, graph_viewer, kernel_lookup, graph_optimizer_registry, resource_accountant);
 }
 common::Status IExecutionProvider::Compile(const std::vector<FusedNodeAndGraph>& fused_nodes_and_graphs,
                                            std::vector<NodeComputeInfo>& node_compute_funcs) {
diff --git a/onnxruntime/core/providers/shared_library/provider_interfaces.h b/onnxruntime/core/providers/shared_library/provider_interfaces.h
index a77f0cb4c27b0..83d615c1bde0a 100644
--- a/onnxruntime/core/providers/shared_library/provider_interfaces.h
+++ b/onnxruntime/core/providers/shared_library/provider_interfaces.h
@@ -105,6 +105,8 @@ using ModelMetaData = std::unordered_map<std::string, std::string>;
 using IOnnxRuntimeOpSchemaCollectionPtr = std::shared_ptr<IOnnxRuntimeOpSchemaCollection>;
 using IOnnxRuntimeOpSchemaRegistryList = std::list<IOnnxRuntimeOpSchemaCollectionPtr>;
 using InitializedTensorSet = std::unordered_map<std::string, const ONNX_NAMESPACE::TensorProto*>;
+using KeyValueConfig = std::unordered_map<std::string, std::string>;
+using SelectionFunc = std::function<std::vector<std::unique_ptr<ComputeCapability>>(const GraphViewer&, const KeyValueConfig&, const GraphOptimizerRegistry&)>;
 
 struct Node__NodeIterator {
   virtual ~Node__NodeIterator() {}
@@ -151,6 +153,10 @@ struct ConstGraphNodes_Iterator {
 struct ProviderHost {
   virtual const OrtApiBase* OrtGetApiBase() = 0;
 
+  virtual Status GetOptimizerByName(const std::string& name,
+                                    const GraphOptimizerRegistry& graph_optimizer_registry,
+                                    SelectionFunc& selection_func) = 0;
+
   virtual void* HeapAllocate(size_t size) = 0;
   virtual void HeapFree(void*) = 0;
 
@@ -253,6 +259,7 @@ struct ProviderHost {
   // IExecutionProvider
   virtual std::vector<std::unique_ptr<ComputeCapability>> IExecutionProvider__GetCapability(const IExecutionProvider* p, const onnxruntime::GraphViewer& graph_viewer,
                                                                                             const IExecutionProvider::IKernelLookup& kernel_lookup,
+                                                                                            const GraphOptimizerRegistry& graph_optimizer_registry,
                                                                                             IResourceAccountant* resource_accountant) = 0;
 
   virtual common::Status IExecutionProvider__Compile(IExecutionProvider* p, const std::vector<IExecutionProvider::FusedNodeAndGraph>& fused_nodes_and_graphs, std::vector<NodeComputeInfo>& node_compute_funcs) = 0;
@@ -627,6 +634,8 @@ struct ProviderHost {
   virtual std::unique_ptr<ComputeCapability> ComputeCapability__construct(std::unique_ptr<IndexedSubGraph> t_sub_graph) = 0;
   virtual void ComputeCapability__operator_delete(ComputeCapability* p) = 0;
   virtual std::unique_ptr<IndexedSubGraph>& ComputeCapability__SubGraph(ComputeCapability* p) = 0;
+  virtual void ComputeCapability__copy_optimization_func(ComputeCapability* p, ComputeCapability* selection_cc) = 0;
+  virtual void ComputeCapability__add_nodes_to_optimize(ComputeCapability* p, std::unique_ptr<ComputeCapability> optimization_cc) = 0;
 
   // DataTransferManager
   virtual Status DataTransferManager__CopyTensor(const DataTransferManager* p, const Tensor& src, Tensor& dst) = 0;
diff --git a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
index a502ce9c66f69..e2af144f455e4 100644
--- a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
+++ b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
@@ -527,6 +527,9 @@ struct ComputeCapability final {
 
   std::unique_ptr<IndexedSubGraph>& SubGraph() { return g_host->ComputeCapability__SubGraph(this); }
 
+  void copy_optimization_func(ComputeCapability* selection_cc) { g_host->ComputeCapability__copy_optimization_func(this, selection_cc); }
+  void add_nodes_to_optimize(std::unique_ptr<ComputeCapability> optimization_cc) { g_host->ComputeCapability__add_nodes_to_optimize(this, std::move(optimization_cc)); }
+
   ComputeCapability() = delete;
   ComputeCapability(const ComputeCapability&) = delete;
   void operator=(const ComputeCapability&) = delete;
diff --git a/onnxruntime/core/providers/snpe/snpe_execution_provider.cc b/onnxruntime/core/providers/snpe/snpe_execution_provider.cc
index c7fc6d3a556a7..4eae7c97f9ab0 100644
--- a/onnxruntime/core/providers/snpe/snpe_execution_provider.cc
+++ b/onnxruntime/core/providers/snpe/snpe_execution_provider.cc
@@ -72,6 +72,7 @@ SNPEExecutionProvider::~SNPEExecutionProvider() {}
 std::vector<std::unique_ptr<ComputeCapability>>
 SNPEExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph,
                                      const IKernelLookup& kernel_lookup,
+                                     const GraphOptimizerRegistry& /* graph_optimizer_registry */,
                                      IResourceAccountant* /* resource_accountant */) const {
   std::vector<NodeIndex> candidates;
   for (auto& node_index : graph.GetNodesInTopologicalOrder()) {
diff --git a/onnxruntime/core/providers/snpe/snpe_execution_provider.h b/onnxruntime/core/providers/snpe/snpe_execution_provider.h
index 99033649fcbbf..4b7987b38ee93 100644
--- a/onnxruntime/core/providers/snpe/snpe_execution_provider.h
+++ b/onnxruntime/core/providers/snpe/snpe_execution_provider.h
@@ -19,6 +19,7 @@ class SNPEExecutionProvider : public IExecutionProvider {
   std::vector<std::unique_ptr<ComputeCapability>> GetCapability(
       const onnxruntime::GraphViewer& graph,
       const IKernelLookup& kernel_lookup,
+      const GraphOptimizerRegistry& /* graph_optimizer_registry */,
       IResourceAccountant* /* resource_accountant */) const override;
 
   std::shared_ptr<KernelRegistry> GetKernelRegistry() const override;
diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
index e59d252793532..523ebbfae807a 100644
--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
@@ -2459,6 +2459,7 @@ bool TensorrtExecutionProvider::DetectTensorRTGraphCycles(SubGraphCollection_t&
 std::vector<std::unique_ptr<ComputeCapability>>
 TensorrtExecutionProvider::GetCapability(const GraphViewer& graph,
                                          const IKernelLookup& /*kernel_lookup*/,
+                                         const GraphOptimizerRegistry& graph_optimizer_registry,
                                          IResourceAccountant* /* resource_accountant */) const {
   // Construct subgraph capability from node list
   std::vector<std::unique_ptr<ComputeCapability>> result;
@@ -2664,11 +2665,61 @@ TensorrtExecutionProvider::GetCapability(const GraphViewer& graph,
     }
   }
 
+  /**
+   * Enable EP related L2+ graph optimizations:
+   *
+   * 1. Calls provider bridge API to lookup pre-defined optimizer by name and get selection function.
+   *    - Example: g_host->GetOptimizerByName(optimizer_name, graph_optimizer_registry, selection_func)
+   * 2. Executes the selection function to obtain the selection ComputeCapability.
+   *    - ComputeCapability.optimize_func would be set by the optimizer to the function that does the optimization.
+   * 3. Uses the selection ComputeCapability to create the optimization ComputeCapability.
+   * 4. Returns the final ComputeCapability, with nodes_to_optimize set to the optimization ComputeCapability.
+   *
+   * Current available optimizations:
+   *   - (ConstantFoldingDQ) constant folding on DQ nodes, i.e. dequantize INT32, UINT16, INT16 constant to FP32.
+   */
+
+  SelectionFunc selection_func;
+  std::vector<std::unique_ptr<ComputeCapability>> selection_cc;
+
+  // Prepare for ConstantFoldingDQ optimizer
+  // Note: The NodeIndex here is the node index in the graph, not the index in node vector in supported_nodes_vector.
+  std::unordered_set<NodeIndex> trt_selection_node_set;     // The qualified dq nodes selected by TRT EP
+  std::unordered_map<NodeIndex, NodeIndex> consumer_to_dq;  // consumer node -> dq node
+
+  if (dla_enable_) {
+    std::string optimizer_name = "ConstantFoldingDQ";
+    const std::unordered_map<std::string, std::string> key_value_config;
+    auto status = g_host->GetOptimizerByName(optimizer_name, graph_optimizer_registry, selection_func);
+    if (status == Status::OK()) {
+      if (selection_func) {
+        selection_cc = selection_func(graph, key_value_config, graph_optimizer_registry);
+        SelectQualifiedDQNode(graph, trt_selection_node_set, consumer_to_dq);
+      }
+    } else {
+      LOGS_DEFAULT(WARNING) << "[TensorRT EP] Can't get optimizer " << optimizer_name;
+    }
+  }
+
+  // Create ComputeCapability
   int number_of_trt_nodes = 0, subgraph_index = 0;
-  for (const auto& group : supported_nodes_vector) {
+  for (auto& group : supported_nodes_vector) {
     if (!group.first.empty()) {
+      if (!selection_cc.empty()) {
+        // Include DQ nodes that are filtered out by TRT parser
+        UpdateSupportedNodeVectorForDQ(graph, group, supported_nodes_vector, consumer_to_dq);
+      }
+
       std::unique_ptr<IndexedSubGraph> sub_graph = GetSubGraph(group, graph, model_hash, subgraph_index);
-      result.push_back(ComputeCapability::Create(std::move(sub_graph)));
+      auto compute_capability = ComputeCapability::Create(std::move(sub_graph));
+
+      // add optimization ComputeCapability to node_to_optimize
+      for (auto& cc : selection_cc) {
+        std::unique_ptr<ComputeCapability> optimization_cc = CreateOptimizationComputeCapability(cc.get(), trt_selection_node_set, compute_capability.get());
+        compute_capability->add_nodes_to_optimize(std::move(optimization_cc));
+      }
+
+      result.push_back(std::move(compute_capability));
       number_of_trt_nodes += static_cast<int>(group.first.size());
       subgraph_index++;
     }
diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h
index 873826a81c51b..934cc06eed45f 100644
--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h
@@ -249,6 +249,7 @@ class TensorrtExecutionProvider : public IExecutionProvider {
   std::vector<std::unique_ptr<ComputeCapability>>
   GetCapability(const GraphViewer& graph,
                 const IKernelLookup& /*kernel_lookup*/,
+                const GraphOptimizerRegistry& graph_optimizer_registry,
                 IResourceAccountant* /* resource_accountant */) const override;
 
   int GetDeviceId() const { return device_id_; }
@@ -592,5 +593,35 @@ class TensorrtExecutionProvider : public IExecutionProvider {
    * This function only creates the instance at the first time it's being called."
    */
   nvinfer1::IBuilder* GetBuilder(TensorrtLogger& trt_logger) const;
+
+  /**
+   *  This is the helper function for ConstantFoldingDQ graph transformer.
+   *
+   *  It selects the qualified/required DQ node to be optimized as well as provides a mapping table
+   *  to help TRT EP later include the DQ node which is filtered out by TRT parser.
+   */
+  void SelectQualifiedDQNode(const GraphViewer& graph,
+                             std::unordered_set<NodeIndex>& selection_node_set,
+                             std::unordered_map<NodeIndex, NodeIndex>& consumer_to_dq) const;
+
+  /**
+   * This function returns an optimization ComputeCapability that is limited to:
+   *  1. the DQ nodes in this individual TRT ComputeCapability
+   *  2. the DQ nodes that are qualified and selected by TRT EP
+   *
+   * It also needs to make sure the DQ nodes is a subset of the complete list of DQ nodes to optimize in original selection ComputeCapability.
+   * Finally, copy the optimization function from the original selection ComputeCapability.
+   */
+  std::unique_ptr<ComputeCapability> CreateOptimizationComputeCapability(ComputeCapability* selection_cc,
+                                                                         std::unordered_set<NodeIndex>& trt_selection_node_set,
+                                                                         ComputeCapability* trt_cc) const;
+  /**
+   * This function helps add back the DQ nodes that are filtered out by TRT parser.
+   * The reason is the DQ nodes can be optimized and dequantized by applying ConstantFoldingDQ optimizer by ORT L2+ optimization.
+   */
+  void UpdateSupportedNodeVectorForDQ(const GraphViewer& graph,
+                                      SubGraph_t& supported_node_vector,
+                                      SubGraphCollection_t& supported_nodes_vector,
+                                      std::unordered_map<NodeIndex, NodeIndex> consumer_to_dq) const;
 };
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_helper.cc b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_helper.cc
index 92fa101118506..71674f7c9c557 100644
--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_helper.cc
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_helper.cc
@@ -258,4 +258,133 @@ void TensorrtExecutionProvider::SetAllGraphInputs(Graph& graph) const {
 
   graph.SetInputs(graph_inputs_including_initializers);
 }
+
+/**
+ *  This is the helper function for ConstantFoldingDQ graph transformer.
+ *
+ *  It selects the qualified/required DQ node to be optimized as well as provides a mapping table
+ *  to help TRT EP later include the DQ node which is filtered out by TRT parser.
+ */
+void TensorrtExecutionProvider::SelectQualifiedDQNode(const GraphViewer& graph,
+                                                      std::unordered_set<NodeIndex>& selection_node_set,
+                                                      std::unordered_map<NodeIndex, NodeIndex>& consumer_to_dq) const {
+  LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Select qualified DQ nodes ...";
+  const std::vector<NodeIndex>& node_index = graph.GetNodesInTopologicalOrder(1 /*priority-based topological sort*/);
+  for (auto index : node_index) {
+    auto* node = graph.GetNode(index);
+    if (!node) {
+      continue;
+    }
+
+    const auto* input_def = node->InputDefs()[0];  // Get NodeArg of the initializer of the DequantizeLinear node;
+    auto data_type = input_def->TypeAsProto()->tensor_type().elem_type();
+    auto constant_initializer = graph.IsConstantInitializer(input_def->Name(), true);
+
+    // Node selection: (i.e. initializer -> DQ -> bias of X)
+    // 1. DequantizeLinear op
+    // 2. DQ node does not produce graph output, single consumer
+    // 3. The first input of DQ is constant initializer.
+    // 4. The data type of initializer is INT32, UINT16 or INT16
+    // 5. X should be Gemm, Conv or LayerNormalization ?
+    if (node->OpType() == "DequantizeLinear" &&
+        node->GetOutputEdgesCount() == 1 &&
+        (data_type == ONNX_NAMESPACE::TensorProto_DataType_INT32 || data_type == ONNX_NAMESPACE::TensorProto_DataType_INT16 || data_type == ONNX_NAMESPACE::TensorProto_DataType_UINT16) &&
+        constant_initializer) {
+      const Node& consumer_node = *node->OutputNodesBegin();
+      selection_node_set.insert(index);
+      consumer_to_dq[consumer_node.Index()] = index;
+      LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] " << consumer_node.Name() << " <- " << node->Name();
+    }
+  }
+  LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Total " << selection_node_set.size() << " DequantizeLinear node(s) are selected.";
+}
+
+/**
+ * This function returns an optimization ComputeCapability that is limited to:
+ *  1. the DQ nodes in this individual TRT ComputeCapability
+ *  2. the DQ nodes that are qualified and selected by TRT EP
+ *
+ * It also needs to make sure the DQ nodes is a subset of the complete list of DQ nodes to optimize in original selection ComputeCapability.
+ * Finally, copy the optimization function from the original selection ComputeCapability.
+ */
+std::unique_ptr<ComputeCapability> TensorrtExecutionProvider::CreateOptimizationComputeCapability(ComputeCapability* selection_cc,
+                                                                                                  std::unordered_set<NodeIndex>& trt_selection_node_set,
+                                                                                                  ComputeCapability* trt_cc) const {
+  auto sub_graph = onnxruntime::IndexedSubGraph::Create();
+  std::unordered_set<NodeIndex> selection_node_set;
+
+  for (auto index : selection_cc->SubGraph()->Nodes()) {
+    selection_node_set.insert(index);
+  }
+
+  for (auto index : trt_cc->SubGraph()->Nodes()) {
+    if (selection_node_set.find(index) == selection_node_set.end()) {
+      continue;
+    }
+    if (trt_selection_node_set.find(index) == trt_selection_node_set.end()) {
+      continue;
+    }
+    sub_graph->Nodes().push_back(index);
+  }
+  auto compute_capability = ComputeCapability::Create(std::move(sub_graph));
+  compute_capability->copy_optimization_func(selection_cc);
+  return compute_capability;
+}
+
+/**
+ * This function helps add back the DQ nodes that are filtered out by TRT parser.
+ * The reason is the DQ nodes can be optimized and dequantized by applying ConstantFoldingDQ optimizer by ORT L2+ optimization.
+ */
+void TensorrtExecutionProvider::UpdateSupportedNodeVectorForDQ(const GraphViewer& graph,
+                                                               SubGraph_t& supported_node_vector,
+                                                               SubGraphCollection_t& supported_nodes_vector,
+                                                               std::unordered_map<NodeIndex, NodeIndex> consumer_to_dq) const {
+  if (consumer_to_dq.empty()) {
+    return;
+  }
+
+  if (!supported_node_vector.second) {
+    return;
+  }
+
+  const std::vector<NodeIndex>& node_index = graph.GetNodesInTopologicalOrder(1);
+  auto supported_nodes = supported_node_vector.first;
+  for (auto index : supported_nodes) {
+    if (consumer_to_dq.find(node_index[index]) == consumer_to_dq.end()) {
+      continue;
+    }
+
+    auto dq_node_index = consumer_to_dq[node_index[index]];
+
+    // Check if DQ node is included in one of the subgraphs
+    auto in_the_subgraph_collection = [&](NodeIndex node_idx) -> bool {
+      for (auto& node_vector : supported_nodes_vector) {
+        if (!node_vector.second) {
+          continue;
+        }
+        for (auto i : node_vector.first) {
+          if (node_index[i] == node_idx) {
+            return true;
+          }
+        }
+      }
+      return false;
+    };
+
+    // If the DQ node is already in the subgraph, do nothing.
+    if (in_the_subgraph_collection(dq_node_index)) {
+      continue;
+    }
+
+    // Find the iterator pointing to the target element
+    auto it = std::find(node_index.begin(), node_index.end(), dq_node_index);
+    if (it != node_index.end()) {
+      // Calculate the index
+      size_t idx = std::distance(node_index.begin(), it);
+      supported_node_vector.first.push_back(idx);
+      auto node = graph.GetNode(dq_node_index);
+      LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] " << node->Name() << " is included which is filtered out by TRT parser.";
+    }
+  }
+}
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/vitisai/vitisai_execution_provider.cc b/onnxruntime/core/providers/vitisai/vitisai_execution_provider.cc
index 5d2204b0b1979..ab8a95b38491d 100644
--- a/onnxruntime/core/providers/vitisai/vitisai_execution_provider.cc
+++ b/onnxruntime/core/providers/vitisai/vitisai_execution_provider.cc
@@ -51,7 +51,7 @@ const InlinedVector<const Node*> VitisAIExecutionProvider::GetEpContextNodes() c
   return ep_context_node_ptrs;
 }
 std::vector<std::unique_ptr<ComputeCapability>> VitisAIExecutionProvider::GetCapability(
-    const onnxruntime::GraphViewer& graph_viewer, const IKernelLookup& kernel_lookup, IResourceAccountant* /* resource_accountant */) const {
+    const onnxruntime::GraphViewer& graph_viewer, const IKernelLookup& kernel_lookup, const GraphOptimizerRegistry& /* graph_optimizer_registry */, IResourceAccountant* /* resource_accountant */) const {
   if (graph_viewer.IsSubgraph()) {
     // VITIS AI EP not support sungraph. Assigned to CPU.
     return {};
diff --git a/onnxruntime/core/providers/vitisai/vitisai_execution_provider.h b/onnxruntime/core/providers/vitisai/vitisai_execution_provider.h
index 5b031ab882839..f72f8cc721fbd 100644
--- a/onnxruntime/core/providers/vitisai/vitisai_execution_provider.h
+++ b/onnxruntime/core/providers/vitisai/vitisai_execution_provider.h
@@ -29,6 +29,7 @@ class VitisAIExecutionProvider : public IExecutionProvider {
 
   std::vector<std::unique_ptr<ComputeCapability>> GetCapability(const onnxruntime::GraphViewer& graph_viewer,
                                                                 const IKernelLookup& /*kernel_lookup*/,
+                                                                const GraphOptimizerRegistry& /* graph_optimizer_registry */,
                                                                 IResourceAccountant* /* resource_accountant */) const override;
 
   int GetDeviceId() const { return 0; }
diff --git a/onnxruntime/core/providers/vsinpu/vsinpu_execution_provider.cc b/onnxruntime/core/providers/vsinpu/vsinpu_execution_provider.cc
index 4b9f6fae86423..3b5daef04dd50 100644
--- a/onnxruntime/core/providers/vsinpu/vsinpu_execution_provider.cc
+++ b/onnxruntime/core/providers/vsinpu/vsinpu_execution_provider.cc
@@ -62,6 +62,7 @@ VSINPUExecutionProvider::~VSINPUExecutionProvider() {}
 std::vector<std::unique_ptr<ComputeCapability>>
 VSINPUExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph_viewer,
                                        const IKernelLookup& /*kernel_lookup*/,
+                                       const GraphOptimizerRegistry& /* graph_optimizer_registry */,
                                        IResourceAccountant* /* resource_accountant */) const {
   std::vector<std::unique_ptr<ComputeCapability>> result;
 
diff --git a/onnxruntime/core/providers/vsinpu/vsinpu_execution_provider.h b/onnxruntime/core/providers/vsinpu/vsinpu_execution_provider.h
index 16cfbc8a9c581..1c0b8b63a8e6c 100644
--- a/onnxruntime/core/providers/vsinpu/vsinpu_execution_provider.h
+++ b/onnxruntime/core/providers/vsinpu/vsinpu_execution_provider.h
@@ -40,6 +40,7 @@ class VSINPUExecutionProvider : public IExecutionProvider {
   std::vector<std::unique_ptr<ComputeCapability>> GetCapability(
       const onnxruntime::GraphViewer& graph_viewer,
       const IKernelLookup& kernel_lookup,
+      const GraphOptimizerRegistry& /* graph_optimizer_registry */,
       IResourceAccountant* /* resource_accountant */) const override;
   std::shared_ptr<KernelRegistry> GetKernelRegistry() const override;
   Status Compile(const std::vector<FusedNodeAndGraph>& fused_nodes_and_graphs,
diff --git a/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc b/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc
index f3bf2402252b7..df7f2d6dcdeab 100644
--- a/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc
+++ b/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc
@@ -765,6 +765,7 @@ std::vector<AllocatorPtr> WebGpuExecutionProvider::CreatePreferredAllocators() {
 std::vector<std::unique_ptr<ComputeCapability>> WebGpuExecutionProvider::GetCapability(
     const onnxruntime::GraphViewer& graph,
     const IKernelLookup& kernel_lookup,
+    const GraphOptimizerRegistry& /* graph_optimizer_registry */,
     IResourceAccountant* /* resource_accountant */) const {
   InlinedVector<NodeIndex> candidates;
   // `tenative_candidates` is a subset of `candidates`.
diff --git a/onnxruntime/core/providers/webgpu/webgpu_execution_provider.h b/onnxruntime/core/providers/webgpu/webgpu_execution_provider.h
index dc25636821651..e2e23b6a307cf 100644
--- a/onnxruntime/core/providers/webgpu/webgpu_execution_provider.h
+++ b/onnxruntime/core/providers/webgpu/webgpu_execution_provider.h
@@ -45,6 +45,7 @@ class WebGpuExecutionProvider : public IExecutionProvider {
   std::vector<std::unique_ptr<ComputeCapability>> GetCapability(
       const onnxruntime::GraphViewer& graph_viewer,
       const IKernelLookup& /*kernel_lookup*/,
+      const GraphOptimizerRegistry& /* graph_optimizer_registry */,
       IResourceAccountant* /* resource_accountant */) const override;
 
   std::shared_ptr<KernelRegistry> GetKernelRegistry() const override;
diff --git a/onnxruntime/core/providers/webnn/webnn_execution_provider.cc b/onnxruntime/core/providers/webnn/webnn_execution_provider.cc
index 39e6520e3912b..7410ff66add30 100644
--- a/onnxruntime/core/providers/webnn/webnn_execution_provider.cc
+++ b/onnxruntime/core/providers/webnn/webnn_execution_provider.cc
@@ -56,6 +56,7 @@ WebNNExecutionProvider::~WebNNExecutionProvider() {}
 std::vector<std::unique_ptr<ComputeCapability>>
 WebNNExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph_viewer,
                                       const IKernelLookup& /*kernel_registries*/,
+                                      const GraphOptimizerRegistry& /* graph_optimizer_registry */,
                                       IResourceAccountant* /* resource_accountant */) const {
   // For subgraph which is the attribute of the control flow nodes, part of its initializers are stored in its
   // ancestor graphs as common initializers shared for other subgraphs. We need to collect all of them used for
diff --git a/onnxruntime/core/providers/webnn/webnn_execution_provider.h b/onnxruntime/core/providers/webnn/webnn_execution_provider.h
index e806dc340d53e..b8775e717668a 100644
--- a/onnxruntime/core/providers/webnn/webnn_execution_provider.h
+++ b/onnxruntime/core/providers/webnn/webnn_execution_provider.h
@@ -25,6 +25,7 @@ class WebNNExecutionProvider : public IExecutionProvider {
   std::vector<std::unique_ptr<ComputeCapability>>
   GetCapability(const onnxruntime::GraphViewer& graph_viewer,
                 const IKernelLookup& /*kernel_registries*/,
+                const GraphOptimizerRegistry& /* graph_optimizer_registry */,
                 IResourceAccountant* /* resource_accountant */) const override;
 
   DataLayout GetPreferredLayout() const override { return preferred_layout_; }
diff --git a/onnxruntime/core/providers/xnnpack/xnnpack_execution_provider.cc b/onnxruntime/core/providers/xnnpack/xnnpack_execution_provider.cc
index 641f8b0729d0a..ab14c083884d3 100644
--- a/onnxruntime/core/providers/xnnpack/xnnpack_execution_provider.cc
+++ b/onnxruntime/core/providers/xnnpack/xnnpack_execution_provider.cc
@@ -258,6 +258,7 @@ static void AddComputeCapabilityForEachNodeInNodeUnit(
 std::vector<std::unique_ptr<ComputeCapability>> XnnpackExecutionProvider::GetCapability(
     const onnxruntime::GraphViewer& graph,
     const IKernelLookup& /*kernel_lookup*/,
+    const GraphOptimizerRegistry& /* graph_optimizer_registry */,
     IResourceAccountant* /* resource_accountant */) const {
   const auto& logger = *GetLogger();
   std::vector<std::unique_ptr<ComputeCapability>> capabilities;
diff --git a/onnxruntime/core/providers/xnnpack/xnnpack_execution_provider.h b/onnxruntime/core/providers/xnnpack/xnnpack_execution_provider.h
index 152bef1a1c52c..9c4d2484f9f4b 100644
--- a/onnxruntime/core/providers/xnnpack/xnnpack_execution_provider.h
+++ b/onnxruntime/core/providers/xnnpack/xnnpack_execution_provider.h
@@ -33,6 +33,7 @@ class XnnpackExecutionProvider : public IExecutionProvider {
   std::vector<std::unique_ptr<ComputeCapability>> GetCapability(
       const onnxruntime::GraphViewer& graph_viewer,
       const IKernelLookup& /*kernel_lookup*/,
+      const GraphOptimizerRegistry& /* graph_optimizer_registry */,
       IResourceAccountant* /* resource_accountant */) const override;
 
   std::shared_ptr<KernelRegistry> GetKernelRegistry() const override;
diff --git a/onnxruntime/core/session/inference_session.cc b/onnxruntime/core/session/inference_session.cc
index e941b1ebbaba8..e5ea562ce3535 100644
--- a/onnxruntime/core/session/inference_session.cc
+++ b/onnxruntime/core/session/inference_session.cc
@@ -42,6 +42,7 @@
 #include "core/graph/model_saving_options.h"
 #include "core/optimizer/graph_transformer_utils.h"
 #include "core/optimizer/graph_transformer.h"
+#include "core/optimizer/graph_optimizer_registry.h"
 #include "core/optimizer/layout_transformation/layout_transformation.h"
 #include "core/optimizer/insert_cast_transformer.h"
 #include "core/optimizer/qdq_transformer/ensure_unique_dq_for_node_unit.h"
@@ -1278,8 +1279,13 @@ common::Status InferenceSession::TransformGraph(onnxruntime::Graph& graph, bool
   // 6. insert cast nodes (required transformer).
   // 7. insert copy nodes (required transformer).
 
+  // Create GraphOptimizerRegistry instance for providing predefined graph optimizers and selection functions for EPs to lookup
+  auto graph_optimizer_registry = std::make_unique<GraphOptimizerRegistry>(&session_options_,
+                                                                           execution_providers_.Get(onnxruntime::kCpuExecutionProvider),
+                                                                           session_logger_);
+  GraphPartitioner partitioner(kernel_registry_manager_, execution_providers_, std::move(graph_optimizer_registry));
+
   // Run Ahead Of time function inlining
-  GraphPartitioner partitioner(kernel_registry_manager_, execution_providers_);
   if (const bool disable_aot_function_inlining =
           session_options_.config_options.GetConfigOrDefault(
               kOrtSessionOptionsDisableAheadOfTimeFunctionInlining, "0") == "1";
@@ -1682,7 +1688,7 @@ Status PartitionOrtFormatModel(onnxruntime::Graph& graph,
                                const ExecutionProviders& providers,
                                KernelRegistryManager& kernel_registry_manager,
                                SessionState& session_state,
-                               const ConfigOptions& config_options,
+                               const SessionOptions& sess_options,
                                const logging::Logger& logger) {
   layout_transformation::TransformLayoutFunction transform_layout_fn = nullptr;
 
@@ -1700,11 +1706,16 @@ Status PartitionOrtFormatModel(onnxruntime::Graph& graph,
   }
 #endif  // !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
 
-  GraphPartitioner partitioner(kernel_registry_manager, providers);
+  // Create GraphOptimizerRegistry instance for providing predefined graph optimizers and selection functions for EPs to lookup
+  auto graph_optimizer_registry = std::make_unique<GraphOptimizerRegistry>(&sess_options,
+                                                                           providers.Get(onnxruntime::kCpuExecutionProvider),
+                                                                           &logger);
+
+  GraphPartitioner partitioner(kernel_registry_manager, providers, std::move(graph_optimizer_registry));
   ORT_RETURN_IF_ERROR(partitioner.Partition(graph,
                                             session_state.GetMutableFuncMgr(),
                                             transform_layout_fn,
-                                            config_options,
+                                            sess_options.config_options,
                                             logger,
                                             GraphPartitioner::Mode::kOrtFormatLoad));
 
@@ -2147,7 +2158,7 @@ common::Status InferenceSession::Initialize() {
 #endif  // !defined(ORT_MINIMAL_BUILD)
     } else {
       ORT_RETURN_IF_ERROR_SESSIONID_(PartitionOrtFormatModel(graph, execution_providers_, kernel_registry_manager_,
-                                                             *session_state_, session_options_.config_options, *session_logger_));
+                                                             *session_state_, session_options_, *session_logger_));
 
 #if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
       const auto& cpu_ep = *execution_providers_.Get(onnxruntime::kCpuExecutionProvider);
diff --git a/onnxruntime/core/session/provider_bridge_ort.cc b/onnxruntime/core/session/provider_bridge_ort.cc
index 1d25ceb9af8a3..69dea34175155 100644
--- a/onnxruntime/core/session/provider_bridge_ort.cc
+++ b/onnxruntime/core/session/provider_bridge_ort.cc
@@ -4,6 +4,7 @@
 // This is the Onnxruntime side of the bridge to allow providers to be built as a DLL
 // It implements onnxruntime::ProviderHost
 
+#include <optional>
 #include "core/common/inlined_containers.h"
 #include "core/common/path_string.h"
 #include "core/framework/allocator_utils.h"
@@ -35,6 +36,7 @@
 #include "core/graph/graph_proto_serializer.h"
 #include "core/framework/murmurhash3.h"
 #include "core/framework/model_metadef_id_generator.h"
+#include "core/optimizer/graph_optimizer_registry.h"
 #include "core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.h"
 #include "core/optimizer/qdq_transformer/selectors_actions/shared/utils.h"
 
@@ -237,6 +239,21 @@ common::Status LoadDynamicLibraryFromProvider(onnxruntime::PathString library_na
 struct ProviderHostImpl : ProviderHost {
   const OrtApiBase* OrtGetApiBase() override { return ::OrtGetApiBase(); }
 
+  Status GetOptimizerByName(const std::string& name,
+                            const GraphOptimizerRegistry& graph_optimizer_registry,
+                            SelectionFunc& selection_func) override {
+    std::string optimizer_name(name);
+
+    auto func = graph_optimizer_registry.GetSelectionFunc(optimizer_name);
+
+    if (func.has_value()) {
+      selection_func = func.value();
+    } else {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Failed to get optimizer " + optimizer_name);
+    }
+    return Status::OK();
+  };
+
   void* HeapAllocate(size_t size) override { return new uint8_t[size]; }
   void HeapFree(void* p) override { delete[] reinterpret_cast<uint8_t*>(p); }
 
@@ -360,8 +377,9 @@ struct ProviderHostImpl : ProviderHost {
   std::vector<std::unique_ptr<ComputeCapability>> IExecutionProvider__GetCapability(
       const IExecutionProvider* p, const onnxruntime::GraphViewer& graph_viewer,
       const IExecutionProvider::IKernelLookup& kernel_lookup,
+      const GraphOptimizerRegistry& graph_optimizer_registry,
       IResourceAccountant* resource_accountant) override {
-    return p->IExecutionProvider::GetCapability(graph_viewer, kernel_lookup, resource_accountant);
+    return p->IExecutionProvider::GetCapability(graph_viewer, kernel_lookup, graph_optimizer_registry, resource_accountant);
   }
 
   common::Status IExecutionProvider__Compile(IExecutionProvider* p, const std::vector<IExecutionProvider::FusedNodeAndGraph>& fused_nodes_and_graphs, std::vector<NodeComputeInfo>& node_compute_funcs) override {
@@ -797,6 +815,8 @@ struct ProviderHostImpl : ProviderHost {
   std::unique_ptr<ComputeCapability> ComputeCapability__construct(std::unique_ptr<IndexedSubGraph> t_sub_graph) override { return std::make_unique<ComputeCapability>(std::move(t_sub_graph)); }
   void ComputeCapability__operator_delete(ComputeCapability* p) override { delete p; }
   std::unique_ptr<IndexedSubGraph>& ComputeCapability__SubGraph(ComputeCapability* p) override { return p->sub_graph; }
+  void ComputeCapability__copy_optimization_func(ComputeCapability* p, ComputeCapability* selection_cc) override { p->optimization_func = selection_cc->optimization_func; }
+  void ComputeCapability__add_nodes_to_optimize(ComputeCapability* p, std::unique_ptr<ComputeCapability> optimization_cc) override { p->nodes_to_optimize.push_back(std::move(optimization_cc)); }
 
   // DataTransferManager (wrapped)
   Status DataTransferManager__CopyTensor(const DataTransferManager* p, const Tensor& src, Tensor& dst) override { return p->CopyTensor(src, dst); }
@@ -1631,6 +1651,7 @@ struct ProviderHostImpl : ProviderHost {
   Status LoadDynamicLibrary(onnxruntime::PathString library_name) override { return LoadDynamicLibraryFromProvider(library_name); };
 #endif
 } provider_host_;
+
 #if defined(_MSC_VER) && !defined(__clang__)
 #pragma warning(pop)
 #endif
diff --git a/onnxruntime/test/framework/inference_session_test.cc b/onnxruntime/test/framework/inference_session_test.cc
index 1b06eb55afbd2..95101c8075fc2 100644
--- a/onnxruntime/test/framework/inference_session_test.cc
+++ b/onnxruntime/test/framework/inference_session_test.cc
@@ -138,6 +138,7 @@ class FuseExecutionProvider : public IExecutionProvider {
   std::vector<std::unique_ptr<ComputeCapability>>
   GetCapability(const onnxruntime::GraphViewer& graph,
                 const IKernelLookup& /*kernel_lookup*/,
+                const GraphOptimizerRegistry& /* graph_optimizer_registry */,
                 IResourceAccountant* /* resource_accountant */) const override {
     // Fuse two add into one.
     std::vector<std::unique_ptr<ComputeCapability>> result;
diff --git a/onnxruntime/test/framework/session_state_test.cc b/onnxruntime/test/framework/session_state_test.cc
index b6b915f90d99a..8f4eede76b905 100644
--- a/onnxruntime/test/framework/session_state_test.cc
+++ b/onnxruntime/test/framework/session_state_test.cc
@@ -27,6 +27,7 @@
 #include "test/util/include/default_providers.h"
 #include "test/util/include/file_util.h"
 #include "core/optimizer/layout_transformation/layout_transformation.h"
+#include "core/optimizer/graph_optimizer_registry.h"
 
 using namespace ONNX_NAMESPACE;
 namespace onnxruntime {
@@ -264,7 +265,11 @@ TEST_P(SessionStateTestP, TestInitializerProcessing) {
   SessionState session_state(graph, execution_providers, tp.get(), nullptr, dtm, edlm,
                              DefaultLoggingManager().DefaultLogger(), profiler, sess_options);
 
-  GraphPartitioner partitioner(krm, execution_providers);
+  // Create GraphOptimizerRegistry instance for providing predefined graph optimizers and selection functions for EPs to lookup
+  auto graph_optimizer_registry = std::make_unique<GraphOptimizerRegistry>(&sess_options,
+                                                                           execution_providers.Get(onnxruntime::kCpuExecutionProvider),
+                                                                           &DefaultLoggingManager().DefaultLogger());
+  GraphPartitioner partitioner(krm, execution_providers, std::move(graph_optimizer_registry));
   ASSERT_STATUS_OK(
       partitioner.Partition(
           graph, session_state.GetMutableFuncMgr(),
@@ -350,8 +355,12 @@ TEST(SessionStateTest, TestInitializerMemoryAllocatedUsingNonArenaMemory) {
     SessionState session_state(graph, execution_providers, nullptr, nullptr, dtm, edlm,
                                DefaultLoggingManager().DefaultLogger(), profiler, sess_options);
 
+    // Create GraphOptimizerRegistry instance for providing predefined graph optimizers and selection functions for EPs to lookup
+    auto graph_optimizer_registry = std::make_unique<GraphOptimizerRegistry>(&sess_options,
+                                                                             execution_providers.Get(onnxruntime::kCpuExecutionProvider),
+                                                                             &DefaultLoggingManager().DefaultLogger());
     // Partition the graph
-    GraphPartitioner partitioner(krm, execution_providers);
+    GraphPartitioner partitioner(krm, execution_providers, std::move(graph_optimizer_registry));
     ASSERT_STATUS_OK(partitioner.Partition(
         graph, session_state.GetMutableFuncMgr(),
         [&cpu_allocator](Graph& graph, bool& modified, const IExecutionProvider& execution_provider,
@@ -409,8 +418,13 @@ TEST(SessionStateTest, TestInitializerMemoryAllocatedUsingNonArenaMemory) {
     SessionState session_state(graph, execution_providers, nullptr, nullptr, dtm, edlm,
                                DefaultLoggingManager().DefaultLogger(), profiler, sess_options);
 
+    // Create GraphOptimizerRegistry instance for providing predefined graph optimizers and selection functions for EPs to lookup
+    auto graph_optimizer_registry = std::make_unique<GraphOptimizerRegistry>(&sess_options,
+                                                                             execution_providers.Get(onnxruntime::kCpuExecutionProvider),
+                                                                             &DefaultLoggingManager().DefaultLogger());
+
     // Partition the graph
-    GraphPartitioner partitioner(krm, execution_providers);
+    GraphPartitioner partitioner(krm, execution_providers, std::move(graph_optimizer_registry));
     ASSERT_STATUS_OK(partitioner.Partition(
         graph, session_state.GetMutableFuncMgr(),
         [&cpu_allocator](Graph& graph, bool& modified,
@@ -479,7 +493,12 @@ void LoadWithResourceAwarePartitioning(const ORTCHAR_T* model_path,
   SessionState session_state(model->MainGraph(), execution_providers, tp.get(), nullptr, dtm, edlm,
                              default_logger, profiler, sess_options);
 
-  GraphPartitioner partitioner(krm, execution_providers);
+  // Create GraphOptimizerRegistry instance for providing predefined graph optimizers and selection functions for EPs to lookup
+  auto graph_optimizer_registry = std::make_unique<GraphOptimizerRegistry>(&sess_options,
+                                                                           execution_providers.Get(onnxruntime::kCpuExecutionProvider),
+                                                                           &DefaultLoggingManager().DefaultLogger());
+
+  GraphPartitioner partitioner(krm, execution_providers, std::move(graph_optimizer_registry));
   layout_transformation::TransformLayoutFunction transform_layout_fn;
   layout_transformation::DebugGraphFn debug_graph_fn;
   ASSERT_STATUS_OK(
diff --git a/onnxruntime/test/providers/internal_testing/internal_testing_execution_provider.cc b/onnxruntime/test/providers/internal_testing/internal_testing_execution_provider.cc
index b753bc386d722..ee0aff6d26444 100644
--- a/onnxruntime/test/providers/internal_testing/internal_testing_execution_provider.cc
+++ b/onnxruntime/test/providers/internal_testing/internal_testing_execution_provider.cc
@@ -111,6 +111,7 @@ DataLayout InternalTestingExecutionProvider::GetPreferredLayout() const {
 std::vector<std::unique_ptr<ComputeCapability>>
 InternalTestingExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph_viewer,
                                                 const IKernelLookup& kernel_lookup,
+                                                const GraphOptimizerRegistry& /* graph_optimizer_registry */,
                                                 IResourceAccountant* /* resource_accountant */) const {
   // find nodes that have ops in our supported list
   std::unordered_set<const Node*> supported_static_nodes;
diff --git a/onnxruntime/test/providers/internal_testing/internal_testing_execution_provider.h b/onnxruntime/test/providers/internal_testing/internal_testing_execution_provider.h
index d2ed8259ee974..0caa0febc2796 100644
--- a/onnxruntime/test/providers/internal_testing/internal_testing_execution_provider.h
+++ b/onnxruntime/test/providers/internal_testing/internal_testing_execution_provider.h
@@ -20,6 +20,7 @@ class InternalTestingExecutionProvider : public IExecutionProvider {
   std::vector<std::unique_ptr<ComputeCapability>>
   GetCapability(const onnxruntime::GraphViewer& graph_view,
                 const IKernelLookup& /*kernel_lookup*/,
+                const GraphOptimizerRegistry& /* graph_optimizer_registry */,
                 IResourceAccountant* /* resource_accountant */) const override;
 
   common::Status Compile(const std::vector<FusedNodeAndGraph>& fused_nodes,
diff --git a/onnxruntime/test/providers/qnn/qnn_test_utils.cc b/onnxruntime/test/providers/qnn/qnn_test_utils.cc
index e2deccc4fff0f..2361e179d1cf1 100644
--- a/onnxruntime/test/providers/qnn/qnn_test_utils.cc
+++ b/onnxruntime/test/providers/qnn/qnn_test_utils.cc
@@ -14,6 +14,7 @@
 #include "core/framework/compute_capability.h"
 #include "core/graph/graph.h"
 #include "core/session/onnxruntime_session_options_config_keys.h"
+#include "core/optimizer/graph_optimizer_registry.h"
 
 namespace onnxruntime {
 namespace test {
@@ -279,9 +280,10 @@ static BackendSupport GetHTPSupport(const onnxruntime::logging::Logger& logger)
   onnxruntime::GraphViewer graph_viewer(graph);
   std::unique_ptr<onnxruntime::IExecutionProvider> qnn_ep = QnnExecutionProviderWithOptions(
       {{"backend_path", "QnnHtp.dll"}, {"offload_graph_io_quantization", "0"}});
+  GraphOptimizerRegistry graph_optimizer_registry(nullptr, nullptr, nullptr);  // as a placeholder to feed into GetCapability
 
   qnn_ep->SetLogger(&logger);
-  auto result = qnn_ep->GetCapability(graph_viewer, kernel_lookup, nullptr);
+  auto result = qnn_ep->GetCapability(graph_viewer, kernel_lookup, graph_optimizer_registry, nullptr);
 
   return result.empty() ? BackendSupport::UNSUPPORTED : BackendSupport::SUPPORTED;
 }
@@ -342,9 +344,10 @@ static BackendSupport GetCPUSupport(const onnxruntime::logging::Logger& logger)
   onnxruntime::GraphViewer graph_viewer(graph);
   std::unique_ptr<onnxruntime::IExecutionProvider> qnn_ep = QnnExecutionProviderWithOptions(
       {{"backend_path", "QnnCpu.dll"}, {"offload_graph_io_quantization", "0"}});
+  GraphOptimizerRegistry graph_optimizer_registry(nullptr, nullptr, nullptr);  // as a placeholder to feed into GetCapability
 
   qnn_ep->SetLogger(&logger);
-  auto result = qnn_ep->GetCapability(graph_viewer, kernel_lookup, nullptr);
+  auto result = qnn_ep->GetCapability(graph_viewer, kernel_lookup, graph_optimizer_registry, nullptr);
 
   return result.empty() ? BackendSupport::UNSUPPORTED : BackendSupport::SUPPORTED;
 }

From 2ba076aa9ea8e3c68e97dc0f2463fb2786fcb96f Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Thu, 6 Mar 2025 23:09:22 -0800
Subject: [PATCH 34/46] fix binplace file in web pipeline (#23930)

---
 .../github/azure-pipelines/templates/win-web-ci.yml  | 12 ++++++++++--
 .../templates/win-web-multi-browsers.yml             | 12 ++++++++++--
 2 files changed, 20 insertions(+), 4 deletions(-)

diff --git a/tools/ci_build/github/azure-pipelines/templates/win-web-ci.yml b/tools/ci_build/github/azure-pipelines/templates/win-web-ci.yml
index b77cab6a19ba0..6868043f64d81 100644
--- a/tools/ci_build/github/azure-pipelines/templates/win-web-ci.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/win-web-ci.yml
@@ -88,10 +88,18 @@ jobs:
     inputs:
       sourceFolder: $(Pipeline.Workspace)\artifacts
       contents: |
-        **\*.*
+        **\ort-*.wasm
       targetFolder: $(Build.SourcesDirectory)\js\web\dist
       flattenFolders: true
-    displayName: 'Binplace dist files'
+    displayName: 'Binplace dist files (.wasm)'
+  - task: CopyFiles@2
+    inputs:
+      sourceFolder: $(Pipeline.Workspace)\artifacts
+      contents: |
+        **\ort-*.mjs
+      targetFolder: $(Build.SourcesDirectory)\js\web\dist
+      flattenFolders: true
+    displayName: 'Binplace dist files (.mjs)'
   - script: |
       npm ci
     workingDirectory: '$(Build.SourcesDirectory)\js'
diff --git a/tools/ci_build/github/azure-pipelines/templates/win-web-multi-browsers.yml b/tools/ci_build/github/azure-pipelines/templates/win-web-multi-browsers.yml
index e201cc0ffdd5a..00df695889b1d 100644
--- a/tools/ci_build/github/azure-pipelines/templates/win-web-multi-browsers.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/win-web-multi-browsers.yml
@@ -44,10 +44,18 @@ jobs:
     inputs:
       sourceFolder: $(Pipeline.Workspace)\artifacts
       contents: |
-        **\*.*
+        **\ort-*.wasm
       targetFolder: $(Build.SourcesDirectory)\js\web\dist
       flattenFolders: true
-    displayName: 'Binplace dist files'
+    displayName: 'Binplace dist files (.wasm)'
+  - task: CopyFiles@2
+    inputs:
+      sourceFolder: $(Pipeline.Workspace)\artifacts
+      contents: |
+        **\ort-*.mjs
+      targetFolder: $(Build.SourcesDirectory)\js\web\dist
+      flattenFolders: true
+    displayName: 'Binplace dist files (.mjs)'
   - script: |
      npm ci
     workingDirectory: '$(Build.SourcesDirectory)\js'

From e47c6c165df7403bf7267ceb8c1558cf5e2ddd3f Mon Sep 17 00:00:00 2001
From: Yi-Hong Lyu <yilyu@microsoft.com>
Date: Fri, 7 Mar 2025 00:39:49 -0800
Subject: [PATCH 35/46] Updated run_CIs_for_external_pr.py to support the
 Windows OpenVINO CI pipeline (#23931)

---
 tools/python/run_CIs_for_external_pr.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/python/run_CIs_for_external_pr.py b/tools/python/run_CIs_for_external_pr.py
index 1546a9143831a..aca5f1df7d18b 100644
--- a/tools/python/run_CIs_for_external_pr.py
+++ b/tools/python/run_CIs_for_external_pr.py
@@ -24,6 +24,7 @@ def get_pipeline_names():
         "Windows GPU DML CI Pipeline",
         "Windows GPU Doc Gen CI Pipeline",
         "Windows GPU TensorRT CI Pipeline",
+        "Windows OpenVINO CI Pipeline",
         "ONNX Runtime Web CI Pipeline",
         "Win_TRT_Minimal_CUDA_Test_CI",
         # linux

From 8969ee7817de81de36affad8feecc13aae8d97aa Mon Sep 17 00:00:00 2001
From: Scott McKay <skottmckay@gmail.com>
Date: Fri, 7 Mar 2025 18:58:01 +1000
Subject: [PATCH 36/46] Fix ConvInteger handling of optional inputs. (#23935)

### Description
<!-- Describe your changes. -->
Fix ConvInteger handling of optional inputs. Need to check Exists() and
not just the number of inputs.


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
#23927
---
 .../cpu/quantization/conv_integer.cc          |  7 ++--
 .../providers/cpu/nn/conv_integer_test.cc     | 40 +++++++++++++++++++
 2 files changed, 44 insertions(+), 3 deletions(-)

diff --git a/onnxruntime/core/providers/cpu/quantization/conv_integer.cc b/onnxruntime/core/providers/cpu/quantization/conv_integer.cc
index 03b39e19ed748..f3c6b18f8e753 100644
--- a/onnxruntime/core/providers/cpu/quantization/conv_integer.cc
+++ b/onnxruntime/core/providers/cpu/quantization/conv_integer.cc
@@ -34,17 +34,18 @@ ONNX_OPERATOR_KERNEL_EX(
     ConvInteger);
 
 Status ConvInteger::Compute(OpKernelContext* context) const {
-  size_t num_inputs = OpKernel::Node().InputDefs().size();
+  const auto input_defs = Node().InputDefs();
+  size_t num_inputs = input_defs.size();
   const auto* X = context->Input<Tensor>(0);
   const auto* W = context->Input<Tensor>(1);
   uint8_t input_offset = 0;
   uint8_t filter_offset = 0;
-  if (num_inputs >= 3) {
+  if (num_inputs >= 3 && input_defs[2]->Exists()) {
     const auto* X_Zero_Point = context->Input<Tensor>(2);
     ORT_ENFORCE(IsScalarOr1ElementVector(X_Zero_Point), "Must be a scalar or 1D tensor or size 1.");
     input_offset = *(X_Zero_Point->Data<uint8_t>());
   }
-  if (num_inputs >= 4) {
+  if (num_inputs >= 4 && input_defs[3]->Exists()) {
     const auto* W_Zero_Point = context->Input<Tensor>(3);
     ORT_ENFORCE(IsScalarOr1ElementVector(W_Zero_Point), "Non per-tensor quantization is not supported now.");
     filter_offset = *(W_Zero_Point->Data<uint8_t>());
diff --git a/onnxruntime/test/providers/cpu/nn/conv_integer_test.cc b/onnxruntime/test/providers/cpu/nn/conv_integer_test.cc
index a5378fa3cefd7..c98d9e28b2f46 100644
--- a/onnxruntime/test/providers/cpu/nn/conv_integer_test.cc
+++ b/onnxruntime/test/providers/cpu/nn/conv_integer_test.cc
@@ -254,5 +254,45 @@ TEST(ConvIntegerTest, WithStride3_2D_u8u8) {
   test.Run();
 }
 
+TEST(ConvIntegerTest, NoXZeroPoint) {
+  OpTester test("ConvInteger", 10);
+  std::vector<int64_t> x_dims{1, 1, 3, 3};
+  test.AddInput<uint8_t>("x", x_dims,
+                         {2, 3, 4,
+                          5, 6, 7,
+                          8, 9, 10});
+  std::vector<int64_t> w_dims{1, 1, 2, 2};
+  test.AddInput<uint8_t>("w", w_dims,
+                         {2, 2,
+                          2, 2});
+  test.AddOptionalInputEdge<uint8_t>();
+  test.AddInput<uint8_t>("w_zero_point", {}, {1});
+  std::vector<int64_t> y_dims{1, 1, 2, 2};
+  test.AddOutput<int32_t>("y", y_dims,
+                          {16, 20,
+                           28, 32});
+  test.Run();
+}
+
+// provide optional input with empty name for w. tests that input args == 4 but the w_zero_point does not exist.
+TEST(ConvIntegerTest, NoWZeroPoint) {
+  OpTester test("ConvInteger", 10);
+  std::vector<int64_t> x_dims{1, 1, 3, 3};
+  test.AddInput<uint8_t>("x", x_dims,
+                         {2, 3, 4,
+                          5, 6, 7,
+                          8, 9, 10});
+  std::vector<int64_t> w_dims{1, 1, 2, 2};
+  test.AddInput<uint8_t>("w", w_dims,
+                         {2, 2,
+                          2, 2});
+  test.AddInput<uint8_t>("x_zero_point", {}, {1});
+  test.AddOptionalInputEdge<uint8_t>();
+  std::vector<int64_t> y_dims{1, 1, 2, 2};
+  test.AddOutput<int32_t>("y", y_dims,
+                          {24, 32,
+                           48, 56});
+  test.Run();
+}
 }  // namespace test
 }  // namespace onnxruntime

From 26f590b34519c4da964f7c01c40c5a0963eb82b8 Mon Sep 17 00:00:00 2001
From: saurabh <saurabh1.kale@intel.com>
Date: Fri, 7 Mar 2025 01:45:39 -0800
Subject: [PATCH 37/46] Updated ov version in pipeline (#595) (#23882)

### Description
This PR updates the OpenVINO version used in the pipeline from 2024.5.0
to 2025.0.0

Co-authored-by: jatinwadhwa921 <110383850+jatinwadhwa921@users.noreply.github.com>
---
 .../github/azure-pipelines/linux-openvino-ci-pipeline.yml | 2 +-
 .../github/linux/docker/Dockerfile.ubuntu_openvino        | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/tools/ci_build/github/azure-pipelines/linux-openvino-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-openvino-ci-pipeline.yml
index e89fa0ece2c76..48627e656b9a8 100644
--- a/tools/ci_build/github/azure-pipelines/linux-openvino-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/linux-openvino-ci-pipeline.yml
@@ -33,5 +33,5 @@ jobs:
   parameters:
     AgentPool : 'Linux-CPU-2019'
     JobName: 'Linux_CI_Dev'
-    RunDockerBuildArgs: '-o ubuntu22.04 -p 3.10 -d openvino -v 2024.5.0 -x "--enable_generic_interface --use_openvino CPU --build_wheel"'
+    RunDockerBuildArgs: '-o ubuntu22.04 -p 3.10 -d openvino -v 2025.0.0 -x "--enable_generic_interface --use_openvino CPU --build_wheel"'
     TimeoutInMinutes: 120
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_openvino b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_openvino
index 7b1e3fa677375..b53a2302be403 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_openvino
+++ b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_openvino
@@ -1,7 +1,7 @@
 ARG UBUNTU_VERSION=22.04
 FROM ubuntu:${UBUNTU_VERSION}
 
-ARG OPENVINO_VERSION=2024.5.0
+ARG OPENVINO_VERSION=2025.0.0
 ARG PYTHON_VERSION=3.10
 
 ADD scripts /tmp/scripts
@@ -19,9 +19,9 @@ ENV IE_PLUGINS_PATH=$INTEL_OPENVINO_DIR/runtime/lib/intel64
 ENV DEBIAN_FRONTEND=noninteractive
 
 RUN cd /opt && mkdir -p intel && cd intel && \
-    wget https://storage.openvinotoolkit.org/repositories/openvino/packages/2024.5/linux/l_openvino_toolkit_ubuntu22_2024.5.0.17288.7975fa5da0c_x86_64.tgz && \
-    tar xzf l_openvino_toolkit_ubuntu22_2024.5.0.17288.7975fa5da0c_x86_64.tgz && rm -rf l_openvino_toolkit_ubuntu22_2024.5.0.17288.7975fa5da0c_x86_64.tgz && \
-    mv l_openvino_toolkit_ubuntu22_2024.5.0.17288.7975fa5da0c_x86_64 openvino_2024.5.0 && \
+    wget https://storage.openvinotoolkit.org/repositories/openvino/packages/2025.0/linux/openvino_toolkit_ubuntu22_2025.0.0.17942.1f68be9f594_x86_64.tgz && \
+    tar xzf openvino_toolkit_ubuntu22_2025.0.0.17942.1f68be9f594_x86_64.tgz && rm -rf openvino_toolkit_ubuntu22_2025.0.0.17942.1f68be9f594_x86_64.tgz && \
+    mv openvino_toolkit_ubuntu22_2025.0.0.17942.1f68be9f594_x86_64 openvino_2025.0.0 && \
     cd $INTEL_OPENVINO_DIR/install_dependencies && ./install_openvino_dependencies.sh -y
 
 WORKDIR /root

From f25deaeaa46fab20d560687a528413ef031adb91 Mon Sep 17 00:00:00 2001
From: Ranjit Ranjan <165394499+ranjitshs@users.noreply.github.com>
Date: Fri, 7 Mar 2025 23:17:30 +0530
Subject: [PATCH 38/46] [AIX] External data handling (#23859)

### Description
In BE system, model tensor data coming from external file is not handled
properly.
This was found during the debugging of
(https://github.com/microsoft/onnxruntime-genai/issues/1104)(url)

This PR changes do the endianness conversion of data loaded from
external file in BE system.
---
 .../core/framework/session_state_utils.cc        |  5 +++++
 onnxruntime/core/framework/tensorprotoutils.cc   | 16 +++++++++++++++-
 onnxruntime/core/framework/tensorprotoutils.h    | 10 ++++++++--
 3 files changed, 28 insertions(+), 3 deletions(-)

diff --git a/onnxruntime/core/framework/session_state_utils.cc b/onnxruntime/core/framework/session_state_utils.cc
index 343d634b44691..9d45ec38e5a32 100644
--- a/onnxruntime/core/framework/session_state_utils.cc
+++ b/onnxruntime/core/framework/session_state_utils.cc
@@ -81,6 +81,11 @@ static common::Status ExtDataTensorProtoToTensor(const Env& env,
   ORT_RETURN_IF_ERROR(utils::GetExtDataFromTensorProto(env, proto_path.c_str(), tensor_proto,
                                                        ext_data_buf, ext_data_len, ext_data_deleter,
                                                        buffered_tensor, &prepacked_for_graph));
+  if constexpr (endian::native != endian::little) {
+    if (!proto_path.empty() && (proto_path.compare(onnxruntime::utils::kTensorProtoMemoryAddressTag) != 0)) {
+      utils::ConvertRawDataInTensorProto(const_cast<ONNX_NAMESPACE::TensorProto*>(&tensor_proto), ext_data_buf, ext_data_len);
+    }
+  }
 
   // NB: creating a do-nothing allocator per tensor is wasteful; can perhaps be
   // avoided if the Tensor class implements the do-nothing behavior when given a
diff --git a/onnxruntime/core/framework/tensorprotoutils.cc b/onnxruntime/core/framework/tensorprotoutils.cc
index ae1ec2e53bd7c..94a2a6677358e 100644
--- a/onnxruntime/core/framework/tensorprotoutils.cc
+++ b/onnxruntime/core/framework/tensorprotoutils.cc
@@ -270,10 +270,15 @@ void SetRawDataInTensorProto(ONNX_NAMESPACE::TensorProto& tensor_proto, std::str
   tensor_proto.set_raw_data(std::move(param));
 }
 
-void ConvertRawDataInTensorProto(TensorProto* tensor) {
+void ConvertRawDataInTensorProto(TensorProto* tensor,
+                                 void* ext_data_buf,
+                                 size_t ext_data_len) {
   size_t element_size = 1;
   char* bytes = NULL;
   size_t num_elements = 0;
+  if (ext_data_buf && !ext_data_len) {
+    return;
+  }
   switch (tensor->data_type()) {
     case TensorProto_DataType_FLOAT:
       bytes = reinterpret_cast<char*>(tensor->mutable_float_data()->mutable_data());
@@ -337,6 +342,15 @@ void ConvertRawDataInTensorProto(TensorProto* tensor) {
     num_elements = (tensor->raw_data().size()) / element_size;
     bytes = const_cast<char*>(tensor->mutable_raw_data()->c_str());
   }
+
+  if (element_size == 1) {
+    return;
+  }
+  if (ext_data_buf) {
+    ORT_ENFORCE(ext_data_len % element_size == 0);
+    num_elements = ext_data_len / element_size;
+    bytes = reinterpret_cast<char*>(ext_data_buf);
+  }
   for (size_t i = 0; i < num_elements; ++i) {
     char* start_byte = bytes + i * element_size;
     char* end_byte = start_byte + element_size - 1;
diff --git a/onnxruntime/core/framework/tensorprotoutils.h b/onnxruntime/core/framework/tensorprotoutils.h
index f5dec7ae988f2..79eae48c10411 100644
--- a/onnxruntime/core/framework/tensorprotoutils.h
+++ b/onnxruntime/core/framework/tensorprotoutils.h
@@ -41,12 +41,18 @@ Status GetExternalDataInfo(const ONNX_NAMESPACE::TensorProto& tensor_proto,
                            ExternalDataInfo::PrepackedInfos* prepacked_infos = nullptr);
 /**
  * This function is used to convert the endianess of Tensor data.
+ * If ext_data_buf is provided, then this buffer content's endianess
+ * will be changed.
  * Mostly, will be used in big endian system to support the model file
  * generated on little endian system.
- * @param initializer       given initializer tensor
+ * @param tensor_proto given initializer tensor
+ * @param ext_data_buf optional externl data buffer
+ * @param ext_data_len optional externl data buffer lengeh
  * @returns                 None
  */
-void ConvertRawDataInTensorProto(ONNX_NAMESPACE::TensorProto* initializer);
+void ConvertRawDataInTensorProto(ONNX_NAMESPACE::TensorProto* tensor_proto,
+                                 void* ext_data_buf = NULL,
+                                 size_t ext_data_len = 0);
 
 /**
  * Wrapper function for set_raw_data.

From 593d5c0ed407ea1f7cfa110f0e4dc5873533705e Mon Sep 17 00:00:00 2001
From: Baiju Meswani <bmeswani@microsoft.com>
Date: Fri, 7 Mar 2025 10:13:55 -0800
Subject: [PATCH 39/46] Create a packaging pipeline for a custom nuget package
 (#23918)

---
 .../custom-nuget-packaging-pipeline.yml       | 142 +++++++++++++++++
 ...acts-package-and-publish-steps-windows.yml |  16 ++
 .../azure-pipelines/templates/qnn-ep-win.yml  |  27 +++-
 .../azure-pipelines/templates/win-ci.yml      |   2 +-
 .../nuget/generate_nuspec_for_custom_nuget.py | 150 ++++++++++++++++++
 5 files changed, 331 insertions(+), 6 deletions(-)
 create mode 100644 tools/ci_build/github/azure-pipelines/custom-nuget-packaging-pipeline.yml
 create mode 100644 tools/nuget/generate_nuspec_for_custom_nuget.py

diff --git a/tools/ci_build/github/azure-pipelines/custom-nuget-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/custom-nuget-packaging-pipeline.yml
new file mode 100644
index 0000000000000..8aaaa0e85585a
--- /dev/null
+++ b/tools/ci_build/github/azure-pipelines/custom-nuget-packaging-pipeline.yml
@@ -0,0 +1,142 @@
+parameters:
+- name: CudaVersion
+  type: string
+  default: '12.2'
+
+- name: QnnSdk
+  displayName: QNN SDK Version
+  type: string
+  default: 2.31.0.250130
+
+- name: IsReleaseBuild
+  displayName: Is a release build? Set it to true if you are doing an Onnx Runtime release.
+  type: boolean
+  default: false
+
+- name: PackageName
+  displayName: What is the package name?
+  type: string
+  default: 'Microsoft.ML.OnnxRuntime.Flamingo'
+
+variables:
+  - template: templates/common-variables.yml
+  - name: ReleaseVersionSuffix
+    value: ''
+  - name: win_cuda_home
+    ${{ if eq(parameters.CudaVersion, '11.8') }}:
+      value: $(Agent.TempDirectory)\v11.8
+    ${{ if eq(parameters.CudaVersion, '12.2') }}:
+      value: $(Agent.TempDirectory)\v12.2
+
+stages:
+  - template: templates/win-ci.yml
+    parameters:
+      ort_build_pool_name: 'onnxruntime-Win2022-GPU-A10'
+      DoCompliance: false
+      DoEsrp: true
+      stage_name_suffix: CUDA
+      buildArch: x64
+      msbuildPlatform: x64
+      packageName: x64-cuda
+      CudaVersion: ${{ parameters.CudaVersion }}
+      buildparameter: --use_cuda --cuda_home=${{ variables.win_cuda_home }} --enable_onnx_tests --enable_wcos --use_webgpu --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52-real;61-real;75-real;86-real;89-real;90-virtual"
+      runTests: false
+      buildJava: false
+      java_artifact_id: onnxruntime_gpu
+      UseIncreasedTimeoutForTests: false
+      SpecificArtifact: false
+      BuildId: '0'
+
+  - template: templates/qnn-ep-win.yml
+    parameters:
+      qnn_ep_build_pool_name: 'Onnxruntime-QNNEP-Windows-2022-CPU'
+      QnnSdk: ${{ parameters.QnnSdk }}
+      IsReleaseBuild: ${{ parameters.IsReleaseBuild }}
+      DoEsrp: true
+      ArtifactName: 'drop-nuget-qnn-arm64'
+      # Add --use_webgpu to enable WebGPU
+      buildParameter: '--arm64'
+      buildPlatform: 'ARM64'
+      buildArch: 'ARM64'
+      StageName: 'OnnxRuntime_QNN_Nuget_Win_Arm64'
+      build_config: 'RelWithDebInfo'
+      Is1ES: false
+      PublishArchive: true
+
+  - stage: NugetPackaging
+    dependsOn: [Windows_Packaging_CUDA, OnnxRuntime_QNN_Nuget_Win_Arm64]
+    jobs:
+    - job: CreateNugetPackage
+      pool: 'Onnxruntime-Win2022-GPU-A10'
+      timeoutInMinutes: 120
+      steps:
+      - checkout: self
+        clean: true
+        submodules: none
+
+      - task: UsePythonVersion@0
+        inputs:
+          versionSpec: '3.12'
+          addToPath: true
+
+      - task: DownloadPipelineArtifact@0
+        displayName: 'Download Pipeline Artifact - managed nuget'
+        inputs:
+          artifactName: 'drop-nuget-qnn-arm64'
+          targetPath: '$(Build.BinariesDirectory)/managed-nuget'
+
+      - task: DownloadPipelineArtifact@0
+        displayName: 'Download Pipeline Artifact - win-x64'
+        inputs:
+          artifactName: 'onnxruntime-win-x64-cuda'
+          targetPath: '$(Build.BinariesDirectory)/win-x64'
+
+      - task: DownloadPipelineArtifact@0
+        displayName: 'Download Pipeline Artifact - win-arm64'
+        inputs:
+          artifactName: 'onnxruntime-win-ARM64-qnn'
+          targetPath: '$(Build.BinariesDirectory)/win-arm64'
+
+      - task: PowerShell@2
+        displayName: 'Extract Nuget Package Version'
+        inputs:
+          targetType: 'inline'
+          script: |
+            $nupkgs = (Get-ChildItem $(Build.BinariesDirectory)/managed-nuget -Filter Microsoft.ML.OnnxRuntime.Managed.*.nupkg -Recurse)
+            $package_name = $nupkgs[0].Name
+            $version_length = $package_name.Length - "Microsoft.ML.OnnxRuntime.Managed.".Length - ".nupkg".Length
+            $package_version = $package_name.Substring("Microsoft.ML.OnnxRuntime.Managed.".Length, $version_length)
+            Write-Host "##vso[task.setvariable variable=package_version;]$package_version"
+          workingDirectory: $(Build.BinariesDirectory)
+
+      - task: PowerShell@2
+        displayName: 'Extract Archives'
+        inputs:
+          targetType: 'inline'
+          script: |
+            Expand-Archive -Path $(Build.BinariesDirectory)/win-x64/onnxruntime-win-x64-cuda*.zip -DestinationPath $(Build.BinariesDirectory)/win-x64
+            Expand-Archive -Path $(Build.BinariesDirectory)/win-arm64/onnxruntime-win-ARM64-qnn*.zip -DestinationPath $(Build.BinariesDirectory)/win-arm64
+            $win_x64 = (Get-ChildItem -Path $(Build.BinariesDirectory)/win-x64 -Filter onnxruntime-win-x64-cuda*)[0].FullName
+            $win_arm64 = (Get-ChildItem -Path $(Build.BinariesDirectory)/win-arm64 -Filter onnxruntime-win-ARM64-qnn*)[0].FullName
+            Write-Host "##vso[task.setvariable variable=win_x64;]$win_x64"
+            Write-Host "##vso[task.setvariable variable=win_arm64;]$win_arm64"
+          workingDirectory: $(Build.BinariesDirectory)
+
+      - task: PythonScript@0
+        displayName: 'Generate Nuget Package'
+        inputs:
+          scriptPath: '$(Build.SourcesDirectory)/tools/nuget/generate_nuspec_for_custom_nuget.py'
+          arguments: '--nuspec_path "$(Build.BinariesDirectory)/${{ parameters.PackageName }}.nuspec" --root_dir "$(Build.SourcesDirectory)" --commit_id "$(Build.SourceVersion)" --win_arm64 "$(win_arm64)" --win_x64 "$(win_x64)" --package_version "$(package_version)" --package_name "${{ parameters.PackageName }}"'
+
+      - task: NuGetCommand@2
+        displayName: 'Pack Nuget Package'
+        inputs:
+          command: 'pack'
+          packagesToPack: '$(Build.BinariesDirectory)/${{ parameters.PackageName }}.nuspec'
+          packDestination: $(Build.ArtifactStagingDirectory)\
+
+      - task: PublishBuildArtifacts@1
+        displayName: 'Publish Artifact: Nuget'
+        inputs:
+          pathtoPublish: '$(Build.ArtifactStagingDirectory)'
+          artifactName: '${{ parameters.PackageName }}'
diff --git a/tools/ci_build/github/azure-pipelines/templates/c-api-artifacts-package-and-publish-steps-windows.yml b/tools/ci_build/github/azure-pipelines/templates/c-api-artifacts-package-and-publish-steps-windows.yml
index 5ee425405ac70..e1a514ea54123 100644
--- a/tools/ci_build/github/azure-pipelines/templates/c-api-artifacts-package-and-publish-steps-windows.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/c-api-artifacts-package-and-publish-steps-windows.yml
@@ -57,6 +57,22 @@ steps:
           copy $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\onnxruntime_providers_cuda.pdb $(Build.BinariesDirectory)\${{parameters.artifactName}}\lib
           copy $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\onnxruntime_providers_cuda.lib $(Build.BinariesDirectory)\${{parameters.artifactName}}\lib
 
+          # Copy WebGPU dependencies if required
+          copy $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\dxcompiler.dll $(Build.BinariesDirectory)\${{parameters.artifactName}}\lib
+          copy $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\dxil.dll $(Build.BinariesDirectory)\${{parameters.artifactName}}\lib
+
+          # Copy QNN dependencies if required
+          copy $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\onnxruntime_providers_qnn.dll $(Build.BinariesDirectory)\${{parameters.artifactName}}\lib
+          copy $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\libQnnHtp*.so $(Build.BinariesDirectory)\${{parameters.artifactName}}\lib /Y
+          copy $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\libqnnhtp*.cat $(Build.BinariesDirectory)\${{parameters.artifactName}}\lib /Y
+          copy $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\QnnCpu.dll $(Build.BinariesDirectory)\${{parameters.artifactName}}\lib
+          copy $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\QnnHtp.dll $(Build.BinariesDirectory)\${{parameters.artifactName}}\lib
+          copy $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\QnnHtpPrepare.dll $(Build.BinariesDirectory)\${{parameters.artifactName}}\lib
+          copy $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\QnnHtpV68Stub.dll $(Build.BinariesDirectory)\${{parameters.artifactName}}\lib
+          copy $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\QnnHtpV73Stub.dll $(Build.BinariesDirectory)\${{parameters.artifactName}}\lib
+          copy $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\QnnSaver.dll $(Build.BinariesDirectory)\${{parameters.artifactName}}\lib
+          copy $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\QnnSystem.dll $(Build.BinariesDirectory)\${{parameters.artifactName}}\lib
+
           # copy trt ep libraries only when trt ep is enabled
           copy $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\onnxruntime_providers_tensorrt.dll $(Build.BinariesDirectory)\${{parameters.artifactName}}\lib
           copy $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\onnxruntime_providers_tensorrt.pdb $(Build.BinariesDirectory)\${{parameters.artifactName}}\lib
diff --git a/tools/ci_build/github/azure-pipelines/templates/qnn-ep-win.yml b/tools/ci_build/github/azure-pipelines/templates/qnn-ep-win.yml
index b591a3e3e121b..3fa4799ec9c0e 100644
--- a/tools/ci_build/github/azure-pipelines/templates/qnn-ep-win.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/qnn-ep-win.yml
@@ -10,6 +10,8 @@ parameters:
   buildPlatform: 'x64'
   buildArch: 'x64'
   StageName: 'OnnxRuntime_QNN_Nuget_Win_x64'
+  Is1ES: true
+  PublishArchive: false
 
 stages:
 - stage: ${{ parameters.StageName }}
@@ -107,6 +109,14 @@ stages:
         DoEsrp: ${{ parameters.DoEsrp }}
         Pattern: 'onnxruntime*.dll'
 
+    - ${{ if eq(parameters.PublishArchive, true) }}:
+      - template: c-api-artifacts-package-and-publish-steps-windows.yml
+        parameters:
+          buildConfig: ${{ parameters.build_config }}
+          artifactName: 'onnxruntime-win-${{ parameters.buildPlatform }}-qnn'
+          artifactNameNoVersionString: 'onnxruntime-win-${{ parameters.buildPlatform }}-qnn'
+          DoEsrp: ${{ parameters.DoEsrp }}
+
     - task: MSBuild@1
       displayName: 'Restore NuGet Packages and create project.assets.json'
       inputs:
@@ -155,8 +165,15 @@ stages:
         Contents: '*.snupkg'
         TargetFolder: '$(Build.ArtifactStagingDirectory)'
 
-    - task: 1ES.PublishPipelineArtifact@1
-      displayName: 'Publish Pipeline x64 NuGet Artifact'
-      inputs:
-        artifactName: ${{ parameters.ArtifactName }}
-        targetPath: '$(Build.ArtifactStagingDirectory)'
+    - ${{ if eq(parameters.Is1ES, true) }}:
+      - task: 1ES.PublishPipelineArtifact@1
+        displayName: 'Publish Pipeline x64 NuGet Artifact'
+        inputs:
+          artifactName: ${{ parameters.ArtifactName }}
+          targetPath: '$(Build.ArtifactStagingDirectory)'
+    - ${{ else }}:
+      - task: PublishPipelineArtifact@1
+        displayName: 'Publish Pipeline x64 NuGet Artifact'
+        inputs:
+          artifactName: ${{ parameters.ArtifactName }}
+          targetPath: '$(Build.ArtifactStagingDirectory)'
diff --git a/tools/ci_build/github/azure-pipelines/templates/win-ci.yml b/tools/ci_build/github/azure-pipelines/templates/win-ci.yml
index 600e6d857185f..69a06c3db24b8 100644
--- a/tools/ci_build/github/azure-pipelines/templates/win-ci.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/win-ci.yml
@@ -161,7 +161,7 @@ stages:
         displayName: 'Generate cmake config'
         inputs:
           scriptPath: '$(Build.SourcesDirectory)\tools\ci_build\build.py'
-          arguments: '--config RelWithDebInfo --use_binskim_compliant_compile_flags --enable_lto --disable_rtti --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --update --build --use_vcpkg --use_vcpkg_ms_internal_asset_cache --cmake_generator "$(VSGenerator)" --enable_onnx_tests $(TelemetryOption) ${{ parameters.buildparameter }} $(timeoutParameter) $(buildJavaParameter)'
+          arguments: '--config RelWithDebInfo --use_binskim_compliant_compile_flags --enable_lto --disable_rtti --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --update --build --cmake_generator "$(VSGenerator)" --enable_onnx_tests $(TelemetryOption) ${{ parameters.buildparameter }} $(timeoutParameter) $(buildJavaParameter)'
           workingDirectory: '$(Build.BinariesDirectory)'
 
 
diff --git a/tools/nuget/generate_nuspec_for_custom_nuget.py b/tools/nuget/generate_nuspec_for_custom_nuget.py
new file mode 100644
index 0000000000000..baf46743cbf1b
--- /dev/null
+++ b/tools/nuget/generate_nuspec_for_custom_nuget.py
@@ -0,0 +1,150 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+import argparse
+import glob
+import os
+import shutil
+
+from generate_nuspec_for_native_nuget import generate_metadata
+
+
+def generate_files(lines, args):
+    files_list = ["<files>"]
+    platform_map = {
+        "win-arm64": args.win_arm64,
+        "win-x64": args.win_x64,
+    }
+
+    avoid_keywords = {"pdb"}
+    processed_includes = set()
+    for platform, platform_dir in platform_map.items():
+        for file in glob.glob(os.path.join(platform_dir, "lib", "*")):
+            if not os.path.isfile(file):
+                continue
+            if any(keyword in file for keyword in avoid_keywords):
+                continue
+            file_name = os.path.basename(file)
+
+            files_list.append(f'<file src="{file}" target="runtimes/{platform}/native/{file_name}" />')
+
+        for file in glob.glob(os.path.join(platform_dir, "include", "*")):
+            if not os.path.isfile(file):
+                continue
+            file_name = os.path.basename(file)
+            if file_name in processed_includes:
+                continue
+            processed_includes.add(file_name)
+            files_list.append(f'<file src="{file}" target="build/native/include/{file_name}" />')
+
+    files_list.append(
+        f'<file src="{os.path.join(args.root_dir, "tools", "nuget", "nupkg.README.md")}" target="README.md" />'
+    )
+
+    files_list.append(f'<file src="{os.path.join(args.root_dir, "LICENSE")}" target="LICENSE" />')
+    files_list.append(
+        f'<file src="{os.path.join(args.root_dir, "ThirdPartyNotices.txt")}" target="ThirdPartyNotices.txt" />'
+    )
+    files_list.append(f'<file src="{os.path.join(args.root_dir, "docs", "Privacy.md")}" target="Privacy.md" />')
+    files_list.append(
+        f'<file src="{os.path.join(args.root_dir, "ORT_icon_for_light_bg.png")}" target="ORT_icon_for_light_bg.png" />'
+    )
+
+    source_props = os.path.join(
+        args.root_dir,
+        "csharp",
+        "src",
+        "Microsoft.ML.OnnxRuntime",
+        "targets",
+        "netstandard",
+        "props.xml",
+    )
+    target_props = os.path.join(
+        args.root_dir,
+        "csharp",
+        "src",
+        "Microsoft.ML.OnnxRuntime",
+        "targets",
+        "netstandard",
+        f"{args.package_name}.props",
+    )
+    shutil.copyfile(source_props, target_props)
+    files_list.append(f'<file src="{target_props}" target="build/netstandard2.0/" />')
+    files_list.append(f'<file src="{target_props}" target="build/netstandard2.1/" />')
+
+    source_targets = os.path.join(
+        args.root_dir,
+        "csharp",
+        "src",
+        "Microsoft.ML.OnnxRuntime",
+        "targets",
+        "netstandard",
+        "targets.xml",
+    )
+    target_targets = os.path.join(
+        args.root_dir,
+        "csharp",
+        "src",
+        "Microsoft.ML.OnnxRuntime",
+        "targets",
+        "netstandard",
+        f"{args.package_name}.targets",
+    )
+    shutil.copyfile(source_targets, target_targets)
+    files_list.append(f'<file src="{target_targets}" target="build/netstandard2.0/" />')
+    files_list.append(f'<file src="{target_targets}" target="build/netstandard2.1/" />')
+
+    files_list.append("</files>")
+    lines.extend(files_list)
+
+
+def parse_arguments():
+    parser = argparse.ArgumentParser(
+        description="Create a nuspec file for the custom nuget package.",
+    )
+
+    parser.add_argument("--nuspec_path", required=True, help="Nuspec output file path.")
+    parser.add_argument("--root_dir", required=True, help="ORT repository root.")
+    parser.add_argument(
+        "--commit_id",
+        required=True,
+        help="The last commit id included in this package.",
+    )
+    parser.add_argument("--win_arm64", required=True, help="Ort win-arm64 directory")
+    parser.add_argument("--win_x64", required=True, help="Ort win-x64 directory")
+    parser.add_argument("--package_version", required=True, help="Version of the package")
+    parser.add_argument("--package_name", required=True, help="Name of the package")
+
+    args = parser.parse_args()
+
+    args.sdk_info = ""
+
+    return args
+
+
+def generate_nuspec(args: argparse.Namespace):
+    lines = ['<?xml version="1.0"?>']
+    lines.append("<package>")
+
+    generate_metadata(lines, args)
+    generate_files(lines, args)
+
+    lines.append("</package>")
+    return lines
+
+
+def main():
+    args = parse_arguments()
+
+    lines = generate_nuspec(args)
+
+    with open(os.path.join(args.nuspec_path), "w") as f:
+        for line in lines:
+            # Uncomment the printing of the line if you need to debug what's produced on a CI machine
+            print(line)
+            f.write(line)
+            f.write("\n")
+
+
+if __name__ == "__main__":
+    main()

From 7dbbfe0860e91a737cabe58ae02aeb3bba836417 Mon Sep 17 00:00:00 2001
From: Scott McKay <skottmckay@gmail.com>
Date: Sat, 8 Mar 2025 04:16:44 +1000
Subject: [PATCH 40/46] Fix license in example test code. (#23936)

---
 .../custom_execution_provider_library/my_execution_provider.cc  | 2 +-
 .../custom_execution_provider_library/my_execution_provider.h   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/onnxruntime/test/testdata/custom_execution_provider_library/my_execution_provider.cc b/onnxruntime/test/testdata/custom_execution_provider_library/my_execution_provider.cc
index 57471f7c029c2..27a4b06a99e64 100644
--- a/onnxruntime/test/testdata/custom_execution_provider_library/my_execution_provider.cc
+++ b/onnxruntime/test/testdata/custom_execution_provider_library/my_execution_provider.cc
@@ -1,5 +1,5 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
-// Confidential and Proprietary.
+// Licensed under the MIT License.
 
 #include "my_execution_provider.h"
 #include "my_allocator.h"
diff --git a/onnxruntime/test/testdata/custom_execution_provider_library/my_execution_provider.h b/onnxruntime/test/testdata/custom_execution_provider_library/my_execution_provider.h
index ff0c7e80c4eeb..efb359a9e5e43 100644
--- a/onnxruntime/test/testdata/custom_execution_provider_library/my_execution_provider.h
+++ b/onnxruntime/test/testdata/custom_execution_provider_library/my_execution_provider.h
@@ -1,5 +1,5 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
-// Confidential and Proprietary.
+// Licensed under the MIT License.
 
 #pragma once
 

From ab38607de08f7e69f1b43e254ceac80c66b2ae79 Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Fri, 7 Mar 2025 11:01:53 -0800
Subject: [PATCH 41/46] replace usage of gsl::narrow and gsl::narrow_cast in
 WebGPU EP (#23926)

### Description

`gsl::narrow` does not work in no exception build.
- use `onnxruntime::narrow` if necessary;
- or change to `static_cast` if it's obviously safe.

also apply the changes to usage of `gsl::narrow_cast`, which does not
apply checks.
---
 .../contrib_ops/webgpu/bert/fast_gelu.cc      |  4 +--
 .../webgpu/bert/flash_attention.cc            |  2 +-
 .../webgpu/bert/rotary_embedding.cc           | 14 +++++-----
 .../webgpu/bert/skip_layer_norm.cc            |  4 +--
 .../webgpu/quantization/dp4a_matmul_nbits.cc  | 16 +++++------
 .../webgpu/quantization/matmul_nbits.cc       | 22 +++++++--------
 .../subgroup_matrix_matmul_nbits.cc           |  8 +++---
 .../core/providers/webgpu/generator/range.cc  |  2 +-
 .../webgpu/math/binary_elementwise_ops.cc     |  2 +-
 .../webgpu/math/unary_elementwise_ops.cc      |  2 +-
 .../core/providers/webgpu/nn/layer_norm.cc    |  6 ++--
 .../core/providers/webgpu/program_manager.cc  | 10 +++----
 .../core/providers/webgpu/shader_variable.cc  |  2 +-
 .../core/providers/webgpu/tensor/cast.cc      |  2 +-
 .../core/providers/webgpu/tensor/cast.h       |  2 +-
 .../core/providers/webgpu/tensor/concat.cc    |  2 +-
 .../core/providers/webgpu/tensor/expand.cc    |  2 +-
 .../core/providers/webgpu/tensor/gather.cc    |  2 +-
 .../core/providers/webgpu/tensor/pad.cc       |  2 +-
 .../providers/webgpu/tensor/resize_impl.cc    |  8 +++---
 .../core/providers/webgpu/tensor/split.cc     |  6 ++--
 .../core/providers/webgpu/tensor/transpose.cc |  6 ++--
 .../core/providers/webgpu/tensor/where.cc     |  2 +-
 .../core/providers/webgpu/webgpu_context.cc   |  4 +--
 .../win-gpu-webgpu-ci-pipeline.yml            | 28 +++++++++++++++++++
 25 files changed, 94 insertions(+), 66 deletions(-)

diff --git a/onnxruntime/contrib_ops/webgpu/bert/fast_gelu.cc b/onnxruntime/contrib_ops/webgpu/bert/fast_gelu.cc
index a5cae7e7f6747..29ea4f81dd5e1 100644
--- a/onnxruntime/contrib_ops/webgpu/bert/fast_gelu.cc
+++ b/onnxruntime/contrib_ops/webgpu/bert/fast_gelu.cc
@@ -50,7 +50,7 @@ Status FastGelu::ComputeInternal(onnxruntime::webgpu::ComputeContext& context) c
   const auto* bias = context.Input(1);
   auto* output = context.Output(0, input->Shape());
 
-  uint32_t data_size = gsl::narrow<uint32_t>(output->Shape().Size());
+  uint32_t data_size = onnxruntime::narrow<uint32_t>(output->Shape().Size());
   if (data_size == 0) {
     return Status::OK();
   }
@@ -60,7 +60,7 @@ Status FastGelu::ComputeInternal(onnxruntime::webgpu::ComputeContext& context) c
   int bias_components = 1;
 
   if (bias != nullptr) {
-    bias_size = gsl::narrow<uint32_t>(bias->Shape().Size());
+    bias_size = onnxruntime::narrow<uint32_t>(bias->Shape().Size());
     if (bias_size % 4 == 0) {
       bias_components = 4;
       bias_size = bias_size / 4;
diff --git a/onnxruntime/contrib_ops/webgpu/bert/flash_attention.cc b/onnxruntime/contrib_ops/webgpu/bert/flash_attention.cc
index c1b025b10e067..1e95d3d9610ff 100644
--- a/onnxruntime/contrib_ops/webgpu/bert/flash_attention.cc
+++ b/onnxruntime/contrib_ops/webgpu/bert/flash_attention.cc
@@ -98,7 +98,7 @@ Status CopyKVCache(onnxruntime::webgpu::ComputeContext& context, const WebgpuAtt
   program.AddOutputs({{present_key, ProgramTensorMetadataDependency::Rank, components},
                       {present_value, ProgramTensorMetadataDependency::Rank, components}})
       .AddIndices(valid_present_shape);
-  program.SetDispatchGroupSize(gsl::narrow<uint32_t>(valid_kv_size + 63 / 64))
+  program.SetDispatchGroupSize(onnxruntime::narrow<uint32_t>(valid_kv_size + 63 / 64))
       .SetWorkgroupSize(64)
       .CacheHint(has_past, parameters.qkv_format_, parameters.past_present_share_buffer_)
       .AddUniformVariables({{static_cast<uint32_t>(valid_kv_size)},
diff --git a/onnxruntime/contrib_ops/webgpu/bert/rotary_embedding.cc b/onnxruntime/contrib_ops/webgpu/bert/rotary_embedding.cc
index bc8b7493fc916..20e1583e0da8f 100644
--- a/onnxruntime/contrib_ops/webgpu/bert/rotary_embedding.cc
+++ b/onnxruntime/contrib_ops/webgpu/bert/rotary_embedding.cc
@@ -66,11 +66,11 @@ Status RotaryEmbedding::ComputeInternal(onnxruntime::webgpu::ComputeContext& con
   const auto* sin_cache = context.Input<Tensor>(3);
   auto* output = context.Output(0, input_shape);
 
-  const auto batch_size = gsl::narrow<uint32_t>(input->Shape()[0]);
-  const auto batch_stride = gsl::narrow<uint32_t>(input_shape.SizeFromDimension(1));
-  const auto sequence_length = gsl::narrow<uint32_t>(input_shape[input_shape.NumDimensions() - 2]);
+  const auto batch_size = onnxruntime::narrow<uint32_t>(input->Shape()[0]);
+  const auto batch_stride = onnxruntime::narrow<uint32_t>(input_shape.SizeFromDimension(1));
+  const auto sequence_length = onnxruntime::narrow<uint32_t>(input_shape[input_shape.NumDimensions() - 2]);
   const auto hidden_size = batch_stride / sequence_length;
-  const auto half_rotary_embedding_dim = gsl::narrow<uint32_t>(cos_cache->Shape()[1]);
+  const auto half_rotary_embedding_dim = onnxruntime::narrow<uint32_t>(cos_cache->Shape()[1]);
   const auto head_size = rotary_embedding_dim_ == 0 ? half_rotary_embedding_dim * 2 : hidden_size / num_heads_;
 
   // Rotary embeddings will be calculated in a pair-wise fashion. In accordance, use the shape
@@ -85,11 +85,11 @@ Status RotaryEmbedding::ComputeInternal(onnxruntime::webgpu::ComputeContext& con
   std::vector<uint32_t> global_dims(rank);
   std::vector<uint32_t> global_strides(rank);
   for (size_t j = 0; j < rank; ++j) {
-    global_dims[j] = gsl::narrow<uint32_t>(global_shape[j]);
-    global_strides[j] = gsl::narrow<uint32_t>(global_shape.SizeFromDimension(j + 1));
+    global_dims[j] = onnxruntime::narrow<uint32_t>(global_shape[j]);
+    global_strides[j] = onnxruntime::narrow<uint32_t>(global_shape.SizeFromDimension(j + 1));
   }
 
-  const auto output_size = gsl::narrow<const uint32_t>(global_shape.Size());
+  const auto output_size = onnxruntime::narrow<const uint32_t>(global_shape.Size());
   RotaryEmbeddingProgram program{interleaved_};
   const auto input_output_strides =
       input_shape.NumDimensions() == 3
diff --git a/onnxruntime/contrib_ops/webgpu/bert/skip_layer_norm.cc b/onnxruntime/contrib_ops/webgpu/bert/skip_layer_norm.cc
index a1840257d734f..d5d4632c01e2a 100644
--- a/onnxruntime/contrib_ops/webgpu/bert/skip_layer_norm.cc
+++ b/onnxruntime/contrib_ops/webgpu/bert/skip_layer_norm.cc
@@ -122,7 +122,7 @@ Status SkipLayerNorm<simplified>::ComputeInternal(onnxruntime::webgpu::ComputeCo
   }
 
   const bool is_fp16 = x->GetElementType() == ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16;
-  const uint32_t hidden_size = gsl::narrow<uint32_t>(x_shape[x_shape.NumDimensions() - 1]);
+  const uint32_t hidden_size = onnxruntime::narrow<uint32_t>(x_shape[x_shape.NumDimensions() - 1]);
   const int components = GetMaxComponents(hidden_size);
   const bool has_input_skip_bias_sum = input_skip_bias_sum != nullptr;
 
@@ -133,7 +133,7 @@ Status SkipLayerNorm<simplified>::ComputeInternal(onnxruntime::webgpu::ComputeCo
       .AddInputs({{skip, ProgramTensorMetadataDependency::Type, components}})
       .AddInputs({{gamma, ProgramTensorMetadataDependency::Type, components}})
       .AddOutputs({{output, ProgramTensorMetadataDependency::None, components}})
-      .SetDispatchGroupSize(gsl::narrow<uint32_t>(ceil(1.0 * data_size / hidden_size)))
+      .SetDispatchGroupSize(onnxruntime::narrow<uint32_t>(ceil(1.0 * data_size / hidden_size)))
       .AddUniformVariables({
           {static_cast<uint32_t>(components)},
       })
diff --git a/onnxruntime/contrib_ops/webgpu/quantization/dp4a_matmul_nbits.cc b/onnxruntime/contrib_ops/webgpu/quantization/dp4a_matmul_nbits.cc
index 6720a6072f7bb..05cbfb1f99c48 100644
--- a/onnxruntime/contrib_ops/webgpu/quantization/dp4a_matmul_nbits.cc
+++ b/onnxruntime/contrib_ops/webgpu/quantization/dp4a_matmul_nbits.cc
@@ -277,9 +277,9 @@ Status ApplyDP4AMatrixMatMulNBits(const Tensor* a, const Tensor* b, const Tensor
   Tensor a_quant = context.CreateGPUTensor(DataTypeImpl::GetType<uint32_t>(), a_quant_shape);
   TensorShapeVector a_scales_dims({1, 1, M, K / kBlockSizeA});
   Tensor a_scale = context.CreateGPUTensor(a->DataType(), a_scales_dims);
-  quantize_program.AddInputs({{a, ProgramTensorMetadataDependency::TypeAndRank, gsl::narrow<int>(kVec4Components)}})
-      .AddOutputs({{&a_quant, ProgramTensorMetadataDependency::Rank, a_quant.Shape(), gsl::narrow<int>(1)},
-                   {&a_scale, ProgramTensorMetadataDependency::Rank, a_scale.Shape(), gsl::narrow<int>(1)}})
+  quantize_program.AddInputs({{a, ProgramTensorMetadataDependency::TypeAndRank, static_cast<int>(kVec4Components)}})
+      .AddOutputs({{&a_quant, ProgramTensorMetadataDependency::Rank, a_quant.Shape(), 1},
+                   {&a_scale, ProgramTensorMetadataDependency::Rank, a_scale.Shape(), 1}})
       .AddUniformVariable({static_cast<uint32_t>(M * K / kVec4Components)});
   ORT_RETURN_IF_ERROR(context.RunProgram(quantize_program));
 
@@ -290,16 +290,16 @@ Status ApplyDP4AMatrixMatMulNBits(const Tensor* a, const Tensor* b, const Tensor
   mul_program.SetDispatchGroupSize(
       (M + kTileSize - 1) / kTileSize,
       (N + kTileSize - 1) / kTileSize, 1);
-  mul_program.AddInputs({{&a_quant, ProgramTensorMetadataDependency::TypeAndRank, gsl::narrow<int>(kVec4Components)},
-                         {&a_scale, ProgramTensorMetadataDependency::TypeAndRank, gsl::narrow<int>(1)},
-                         {b, ProgramTensorMetadataDependency::TypeAndRank, gsl::narrow<int>(kVec2Components * kU32Components)},
-                         {scales, ProgramTensorMetadataDependency::TypeAndRank, gsl::narrow<int>(1)}})
+  mul_program.AddInputs({{&a_quant, ProgramTensorMetadataDependency::TypeAndRank, static_cast<int>(kVec4Components)},
+                         {&a_scale, ProgramTensorMetadataDependency::TypeAndRank, 1},
+                         {b, ProgramTensorMetadataDependency::TypeAndRank, static_cast<int>(kVec2Components * kU32Components)},
+                         {scales, ProgramTensorMetadataDependency::TypeAndRank, 1}})
       .AddUniformVariables({{static_cast<uint32_t>(M)},
                             {static_cast<uint32_t>(N)},
                             {static_cast<uint32_t>(K)},
                             {static_cast<uint32_t>(K / 8)},
                             {static_cast<uint32_t>(K / 16)}})
-      .AddOutput({y, ProgramTensorMetadataDependency::TypeAndRank, reshaped_y_shape, gsl::narrow<int>(kVec4Components)})
+      .AddOutput({y, ProgramTensorMetadataDependency::TypeAndRank, reshaped_y_shape, static_cast<int>(kVec4Components)})
       .CacheHint("Block" + std::to_string(block_size));
   return context.RunProgram(mul_program);
 }
diff --git a/onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits.cc b/onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits.cc
index e10a7f551eec9..cce10a59fbd4b 100644
--- a/onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits.cc
+++ b/onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits.cc
@@ -372,7 +372,7 @@ Status MatMulNBitsProgram::GenerateShaderCode(ShaderHelper& shader) const {
     }
   } else {
     const std::string quantized_data_type = QuantizedDataType(a.NumComponents());
-    const int output_element_number = y.NumComponents() * gsl::narrow<int>(output_number_);
+    const int output_element_number = y.NumComponents() * onnxruntime::narrow<int>(output_number_);
 
     const uint32_t shared_memory_size = output_number_ * WORKGROUP_SIZE;
     std::string offset = "workgroup_idx * " + std::to_string(output_number_);
@@ -548,16 +548,16 @@ Status MatMulNBits::ComputeInternal(onnxruntime::webgpu::ComputeContext& context
   TensorShape b_shape({N_, K_});
   ORT_RETURN_IF_ERROR(helper.Compute(a->Shape(), b_shape, false, true));
   auto* y = context.Output(0, helper.OutputShape());
-  const uint32_t data_size = gsl::narrow<uint32_t>(y->Shape().Size());
+  const uint32_t data_size = onnxruntime::narrow<uint32_t>(y->Shape().Size());
   if (data_size == 0) {
     return Status::OK();
   }
 
-  const uint32_t batch_count = gsl::narrow<uint32_t>(helper.OutputOffsets().size());
-  const uint32_t M = gsl::narrow<uint32_t>(helper.M());
-  const uint32_t N = gsl::narrow<uint32_t>(helper.N());
-  const uint32_t K = gsl::narrow<uint32_t>(helper.K());
-  const uint32_t block_size = gsl::narrow<uint32_t>(block_size_);
+  const uint32_t batch_count = onnxruntime::narrow<uint32_t>(helper.OutputOffsets().size());
+  const uint32_t M = onnxruntime::narrow<uint32_t>(helper.M());
+  const uint32_t N = onnxruntime::narrow<uint32_t>(helper.N());
+  const uint32_t K = onnxruntime::narrow<uint32_t>(helper.K());
+  const uint32_t block_size = onnxruntime::narrow<uint32_t>(block_size_);
   constexpr uint32_t nbits = 4;
 
   const uint32_t n_blocks_per_col = (K + block_size - 1) / block_size;
@@ -584,7 +584,7 @@ Status MatMulNBits::ComputeInternal(onnxruntime::webgpu::ComputeContext& context
   const uint32_t tile_m = M > kMinMForTileOptimization ? 4 : 1;
   const bool has_subgroup = context.Device().HasFeature(wgpu::FeatureName::Subgroups);
   const bool use_subgroup = has_subgroup && context.AdapterInfo().vendor == std::string_view{"intel"} && components_a == 4 && block_size == 32;
-  MatMulNBitsProgram program{output_number, block_size, tile_m, gsl::narrow<int>(components_b), has_zero_points, use_subgroup};
+  MatMulNBitsProgram program{output_number, block_size, tile_m, static_cast<int>(components_b), has_zero_points, use_subgroup};
   if (M > kMinMForTileOptimization && block_size == 32) {
     components = 1;
     constexpr uint32_t workgroup_size = 64;
@@ -614,10 +614,10 @@ Status MatMulNBits::ComputeInternal(onnxruntime::webgpu::ComputeContext& context
   TensorShape reshaped_y_shape{batch_count, M, N / components};
 
   program
-      .AddInputs({{a, ProgramTensorMetadataDependency::TypeAndRank, reshaped_a_shape, gsl::narrow<int>(components_a)},
-                  {b, ProgramTensorMetadataDependency::TypeAndRank, reshaped_b_shape, gsl::narrow<int>(components_b * 4 /** b will be accessed as uint32 which includs 4 uint8. So here we need to multiply 4.*/)},
+      .AddInputs({{a, ProgramTensorMetadataDependency::TypeAndRank, reshaped_a_shape, static_cast<int>(components_a)},
+                  {b, ProgramTensorMetadataDependency::TypeAndRank, reshaped_b_shape, static_cast<int>(components_b * 4 /** b will be accessed as uint32 which includs 4 uint8. So here we need to multiply 4.*/)},
                   {scales, ProgramTensorMetadataDependency::None}})
-      .AddOutput({y, ProgramTensorMetadataDependency::TypeAndRank, reshaped_y_shape, gsl::narrow<int>(components)})
+      .AddOutput({y, ProgramTensorMetadataDependency::TypeAndRank, reshaped_y_shape, static_cast<int>(components)})
       .AddUniformVariable({block_size});
   if (has_zero_points) {
     program.AddInput({zero_points, ProgramTensorMetadataDependency::None, {(zero_points->Shape().Size() + 3) / 4}, 4});
diff --git a/onnxruntime/contrib_ops/webgpu/quantization/subgroup_matrix_matmul_nbits.cc b/onnxruntime/contrib_ops/webgpu/quantization/subgroup_matrix_matmul_nbits.cc
index 2944a4d61b8ef..cb024d2a758a9 100644
--- a/onnxruntime/contrib_ops/webgpu/quantization/subgroup_matrix_matmul_nbits.cc
+++ b/onnxruntime/contrib_ops/webgpu/quantization/subgroup_matrix_matmul_nbits.cc
@@ -185,13 +185,13 @@ Status ApplySubgroupMatrixMatMulNBits(const Tensor* a, const Tensor* b, const Te
   mul_program.SetDispatchGroupSize(
       (N + kTileSizeB - 1) / kTileSizeB,
       (M + kTileSizeA - 1) / kTileSizeA, 1);
-  mul_program.AddInputs({{a, ProgramTensorMetadataDependency::TypeAndRank, gsl::narrow<int>(1)},
-                         {b, ProgramTensorMetadataDependency::TypeAndRank, gsl::narrow<int>(kU32Components)},
-                         {scales, ProgramTensorMetadataDependency::TypeAndRank, gsl::narrow<int>(1)}})
+  mul_program.AddInputs({{a, ProgramTensorMetadataDependency::TypeAndRank, 1},
+                         {b, ProgramTensorMetadataDependency::TypeAndRank, static_cast<int>(kU32Components)},
+                         {scales, ProgramTensorMetadataDependency::TypeAndRank, 1}})
       .AddUniformVariables({{static_cast<uint32_t>(M)},
                             {static_cast<uint32_t>(N)},
                             {static_cast<uint32_t>(K)}})
-      .AddOutput({y, ProgramTensorMetadataDependency::TypeAndRank, y_shape, gsl::narrow<int>(1)});
+      .AddOutput({y, ProgramTensorMetadataDependency::TypeAndRank, y_shape, 1});
   return context.RunProgram(mul_program);
 }
 
diff --git a/onnxruntime/core/providers/webgpu/generator/range.cc b/onnxruntime/core/providers/webgpu/generator/range.cc
index a0b65f08a5b4e..99c5a1c1b5566 100644
--- a/onnxruntime/core/providers/webgpu/generator/range.cc
+++ b/onnxruntime/core/providers/webgpu/generator/range.cc
@@ -23,7 +23,7 @@ Status Range<T>::ComputeInternal(ComputeContext& context) const {
     return Status::OK();
   }
 
-  uint32_t output_size = gsl::narrow<uint32_t>(n);
+  uint32_t output_size = onnxruntime::narrow<uint32_t>(n);
   RangeProgram program{};
 #if defined(__GNUC__)
 #pragma GCC diagnostic push
diff --git a/onnxruntime/core/providers/webgpu/math/binary_elementwise_ops.cc b/onnxruntime/core/providers/webgpu/math/binary_elementwise_ops.cc
index 75866513e2c7d..8a22e45f17047 100644
--- a/onnxruntime/core/providers/webgpu/math/binary_elementwise_ops.cc
+++ b/onnxruntime/core/providers/webgpu/math/binary_elementwise_ops.cc
@@ -141,7 +141,7 @@ Status BinaryElementwise::ComputeInternal(ComputeContext& context) const {
     }
   }
 
-  uint32_t vec_size = gsl::narrow<uint32_t>((size + 3) / 4);
+  uint32_t vec_size = onnxruntime::narrow<uint32_t>((size + 3) / 4);
   BinaryElementwiseProgram program{kernel_name_,
                                    expression_,
                                    is_broadcast,
diff --git a/onnxruntime/core/providers/webgpu/math/unary_elementwise_ops.cc b/onnxruntime/core/providers/webgpu/math/unary_elementwise_ops.cc
index eaaad206ebaf5..189d7baafce6a 100644
--- a/onnxruntime/core/providers/webgpu/math/unary_elementwise_ops.cc
+++ b/onnxruntime/core/providers/webgpu/math/unary_elementwise_ops.cc
@@ -27,7 +27,7 @@ Status UnaryElementwise::ComputeInternal(ComputeContext& context) const {
   if (size == 0) {
     return Status::OK();
   }
-  uint32_t vec_size = gsl::narrow<uint32_t>((size + 3) / 4);
+  uint32_t vec_size = onnxruntime::narrow<uint32_t>((size + 3) / 4);
   UnaryElementwiseProgram program{kernel_name_, expression_, additional_impl_, additional_usage_};
   program
       .AddInputs({{input_tensor, ProgramTensorMetadataDependency::Type, {vec_size}, 4}})
diff --git a/onnxruntime/core/providers/webgpu/nn/layer_norm.cc b/onnxruntime/core/providers/webgpu/nn/layer_norm.cc
index 64172021e82f1..28ad686909a47 100644
--- a/onnxruntime/core/providers/webgpu/nn/layer_norm.cc
+++ b/onnxruntime/core/providers/webgpu/nn/layer_norm.cc
@@ -23,7 +23,7 @@ static size_t NormalizeAxis(int64_t axis, size_t tensor_rank) {
   if (axis < -rank && axis >= rank) {
     ORT_THROW("invalid axis: ", axis);
   }
-  return gsl::narrow<size_t>(axis < 0 ? axis + rank : axis);
+  return onnxruntime::narrow<size_t>(axis < 0 ? axis + rank : axis);
 }
 
 static std::string SumVector(std::string x, int components) {
@@ -92,10 +92,10 @@ Status LayerNorm<simplified>::ComputeInternal(onnxruntime::webgpu::ComputeContex
   const bool is_fp16 = x->GetElementType() == ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16;
 
   const size_t axis = NormalizeAxis(axis_, x_shape.NumDimensions());
-  const uint32_t norm_count = gsl::narrow<uint32_t>(x_shape.SizeToDimension(axis));
+  const uint32_t norm_count = onnxruntime::narrow<uint32_t>(x_shape.SizeToDimension(axis));
   const int64_t norm_size = x_shape.SizeFromDimension(axis);
   const int components = GetMaxComponents(norm_size);
-  const uint32_t norm_size_vectorized = gsl::narrow<uint32_t>((norm_size + components - 1) / components);
+  const uint32_t norm_size_vectorized = onnxruntime::narrow<uint32_t>((norm_size + components - 1) / components);
 
   const auto scale_size = scale->Shape().Size();
   const auto bias_size = (bias) ? bias->Shape().Size() : 0;
diff --git a/onnxruntime/core/providers/webgpu/program_manager.cc b/onnxruntime/core/providers/webgpu/program_manager.cc
index 1fdd312d4f0d8..7a4a873a1adf3 100644
--- a/onnxruntime/core/providers/webgpu/program_manager.cc
+++ b/onnxruntime/core/providers/webgpu/program_manager.cc
@@ -24,14 +24,14 @@ Status ProgramManager::NormalizeDispatchGroupSize(uint32_t& x, uint32_t& y, uint
 
   auto limit_per_dimension = limits_.maxComputeWorkgroupsPerDimension;
   if (x > limit_per_dimension || y > limit_per_dimension || z > limit_per_dimension) {
-    auto size = static_cast<double>(x) * static_cast<double>(y) * static_cast<double>(z);
-    uint32_t dispatch_avg = gsl::narrow<uint32_t>(std::ceil(std::sqrt(size)));
+    double size = static_cast<double>(x) * static_cast<double>(y) * static_cast<double>(z);
+    double dispatch_avg = std::ceil(std::sqrt(size));
     if (dispatch_avg > limit_per_dimension) {
-      dispatch_avg = gsl::narrow<uint32_t>(std::ceil(std::cbrt(size)));
+      dispatch_avg = std::ceil(std::cbrt(size));
       ORT_RETURN_IF(dispatch_avg > limit_per_dimension, "The dispatch group size exceeds WebGPU maximum.");
-      x = y = z = dispatch_avg;
+      x = y = z = static_cast<uint32_t>(dispatch_avg);
     } else {
-      x = y = dispatch_avg;
+      x = y = static_cast<uint32_t>(dispatch_avg);
       z = 1;
     }
   }
diff --git a/onnxruntime/core/providers/webgpu/shader_variable.cc b/onnxruntime/core/providers/webgpu/shader_variable.cc
index 5e5920f582251..f8e1e0b3b8d2b 100644
--- a/onnxruntime/core/providers/webgpu/shader_variable.cc
+++ b/onnxruntime/core/providers/webgpu/shader_variable.cc
@@ -91,7 +91,7 @@ ShaderIndicesHelper::ShaderIndicesHelper(std::string_view name, ProgramVariableD
     : name_(name),
       type_(type),
       num_components_{NumberOfComponents(type)},
-      rank_{gsl::narrow<int>(dims.NumDimensions())},
+      rank_{static_cast<int>(dims.NumDimensions())},
       dims_{dims},
       usage_(usage),
       indices_type_{GetIndicesType(rank_)},
diff --git a/onnxruntime/core/providers/webgpu/tensor/cast.cc b/onnxruntime/core/providers/webgpu/tensor/cast.cc
index 8b5bede34e6d0..7f92ea4ed3776 100644
--- a/onnxruntime/core/providers/webgpu/tensor/cast.cc
+++ b/onnxruntime/core/providers/webgpu/tensor/cast.cc
@@ -69,7 +69,7 @@ Status Cast::ComputeInternal(ComputeContext& context) const {
   if (size == 0) {
     return Status::OK();
   }
-  uint32_t vec_size = gsl::narrow<uint32_t>((size + 3) / 4);
+  uint32_t vec_size = onnxruntime::narrow<uint32_t>((size + 3) / 4);
 
   CastProgram program{to_};
   program
diff --git a/onnxruntime/core/providers/webgpu/tensor/cast.h b/onnxruntime/core/providers/webgpu/tensor/cast.h
index ef5c4d5d0dabe..925cd200f0aba 100644
--- a/onnxruntime/core/providers/webgpu/tensor/cast.h
+++ b/onnxruntime/core/providers/webgpu/tensor/cast.h
@@ -26,7 +26,7 @@ class Cast final : public WebGpuKernel {
     int64_t to;
     Status status = info.GetAttr("to", &to);
     ORT_ENFORCE(status.IsOK(), "Attribute to is not set.");
-    to_ = gsl::narrow<int32_t>(to);
+    to_ = onnxruntime::narrow<int32_t>(to);
 
     // ignore attribute 'saturate' as float8 is not supported in WebGPU
   }
diff --git a/onnxruntime/core/providers/webgpu/tensor/concat.cc b/onnxruntime/core/providers/webgpu/tensor/concat.cc
index 5ed8099fde05e..5cfd6c78f8929 100644
--- a/onnxruntime/core/providers/webgpu/tensor/concat.cc
+++ b/onnxruntime/core/providers/webgpu/tensor/concat.cc
@@ -104,7 +104,7 @@ Status Concat::ComputeInternal(ComputeContext& context) const {
     return Status::OK();
   }
 
-  uint32_t output_size = gsl::narrow_cast<int32_t>(prepare.output_tensor->Shape().Size());
+  uint32_t output_size = onnxruntime::narrow<int32_t>(prepare.output_tensor->Shape().Size());
 
   size_t axis = static_cast<size_t>(prepare.axis);
   ConcatProgram program{axis};
diff --git a/onnxruntime/core/providers/webgpu/tensor/expand.cc b/onnxruntime/core/providers/webgpu/tensor/expand.cc
index 809616660aa9e..9bdebe2c1e0d3 100644
--- a/onnxruntime/core/providers/webgpu/tensor/expand.cc
+++ b/onnxruntime/core/providers/webgpu/tensor/expand.cc
@@ -42,7 +42,7 @@ Status Expand::ComputeInternal(ComputeContext& context) const {
                                                                                                               : 1;
   const int components_o = output_shape.IsScalar() ? 1 : output_shape[output_shape.NumDimensions() - 1] % 4 == 0 ? 4
                                                                                                                  : 1;
-  uint32_t data_size = gsl::narrow<uint32_t>(output_shape.Size() / components_o);
+  uint32_t data_size = onnxruntime::narrow<uint32_t>(output_shape.Size() / components_o);
 
   ExpandProgram program{};
   program
diff --git a/onnxruntime/core/providers/webgpu/tensor/gather.cc b/onnxruntime/core/providers/webgpu/tensor/gather.cc
index 9f6e5f2420d86..39d07991f3c5a 100644
--- a/onnxruntime/core/providers/webgpu/tensor/gather.cc
+++ b/onnxruntime/core/providers/webgpu/tensor/gather.cc
@@ -42,7 +42,7 @@ Status GatherProgram::GenerateShaderCode(ShaderHelper& shader) const {
 Status Gather::ComputeInternal(ComputeContext& context) const {
   Prepare p;
   ORT_RETURN_IF_ERROR(PrepareForCompute(&context.KernelContext(), p));
-  uint32_t data_size = gsl::narrow<uint32_t>(p.output_tensor->Shape().Size());
+  uint32_t data_size = onnxruntime::narrow<uint32_t>(p.output_tensor->Shape().Size());
   if (data_size == 0) {
     return Status::OK();
   }
diff --git a/onnxruntime/core/providers/webgpu/tensor/pad.cc b/onnxruntime/core/providers/webgpu/tensor/pad.cc
index 9ee13aada67fe..6a8bc6554b772 100644
--- a/onnxruntime/core/providers/webgpu/tensor/pad.cc
+++ b/onnxruntime/core/providers/webgpu/tensor/pad.cc
@@ -130,7 +130,7 @@ Status Pad::ComputeInternal(ComputeContext& context) const {
   }
 
   auto* output_tensor = context.Output(0, output_shape);
-  uint32_t output_size = gsl::narrow<uint32_t>(output_shape.Size());
+  uint32_t output_size = onnxruntime::narrow<uint32_t>(output_shape.Size());
   if (output_size == 0) {
     // Do not need to fill output, return
     return Status::OK();
diff --git a/onnxruntime/core/providers/webgpu/tensor/resize_impl.cc b/onnxruntime/core/providers/webgpu/tensor/resize_impl.cc
index 455e7dc54bf1d..f68ace3c1d8a1 100644
--- a/onnxruntime/core/providers/webgpu/tensor/resize_impl.cc
+++ b/onnxruntime/core/providers/webgpu/tensor/resize_impl.cc
@@ -211,7 +211,7 @@ Status ResizeNearestImpl(ComputeContext& context,
                          onnxruntime::ResizeNearestMode nearest_mode) {
   TensorShape output_shape(output_dims);
   auto* output_tensor = context.Output(0, output_shape);
-  uint32_t output_size = gsl::narrow<uint32_t>(output_shape.Size());
+  uint32_t output_size = onnxruntime::narrow<uint32_t>(output_shape.Size());
 
   ResizeNearestProgram program{coordinate_transform_mode, nearest_mode, extrapolation_enabled, rank};
   program.AddInput({input_tensor, ProgramTensorMetadataDependency::TypeAndRank})
@@ -299,7 +299,7 @@ Status ResizeBilinearImpl(ComputeContext& context,
                           onnxruntime::ResizeCoordinateTransformationMode coordinate_transform_mode) {
   TensorShape output_shape(output_dims);
   auto* output_tensor = context.Output(0, output_shape);
-  uint32_t output_size = gsl::narrow<uint32_t>(output_shape.Size());
+  uint32_t output_size = onnxruntime::narrow<uint32_t>(output_shape.Size());
 
   ResizeBilinearProgram program{coordinate_transform_mode, extrapolation_enabled, rank};
   program.AddInput({input_tensor, ProgramTensorMetadataDependency::TypeAndRank})
@@ -413,7 +413,7 @@ Status ResizeTrilinearImpl(ComputeContext& context,
                            onnxruntime::ResizeCoordinateTransformationMode coordinate_transform_mode) {
   TensorShape output_shape(output_dims);
   auto* output_tensor = context.Output(0, output_shape);
-  uint32_t output_size = gsl::narrow<uint32_t>(output_shape.Size());
+  uint32_t output_size = onnxruntime::narrow<uint32_t>(output_shape.Size());
 
   ResizeTrilinearProgram program{coordinate_transform_mode, extrapolation_enabled, rank};
   program.AddInput({input_tensor, ProgramTensorMetadataDependency::TypeAndRank})
@@ -534,7 +534,7 @@ Status ResizeBiCubicImpl(ComputeContext& context,
                          onnxruntime::ResizeCoordinateTransformationMode coordinate_transform_mode) {
   TensorShape output_shape(output_dims);
   auto* output_tensor = context.Output(0, output_shape);
-  uint32_t output_size = gsl::narrow<uint32_t>(output_shape.Size());
+  uint32_t output_size = onnxruntime::narrow<uint32_t>(output_shape.Size());
 
   ResizeBiCubicProgram program{coordinate_transform_mode, extrapolation_enabled, exclude_outside, rank};
   program.AddInput({input_tensor, ProgramTensorMetadataDependency::TypeAndRank})
diff --git a/onnxruntime/core/providers/webgpu/tensor/split.cc b/onnxruntime/core/providers/webgpu/tensor/split.cc
index 83bf832cc5b11..d93b75fa21c16 100644
--- a/onnxruntime/core/providers/webgpu/tensor/split.cc
+++ b/onnxruntime/core/providers/webgpu/tensor/split.cc
@@ -107,7 +107,7 @@ Status Split::ComputeInternal(ComputeContext& context) const {
   ORT_RETURN_IF_ERROR(PrepareForCompute(input_shape, num_outputs, axis, before_dims, after_dims_including_split_axis,
                                         after_dims_excluding_split, split_sizes));
 
-  SplitProgram program{gsl::narrow_cast<uint32_t>(axis)};
+  SplitProgram program{static_cast<uint32_t>(axis)};
   program.AddInput({input, ProgramTensorMetadataDependency::TypeAndRank});
 
   auto output_dimensions = input_shape.AsShapeVector();
@@ -120,7 +120,7 @@ Status Split::ComputeInternal(ComputeContext& context) const {
     program.AddOutput({output, ProgramTensorMetadataDependency::Rank});
   }
 
-  uint32_t input_size = gsl::narrow<uint32_t>(input_shape.Size());
+  uint32_t input_size = onnxruntime::narrow<uint32_t>(input_shape.Size());
   // Early return if the input tensor is empty.
   if (input_size == 0) {
     return Status::OK();
@@ -130,7 +130,7 @@ Status Split::ComputeInternal(ComputeContext& context) const {
   std::vector<uint32_t> sizes_in_split_axis;
   // sizes_in_split_axis are the cumulative sizes of the splits in the split axis.
   for (auto split_size : split_sizes) {
-    previous_sum += gsl::narrow<uint32_t>(split_size);
+    previous_sum += onnxruntime::narrow<uint32_t>(split_size);
     sizes_in_split_axis.push_back(previous_sum);
   }
 
diff --git a/onnxruntime/core/providers/webgpu/tensor/transpose.cc b/onnxruntime/core/providers/webgpu/tensor/transpose.cc
index 24b98e9533d17..0df7d1ae9fa2f 100644
--- a/onnxruntime/core/providers/webgpu/tensor/transpose.cc
+++ b/onnxruntime/core/providers/webgpu/tensor/transpose.cc
@@ -105,7 +105,7 @@ Status Transpose::DoTranspose(onnxruntime::webgpu::ComputeContext& context,
                               const Tensor& input, Tensor& output) {
   const auto& input_shape = input.Shape();
   const auto& input_dims = input_shape.GetDims();
-  int32_t rank = gsl::narrow_cast<int32_t>(input_shape.NumDimensions());
+  int32_t rank = static_cast<int32_t>(input_shape.NumDimensions());
 
   TensorShapeVector output_dims(rank);
 
@@ -131,7 +131,7 @@ Status Transpose::DoTranspose(onnxruntime::webgpu::ComputeContext& context,
     new_output_shape = TensorShape({new_input_shape[1], new_input_shape[0]});
   }
 
-  uint32_t output_size = gsl::narrow_cast<int32_t>(input_shape.Size());
+  uint32_t output_size = onnxruntime::narrow<int32_t>(input_shape.Size());
   TransposeProgram program{permutations, use_shared};
 
   if (use_shared) {
@@ -156,7 +156,7 @@ Status Transpose::DoTranspose(onnxruntime::webgpu::ComputeContext& context,
 Status Transpose::ComputeInternal(ComputeContext& context) const {
   const auto* input_tensor = context.Input(0);
   const TensorShape& input_shape = input_tensor->Shape();
-  int32_t rank = gsl::narrow_cast<int32_t>(input_shape.NumDimensions());
+  int32_t rank = static_cast<int32_t>(input_shape.NumDimensions());
 
   TensorShapeVector output_dims(rank);
   InlinedVector<size_t> default_perm(rank);
diff --git a/onnxruntime/core/providers/webgpu/tensor/where.cc b/onnxruntime/core/providers/webgpu/tensor/where.cc
index e8cdabb9dbe40..d7272ec525296 100644
--- a/onnxruntime/core/providers/webgpu/tensor/where.cc
+++ b/onnxruntime/core/providers/webgpu/tensor/where.cc
@@ -127,7 +127,7 @@ Status Where::ComputeInternal(ComputeContext& context) const {
   ORT_RETURN_IF_ERROR(ComputeOutputShape(cond_shape, x_shape, y_shape, output_shape));
   auto* output_tensor = context.Output(0, output_shape);
   constexpr int component = 4;
-  uint32_t vec_size = gsl::narrow_cast<uint32_t>((output_shape.Size() + 3) / component);
+  uint32_t vec_size = onnxruntime::narrow<uint32_t>((output_shape.Size() + 3) / component);
   const auto is_broadcast = !(x_shape == y_shape &&
                               y_shape == cond_shape);
   WhereProgram program{is_broadcast};
diff --git a/onnxruntime/core/providers/webgpu/webgpu_context.cc b/onnxruntime/core/providers/webgpu/webgpu_context.cc
index 14c12ac247080..97144573dde2d 100644
--- a/onnxruntime/core/providers/webgpu/webgpu_context.cc
+++ b/onnxruntime/core/providers/webgpu/webgpu_context.cc
@@ -322,9 +322,9 @@ Status WebGpuContext::Run(ComputeContext& context, const ProgramBase& program) {
       std::vector<uint32_t> dims(expected_rank);
       std::vector<uint32_t> stride(expected_rank - 1);
       for (size_t j = 0; j < expected_rank; ++j) {
-        dims[j] = gsl::narrow<uint32_t>(shape[j]);
+        dims[j] = onnxruntime::narrow<uint32_t>(shape[j]);
         if (j < expected_rank - 1) {
-          stride[j] = gsl::narrow<uint32_t>(shape.SizeFromDimension(j + 1));
+          stride[j] = onnxruntime::narrow<uint32_t>(shape.SizeFromDimension(j + 1));
         }
       }
 
diff --git a/tools/ci_build/github/azure-pipelines/win-gpu-webgpu-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-gpu-webgpu-ci-pipeline.yml
index bb6c210161952..a0f22fcfce14e 100644
--- a/tools/ci_build/github/azure-pipelines/win-gpu-webgpu-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/win-gpu-webgpu-ci-pipeline.yml
@@ -105,3 +105,31 @@ stages:
             onnxruntime_webgpu_external_dawn_test.exe --no_proc_table
           displayName: Run tests (onnxruntime_webgpu_external_dawn_test)
           workingDirectory: '$(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo'
+
+- stage: webgpu_minimal_build_edge
+  dependsOn: []
+  jobs:
+    - template: templates/jobs/win-ci-vs-2022-job.yml
+      parameters:
+        BuildConfig: 'RelWithDebInfo'
+        EnvSetupScript: setup_env.bat
+        buildArch: x64
+        additionalBuildFlags: >-
+          --build_shared_lib
+          --disable_exceptions
+          --disable_rtti
+          --enable_msvc_static_runtime
+          --enable_reduced_operator_type_support
+          --skip_tests
+          --use_binskim_compliant_compile_flags
+          --cmake_extra_defines onnxruntime_BUILD_UNIT_TESTS=OFF onnxruntime_DISABLE_SPARSE_TENSORS=ON onnxruntime_DISABLE_OPTIONAL_TYPE=ON
+          --minimal_build extended
+          --use_webgpu
+        msbuildPlatform: x64
+        isX86: false
+        job_name_suffix: x64_RelWithDebInfo
+        RunOnnxRuntimeTests: false
+        ORT_EP_NAME: WebGPU
+        EnablePython: false
+        WITH_CACHE: true
+        MachinePool: onnxruntime-Win2022-VS2022-webgpu-A10

From cffef2e028acf23dd1a579639c9591ac3730a866 Mon Sep 17 00:00:00 2001
From: Changming Sun <chasun@microsoft.com>
Date: Fri, 7 Mar 2025 12:47:43 -0800
Subject: [PATCH 42/46] VCPKG improvement: set  VCPKG_OSX_DEPLOYMENT_TARGET
 (#23933)

### Description
1. Set  VCPKG_OSX_DEPLOYMENT_TARGET for macOS targets
2. Enable VCPKG in more pipelines.
---
 tools/ci_build/build.py                       | 14 +++-
 .../stages/py-cpu-packaging-stage.yml         |  2 +-
 .../templates/windowsai-steps.yml             |  2 +-
 .../win-qnn-arm64-ci-pipeline.yml             |  2 +-
 .../azure-pipelines/win-qnn-ci-pipeline.yml   |  2 +-
 tools/python/util/__init__.py                 |  3 +-
 tools/python/util/vcpkg_helpers.py            | 78 +++++++++++++------
 7 files changed, 71 insertions(+), 32 deletions(-)

diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py
index fe20351b0e8bb..db7dbed23a2d2 100644
--- a/tools/ci_build/build.py
+++ b/tools/ci_build/build.py
@@ -35,7 +35,8 @@ def version_to_tuple(version: str) -> tuple:
 import util.android as android  # noqa: E402
 from util import (  # noqa: E402
     generate_android_triplets,
-    generate_posix_triplets,
+    generate_linux_triplets,
+    generate_macos_triplets,
     generate_vcpkg_triplets_for_emscripten,
     generate_windows_triplets,
     get_logger,
@@ -1115,7 +1116,6 @@ def generate_build_tree(
     cmake_extra_args,
 ):
     log.info("Generating CMake build tree")
-
     cmake_dir = os.path.join(source_dir, "cmake")
     cmake_args = [cmake_path, cmake_dir]
     if not use_dev_mode(args):
@@ -1330,8 +1330,16 @@ def generate_build_tree(
             generate_android_triplets(build_dir, args.android_cpp_shared, args.android_api)
         elif is_windows():
             generate_windows_triplets(build_dir)
+        elif is_macOS():
+            osx_target = args.apple_deploy_target
+            if args.apple_deploy_target is None:
+                osx_target = os.environ.get("MACOSX_DEPLOYMENT_TARGET")
+            if osx_target is not None:
+                log.info(f"Setting VCPKG_OSX_DEPLOYMENT_TARGET to {osx_target}")
+            generate_macos_triplets(build_dir, osx_target)
         else:
-            generate_posix_triplets(build_dir)
+            # Linux, *BSD, AIX or other platforms
+            generate_linux_triplets(build_dir)
         add_default_definition(cmake_extra_defines, "CMAKE_TOOLCHAIN_FILE", str(vcpkg_toolchain_path))
 
         vcpkg_install_options = generate_vcpkg_install_options(build_dir, args)
diff --git a/tools/ci_build/github/azure-pipelines/stages/py-cpu-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/stages/py-cpu-packaging-stage.yml
index 4ff539df9f914..42d6e4371ccce 100644
--- a/tools/ci_build/github/azure-pipelines/stages/py-cpu-packaging-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/stages/py-cpu-packaging-stage.yml
@@ -123,7 +123,7 @@ stages:
             --skip_submodule_sync
             --cmake_generator "Visual Studio 17 2022"
             --enable_pybind
-            --enable_onnx_tests
+            --enable_onnx_tests --use_vcpkg --use_vcpkg_ms_internal_asset_cache 
             ${{ parameters.build_py_parameters }}
             --parallel --use_binskim_compliant_compile_flags --update --build
             $(TelemetryOption)
diff --git a/tools/ci_build/github/azure-pipelines/templates/windowsai-steps.yml b/tools/ci_build/github/azure-pipelines/templates/windowsai-steps.yml
index fb3ebdc760a7b..355a575307f0b 100644
--- a/tools/ci_build/github/azure-pipelines/templates/windowsai-steps.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/windowsai-steps.yml
@@ -89,7 +89,7 @@ jobs:
     # must call vsdevcmd first to add cmake to PATH
     - script: |
         python --version
-        python "$(Build.SourcesDirectory)\tools\ci_build\build.py" --build_dir $(Build.BinariesDirectory) --parallel --use_binskim_compliant_compile_flags --build_shared_lib --enable_onnx_tests --ms_experimental --use_dml --use_winml --cmake_generator "Visual Studio 17 2022" --update --config RelWithDebInfo --enable_lto --use_telemetry --disable_rtti --enable_wcos --windows_sdk_version "10.0.22621.0" $(BuildFlags) --cmake_extra_defines "CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFO=/PROFILE" "CMAKE_SHARED_LINKER_FLAGS_RELWITHDEBINFO=/PROFILE"
+        python "$(Build.SourcesDirectory)\tools\ci_build\build.py" --build_dir $(Build.BinariesDirectory) --parallel --use_binskim_compliant_compile_flags --build_shared_lib --enable_onnx_tests --ms_experimental --use_dml --use_winml --cmake_generator "Visual Studio 17 2022" --update --config RelWithDebInfo --enable_lto --use_telemetry --disable_rtti --enable_wcos --use_vcpkg --use_vcpkg_ms_internal_asset_cache --windows_sdk_version "10.0.22621.0" $(BuildFlags) --cmake_extra_defines "CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFO=/PROFILE" "CMAKE_SHARED_LINKER_FLAGS_RELWITHDEBINFO=/PROFILE"
       workingDirectory: '$(Build.BinariesDirectory)'
       displayName: 'Generate cmake config'
 
diff --git a/tools/ci_build/github/azure-pipelines/win-qnn-arm64-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-qnn-arm64-ci-pipeline.yml
index e08d7eb2b12de..1c3d911fa7dbb 100644
--- a/tools/ci_build/github/azure-pipelines/win-qnn-arm64-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/win-qnn-arm64-ci-pipeline.yml
@@ -90,7 +90,7 @@ jobs:
         --config $(BuildConfig)
         --build_dir $(Build.BinariesDirectory)
         --cmake_generator "Visual Studio 17 2022"
-        --build_shared_lib
+        --build_shared_lib --use_vcpkg --use_vcpkg_ms_internal_asset_cache 
         --use_qnn $(QnnLibKind)
         --qnn_home $(QnnSDKRootDir)
         --update --build --parallel
diff --git a/tools/ci_build/github/azure-pipelines/win-qnn-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-qnn-ci-pipeline.yml
index 81de3335a07d2..faef469e010f6 100644
--- a/tools/ci_build/github/azure-pipelines/win-qnn-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/win-qnn-ci-pipeline.yml
@@ -78,7 +78,7 @@ jobs:
         --build_dir $(Build.BinariesDirectory)
         --cmake_generator "Visual Studio 17 2022"
         --build_java
-        --build_shared_lib
+        --build_shared_lib --use_vcpkg --use_vcpkg_ms_internal_asset_cache
         --use_qnn $(QnnLibKind)
         --qnn_home $(QnnSDKRootDir)
         --use_binskim_compliant_compile_flags
diff --git a/tools/python/util/__init__.py b/tools/python/util/__init__.py
index a669963e84bcf..8631218ca9e00 100644
--- a/tools/python/util/__init__.py
+++ b/tools/python/util/__init__.py
@@ -7,7 +7,8 @@
 from .run import run  # noqa: F401
 from .vcpkg_helpers import (  # noqa: F401
     generate_android_triplets,
-    generate_posix_triplets,
+    generate_linux_triplets,
+    generate_macos_triplets,
     generate_vcpkg_triplets_for_emscripten,
     generate_windows_triplets,
 )
diff --git a/tools/python/util/vcpkg_helpers.py b/tools/python/util/vcpkg_helpers.py
index d33b2f7675690..875a6186e55c2 100644
--- a/tools/python/util/vcpkg_helpers.py
+++ b/tools/python/util/vcpkg_helpers.py
@@ -222,6 +222,7 @@ def generate_triplet_for_posix_platform(
     enable_asan: bool,
     crt_linkage: str,
     target_abi: str,
+    osx_deployment_target: str,
 ) -> None:
     """
     Generate triplet file for POSIX platforms (Linux, macOS).
@@ -235,6 +236,7 @@ def generate_triplet_for_posix_platform(
         enable_asan (bool): Flag indicating if AddressSanitizer is enabled.
         crt_linkage (str): The CRT linkage type ("static" or "dynamic").
         target_abi (str): The target ABI, which maps to the VCPKG_TARGET_ARCHITECTURE variable. Valid options include x86, x64, arm, arm64, arm64ec, s390x, ppc64le, riscv32, riscv64, loongarch32, loongarch64, mips64.
+        osx_deployment_target (str, optional): The macOS deployment target version. The parameter sets the minimum macOS version for compiled binaries. It also changes what versions of the macOS platform SDK CMake will search for. See the CMake documentation for CMAKE_OSX_DEPLOYMENT_TARGET for more information.
     """
     folder_name_parts = []
     if enable_asan:
@@ -341,6 +343,8 @@ def generate_triplet_for_posix_platform(
             else:
                 osx_abi = target_abi
             f.write(f'set(VCPKG_OSX_ARCHITECTURES "{osx_abi}")\n')
+            if osx_deployment_target:
+                f.write(f'set(VCPKG_OSX_DEPLOYMENT_TARGET "{osx_deployment_target}")\n')
         f.write("set(CMAKE_POSITION_INDEPENDENT_CODE ON)\n")
         f.write(
             "list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DBENCHMARK_ENABLE_WERROR=OFF)\n"
@@ -501,32 +505,58 @@ def generate_windows_triplets(build_dir: str) -> None:
                                 add_port_configs(f, enable_exception, False)
 
 
-def generate_posix_triplets(build_dir: str) -> None:
+def generate_linux_triplets(build_dir: str) -> None:
     """
-    Generate triplet files for POSIX platforms (Linux, macOS).
+    Generate triplet files for Linux platforms.
 
     Args:
         build_dir (str): The directory to save the generated triplet files.
     """
-    for os_name in ["linux", "osx"]:
-        if os_name == "linux":
-            target_abis = ["x86", "x64", "arm", "arm64", "s390x", "ppc64le", "riscv64", "loongarch64", "mips64"]
-        else:
-            target_abis = ["x64", "arm64", "universal2"]
-        for enable_rtti in [True, False]:
-            for enable_exception in [True, False]:
-                for enable_binskim in [True, False]:
-                    for enable_asan in [True, False]:
-                        if enable_asan and enable_binskim:
-                            continue
-                        for target_abi in target_abis:
-                            generate_triplet_for_posix_platform(
-                                build_dir,
-                                os_name,
-                                enable_rtti,
-                                enable_exception,
-                                enable_binskim,
-                                enable_asan,
-                                "dynamic",
-                                target_abi,
-                            )
+    target_abis = ["x86", "x64", "arm", "arm64", "s390x", "ppc64le", "riscv64", "loongarch64", "mips64"]
+    for enable_rtti in [True, False]:
+        for enable_exception in [True, False]:
+            for enable_binskim in [True, False]:
+                for enable_asan in [True, False]:
+                    if enable_asan and enable_binskim:
+                        continue
+                    for target_abi in target_abis:
+                        generate_triplet_for_posix_platform(
+                            build_dir,
+                            "linux",
+                            enable_rtti,
+                            enable_exception,
+                            enable_binskim,
+                            enable_asan,
+                            "dynamic",
+                            target_abi,
+                            None,
+                        )
+
+
+def generate_macos_triplets(build_dir: str, osx_deployment_target: str) -> None:
+    """
+    Generate triplet files for macOS platforms.
+
+    Args:
+        build_dir (str): The directory to save the generated triplet files.
+        osx_deployment_target (str, optional): The macOS deployment target version.
+    """
+    target_abis = ["x64", "arm64", "universal2"]
+    for enable_rtti in [True, False]:
+        for enable_exception in [True, False]:
+            for enable_binskim in [True, False]:
+                for enable_asan in [True, False]:
+                    if enable_asan and enable_binskim:
+                        continue
+                    for target_abi in target_abis:
+                        generate_triplet_for_posix_platform(
+                            build_dir,
+                            "osx",
+                            enable_rtti,
+                            enable_exception,
+                            enable_binskim,
+                            enable_asan,
+                            "dynamic",
+                            target_abi,
+                            osx_deployment_target,
+                        )

From 49328fe63789c424a4b9335ebee791ec8f154e1c Mon Sep 17 00:00:00 2001
From: Changming Sun <chasun@microsoft.com>
Date: Fri, 7 Mar 2025 14:04:02 -0800
Subject: [PATCH 43/46] Allow using a different version of flatbuffers when
 building with vcpkg (#23946)

### Description
Allow using a different version of flatbuffers when building with vcpkg,
so that users do not need to pin flatbuffer's version, which provides
more flexibility in the build process.

Delete utf8_range from the dependencies, because it is an indirect
dependency of protobuf, which is already included in the build process.
### Motivation and Context
---
 cmake/deps.txt                                |  1 -
 .../external/onnxruntime_external_deps.cmake  | 21 ++++---------------
 2 files changed, 4 insertions(+), 18 deletions(-)

diff --git a/cmake/deps.txt b/cmake/deps.txt
index d0bab93d3c16f..c7db8ef51505d 100644
--- a/cmake/deps.txt
+++ b/cmake/deps.txt
@@ -53,7 +53,6 @@ re2;https://github.com/google/re2/archive/refs/tags/2024-07-02.zip;646e1728269cd
 safeint;https://github.com/dcleblanc/SafeInt/archive/refs/tags/3.0.28.zip;23f252040ff6cb9f1fd18575b32fa8fb5928daac
 tensorboard;https://github.com/tensorflow/tensorboard/archive/373eb09e4c5d2b3cc2493f0949dc4be6b6a45e81.zip;67b833913605a4f3f499894ab11528a702c2b381
 cutlass;https://github.com/NVIDIA/cutlass/archive/refs/tags/v3.5.1.zip;e49b2b964163d27765a5002d210a2f3c73771835
-utf8_range;https://github.com/protocolbuffers/utf8_range/archive/72c943dea2b9240cd09efde15191e144bc7c7d38.zip;9925739c9debc0efa2adcb194d371a35b6a03156
 extensions;https://github.com/microsoft/onnxruntime-extensions/archive/c24b7bab0c12f53da76d0c31b03b9f0f8ec8f3b4.zip;239063aee4946a9af147b473a4c3da78ba7413b4
 composable_kernel;https://github.com/ROCmSoftwarePlatform/composable_kernel/archive/204da9c522cebec5220bba52cd3542ebcaf99e7a.zip;1827348efd47831c13074245274d41b7cae8a557
 directx_headers;https://github.com/microsoft/DirectX-Headers/archive/refs/tags/v1.613.1.zip;47653509a3371eabb156360f42faf582f314bf2e
diff --git a/cmake/external/onnxruntime_external_deps.cmake b/cmake/external/onnxruntime_external_deps.cmake
index 2ab9fc129a90d..a477d6edb3a3f 100644
--- a/cmake/external/onnxruntime_external_deps.cmake
+++ b/cmake/external/onnxruntime_external_deps.cmake
@@ -107,23 +107,6 @@ if(onnxruntime_USE_MIMALLOC)
   FetchContent_MakeAvailable(mimalloc)
 endif()
 
-#Protobuf depends on utf8_range
-onnxruntime_fetchcontent_declare(
-    utf8_range
-    URL ${DEP_URL_utf8_range}
-    URL_HASH SHA1=${DEP_SHA1_utf8_range}
-    EXCLUDE_FROM_ALL
-    FIND_PACKAGE_ARGS NAMES utf8_range
-)
-
-set(utf8_range_ENABLE_TESTS OFF CACHE BOOL "Build test suite" FORCE)
-set(utf8_range_ENABLE_INSTALL OFF CACHE BOOL "Configure installation" FORCE)
-
-# The next line will generate an error message "fatal: not a git repository", but it is ok. It is from flatbuffers
-onnxruntime_fetchcontent_makeavailable(utf8_range)
-# protobuf's cmake/utf8_range.cmake has the following line
-include_directories(${utf8_range_SOURCE_DIR})
-
 # Download a protoc binary from Internet if needed
 if(NOT ONNX_CUSTOM_PROTOC_EXECUTABLE AND NOT onnxruntime_USE_VCPKG)
   # This part of code is only for users' convenience. The code couldn't handle all cases. Users always can manually
@@ -442,6 +425,9 @@ target_include_directories(safeint_interface INTERFACE ${safeint_SOURCE_DIR})
 
 
 # Flatbuffers
+if(onnxruntime_USE_VCPKG)
+  find_package(flatbuffers REQUIRED)
+else()
 # We do not need to build flatc for iOS or Android Cross Compile
 if (CMAKE_SYSTEM_NAME STREQUAL "iOS" OR CMAKE_SYSTEM_NAME STREQUAL "Android" OR CMAKE_SYSTEM_NAME STREQUAL "Emscripten")
   set(FLATBUFFERS_BUILD_FLATC OFF CACHE BOOL "FLATBUFFERS_BUILD_FLATC" FORCE)
@@ -492,6 +478,7 @@ namespace std { using ::getenv; }
     endif()
   endif()
 endif()
+endif()
 
 # ONNX
 if (NOT onnxruntime_USE_FULL_PROTOBUF)

From 95dcd15053575e01a6bdc8ef7af347e553be9ccf Mon Sep 17 00:00:00 2001
From: Jian Chen <cjian@microsoft.com>
Date: Fri, 7 Mar 2025 20:05:05 -0500
Subject: [PATCH 44/46] Make python package pipeline 1ES compliant (#23800)

### Description
Make [Python packaging
pipeline](https://aiinfra.visualstudio.com/530acbc4-21bc-487d-8cd8-348ff451d2ff/_build?definitionId=841)
1ES compliant


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->

### Checklist

- [x] Make Onnxruntime-QNNEP-Windows-2022-CPU stateless
---
 .../py-package-test-pipeline.yml              |   2 +
 .../azure-pipelines/py-packaging-pipeline.yml |  50 ++--
 .../stages/py-cpu-packaging-stage.yml         | 122 ++++----
 .../templates/py-linux-qnn.yml                | 118 ++++----
 .../azure-pipelines/templates/py-linux.yml    | 144 +++++----
 .../templates/py-package-smoking-test.yml     |  13 +-
 .../templates/py-packaging-linux-test-cpu.yml |  18 +-
 .../templates/py-win-arm64-qnn.yml            | 273 +++++++++---------
 .../templates/py-win-arm64ec-qnn.yml          | 241 ++++++++--------
 .../templates/py-win-x64-qnn.yml              |  21 +-
 .../templates/react-native-ci.yml             |  12 +-
 11 files changed, 564 insertions(+), 450 deletions(-)

diff --git a/tools/ci_build/github/azure-pipelines/py-package-test-pipeline.yml b/tools/ci_build/github/azure-pipelines/py-package-test-pipeline.yml
index a0e49692220f9..7a78c6ba0fcdf 100644
--- a/tools/ci_build/github/azure-pipelines/py-package-test-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/py-package-test-pipeline.yml
@@ -31,10 +31,12 @@ stages:
         machine_pool:
           vmImage: 'macOS-13'
         itemPattern: '*/*mac*x86_64.whl'
+        arch: 'x86_64'
     - template: templates/py-package-smoking-test.yml
       parameters:
         job_name: Test_LINUX_x86_64_Wheels
         itemPattern: '*/*manylinux*x86_64.whl'
+        arch: 'x86_64'
         machine_pool:
           name: 'onnxruntime-Ubuntu2204-AMD-CPU'
 
diff --git a/tools/ci_build/github/azure-pipelines/py-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/py-packaging-pipeline.yml
index 01d30d0e1ba86..28ddd29ec63e6 100644
--- a/tools/ci_build/github/azure-pipelines/py-packaging-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/py-packaging-pipeline.yml
@@ -50,10 +50,10 @@ parameters:
   displayName: 'Linux packages cmake build type. Linux Only.'
   default: 'Release'
   values:
-   - Debug
-   - Release
-   - RelWithDebInfo
-   - MinSizeRel
+  - Debug
+  - Release
+  - RelWithDebInfo
+  - MinSizeRel
 
 # Only applies to QNN packages.
 - name: qnn_sdk_version
@@ -63,17 +63,33 @@ parameters:
 
 trigger: none
 
-stages:
-- template: stages/py-cpu-packaging-stage.yml
+resources:
+  repositories:
+  - repository: 1esPipelines
+    type: git
+    name: 1ESPipelineTemplates/1ESPipelineTemplates
+    ref: refs/tags/release
+extends:
+  # The pipeline extends the 1ES PT which will inject different SDL and compliance tasks.
+  # For non-production pipelines, use "Unofficial" as defined below.
+  # For productions pipelines, use "Official".
+  template: v1/1ES.Official.PipelineTemplate.yml@1esPipelines
   parameters:
-    enable_linux_cpu: ${{ parameters.enable_linux_cpu }}
-    enable_windows_cpu: ${{ parameters.enable_windows_cpu }}
-    enable_mac_cpu: ${{ parameters.enable_mac_cpu }}
-    enable_linux_arm: ${{ parameters.enable_linux_arm }}
-    enable_windows_arm64_qnn: ${{ parameters.enable_windows_arm64_qnn }}
-    enable_windows_arm64ec_qnn: ${{ parameters.enable_windows_arm64ec_qnn }}
-    enable_windows_x64_qnn: ${{ parameters.enable_windows_x64_qnn }}
-    enable_linux_x64_qnn: ${{ parameters.enable_linux_x64_qnn }}
-    build_py_parameters: ${{ parameters.build_py_parameters }}
-    cmake_build_type: ${{ parameters.cmake_build_type }}
-    qnn_sdk_version: ${{ parameters.qnn_sdk_version }}
+    sdl:
+      sourceAnalysisPool:
+        name: onnxruntime-Win-CPU-2022
+        os: windows
+    stages:
+    - template: stages/py-cpu-packaging-stage.yml
+      parameters:
+        enable_linux_cpu: ${{ parameters.enable_linux_cpu }}
+        enable_windows_cpu: ${{ parameters.enable_windows_cpu }}
+        enable_mac_cpu: ${{ parameters.enable_mac_cpu }}
+        enable_linux_arm: ${{ parameters.enable_linux_arm }}
+        enable_windows_arm64_qnn: ${{ parameters.enable_windows_arm64_qnn }}
+        enable_windows_arm64ec_qnn: ${{ parameters.enable_windows_arm64ec_qnn }}
+        enable_windows_x64_qnn: ${{ parameters.enable_windows_x64_qnn }}
+        enable_linux_x64_qnn: ${{ parameters.enable_linux_x64_qnn }}
+        build_py_parameters: ${{ parameters.build_py_parameters }}
+        cmake_build_type: ${{ parameters.cmake_build_type }}
+        qnn_sdk_version: ${{ parameters.qnn_sdk_version }}
diff --git a/tools/ci_build/github/azure-pipelines/stages/py-cpu-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/stages/py-cpu-packaging-stage.yml
index 42d6e4371ccce..5e783607e3622 100644
--- a/tools/ci_build/github/azure-pipelines/stages/py-cpu-packaging-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/stages/py-cpu-packaging-stage.yml
@@ -151,10 +151,11 @@ stages:
           Contents: '*.whl'
           TargetFolder: '$(Build.ArtifactStagingDirectory)'
 
-      - task: PublishBuildArtifacts@1
+      - task: 1ES.PublishPipelineArtifact@1
         displayName: 'Publish Artifact: ONNXRuntime python wheel'
         inputs:
-          ArtifactName: onnxruntime
+          artifactName: onnxruntime-win-$(PythonVersion)
+          targetPath: '$(Build.ArtifactStagingDirectory)'
 
       - script: |
           7z x *.whl
@@ -199,7 +200,9 @@ stages:
       workspace:
         clean: all
       pool:
-        vmImage: 'macOS-13'
+        name: "Azure Pipelines"
+        image: "macOS-13"
+        os: macOS
       variables:
         MACOSX_DEPLOYMENT_TARGET: '13.3'
       strategy:
@@ -251,74 +254,81 @@ stages:
           Contents: '*.whl'
           TargetFolder: '$(Build.ArtifactStagingDirectory)'
 
-      - task: PublishBuildArtifacts@1
+      - task: 1ES.PublishPipelineArtifact@1
         displayName: 'Publish Artifact: ONNXRuntime python wheel'
         inputs:
-          ArtifactName: onnxruntime
+          artifactName: onnxruntime-macos-$(PythonVersion)
+          targetPath: '$(Build.ArtifactStagingDirectory)'
 
       - template: ../templates/component-governance-component-detection-steps.yml
         parameters:
           condition: 'succeeded'
 
 
-  - ${{ if eq(parameters.enable_linux_arm, true) }}:
-    - stage: Python_Packaging_Linux_ARM
-      dependsOn: []
-      jobs:
-        - template: ../templates/py-linux.yml
-          parameters:
-            arch: 'aarch64'
-            machine_pool: 'onnxruntime-linux-ARM64-CPU-2019'
-            extra_build_arg: ${{ parameters.build_py_parameters }}
-            cmake_build_type: ${{ parameters.cmake_build_type }}
-
-  - ${{ if eq(parameters.enable_linux_cpu, true) }}:
-    - stage: Python_Packaging_Linux_CPU
-      dependsOn: []
-      jobs:
+- ${{ if eq(parameters.enable_linux_arm, true) }}:
+  - stage: Python_Packaging_Linux_ARM
+    dependsOn: []
+    jobs:
       - template: ../templates/py-linux.yml
         parameters:
-          arch: 'x86_64'
-          machine_pool: 'onnxruntime-Ubuntu2204-AMD-CPU-Large'
+          arch: 'aarch64'
+          machine_pool: 'onnxruntime-linux-ARM64-CPU-2019'
           extra_build_arg: ${{ parameters.build_py_parameters }}
           cmake_build_type: ${{ parameters.cmake_build_type }}
+          is1ES: true
 
-  - ${{ if eq(parameters.enable_windows_arm64_qnn, true) }}:
-    - stage: Python_Packaging_Windows_ARM64_QNN
-      dependsOn: []
-      jobs:
-      - template: ../templates/py-win-arm64-qnn.yml
+- ${{ if eq(parameters.enable_linux_cpu, true) }}:
+  - stage: Python_Packaging_Linux_CPU
+    dependsOn: []
+    jobs:
+    - template: ../templates/py-linux.yml
+      parameters:
+        arch: 'x86_64'
+        machine_pool: 'onnxruntime-Ubuntu2204-AMD-CPU-Large'
+        extra_build_arg: ${{ parameters.build_py_parameters }}
+        cmake_build_type: ${{ parameters.cmake_build_type }}
+        is1ES: true
+
+- ${{ if eq(parameters.enable_windows_arm64_qnn, true) }}:
+  - stage: Python_Packaging_Windows_ARM64_QNN
+    dependsOn: []
+    jobs:
+    - template: ../templates/py-win-arm64-qnn.yml
+      parameters:
+        MACHINE_POOL: 'onnxruntime-qnn-windows-vs-2022-arm64'
+        QNN_SDK: ${{ parameters.qnn_sdk_version }}
+        BUILD_PY_PARAMETERS: ${{ parameters.build_py_parameters }}
+        is1ES: true
+
+- ${{ if eq(parameters.enable_windows_arm64ec_qnn, true) }}:
+  - stage: Python_Packaging_Windows_arm64ec_QNN
+    dependsOn: []
+    jobs:
+      - template: ../templates/py-win-arm64ec-qnn.yml
         parameters:
-          MACHINE_POOL: 'onnxruntime-qnn-windows-vs-2022-arm64'
+          MACHINE_POOL: 'Onnxruntime-QNNEP-Windows-2022-CPU'
           QNN_SDK: ${{ parameters.qnn_sdk_version }}
           BUILD_PY_PARAMETERS: ${{ parameters.build_py_parameters }}
+          is1ES: true
 
-  - ${{ if eq(parameters.enable_windows_arm64ec_qnn, true) }}:
-    - stage: Python_Packaging_Windows_arm64ec_QNN
-      dependsOn: []
-      jobs:
-        - template: ../templates/py-win-arm64ec-qnn.yml
-          parameters:
-            MACHINE_POOL: 'Onnxruntime-QNNEP-Windows-2022-CPU'
-            QNN_SDK: ${{ parameters.qnn_sdk_version }}
-            BUILD_PY_PARAMETERS: ${{ parameters.build_py_parameters }}
-
-  - ${{ if eq(parameters.enable_windows_x64_qnn, true) }}:
-    - stage: Python_Packaging_Windows_x64_QNN
-      dependsOn: []
-      jobs:
-        - template: ../templates/py-win-x64-qnn.yml
-          parameters:
-            MACHINE_POOL: 'Onnxruntime-QNNEP-Windows-2022-CPU'
-            QNN_SDK: ${{ parameters.qnn_sdk_version }}
-            BUILD_PY_PARAMETERS: ${{ parameters.build_py_parameters }}
-
-  - ${{ if eq(parameters.enable_linux_x64_qnn, true) }}:
-    - stage: Python_Packaging_Linux_x64_QNN
-      dependsOn: []
-      jobs:
-      - template: ../templates/py-linux-qnn.yml
+- ${{ if eq(parameters.enable_windows_x64_qnn, true) }}:
+  - stage: Python_Packaging_Windows_x64_QNN
+    dependsOn: []
+    jobs:
+      - template: ../templates/py-win-x64-qnn.yml
         parameters:
-          machine_pool: 'onnxruntime-Ubuntu2204-AMD-CPU'
-          extra_build_arg: ${{ parameters.build_py_parameters }}
-          cmake_build_type: ${{ parameters.cmake_build_type }}
+          MACHINE_POOL: 'Onnxruntime-QNNEP-Windows-2022-CPU'
+          QNN_SDK: ${{ parameters.qnn_sdk_version }}
+          BUILD_PY_PARAMETERS: ${{ parameters.build_py_parameters }}
+          is1ES: true
+
+- ${{ if eq(parameters.enable_linux_x64_qnn, true) }}:
+  - stage: Python_Packaging_Linux_x64_QNN
+    dependsOn: []
+    jobs:
+    - template: ../templates/py-linux-qnn.yml
+      parameters:
+        machine_pool: 'onnxruntime-Ubuntu2204-AMD-CPU'
+        extra_build_arg: ${{ parameters.build_py_parameters }}
+        cmake_build_type: ${{ parameters.cmake_build_type }}
+        is1ES: true
diff --git a/tools/ci_build/github/azure-pipelines/templates/py-linux-qnn.yml b/tools/ci_build/github/azure-pipelines/templates/py-linux-qnn.yml
index 347a3145e8c70..8126cda449daa 100644
--- a/tools/ci_build/github/azure-pipelines/templates/py-linux-qnn.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/py-linux-qnn.yml
@@ -6,10 +6,10 @@ parameters:
   type: string
   default: 'Release'
   values:
-   - Debug
-   - Release
-   - RelWithDebInfo
-   - MinSizeRel
+  - Debug
+  - Release
+  - RelWithDebInfo
+  - MinSizeRel
 
 - name: device
   type: string
@@ -27,68 +27,82 @@ parameters:
   displayName: QNN SDK version
   type: string
   default: 2.31.0.250130
+  
+- name: is1ES
+  displayName: 'Whether the pipeline is running in 1ES'
+  type: boolean
+  default: false
 
 jobs:
 - job: Linux_py_qnn_Wheels_x64
   timeoutInMinutes: 240
   workspace:
     clean: all
-  pool: ${{ parameters.machine_pool }}
+  pool:
+    name: ${{ parameters.machine_pool }}
+    os: linux
   variables:
-    # The build machine pool doesn't have dotnet, so it can't run CG.
-    - name: skipComponentGovernanceDetection
-      value: true
-    - name: ORT_CACHE_DIR
-      value: $(Agent.TempDirectory)/ort_ccache
-    - name: TODAY
-      value: $[format('{0:dd}{0:MM}{0:yyyy}', pipeline.startTime)]
-    - name: extra_build_args
-      ${{ if ne(parameters.extra_build_arg, '') }}:
-        value: -x ${{ parameters.extra_build_arg }}
-      ${{ if eq(parameters.extra_build_arg, '') }}:
-        value: ''
+  # The build machine pool doesn't have dotnet, so it can't run CG.
+  - name: skipComponentGovernanceDetection
+    value: true
+  - name: ORT_CACHE_DIR
+    value: $(Agent.TempDirectory)/ort_ccache
+  - name: TODAY
+    value: $[format('{0:dd}{0:MM}{0:yyyy}', pipeline.startTime)]
+  - name: extra_build_args
+    ${{ if ne(parameters.extra_build_arg, '') }}:
+      value: -x ${{ parameters.extra_build_arg }}
+    ${{ if eq(parameters.extra_build_arg, '') }}:
+      value: ''
   steps:
 
-    - checkout: self
-      clean: true
-      submodules: none
+  - checkout: self
+    clean: true
+    submodules: none
 
-    - template: jobs/download_linux_qnn_sdk.yml
-      parameters:
-        QnnSDKVersion: ${{ parameters.QnnSdk }}
+  - template: jobs/download_linux_qnn_sdk.yml
+    parameters:
+      QnnSDKVersion: ${{ parameters.QnnSdk }}
 
-    - template: set-nightly-build-option-variable-step.yml
+  - template: set-nightly-build-option-variable-step.yml
 
-    - template: get-docker-image-steps.yml
-      parameters:
-        Dockerfile: tools/ci_build/github/linux/docker/inference/x86_64/python/cpu/Dockerfile
-        Context: tools/ci_build/github/linux/docker/inference/x86_64/python/cpu
-        DockerBuildArgs: "--build-arg BUILD_UID=$( id -u )"
-        Repository: onnxruntimecpubuildpythonx86_64_qnn
+  - template: get-docker-image-steps.yml
+    parameters:
+      Dockerfile: tools/ci_build/github/linux/docker/inference/x86_64/python/cpu/Dockerfile
+      Context: tools/ci_build/github/linux/docker/inference/x86_64/python/cpu
+      DockerBuildArgs: "--build-arg BUILD_UID=$( id -u )"
+      Repository: onnxruntimecpubuildpythonx86_64_qnn
 
-    - template: linux-build-step-with-cache.yml
-      parameters:
-        WithCache: ${{parameters.with_cache}}
-        Today: $(TODAY)
-        AdditionalKey: Linux_py_qnn_Wheels_x64
-        CacheDir: $(ORT_CACHE_DIR)
-        ChangeEveryCommit: true
-        BuildStep:
-          - task: Bash@3
-            displayName: 'Build Python Wheel'
-            inputs:
-              targetType: filePath
-              filePath: tools/ci_build/github/linux/run_python_dockerbuild.sh
-              arguments: -i onnxruntimecpubuildpythonx86_64_qnn -d "${{ parameters.device }}" -c ${{ parameters.cmake_build_type }} $(extra_build_args)
-            env:
-              ADDITIONAL_DOCKER_PARAMETER: "--volume $(QnnSDKRootDir):/qnn_sdk"
+  - template: linux-build-step-with-cache.yml
+    parameters:
+      WithCache: ${{parameters.with_cache}}
+      Today: $(TODAY)
+      AdditionalKey: Linux_py_qnn_Wheels_x64
+      CacheDir: $(ORT_CACHE_DIR)
+      ChangeEveryCommit: true
+      BuildStep:
+      - task: Bash@3
+        displayName: 'Build Python Wheel'
+        inputs:
+          targetType: filePath
+          filePath: tools/ci_build/github/linux/run_python_dockerbuild.sh
+          arguments: -i onnxruntimecpubuildpythonx86_64_qnn -d "${{ parameters.device }}" -c ${{ parameters.cmake_build_type }} $(extra_build_args)
+        env:
+          ADDITIONAL_DOCKER_PARAMETER: "--volume $(QnnSDKRootDir):/qnn_sdk"
+  - ${{ if eq(parameters.is1ES, true) }}:
+    - task: 1ES.PublishPipelineArtifact@1
+      displayName: 'Publish Artifact: Linux ONNXRuntime QNN python wheel'
+      inputs:
+        targetPath: '$(Build.BinariesDirectory)/dist'
+        artifactName: onnxruntime-linux-qnn-x64
 
-    - task: PublishBuildArtifacts@1
+  - ${{ if eq(parameters.is1ES, false) }}:
+    - task: PublishPipelineArtifact@1
       displayName: 'Publish Artifact: Linux ONNXRuntime QNN python wheel'
       inputs:
-        PathtoPublish: '$(Build.BinariesDirectory)/dist'
-        ArtifactName: onnxruntime-linux-qnn-x64
+        targetPath: '$(Build.BinariesDirectory)/dist'
+        artifactName: onnxruntime-linux-qnn-x64
 
-    - template: component-governance-component-detection-steps.yml
-      parameters :
-        condition : 'succeeded'
+  - template: component-governance-component-detection-steps.yml
+    parameters:
+      condition: 'succeeded'
diff --git a/tools/ci_build/github/azure-pipelines/templates/py-linux.yml b/tools/ci_build/github/azure-pipelines/templates/py-linux.yml
index e591b719ecfa9..8d0c4334f4874 100644
--- a/tools/ci_build/github/azure-pipelines/templates/py-linux.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/py-linux.yml
@@ -9,10 +9,10 @@ parameters:
   type: string
   default: 'Release'
   values:
-   - Debug
-   - Release
-   - RelWithDebInfo
-   - MinSizeRel
+  - Debug
+  - Release
+  - RelWithDebInfo
+  - MinSizeRel
 
 - name: device
   type: string
@@ -34,76 +34,98 @@ parameters:
   type: string
   default: ''
 
+- name: is1ES
+  displayName: 'Whether the pipeline is running in 1ES'
+  type: boolean
+  default: false
+
 jobs:
 - job: Linux_py_Wheels_${{ parameters.arch }}_${{parameters.ep}}
   timeoutInMinutes: 240
   workspace:
     clean: all
-  pool: ${{ parameters.machine_pool }}
+  pool:
+    name: ${{ parameters.machine_pool }}
+    os: 'linux'
+    ${{ if eq(parameters.arch, 'aarch64') }}:
+      hostArchitecture: Arm64
   variables:
-    # The build machine pool doesn't have dotnet, so it can't run CG.
-    - name: skipComponentGovernanceDetection
-      value: true
-    - name: ORT_CACHE_DIR
-      value: $(Agent.TempDirectory)/ort_ccache
-    - name: TODAY
-      value: $[format('{0:dd}{0:MM}{0:yyyy}', pipeline.startTime)]
-    - name: extra_build_args
-      ${{ if ne(parameters.extra_build_arg, '') }}:
-        value: '-x ${{ parameters.extra_build_arg }}'
-      ${{ if eq(parameters.extra_build_arg, '') }}:
-        value: ''
-    - name: python_exe_path
-      ${{ if ne(parameters.python_exe_path, '') }}:
-        value: '-p ${{ parameters.python_exe_path }}'
-      ${{ if eq(parameters.python_exe_path, '') }}:
-        value: ''
+  # The build machine pool doesn't have dotnet, so it can't run CG.
+  - name: skipComponentGovernanceDetection
+    value: true
+  - name: ORT_CACHE_DIR
+    value: $(Agent.TempDirectory)/ort_ccache
+  - name: TODAY
+    value: $[format('{0:dd}{0:MM}{0:yyyy}', pipeline.startTime)]
+  - name: extra_build_args
+    ${{ if ne(parameters.extra_build_arg, '') }}:
+      value: '-x ${{ parameters.extra_build_arg }}'
+    ${{ if eq(parameters.extra_build_arg, '') }}:
+      value: ''
+  - name: python_exe_path
+    ${{ if ne(parameters.python_exe_path, '') }}:
+      value: '-p ${{ parameters.python_exe_path }}'
+    ${{ if eq(parameters.python_exe_path, '') }}:
+      value: ''
 
   steps:
 
-    - checkout: self
-      clean: true
-      submodules: none
-
-    - template: set-nightly-build-option-variable-step.yml
-
-    - template: get-docker-image-steps.yml
-      parameters:
-        Dockerfile: tools/ci_build/github/linux/docker/inference/${{ parameters.arch }}/python/cpu/Dockerfile
-        Context: tools/ci_build/github/linux/docker/inference/${{ parameters.arch }}/python/cpu
-        DockerBuildArgs: "--build-arg BUILD_UID=$( id -u )"
-        Repository: onnxruntimecpubuildpython${{ parameters.arch }}
-
-    - template: linux-build-step-with-cache.yml
-      parameters:
-        WithCache: ${{parameters.with_cache}}
-        Today: $(TODAY)
-        AdditionalKey: Linux_py_Wheels_${{ parameters.arch }}
-        CacheDir: $(ORT_CACHE_DIR)
-        ChangeEveryCommit: true
-        BuildStep:
-          - task: Bash@3
-            displayName: 'Build Python Wheel'
-            inputs:
-              targetType: filePath
-              filePath: tools/ci_build/github/linux/run_python_dockerbuild.sh
-              arguments: -i onnxruntimecpubuildpython${{ parameters.arch }} -d "${{ parameters.device }}" -c ${{ parameters.cmake_build_type }} $(extra_build_args) $(python_exe_path)
-            ${{ if eq(parameters.with_cache, 'true') }}:
-              env:
-                ADDITIONAL_DOCKER_PARAMETER: "--volume $(ORT_CACHE_DIR):/cache -e CCACHE_DIR=/cache -e ORT_BUILD_WITH_CACHE=1"
-
-    - task: PublishBuildArtifacts@1
+  - checkout: self
+    clean: true
+    submodules: none
+
+  - template: set-nightly-build-option-variable-step.yml
+
+  - template: get-docker-image-steps.yml
+    parameters:
+      Dockerfile: tools/ci_build/github/linux/docker/inference/${{ parameters.arch }}/python/cpu/Dockerfile
+      Context: tools/ci_build/github/linux/docker/inference/${{ parameters.arch }}/python/cpu
+      DockerBuildArgs: "--build-arg BUILD_UID=$( id -u )"
+      Repository: onnxruntimecpubuildpython${{ parameters.arch }}
+
+  - template: linux-build-step-with-cache.yml
+    parameters:
+      WithCache: ${{parameters.with_cache}}
+      Today: $(TODAY)
+      AdditionalKey: Linux_py_Wheels_${{ parameters.arch }}
+      CacheDir: $(ORT_CACHE_DIR)
+      ChangeEveryCommit: true
+      BuildStep:
+      - task: Bash@3
+        displayName: 'Build Python Wheel'
+        inputs:
+          targetType: filePath
+          filePath: tools/ci_build/github/linux/run_python_dockerbuild.sh
+          arguments: -i onnxruntimecpubuildpython${{ parameters.arch }} -d "${{ parameters.device }}" -c ${{ parameters.cmake_build_type }} $(extra_build_args) $(python_exe_path)
+        ${{ if eq(parameters.with_cache, 'true') }}:
+          env:
+            ADDITIONAL_DOCKER_PARAMETER: "--volume $(ORT_CACHE_DIR):/cache -e CCACHE_DIR=/cache -e ORT_BUILD_WITH_CACHE=1"
+
+  - ${{ if eq(parameters.is1ES, true) }}:
+    - task: 1ES.PublishPipelineArtifact@1
       displayName: 'Publish Artifact: ONNXRuntime python wheel'
       inputs:
-        PathtoPublish: '$(Build.BinariesDirectory)/dist'
-        ArtifactName: onnxruntime-${{ parameters.ep }}
-
-    - task: PublishPipelineArtifact@0
+        targetPath: '$(Build.BinariesDirectory)/dist'
+        artifactName: onnxruntime-${{ parameters.arch }}-${{ parameters.ep }}
+    - task: 1ES.PublishPipelineArtifact@1
+      displayName: 'Publish Test Binaries'
+      inputs:
+        artifactName: 'drop-linux-cpu-${{ parameters.arch }}-${{ parameters.ep }}'
+        targetPath: '$(Build.BinariesDirectory)/${{ parameters.cmake_build_type }}'
+  - ${{ if eq(parameters.is1ES, false) }}:
+    - task: PublishPipelineArtifact@1
+      displayName: 'Publish Artifact: ONNXRuntime python wheel'
+      inputs:
+        targetPath: '$(Build.BinariesDirectory)/dist'
+        artifactName: onnxruntime-${{ parameters.arch }}-${{ parameters.ep }}
+    - task: PublishPipelineArtifact@1
       displayName: 'Publish Test Binaries'
       inputs:
         artifactName: 'drop-linux-cpu-${{ parameters.arch }}-${{ parameters.ep }}'
         targetPath: '$(Build.BinariesDirectory)/${{ parameters.cmake_build_type }}'
 
-    - template: component-governance-component-detection-steps.yml
-      parameters :
-        condition : 'succeeded'
+
+
+  - template: component-governance-component-detection-steps.yml
+    parameters:
+      condition: 'succeeded'
diff --git a/tools/ci_build/github/azure-pipelines/templates/py-package-smoking-test.yml b/tools/ci_build/github/azure-pipelines/templates/py-package-smoking-test.yml
index 3a3da0f8f5afa..c0bd740b2d483 100644
--- a/tools/ci_build/github/azure-pipelines/templates/py-package-smoking-test.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/py-package-smoking-test.yml
@@ -9,9 +9,13 @@ parameters:
 - name: machine_pool
   type: object
 
-- name: python_arch
+- name: ep
   type: string
-  default: 'x64'
+  default: 'cpu'
+
+- name: arch
+  type: string
+  default: 'x86_64'
 
 jobs:
 - job: ${{ parameters.job_name }}
@@ -37,10 +41,9 @@ jobs:
     displayName: 'Use Python'
     inputs:
       versionSpec: $(PythonVersion)
-      architecture: ${{ parameters.python_arch }}
 
   - download: build   # pipeline resource identifier.
-    artifact: 'onnxruntime'
+    artifact: 'onnxruntime-${{ parameters.arch }}-${{ parameters.ep }}'
 
   - task: Bash@3
     inputs:
@@ -51,7 +54,7 @@ jobs:
         FILE_NAME="${files[0]}"
         FILE_NAME=$(basename $FILE_NAME)
         PYTHON_PACKAGE_NAME=$(echo "$FILE_NAME" | cut -f 1 -d '-')
-        python3 -m pip install --find-links "$(Pipeline.Workspace)/build/onnxruntime" $PYTHON_PACKAGE_NAME
+        python3 -m pip install --find-links "$(Pipeline.Workspace)/build/onnxruntime-${{ parameters.arch }}-${{ parameters.ep }}" $PYTHON_PACKAGE_NAME
         python3 -m pip show $PYTHON_PACKAGE_NAME
         python3 -c "import onnxruntime as ort; print(ort.__version__)"
       workingDirectory: $(Pipeline.Workspace)/build/onnxruntime
diff --git a/tools/ci_build/github/azure-pipelines/templates/py-packaging-linux-test-cpu.yml b/tools/ci_build/github/azure-pipelines/templates/py-packaging-linux-test-cpu.yml
index c475feaef0018..eef97341b8d53 100644
--- a/tools/ci_build/github/azure-pipelines/templates/py-packaging-linux-test-cpu.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/py-packaging-linux-test-cpu.yml
@@ -19,10 +19,10 @@ parameters:
   type: string
   default: 'Release'
   values:
-   - Debug
-   - Release
-   - RelWithDebInfo
-   - MinSizeRel
+  - Debug
+  - Release
+  - RelWithDebInfo
+  - MinSizeRel
 
 - name: timeout
   type: number
@@ -50,29 +50,31 @@ jobs:
       artifact: 'drop-linux-cpu-${{ parameters.arch }}-${{parameters.ep}}'
 
     - download: current   # pipeline resource identifier.
-      artifact: 'onnxruntime${{ parameters.python_wheel_suffix }}-${{ parameters.ep }}'
+      artifact: 'onnxruntime-${{ parameters.arch }}-${{ parameters.ep }}'
 
     - bash: |
         set -e -x
         mv "$(Pipeline.Workspace)/drop-linux-cpu-${{ parameters.arch }}-${{parameters.ep}}" $(Build.BinariesDirectory)/${{parameters.cmake_build_type}}
-        mv "$(Pipeline.Workspace)/onnxruntime${{ parameters.python_wheel_suffix }}-${{parameters.ep}}" "$(Build.BinariesDirectory)/whl"
+        mv "$(Pipeline.Workspace)/onnxruntime-${{ parameters.arch }}-${{ parameters.ep }}" "$(Build.BinariesDirectory)/whl"
         cp -r "$(Build.BinariesDirectory)/whl" $(Build.BinariesDirectory)/tmp
         find "$(Build.BinariesDirectory)/tmp" -name '*.whl' -exec bash -c 'unzip -d "${1%.*}" "$1"' _ {} \;
+      displayName: 'Move the artifacts to the binaries directory'
   # The private ADO project
   - ${{ if eq(variables['System.CollectionId'], 'bc038106-a83b-4dab-9dd3-5a41bc58f34c') }}:
     - download: build   # pipeline resource identifier.
       artifact: 'drop-linux-cpu-${{ parameters.arch }}-${{parameters.ep}}'
 
     - download: build   # pipeline resource identifier.
-      artifact: 'onnxruntime${{ parameters.python_wheel_suffix }}-${{ parameters.ep }}'
+      artifact: 'onnxruntime-${{ parameters.arch }}-${{ parameters.ep }}'
 
     - bash: |
         set -e -x
         ls $(Pipeline.Workspace)/build
         mv "$(Pipeline.Workspace)/build/drop-linux-cpu-${{ parameters.arch }}-${{parameters.ep}}" $(Build.BinariesDirectory)/${{parameters.cmake_build_type}}
-        mv "$(Pipeline.Workspace)/build/onnxruntime${{ parameters.python_wheel_suffix }}-${{parameters.ep}}" "$(Build.BinariesDirectory)/whl"
+        mv "$(Pipeline.Workspace)/build/onnxruntime-${{ parameters.arch }}-${{ parameters.ep }}" "$(Build.BinariesDirectory)/whl"
         cp -r "$(Build.BinariesDirectory)/whl" $(Build.BinariesDirectory)/tmp
         find "$(Build.BinariesDirectory)/tmp" -name '*.whl' -exec bash -c 'unzip -d "${1%.*}" "$1"' _ {} \;
+      displayName: 'Move the artifacts to the binaries directory'
 
   # The BinSkim task uses a dotnet program which doesn't support ARM CPUs yet
   - ${{ if eq(parameters.arch, 'x86_64') }}:
diff --git a/tools/ci_build/github/azure-pipelines/templates/py-win-arm64-qnn.yml b/tools/ci_build/github/azure-pipelines/templates/py-win-arm64-qnn.yml
index 4c9d0dccaf48d..10ea7f6203bb1 100644
--- a/tools/ci_build/github/azure-pipelines/templates/py-win-arm64-qnn.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/py-win-arm64-qnn.yml
@@ -19,6 +19,11 @@ parameters:
   type: string
   default: ''
 
+- name: is1ES
+  displayName: 'Whether the pipeline is running in 1ES'
+  type: boolean
+  default: false
+
 jobs:
 - job: Win_py_arm64_qnn_Wheels
   timeoutInMinutes: 210
@@ -26,6 +31,8 @@ jobs:
     clean: all
   pool:
     name: ${{ parameters.MACHINE_POOL }}
+    os: windows
+    hostArchitecture: Arm64
   strategy:
     matrix:
       Python311_arm64:
@@ -41,132 +48,140 @@ jobs:
     GRADLE_OPTS: '-Dorg.gradle.daemon=false'
     VSGenerator: 'Visual Studio 17 2022'
   steps:
-      - checkout: self
-        clean: true
-        submodules: recursive
-
-      - template: telemetry-steps.yml
-
-      - script: |
-          MKDIR $(Agent.ToolsDirectory)\Python\$(PythonVersion)\arm64
-          XCOPY /s /y /h /e /c /q "$(LocalPythonDir)\*.*" $(Agent.ToolsDirectory)\Python\$(PythonVersion)\arm64\
-          COPY NUL $(Agent.ToolsDirectory)\Python\$(PythonVersion)\arm64.complete
-          DIR $(Agent.ToolsDirectory)\Python
-          DIR $(Agent.ToolsDirectory)\Python\$(PythonVersion)
-          DIR $(Agent.ToolsDirectory)\Python\$(PythonVersion)\arm64
-        displayName: Copy python $(PythonVersion) version to agent tools directory
-
-      - task: UsePythonVersion@0
-        inputs:
-          versionSpec: $(PythonVersion)
-          addToPath: true
-          architecture: 'arm64'
-
-      - task: PipAuthenticate@1
-        displayName: 'Pip Authenticate'
-        inputs:
-          artifactFeeds: 'Lotus'
-
-      - task: onebranch.pipeline.tsaoptions@1
-        displayName: 'OneBranch TSAOptions'
-        inputs:
-          tsaConfigFilePath: '$(Build.SourcesDirectory)\.config\tsaoptions.json'
-          appendSourceBranchName: false
-
-      - task: PythonScript@0
-        inputs:
-          scriptSource: inline
-          script: |
-            import subprocess
-            subprocess.call(['pip', 'install', '-q', 'setuptools', 'wheel'])
-          workingDirectory: '$(Build.BinariesDirectory)'
-          displayName: 'Install python modules'
-
-      - template: set-nightly-build-option-variable-step.yml
-
-      - template: jobs/download_win_qnn_sdk.yml
-        parameters:
-          QnnSDKVersion: ${{ parameters.QNN_SDK }}
-
-      - task: PythonScript@0
-        displayName: 'Generate cmake config'
-        inputs:
-          scriptPath: '$(Build.SourcesDirectory)\tools\ci_build\build.py'
-          arguments: >
-            --config RelWithDebInfo
-            --build_dir $(Build.BinariesDirectory)
-            --skip_submodule_sync
-            --cmake_generator "$(VSGenerator)"
-            --build_shared_lib
-            --use_qnn
-            --qnn_home $(QnnSDKRootDir)
-            --enable_pybind
-            --parallel --use_vcpkg --use_vcpkg_ms_internal_asset_cache --update
-            $(TelemetryOption) ${{ parameters.BUILD_PY_PARAMETERS }}
-          workingDirectory: '$(Build.BinariesDirectory)'
-
-      - task: VSBuild@1
-        displayName: 'Build'
-        inputs:
-          solution: '$(Build.BinariesDirectory)\RelWithDebInfo\onnxruntime.sln'
-          platform: 'arm64'
-          configuration: RelWithDebInfo
-          msbuildArchitecture: 'arm64'
-          maximumCpuCount: true
-          logProjectEvents: true
-          workingFolder: '$(Build.BinariesDirectory)\RelWithDebInfo'
-          createLogFile: true
-
-      # Esrp signing
-      - template: win-esrp-dll.yml
-        parameters:
-          FolderPath: '$(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo\onnxruntime\capi'
-          DisplayName: 'ESRP - Sign Native dlls'
-          DoEsrp: true
-          Pattern: '*.pyd'
-
-      - task: PythonScript@0
-        displayName: 'Build wheel'
-        inputs:
-          scriptPath: '$(Build.SourcesDirectory)\setup.py'
-          arguments: 'bdist_wheel $(NightlyBuildOption) --wheel_name_suffix=qnn'
-          workingDirectory: '$(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo'
-
-      - task: CopyFiles@2
-        displayName: 'Copy Python Wheel to: $(Build.ArtifactStagingDirectory)'
-        inputs:
-          SourceFolder: '$(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo\dist'
-          Contents: '*.whl'
-          TargetFolder: '$(Build.ArtifactStagingDirectory)'
-
-      - task: PublishBuildArtifacts@1
-        displayName: 'Publish Artifact: ONNXRuntime python wheel'
-        inputs:
-          ArtifactName: onnxruntime_qnn_arm64
-
-      - script: |
-          7z x *.whl
-        workingDirectory: '$(Build.ArtifactStagingDirectory)'
-        displayName: 'unzip the package'
-
-      - task: CredScan@3
-        displayName: 'Run CredScan'
-        inputs:
-          debugMode: false
-        continueOnError: true
-
-      - task: BinSkim@4
-        displayName: 'Run BinSkim'
-        inputs:
-          AnalyzeTargetGlob: '+:file|$(Build.ArtifactStagingDirectory)\**\*.dll'
-
-      - task: TSAUpload@2
-        displayName: 'TSA upload'
-        condition: and (succeeded(), eq(variables['Build.SourceBranch'], 'refs/heads/main'))
-        inputs:
-          GdnPublishTsaOnboard: false
-          GdnPublishTsaConfigFile: '$(Build.sourcesDirectory)\.gdn\.gdntsa' 
-
-      - template: component-governance-component-detection-steps.yml
-        parameters:
-          condition: 'succeeded'
+  - checkout: self
+    clean: true
+    submodules: recursive
+
+  - template: telemetry-steps.yml
+
+  - script: |
+      MKDIR $(Agent.ToolsDirectory)\Python\$(PythonVersion)\arm64
+      XCOPY /s /y /h /e /c /q "$(LocalPythonDir)\*.*" $(Agent.ToolsDirectory)\Python\$(PythonVersion)\arm64\
+      COPY NUL $(Agent.ToolsDirectory)\Python\$(PythonVersion)\arm64.complete
+      DIR $(Agent.ToolsDirectory)\Python
+      DIR $(Agent.ToolsDirectory)\Python\$(PythonVersion)
+      DIR $(Agent.ToolsDirectory)\Python\$(PythonVersion)\arm64
+    displayName: Copy python $(PythonVersion) version to agent tools directory
+
+  - task: UsePythonVersion@0
+    inputs:
+      versionSpec: $(PythonVersion)
+      addToPath: true
+      architecture: 'arm64'
+
+  - task: PipAuthenticate@1
+    displayName: 'Pip Authenticate'
+    inputs:
+      artifactFeeds: 'Lotus'
+
+  - task: onebranch.pipeline.tsaoptions@1
+    displayName: 'OneBranch TSAOptions'
+    inputs:
+      tsaConfigFilePath: '$(Build.SourcesDirectory)\.config\tsaoptions.json'
+      appendSourceBranchName: false
+
+  - task: PythonScript@0
+    inputs:
+      scriptSource: inline
+      script: |
+        import subprocess
+        subprocess.call(['pip', 'install', '-q', 'setuptools', 'wheel'])
+      workingDirectory: '$(Build.BinariesDirectory)'
+      displayName: 'Install python modules'
+
+  - template: set-nightly-build-option-variable-step.yml
+
+  - template: jobs/download_win_qnn_sdk.yml
+    parameters:
+      QnnSDKVersion: ${{ parameters.QNN_SDK }}
+
+  - task: PythonScript@0
+    displayName: 'Generate cmake config'
+    inputs:
+      scriptPath: '$(Build.SourcesDirectory)\tools\ci_build\build.py'
+      arguments: >
+        --config RelWithDebInfo
+        --build_dir $(Build.BinariesDirectory)
+        --skip_submodule_sync
+        --cmake_generator "$(VSGenerator)"
+        --build_shared_lib
+        --use_qnn
+        --qnn_home $(QnnSDKRootDir)
+        --enable_pybind
+        --parallel --use_vcpkg --use_vcpkg_ms_internal_asset_cache --update
+        $(TelemetryOption) ${{ parameters.BUILD_PY_PARAMETERS }}
+      workingDirectory: '$(Build.BinariesDirectory)'
+
+  - task: VSBuild@1
+    displayName: 'Build'
+    inputs:
+      solution: '$(Build.BinariesDirectory)\RelWithDebInfo\onnxruntime.sln'
+      platform: 'arm64'
+      configuration: RelWithDebInfo
+      msbuildArchitecture: 'arm64'
+      maximumCpuCount: true
+      logProjectEvents: true
+      workingFolder: '$(Build.BinariesDirectory)\RelWithDebInfo'
+      createLogFile: true
+
+  # Esrp signing
+  - template: win-esrp-dll.yml
+    parameters:
+      FolderPath: '$(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo\onnxruntime\capi'
+      DisplayName: 'ESRP - Sign Native dlls'
+      DoEsrp: true
+      Pattern: '*.pyd'
+
+  - task: PythonScript@0
+    displayName: 'Build wheel'
+    inputs:
+      scriptPath: '$(Build.SourcesDirectory)\setup.py'
+      arguments: 'bdist_wheel $(NightlyBuildOption) --wheel_name_suffix=qnn'
+      workingDirectory: '$(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo'
+
+  - task: CopyFiles@2
+    displayName: 'Copy Python Wheel to: $(Build.ArtifactStagingDirectory)'
+    inputs:
+      SourceFolder: '$(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo\dist'
+      Contents: '*.whl'
+      TargetFolder: '$(Build.ArtifactStagingDirectory)'
+
+  - ${{ if eq(parameters.is1ES, true) }}:
+    - task: 1ES.PublishPipelineArtifact@1
+      displayName: 'Publish Artifact: ONNXRuntime python wheel'
+      inputs:
+        artifactName: onnxruntime_qnn_arm64_$(PythonVersion)
+        targetPath: '$(Build.ArtifactStagingDirectory)'
+  - ${{ if eq(parameters.is1ES, false) }}:
+    - task: PublishPipelineArtifact@1
+      displayName: 'Publish Artifact: ONNXRuntime python wheel'
+      input:
+        artifactName: onnxruntime_qnn_arm64_$(PythonVersion)
+        targetPath: '$(Build.ArtifactStagingDirectory)'
+
+  - script: |
+      7z x *.whl
+    workingDirectory: '$(Build.ArtifactStagingDirectory)'
+    displayName: 'unzip the package'
+
+  - task: CredScan@3
+    displayName: 'Run CredScan'
+    inputs:
+      debugMode: false
+    continueOnError: true
+
+  - task: BinSkim@4
+    displayName: 'Run BinSkim'
+    inputs:
+      AnalyzeTargetGlob: '+:file|$(Build.ArtifactStagingDirectory)\**\*.dll'
+
+  - task: TSAUpload@2
+    displayName: 'TSA upload'
+    condition: and (succeeded(), eq(variables['Build.SourceBranch'], 'refs/heads/main'))
+    inputs:
+      GdnPublishTsaOnboard: false
+      GdnPublishTsaConfigFile: '$(Build.sourcesDirectory)\.gdn\.gdntsa'
+
+  - template: component-governance-component-detection-steps.yml
+    parameters:
+      condition: 'succeeded'
diff --git a/tools/ci_build/github/azure-pipelines/templates/py-win-arm64ec-qnn.yml b/tools/ci_build/github/azure-pipelines/templates/py-win-arm64ec-qnn.yml
index ed29f1e67515e..24321d2a3e1ec 100644
--- a/tools/ci_build/github/azure-pipelines/templates/py-win-arm64ec-qnn.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/py-win-arm64ec-qnn.yml
@@ -19,6 +19,11 @@ parameters:
   type: string
   default: ''
 
+- name: is1ES
+  displayName: 'Whether the pipeline is running in 1ES'
+  type: boolean
+  default: false
+
 jobs:
 - job: Win_py_x64_qnn_Wheels
   timeoutInMinutes: 210
@@ -26,6 +31,7 @@ jobs:
     clean: all
   pool:
     name: ${{ parameters.MACHINE_POOL }}
+    os: windows
   strategy:
     matrix:
       Python310_x64:
@@ -40,117 +46,124 @@ jobs:
     GRADLE_OPTS: '-Dorg.gradle.daemon=false'
     VSGenerator: 'Visual Studio 17 2022'
   steps:
-      - checkout: self
-        clean: true
-        submodules: recursive
-
-      - template: telemetry-steps.yml
-
-      - task: UsePythonVersion@0
-        inputs:
-          versionSpec: $(PythonVersion)
-          addToPath: true
-          architecture: 'x64'
-
-      - task: PipAuthenticate@1
-        displayName: 'Pip Authenticate'
-        inputs:
-          artifactFeeds: 'Lotus'
-
-      - task: onebranch.pipeline.tsaoptions@1
-        displayName: 'OneBranch TSAOptions'
-        inputs:
-          tsaConfigFilePath: '$(Build.SourcesDirectory)\.config\tsaoptions.json'
-          appendSourceBranchName: fals
-
-      - script: python -m pip install -r $(Build.SourcesDirectory)\tools\ci_build\github\linux\python\requirements.txt
-
-
-      - template: set-nightly-build-option-variable-step.yml
-
-      - template: jobs/download_win_qnn_sdk.yml
-        parameters:
-          QnnSDKVersion: ${{ parameters.QNN_SDK }}
-
-      - task: PythonScript@0
-        displayName: 'Generate cmake config'
-        inputs:
-          scriptPath: '$(Build.SourcesDirectory)\tools\ci_build\build.py'
-          arguments: >
-            --config RelWithDebInfo
-            --build_dir $(Build.BinariesDirectory)
-            --skip_submodule_sync
-            --cmake_generator "$(VSGenerator)"
-            --build_shared_lib
-            --use_qnn
-            --qnn_home $(QnnSDKRootDir)
-            --enable_pybind
-            --parallel --use_vcpkg --use_vcpkg_ms_internal_asset_cache --update --arm64ec
-            $(TelemetryOption) ${{ parameters.BUILD_PY_PARAMETERS }}
-          workingDirectory: '$(Build.BinariesDirectory)'
-
-      - task: VSBuild@1
-        displayName: 'Build'
-        inputs:
-          solution: '$(Build.BinariesDirectory)\RelWithDebInfo\onnxruntime.sln'
-          platform: 'arm64ec'
-          configuration: RelWithDebInfo
-          msbuildArchitecture: 'x64'
-          maximumCpuCount: true
-          logProjectEvents: true
-          workingFolder: '$(Build.BinariesDirectory)\RelWithDebInfo'
-          createLogFile: true
-
-      # Esrp signing
-      - template: win-esrp-dll.yml
-        parameters:
-          FolderPath: '$(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo\onnxruntime\capi'
-          DisplayName: 'ESRP - Sign Native dlls'
-          DoEsrp: true
-          Pattern: '*.pyd'
-
-      - task: PythonScript@0
-        displayName: 'Build wheel'
-        inputs:
-          scriptPath: '$(Build.SourcesDirectory)\setup.py'
-          arguments: 'bdist_wheel $(NightlyBuildOption) --wheel_name_suffix=qnn'
-          workingDirectory: '$(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo'
-
-      - task: CopyFiles@2
-        displayName: 'Copy Python Wheel to: $(Build.ArtifactStagingDirectory)'
-        inputs:
-          SourceFolder: '$(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo\dist'
-          Contents: '*.whl'
-          TargetFolder: '$(Build.ArtifactStagingDirectory)'
-
-      - task: PublishBuildArtifacts@1
-        displayName: 'Publish Artifact: ONNXRuntime python wheel'
-        inputs:
-          ArtifactName: onnxruntime_qnn_arm64ec
-
-      - script: |
-          7z x *.whl
-        workingDirectory: '$(Build.ArtifactStagingDirectory)'
-        displayName: 'unzip the package'
-
-      - task: CredScan@3
-        displayName: 'Run CredScan'
-        inputs:
-          debugMode: false
-        continueOnError: true
-
-      - task: BinSkim@4
-        displayName: 'Run BinSkim'
-        inputs:
-          AnalyzeTargetGlob: '+:file|$(Build.ArtifactStagingDirectory)\**\*.dll'
-
-      - task: TSAUpload@2
-        displayName: 'TSA upload'
-        condition: and (succeeded(), eq(variables['Build.SourceBranch'], 'refs/heads/main'))
-        inputs:
-          GdnPublishTsaOnboard: false
-          GdnPublishTsaConfigFile: '$(Build.sourcesDirectory)\.gdn\.gdntsa' 
-
-      - template: component-governance-component-detection-steps.yml
-        parameters:
-          condition: 'succeeded'
+  - checkout: self
+    clean: true
+    submodules: recursive
+
+  - template: telemetry-steps.yml
+
+  - task: UsePythonVersion@0
+    inputs:
+      versionSpec: $(PythonVersion)
+      addToPath: true
+      architecture: 'x64'
+
+  - task: PipAuthenticate@1
+    displayName: 'Pip Authenticate'
+    inputs:
+      artifactFeeds: 'Lotus'
+
+  - task: onebranch.pipeline.tsaoptions@1
+    displayName: 'OneBranch TSAOptions'
+    inputs:
+      tsaConfigFilePath: '$(Build.SourcesDirectory)\.config\tsaoptions.json'
+      appendSourceBranchName: fals
+
+  - script: python -m pip install -r $(Build.SourcesDirectory)\tools\ci_build\github\linux\python\requirements.txt
+
+
+  - template: set-nightly-build-option-variable-step.yml
+
+  - template: jobs/download_win_qnn_sdk.yml
+    parameters:
+      QnnSDKVersion: ${{ parameters.QNN_SDK }}
+
+  - task: PythonScript@0
+    displayName: 'Generate cmake config'
+    inputs:
+      scriptPath: '$(Build.SourcesDirectory)\tools\ci_build\build.py'
+      arguments: >
+        --config RelWithDebInfo
+        --build_dir $(Build.BinariesDirectory)
+        --skip_submodule_sync
+        --cmake_generator "$(VSGenerator)"
+        --build_shared_lib
+        --use_qnn
+        --qnn_home $(QnnSDKRootDir)
+        --enable_pybind
+        --parallel --use_vcpkg --use_vcpkg_ms_internal_asset_cache --update --arm64ec
+        $(TelemetryOption) ${{ parameters.BUILD_PY_PARAMETERS }}
+      workingDirectory: '$(Build.BinariesDirectory)'
+
+  - task: VSBuild@1
+    displayName: 'Build'
+    inputs:
+      solution: '$(Build.BinariesDirectory)\RelWithDebInfo\onnxruntime.sln'
+      platform: 'arm64ec'
+      configuration: RelWithDebInfo
+      msbuildArchitecture: 'x64'
+      maximumCpuCount: true
+      logProjectEvents: true
+      workingFolder: '$(Build.BinariesDirectory)\RelWithDebInfo'
+      createLogFile: true
+
+  # Esrp signing
+  - template: win-esrp-dll.yml
+    parameters:
+      FolderPath: '$(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo\onnxruntime\capi'
+      DisplayName: 'ESRP - Sign Native dlls'
+      DoEsrp: true
+      Pattern: '*.pyd'
+
+  - task: PythonScript@0
+    displayName: 'Build wheel'
+    inputs:
+      scriptPath: '$(Build.SourcesDirectory)\setup.py'
+      arguments: 'bdist_wheel $(NightlyBuildOption) --wheel_name_suffix=qnn'
+      workingDirectory: '$(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo'
+
+  - task: CopyFiles@2
+    displayName: 'Copy Python Wheel to: $(Build.ArtifactStagingDirectory)'
+    inputs:
+      SourceFolder: '$(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo\dist'
+      Contents: '*.whl'
+      TargetFolder: '$(Build.ArtifactStagingDirectory)'
+
+  - ${{ if eq(parameters.is1ES, true) }}:
+    - task: 1ES.PublishPipelineArtifact@1
+      displayName: 'Publish Artifact: ONNXRuntime python wheel'
+      inputs:
+        artifactName: onnxruntime_qnn_arm64ec_$(PythonVersion)
+        targetPath: '$(Build.ArtifactStagingDirectory)'
+  - ${{ if eq(parameters.is1ES, false) }}:
+    - task: PublishPipelineArtifact@1
+      displayName: 'Publish Artifact: ONNXRuntime python wheel'
+      inputs:
+        artifactName: onnxruntime_qnn_arm64ec_$(PythonVersion)
+        targetPath: '$(Build.ArtifactStagingDirectory)'
+  - script: |
+      7z x *.whl
+    workingDirectory: '$(Build.ArtifactStagingDirectory)'
+    displayName: 'unzip the package'
+
+  - task: CredScan@3
+    displayName: 'Run CredScan'
+    inputs:
+      debugMode: false
+    continueOnError: true
+
+  - task: BinSkim@4
+    displayName: 'Run BinSkim'
+    inputs:
+      AnalyzeTargetGlob: '+:file|$(Build.ArtifactStagingDirectory)\**\*.dll'
+
+  - task: TSAUpload@2
+    displayName: 'TSA upload'
+    condition: and (succeeded(), eq(variables['Build.SourceBranch'], 'refs/heads/main'))
+    inputs:
+      GdnPublishTsaOnboard: false
+      GdnPublishTsaConfigFile: '$(Build.sourcesDirectory)\.gdn\.gdntsa'
+
+  - template: component-governance-component-detection-steps.yml
+    parameters:
+      condition: 'succeeded'
diff --git a/tools/ci_build/github/azure-pipelines/templates/py-win-x64-qnn.yml b/tools/ci_build/github/azure-pipelines/templates/py-win-x64-qnn.yml
index 13069846da342..175b343e55d57 100644
--- a/tools/ci_build/github/azure-pipelines/templates/py-win-x64-qnn.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/py-win-x64-qnn.yml
@@ -19,6 +19,11 @@ parameters:
   type: string
   default: ''
 
+- name: is1ES
+  displayName: 'Whether the pipeline is running in 1ES'
+  type: boolean
+  default: false
+
 jobs:
 - job: Win_py_x64_qnn_Wheels
   timeoutInMinutes: 210
@@ -116,10 +121,18 @@ jobs:
           Contents: '*.whl'
           TargetFolder: '$(Build.ArtifactStagingDirectory)'
 
-      - task: PublishBuildArtifacts@1
-        displayName: 'Publish Artifact: ONNXRuntime python wheel'
-        inputs:
-          ArtifactName: onnxruntime_qnn_x64
+      - ${{ if eq(parameters.is1ES, true) }}:
+        - task: 1ES.PublishPipelineArtifact@1
+          displayName: 'Publish Artifact: ONNXRuntime python wheel'
+          inputs:
+            artifactName: onnxruntime_qnn_x64_$(PythonVersion)
+            targetPath: '$(Build.ArtifactStagingDirectory)'
+      - ${{ if eq(parameters.is1ES, false) }}:
+        - task: PublishPipelineArtifact@1
+          displayName: 'Publish Artifact: ONNXRuntime python wheel'
+          inputs:
+            artifactName: onnxruntime_qnn_x64_$(PythonVersion)
+            targetPath: '$(Build.ArtifactStagingDirectory)'
 
       - script: |
           7z x *.whl
diff --git a/tools/ci_build/github/azure-pipelines/templates/react-native-ci.yml b/tools/ci_build/github/azure-pipelines/templates/react-native-ci.yml
index 7991916a47ca4..52dbb76632e0c 100644
--- a/tools/ci_build/github/azure-pipelines/templates/react-native-ci.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/react-native-ci.yml
@@ -62,10 +62,14 @@ stages:
   dependsOn: '${{parameters.InitialStageDependsOn}}'
   jobs:
   - job: ReactNative_CI_iOS
-    pool:
-      name: 'Azure Pipelines'
-      image: 'macOS-13'
-      os: 'macOS'
+    ${{ if eq(parameters.is1ES, false) }}:
+      pool:
+        vmImage: 'macOS-13'
+    ${{ if eq(parameters.is1ES, true) }}:
+      pool:
+        name: 'Azure Pipelines'
+        image: 'macOS-13'
+        os: 'macOS'
 
     timeoutInMinutes: 120
 

From 989d4177ed99db324ba4a4a35149977626120b14 Mon Sep 17 00:00:00 2001
From: Jian Chen <cjian@microsoft.com>
Date: Fri, 7 Mar 2025 23:35:26 -0500
Subject: [PATCH 45/46] Delete ROCM Nuget Publishing Pipeline (#23948)

---
 .../rocm-nuget-packaging-pipeline.yml         | 339 ------------------
 .../rocm-publish-nuget-pipeline.yml           |  21 --
 2 files changed, 360 deletions(-)
 delete mode 100644 tools/ci_build/github/azure-pipelines/rocm-nuget-packaging-pipeline.yml
 delete mode 100644 tools/ci_build/github/azure-pipelines/rocm-publish-nuget-pipeline.yml

diff --git a/tools/ci_build/github/azure-pipelines/rocm-nuget-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/rocm-nuget-packaging-pipeline.yml
deleted file mode 100644
index 286f92b36f7e4..0000000000000
--- a/tools/ci_build/github/azure-pipelines/rocm-nuget-packaging-pipeline.yml
+++ /dev/null
@@ -1,339 +0,0 @@
-parameters:
-- name: RunOnnxRuntimeTests
-  displayName: Run Tests?
-  type: boolean
-  default: true
-
-- name: UseIncreasedTimeoutForTests
-  displayName: Increase timeout for tests? Set it to false if you are doing an Onnx Runtime release.
-  type: boolean
-  default: false
-
-- name: DoCompliance
-  displayName: Run Compliance Tasks?
-  type: boolean
-  default: true
-
-- name: DoEsrp
-  displayName: Run code sign tasks? Must be true if you are doing an ONNX Runtime release
-  type: boolean
-  default: true
-
-- name: IsReleaseBuild
-  displayName: Is a release build? Set it to true if you are doing an ONNX Runtime release.
-  type: boolean
-  default: false
-
-- name: PreReleaseVersionSuffixString
-  displayName: Suffix added to pre-release package version. Only used if IsReleaseBuild is true. Denotes the type of pre-release package.
-  type: string
-  values:
-  - alpha
-  - beta
-  - rc
-  - none
-  default: none
-
-- name: PreReleaseVersionSuffixNumber
-  displayName: Number added to pre-release package version. Only used if IsReleaseBuild is true. Denotes the sequence of a pre-release package.
-  type: number
-  default: 0
-
-# these 2 parameters are used for debugging.
-- name: SpecificArtifact
-  displayName: Use Specific Artifact (Debugging only)
-  type: boolean
-  default: false
-
-- name: BuildId
-  displayName: Pipeline BuildId, you could find it in the URL
-  type: string
-  default: '0'
-
-- name: NugetPackageSuffix
-  displayName: Suffix to append to nuget package
-  type: string
-  default: 'NONE'
-
-resources:
-  repositories:
-  - repository: onnxruntime-inference-examples # The name used to reference this repository in the checkout step
-    type: github
-    endpoint: ort-examples
-    name: microsoft/onnxruntime-inference-examples
-  - repository: manylinux
-    type: Github
-    endpoint: Microsoft
-    name: pypa/manylinux
-    ref: 5eda9aded5462201e6310105728d33016e637ea7
-
-variables:
-- name: ReleaseVersionSuffix
-  value: ''
-
-stages:
-- template: stages/set_packaging_variables_stage.yml
-  parameters:
-    IsReleaseBuild: ${{ parameters.IsReleaseBuild }}
-    PreReleaseVersionSuffixString: ${{ parameters.PreReleaseVersionSuffixString }}
-    PreReleaseVersionSuffixNumber: ${{ parameters.PreReleaseVersionSuffixNumber }}
-
-# ROCm
-- stage: Linux_C_API_Packaging_ROCm_x64
-  dependsOn: []
-  jobs:
-  - job: Linux_C_API_Packaging_ROCm_x64
-    workspace:
-      clean: all
-    timeoutInMinutes: 480
-    pool: onnxruntime-Ubuntu2204-AMD-CPU
-    variables:
-      RocmVersion: '6.2'
-      RocmVersionPatchSuffix: ''
-    steps:
-    - checkout: self                           # due to checkout multiple repos, the root directory is $(Build.SourcesDirectory)/onnxruntime
-      submodules: recursive
-    - checkout: manylinux                      # due to checkout multiple repos, the root directory is $(Build.SourcesDirectory)/manylinux, for get-docker-image-steps.yml
-      submodules: false
-
-    # get-docker-image-steps.yml will move the $(Build.SourcesDirectory)/manylinux into $(Build.SourcesDirectory)/onnxruntime,
-    # then rename $(Build.SourcesDirectory)/onnxruntime as $(Build.SourcesDirectory)
-    - template: templates/get-docker-image-steps.yml
-      parameters:
-        Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_rocm
-        Context: tools/ci_build/github/linux/docker
-        DockerBuildArgs: >-
-          --build-arg INSTALL_DEPS_EXTRA_ARGS=-tmur
-          --build-arg BUILD_UID=$(id -u)
-          --network=host --build-arg POLICY=manylinux_2_28 --build-arg PLATFORM=x86_64
-          --build-arg ROCM_VERSION=$(RocmVersion)$(RocmVersionPatchSuffix)
-          --build-arg DEVTOOLSET_ROOTPATH=/opt/rh/gcc-toolset-12/root
-          --build-arg PREPEND_PATH=/opt/rh/gcc-toolset-12/root/usr/bin:
-          --build-arg LD_LIBRARY_PATH_ARG=/opt/rh/gcc-toolset-12/root/usr/lib64:/opt/rh/gcc-toolset-12/root/usr/lib:/opt/rh/gcc-toolset-12/root/usr/lib64/dyninst:/opt/rh/gcc-toolset-12/root/usr/lib/dyninst:/usr/local/lib64:/usr/local/lib
-        Repository: onnxruntimetrainingrocmbuild-rocm$(RocmVersion)
-        CheckOutManyLinux: true
-
-    - template: templates/set-version-number-variables-step.yml
-
-    - task: Bash@3
-      displayName: 'Build'
-      inputs:
-        targetType: filePath
-        filePath: tools/ci_build/github/linux/build_rocm_c_api_package.sh
-        arguments: >-
-          -S $(Build.SourcesDirectory)
-          -B $(Build.BinariesDirectory)
-          -V $(RocmVersion)
-          -I onnxruntimetrainingrocmbuild-rocm$(RocmVersion)
-          -P python3.10
-
-    - script: |
-       set -e -x
-       mkdir $(Build.ArtifactStagingDirectory)/testdata
-       cp $(Build.BinariesDirectory)/Release/libcustom_op_library.so* $(Build.ArtifactStagingDirectory)/testdata
-       ls -al $(Build.ArtifactStagingDirectory)
-      displayName: 'Create Artifacts for CustomOp'  # libcustom_op_library.so from cpu build is built with fp8, ROCm does not support it.
-
-    - template: templates/c-api-artifacts-package-and-publish-steps-posix.yml
-      parameters:
-        buildConfig: 'Release'
-        artifactName: 'onnxruntime-linux-x64-rocm-$(OnnxRuntimeVersion)'
-        artifactNameNoVersionString: 'onnxruntime-linux-x64-rocm'
-        libraryName: 'libonnxruntime.so.$(OnnxRuntimeVersion)'
-
-    - template: templates/component-governance-component-detection-steps.yml
-      parameters:
-        condition: 'succeeded'
-    - template: templates/clean-agent-build-directory-step.yml
-
-- stage: NuGet_Packaging_ROCm
-  dependsOn:
-  - Setup
-  - Linux_C_API_Packaging_ROCm_x64
-  condition: succeeded()
-  jobs:
-  - job: NuGet_Packaging_ROCm
-    workspace:
-      clean: all
-    # we need to use a 2022 pool to create the nuget package with MAUI targets.
-    # VS2019 has no support for net6/MAUI and we need to use msbuild (from the VS install) to do the packing
-    pool: 'Onnxruntime-Win-CPU-2022'
-    variables:
-      breakCodesignValidationInjection: ${{ parameters.DoEsrp }}
-      ReleaseVersionSuffix: $[stageDependencies.Setup.Set_Variables.outputs['Set_Release_Version_Suffix.ReleaseVersionSuffix']]
-      BuildDate : $[stageDependencies.Setup.Set_Variables.outputs['Set_Build_Date.BuildDate']]
-      BuildTime : $[stageDependencies.Setup.Set_Variables.outputs['Set_Build_Time.BuildTime']]
-
-    steps:
-    - checkout: self
-      submodules: true
-      fetchDepth: 1
-
-    - template: templates/flex-downloadPipelineArtifact.yml
-      parameters:
-        StepName: 'Download Pipeline Artifact - NuGet'
-        ArtifactName: 'onnxruntime-linux-x64-rocm'
-        targetPath: '$(Build.BinariesDirectory)/nuget-artifact'
-        SpecificArtifact: ${{ parameters.specificArtifact }}
-        BuildId: ${{ parameters.BuildId }}
-
-    - task: PowerShell@2
-      displayName: 'Reconstruct Build Directory'
-      inputs:
-        targetType: inline
-        script: |
-          Get-ChildItem $(Build.BinariesDirectory)\nuget-artifact -Filter *.tgz | % {
-            # *.tar will be created after *.tgz is extracted
-            $cmd = "7z.exe x $($_.FullName) -y -o$(Build.BinariesDirectory)\nuget-artifact"
-            Write-Output $cmd
-            Invoke-Expression -Command $cmd
-          }
-
-          Get-ChildItem $(Build.BinariesDirectory)\nuget-artifact -Filter *.tar | % {
-            $cmd = "7z.exe x $($_.FullName) -y -o$(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo\nuget-artifacts"
-            Write-Output $cmd
-            Invoke-Expression -Command $cmd
-          }
-
-          $ort_dirs = Get-ChildItem -Path $(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo\nuget-artifacts\onnxruntime-* -Directory
-          foreach ($ort_dir in $ort_dirs)
-          {
-            $dirname = Split-Path -Path $ort_dir -Leaf
-            $dirname = $dirname.SubString(0, $dirname.LastIndexOf('-'))
-            Write-Output "Renaming $ort_dir to $dirname"
-            Rename-Item -Path $ort_dir -NewName $(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo\nuget-artifacts\$dirname
-          }
-
-          Copy-Item -Path $(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo\nuget-artifacts\onnxruntime-linux-x64-rocm\lib\* -Destination $(Build.BinariesDirectory)\RelWithDebInfo
-
-    - script: |
-       tree /F
-      workingDirectory: '$(Build.BinariesDirectory)'
-      displayName: 'Inspect Build Binaries Directory'
-
-    - script: |
-       mklink /D /J models C:\local\models
-      workingDirectory: '$(Build.BinariesDirectory)'
-      displayName: 'Create models link'
-
-    - task: NuGetToolInstaller@0
-      displayName: Use Nuget 6.10.x
-      inputs:
-        versionSpec: 6.10.x
-
-    - task: MSBuild@1
-      displayName: 'Restore NuGet Packages and create project.assets.json'
-      inputs:
-        solution: '$(Build.SourcesDirectory)\csharp\OnnxRuntime.DesktopOnly.CSharp.sln'
-        platform: 'Any CPU'
-        configuration: RelWithDebInfo
-        msbuildArguments: '-t:restore -p:OrtPackageId="Microsoft.ML.OnnxRuntime.ROCm"'
-        workingDirectory: '$(Build.SourcesDirectory)\csharp'
-
-    - task: MSBuild@1
-      displayName: 'Build C# bindings'
-      inputs:
-        solution: '$(Build.SourcesDirectory)\csharp\OnnxRuntime.DesktopOnly.CSharp.sln'
-        platform: 'Any CPU'
-        configuration: RelWithDebInfo
-        msbuildArguments: >
-          -p:OnnxRuntimeBuildDirectory="$(Build.BinariesDirectory)"
-          -p:OrtPackageId="Microsoft.ML.OnnxRuntime.ROCm"
-          -p:IsReleaseBuild=${{ parameters.IsReleaseBuild }}
-          -p:ReleaseVersionSuffix=$(ReleaseVersionSuffix)
-          -p:IsLinuxBuild=true
-          -p:IsWindowsBuild=false
-          -p:IsMacOSBuild=false
-        workingDirectory: '$(Build.SourcesDirectory)\csharp'
-
-    - template: templates/win-esrp-dll.yml
-      parameters:
-        FolderPath: '$(Build.SourcesDirectory)\csharp\src\Microsoft.ML.OnnxRuntime\bin\RelWithDebInfo'
-        DisplayName: 'ESRP - Sign C# dlls'
-        DoEsrp: ${{ parameters.DoEsrp }}
-
-    - task: UsePythonVersion@0
-      displayName: 'Use Python'
-      inputs:
-        versionSpec: 3.12
-
-    - task: MSBuild@1
-      displayName: 'Build Nuget Packages'
-      inputs:
-        solution: '$(Build.SourcesDirectory)\csharp\OnnxRuntime.CSharp.proj'
-        configuration: RelWithDebInfo
-        platform: 'Any CPU'
-        msbuildArguments: >
-          -t:CreatePackage
-          -p:OnnxRuntimeBuildDirectory="$(Build.BinariesDirectory)"
-          -p:OrtPackageId=Microsoft.ML.OnnxRuntime.ROCm
-          -p:IsReleaseBuild=${{ parameters.IsReleaseBuild }}
-          -p:ReleaseVersionSuffix=$(ReleaseVersionSuffix)
-          -p:CurrentTime=$(BuildTime)
-          -p:CurrentDate=$(BuildDate)
-        workingDirectory: '$(Build.SourcesDirectory)\csharp'
-
-    - task: CopyFiles@2
-      displayName: 'Copy nuget packages to: $(Build.ArtifactStagingDirectory)'
-      inputs:
-        SourceFolder: '$(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo'
-        Contents: '*.snupkg'
-        TargetFolder: '$(Build.ArtifactStagingDirectory)'
-
-    - task: CopyFiles@2
-      displayName: 'Copy nuget packages to: $(Build.ArtifactStagingDirectory)'
-      inputs:
-        SourceFolder: '$(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo'
-        Contents: '*.nupkg'
-        TargetFolder: '$(Build.ArtifactStagingDirectory)'
-
-    - task: CopyFiles@2
-      displayName: 'Copy nuget packages to: $(Build.ArtifactStagingDirectory)'
-      inputs:
-        SourceFolder: '$(Build.SourcesDirectory)\csharp\src\Microsoft.ML.OnnxRuntime\bin\RelWithDebInfo'
-        Contents: '*.nupkg'
-        TargetFolder: '$(Build.ArtifactStagingDirectory)'
-
-    - template: templates/esrp_nuget.yml
-      parameters:
-        DisplayName: 'ESRP - sign NuGet package'
-        FolderPath: '$(Build.ArtifactStagingDirectory)'
-        DoEsrp: ${{ parameters.DoEsrp }}
-
-    - template: templates/validate-package.yml
-      parameters:
-        PackageType: 'nuget'
-        PackagePath: '$(Build.ArtifactStagingDirectory)'
-        PackageName: 'Microsoft.ML.OnnxRuntime.*nupkg'
-        PlatformsSupported: 'linux-x64'
-        VerifyNugetSigning: false
-
-    - task: PublishPipelineArtifact@0
-      displayName: 'Publish Pipeline NuGet Artifact'
-      inputs:
-        artifactName: 'drop-signed-nuget-ROCm'
-        targetPath: '$(Build.ArtifactStagingDirectory)'
-
-    - task: MSBuild@1
-      displayName: 'Clean C#'
-      inputs:
-        solution: '$(Build.SourcesDirectory)\csharp\OnnxRuntime.DesktopOnly.CSharp.sln'
-        platform: 'Any CPU'
-        configuration: RelWithDebInfo
-        msbuildArguments: '-t:Clean -p:OnnxRuntimeBuildDirectory="$(Build.BinariesDirectory)" -p:OrtPackageId=Microsoft.ML.OnnxRuntime.ROCm'
-        workingDirectory: '$(Build.SourcesDirectory)\csharp'
-
-    - template: templates/component-governance-component-detection-steps.yml
-      parameters :
-        condition : 'succeeded'
-
-
-- template: nuget/templates/test_linux.yml
-  parameters:
-    AgentPool: AMD-GPU
-    ArtifactSuffix: 'ROCm'
-    StageSuffix: 'ROCm'
-    NugetPackageName: 'Microsoft.ML.OnnxRuntime.ROCm'
-    SpecificArtifact: ${{ parameters.specificArtifact }}
-    CustomOpArtifactName: 'onnxruntime-linux-x64-rocm'
-    BuildId: ${{ parameters.BuildId }}
diff --git a/tools/ci_build/github/azure-pipelines/rocm-publish-nuget-pipeline.yml b/tools/ci_build/github/azure-pipelines/rocm-publish-nuget-pipeline.yml
deleted file mode 100644
index 1d2393d8f96d5..0000000000000
--- a/tools/ci_build/github/azure-pipelines/rocm-publish-nuget-pipeline.yml
+++ /dev/null
@@ -1,21 +0,0 @@
-resources:
-  pipelines:
-  - pipeline: build
-    source: 'Nuget ROCM Packaging pipeline'
-    trigger:
-      branches:
-        include:
-        - main
-        - rel-*
-    branch: main
-
-# ROCm
-stages:
-- template: templates/publish-nuget-steps.yml
-  parameters:
-    stage_name: 'Publish_ROCM_NuGet_Package'
-    download_artifacts_steps:
-      - download: build
-        displayName: 'Download Pipeline Artifact - Signed NuGet Package'
-        artifact: 'drop-signed-nuget-ROCm'
-      - script: move "$(Pipeline.Workspace)\build\drop-signed-nuget-ROCm\*" $(Build.BinariesDirectory)\nuget-artifact\final-package

From fe7634eb6f20b656a3df978a6a2ef9b3ea00c59d Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Sun, 9 Mar 2025 19:07:47 -0700
Subject: [PATCH 46/46] Bump SixLabors.ImageSharp from 2.1.9 to 2.1.10 in
 /csharp/sample/Microsoft.ML.OnnxRuntime.FasterRcnnSample (#23924)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Bumps [SixLabors.ImageSharp](https://github.com/SixLabors/ImageSharp)
from 2.1.9 to 2.1.10.
<details>
<summary>Release notes</summary>
<p><em>Sourced from <a
href="https://github.com/SixLabors/ImageSharp/releases">SixLabors.ImageSharp's
releases</a>.</em></p>
<blockquote>
<h2>v2.1.10</h2>
<h2>What's Changed</h2>
<ul>
<li>Backport <a
href="https://redirect.github.com/SixLabors/ImageSharp/issues/2859">#2859</a>
to release/2.1.x by <a
href="https://github.com/antonfirsov"><code>@​antonfirsov</code></a> in
<a
href="https://redirect.github.com/SixLabors/ImageSharp/pull/2890">SixLabors/ImageSharp#2890</a></li>
<li>Backport <a
href="https://redirect.github.com/SixLabors/ImageSharp/issues/2701">#2701</a>
to 2.1.x [copy] by <a
href="https://github.com/antonfirsov"><code>@​antonfirsov</code></a> in
<a
href="https://redirect.github.com/SixLabors/ImageSharp/pull/2891">SixLabors/ImageSharp#2891</a></li>
</ul>
<p><strong>Full Changelog</strong>: <a
href="https://github.com/SixLabors/ImageSharp/compare/v2.1.9...v2.1.10">https://github.com/SixLabors/ImageSharp/compare/v2.1.9...v2.1.10</a></p>
</blockquote>
</details>
<details>
<summary>Commits</summary>
<ul>
<li><a
href="https://github.com/SixLabors/ImageSharp/commit/d133ef99e8becfc3b924b0bb4315e63b8681d307"><code>d133ef9</code></a>
Set lang version</li>
<li><a
href="https://github.com/SixLabors/ImageSharp/commit/5dfe5a800367581239de442cc18de659da6e9b1d"><code>5dfe5a8</code></a>
Missed cache action update</li>
<li><a
href="https://github.com/SixLabors/ImageSharp/commit/4d3a85112b03c89d2cb8616a5b747684b6e73730"><code>4d3a851</code></a>
Use latest cache action</li>
<li><a
href="https://github.com/SixLabors/ImageSharp/commit/4cb9f40a722ab2b837157862f0320c6a652da4d0"><code>4cb9f40</code></a>
Merge pull request <a
href="https://redirect.github.com/SixLabors/ImageSharp/issues/2891">#2891</a>
from SixLabors/af/backport-2701</li>
<li><a
href="https://github.com/SixLabors/ImageSharp/commit/bb82f79db0197166271d4355b5fb5ceda370a906"><code>bb82f79</code></a>
<a
href="https://redirect.github.com/SixLabors/ImageSharp/issues/2701">#2701</a>
to 2.1.x [copy]</li>
<li><a
href="https://github.com/SixLabors/ImageSharp/commit/627b5f721f30f6d529acb50bd81f92bd3db754eb"><code>627b5f7</code></a>
Merge pull request <a
href="https://redirect.github.com/SixLabors/ImageSharp/issues/2890">#2890</a>
from SixLabors/af/backport-2859</li>
<li><a
href="https://github.com/SixLabors/ImageSharp/commit/67f7848d6e975e7956c8056823555de49a5fdf6d"><code>67f7848</code></a>
try to fix LFS for *.BMP</li>
<li><a
href="https://github.com/SixLabors/ImageSharp/commit/44d294e06606111195152ead3006452357ef1bb9"><code>44d294e</code></a>
8.0.x is not needed</li>
<li><a
href="https://github.com/SixLabors/ImageSharp/commit/adb85d9e66aa3a588a86f4a4ef9a0539a8502117"><code>adb85d9</code></a>
Another attempt for a Linux-specific skip</li>
<li><a
href="https://github.com/SixLabors/ImageSharp/commit/efc3fc4ee15eec4e523c26f7130e786541b00df2"><code>efc3fc4</code></a>
Disable BmpDecoder_CanDecode_Os2BitmapArray on Linux</li>
<li>Additional commits viewable in <a
href="https://github.com/SixLabors/ImageSharp/compare/v2.1.9...v2.1.10">compare
view</a></li>
</ul>
</details>
<br />


[![Dependabot compatibility
score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=SixLabors.ImageSharp&package-manager=nuget&previous-version=2.1.9&new-version=2.1.10)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores)

Dependabot will resolve any conflicts with this PR as long as you don't
alter it yourself. You can also trigger a rebase manually by commenting
`@dependabot rebase`.

[//]: # (dependabot-automerge-start)
[//]: # (dependabot-automerge-end)

---

<details>
<summary>Dependabot commands and options</summary>
<br />

You can trigger Dependabot actions by commenting on this PR:
- `@dependabot rebase` will rebase this PR
- `@dependabot recreate` will recreate this PR, overwriting any edits
that have been made to it
- `@dependabot merge` will merge this PR after your CI passes on it
- `@dependabot squash and merge` will squash and merge this PR after
your CI passes on it
- `@dependabot cancel merge` will cancel a previously requested merge
and block automerging
- `@dependabot reopen` will reopen this PR if it is closed
- `@dependabot close` will close this PR and stop Dependabot recreating
it. You can achieve the same result by closing it manually
- `@dependabot show <dependency name> ignore conditions` will show all
of the ignore conditions of the specified dependency
- `@dependabot ignore this major version` will close this PR and stop
Dependabot creating any more for this major version (unless you reopen
the PR or upgrade to it yourself)
- `@dependabot ignore this minor version` will close this PR and stop
Dependabot creating any more for this minor version (unless you reopen
the PR or upgrade to it yourself)
- `@dependabot ignore this dependency` will close this PR and stop
Dependabot creating any more for this dependency (unless you reopen the
PR or upgrade to it yourself)
You can disable automated security fix PRs for this repo from the
[Security Alerts
page](https://github.com/microsoft/onnxruntime/network/alerts).

</details>

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 .../Microsoft.ML.OnnxRuntime.FasterRcnnSample.csproj            | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/csharp/sample/Microsoft.ML.OnnxRuntime.FasterRcnnSample/Microsoft.ML.OnnxRuntime.FasterRcnnSample.csproj b/csharp/sample/Microsoft.ML.OnnxRuntime.FasterRcnnSample/Microsoft.ML.OnnxRuntime.FasterRcnnSample.csproj
index f00a08a1a3595..b1452a64934c2 100644
--- a/csharp/sample/Microsoft.ML.OnnxRuntime.FasterRcnnSample/Microsoft.ML.OnnxRuntime.FasterRcnnSample.csproj
+++ b/csharp/sample/Microsoft.ML.OnnxRuntime.FasterRcnnSample/Microsoft.ML.OnnxRuntime.FasterRcnnSample.csproj
@@ -8,7 +8,7 @@
 
   <ItemGroup>
     <PackageReference Include="Microsoft.ML.OnnxRuntime" Version="1.16.3" />
-    <PackageReference Include="Sixlabors.ImageSharp" Version="2.1.9" />
+    <PackageReference Include="Sixlabors.ImageSharp" Version="2.1.10" />
     <PackageReference Include="SixLabors.ImageSharp.Drawing" Version="1.0.0-beta14" />
   </ItemGroup>