From 25c43c1fd66de00d3e7ebe58ecd4b37e060e5be5 Mon Sep 17 00:00:00 2001
From: David Fan <30608893+jiafatom@users.noreply.github.com>
Date: Fri, 2 May 2025 21:38:54 -0700
Subject: [PATCH 01/84] K quant (#24615)

### Description
Integrate some neural compressor code since the ORT side in the repo is
in maintenance mode.


### Motivation and Context
Enable k-quant quantization.
---
 cmake/onnxruntime_python.cmake                |    7 +
 .../quantization/matmul_nbits_quantizer.py    |   49 +-
 .../neural_compressor/__init__.py             |    1 +
 .../neural_compressor/onnx_model.py           | 1264 +++++++++++++++++
 .../quantization/neural_compressor/util.py    |   80 ++
 .../neural_compressor/weight_only.py          |  932 ++++++++++++
 .../quantization/test_op_matmul_4bits.py      |    4 +
 setup.py                                      |    1 +
 8 files changed, 2330 insertions(+), 8 deletions(-)
 create mode 100644 onnxruntime/python/tools/quantization/neural_compressor/__init__.py
 create mode 100644 onnxruntime/python/tools/quantization/neural_compressor/onnx_model.py
 create mode 100644 onnxruntime/python/tools/quantization/neural_compressor/util.py
 create mode 100644 onnxruntime/python/tools/quantization/neural_compressor/weight_only.py

diff --git a/cmake/onnxruntime_python.cmake b/cmake/onnxruntime_python.cmake
index c57a2a962303d..8f7a96e052fa1 100644
--- a/cmake/onnxruntime_python.cmake
+++ b/cmake/onnxruntime_python.cmake
@@ -468,6 +468,9 @@ file(GLOB onnxruntime_python_quantization_fusions_src CONFIGURE_DEPENDS
 file(GLOB onnxruntime_python_quantization_ep_qnn_src CONFIGURE_DEPENDS
     "${ONNXRUNTIME_ROOT}/python/tools/quantization/execution_providers/qnn/*.py"
 )
+file(GLOB onnxruntime_python_quantization_neural_compressor_src CONFIGURE_DEPENDS
+    "${ONNXRUNTIME_ROOT}/python/tools/quantization/neural_compressor/*.py"
+)
 file(GLOB onnxruntime_python_transformers_src CONFIGURE_DEPENDS
     "${ONNXRUNTIME_ROOT}/python/tools/transformers/*.py"
 )
@@ -581,6 +584,7 @@ add_custom_command(
   COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/quantization/fusions
   COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/quantization/execution_providers
   COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/quantization/execution_providers/qnn
+  COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/quantization/neural_compressor
   COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/quantization
   COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/transformers
   COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/transformers/test_data/models
@@ -660,6 +664,9 @@ add_custom_command(
   COMMAND ${CMAKE_COMMAND} -E copy
       ${onnxruntime_python_quantization_ep_qnn_src}
       $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/quantization/execution_providers/qnn/
+  COMMAND ${CMAKE_COMMAND} -E copy
+      ${onnxruntime_python_quantization_neural_compressor_src}
+      $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/quantization/neural_compressor/
   COMMAND ${CMAKE_COMMAND} -E copy
       ${onnxruntime_python_transformers_src}
       $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/transformers/
diff --git a/onnxruntime/python/tools/quantization/matmul_nbits_quantizer.py b/onnxruntime/python/tools/quantization/matmul_nbits_quantizer.py
index b1d58b713eea8..ef08b56cfe7ad 100644
--- a/onnxruntime/python/tools/quantization/matmul_nbits_quantizer.py
+++ b/onnxruntime/python/tools/quantization/matmul_nbits_quantizer.py
@@ -21,6 +21,7 @@
 from onnxruntime.capi._pybind_state import quantize_matmul_4bits, quantize_matmul_8bits, quantize_qdq_matmul_4bits
 
 from .calibrate import CalibrationDataReader
+from .neural_compressor import gptq_quantize, rtn_quantize
 from .onnx_model import ONNXModel
 from .quant_utils import QuantFormat, attribute_to_kwarg
 
@@ -98,6 +99,40 @@ def __init__(
         self.ratios = ratios
 
 
+class KQuantWeightOnlyQuantConfig(WeightOnlyQuantConfig):
+    def __init__(
+        self,
+        ratios=None,
+        quant_format=QuantFormat.QOperator,
+        op_types_to_quantize: tuple[str, ...] | None = None,
+        customized_weight_config: dict | None = None,
+    ):
+        """
+        This is a class for k-quant algorithm Weight Only Quant Configuration.
+
+        Args:
+            ratios:
+                percentile of clip. Defaults to {}.
+            quant_format (QuantFormat{QOperator, QDQ}, optional):
+                QOperator format quantizes the model with quantized operators directly.
+                QDQ format quantize the model by inserting QuantizeLinear/DeQuantizeLinear on the tensor.
+                Defaults to QuantFormat.QOperator.
+            op_types_to_quantize (optional):
+                set of operator types to quantize.
+        """
+        assert quant_format == QuantFormat.QOperator, "k-quant only supports QOperator format"
+
+        if ratios is None:
+            ratios = {}
+        super().__init__(
+            algorithm="k_quant",
+            quant_format=quant_format,
+            op_types_to_quantize=op_types_to_quantize,
+            customized_weight_config=customized_weight_config,
+        )
+        self.ratios = ratios
+
+
 class GPTQWeightOnlyQuantConfig(WeightOnlyQuantConfig):
     def __init__(
         self,
@@ -1258,14 +1293,12 @@ def inc_dataloader():
 
         algorithm = self.algo_config.algorithm
         logger.info(f"start to quantize model with {algorithm} algorithm...")
-        if algorithm == "RTN":
-            from neural_compressor.adaptor.ox_utils.weight_only import rtn_quantize
-
+        if algorithm in ["RTN", "k_quant"]:
             kwargs["ratios"] = self.algo_config.ratios
+            kwargs["algorithm"] = algorithm
 
             """
-            neural-compressor uses fp32 to represent the node that skip quantization, it does not mean this node is fp32 type though.
-            https://github.com/intel/neural-compressor/blob/a617115b1490bbe6163c0024fb55bd260c8914df/neural_compressor/adaptor/ox_utils/weight_only.py#L343
+            We uses fp32 to represent the node that skip quantization, it does not mean this node is fp32 type though.
             """
             for n in self.nodes_to_exclude:
                 weight_only_node_config[n] = "fp32"
@@ -1276,8 +1309,6 @@ def inc_dataloader():
                 **kwargs,
             )
         elif algorithm == "GPTQ":
-            from neural_compressor.adaptor.ox_utils.weight_only import gptq_quantize
-
             kwargs["percdamp"] = self.algo_config.percdamp
             kwargs["blocksize"] = self.algo_config.block_size
             kwargs["actorder"] = self.algo_config.actorder
@@ -1370,7 +1401,7 @@ def parse_args():
         "--quant_method",
         default="default",
         type=str,
-        choices=["default", "hqq", "rtn", "gptq", "nvidia_awq"],
+        choices=["default", "hqq", "rtn", "k_quant", "gptq", "nvidia_awq"],
         help="the algorithm used to quantize weight, \nrtn and gptq leverage Intel® Neural Compressor",
     )
     parser.add_argument("--bits", default=4, type=int, help="the target bits to represent weight")
@@ -1500,6 +1531,8 @@ def parse_args():
         )
     elif args.quant_method == "rtn":
         quant_config = RTNWeightOnlyQuantConfig(op_types_to_quantize=op_types_to_quantize)
+    elif args.quant_method == "k_quant":
+        quant_config = KQuantWeightOnlyQuantConfig(op_types_to_quantize=op_types_to_quantize)
     elif args.quant_method == "gptq":
         quant_config = GPTQWeightOnlyQuantConfig(block_size=args.block_size, op_types_to_quantize=op_types_to_quantize)
     elif args.quant_method == "nvidia_awq":
diff --git a/onnxruntime/python/tools/quantization/neural_compressor/__init__.py b/onnxruntime/python/tools/quantization/neural_compressor/__init__.py
new file mode 100644
index 0000000000000..08b9a38624c98
--- /dev/null
+++ b/onnxruntime/python/tools/quantization/neural_compressor/__init__.py
@@ -0,0 +1 @@
+from .weight_only import gptq_quantize, rtn_quantize  # noqa: F401
diff --git a/onnxruntime/python/tools/quantization/neural_compressor/onnx_model.py b/onnxruntime/python/tools/quantization/neural_compressor/onnx_model.py
new file mode 100644
index 0000000000000..f931045c4e349
--- /dev/null
+++ b/onnxruntime/python/tools/quantization/neural_compressor/onnx_model.py
@@ -0,0 +1,1264 @@
+#
+#  The implementation of this file is based on:
+# https://github.com/intel/neural-compressor/tree/master/neural_compressor
+#
+# Copyright (c) 2023 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Class for ONNX model."""
+
+import logging
+import os
+import sys
+from pathlib import Path
+
+import onnx
+
+from .util import MAXIMUM_PROTOBUF, find_by_name
+
+logger = logging.getLogger("neural_compressor")
+
+# TODO: Check https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/python/tools/quantization/onnx_model.py to see if we can integrate with it.
+
+
+class ONNXModel:
+    """Build ONNX model."""
+
+    def __init__(self, model, **kwargs):
+        """Initialize an ONNX model.
+
+        Args:
+            model (str or ModelProto): path to onnx model or loaded ModelProto model object.
+            ignore_warning (bool): ignore large model warning. Default is False.
+            load_external_data (bool): load external data for large model. Default is True.
+        """
+        self._model = model if not isinstance(model, str) else onnx.load(model, load_external_data=False)
+        self._model_path = None if not isinstance(model, str) else model
+
+        self.check_is_large_model()
+        if self._is_large_model and self._model_path is None and not kwargs.get("ignore_warning", False):
+            logger.warning("Model size > 2GB. Please use model path instead of onnx model object to quantize")
+
+        if self._is_large_model and isinstance(model, str) and kwargs.get("load_external_data", True):
+            from onnx.external_data_helper import load_external_data_for_model
+
+            load_external_data_for_model(self._model, os.path.dirname(self._model_path))
+
+        self._config = None
+        if isinstance(model, str) and os.path.exists(Path(model).parent.joinpath("config.json").as_posix()):
+            from transformers import AutoConfig
+
+            self._config = AutoConfig.from_pretrained(Path(model).parent.as_posix())
+
+        self.node_name_counter = {}
+        self._output_name_to_node = {}
+        self._input_name_to_nodes = {}
+        self._get_input_name_to_nodes(self._model.graph.node)
+        self._get_output_name_to_node(self._model.graph.node)
+        self._graph_info = {}
+        self._get_graph_info()
+        self._q_config = None
+
+    def check_is_large_model(self):
+        """Check model > 2GB."""
+        init_size = 0
+        for init in self._model.graph.initializer:
+            # if initializer has external data location, return True
+            if init.HasField("data_location") and init.data_location == onnx.TensorProto.EXTERNAL:
+                self._is_large_model = True
+                return
+            # if raise error of initializer size > 2GB, return True
+            try:
+                init_bytes = init.SerializeToString()
+                init_size += sys.getsizeof(init_bytes)
+            except Exception as e:
+                if "exceeds maximum protobuf size of 2GB" in str(e):
+                    self._is_large_model = True
+                    return
+                else:  # pragma: no cover
+                    raise e
+            if init_size > MAXIMUM_PROTOBUF:
+                self._is_large_model = True
+                return
+        self._is_large_model = False
+
+    @property
+    def is_large_model(self):
+        """Check the onnx model is over 2GB."""
+        return self._is_large_model
+
+    @property
+    def model_path(self):
+        """Return model path."""
+        return self._model_path
+
+    @model_path.setter
+    def model_path(self, path):
+        """Set model path."""
+        self._model_path = path
+
+    def framework(self):
+        """Return framework."""
+        return "onnxruntime"
+
+    @property
+    def q_config(self):
+        """Return q_config."""
+        return self._q_config
+
+    @q_config.setter
+    def q_config(self, q_config):
+        """Set q_config."""
+        self._q_config = q_config
+
+    @property
+    def hf_config(self):
+        """Return huggingface config if model is Transformer-based."""
+        return self._config
+
+    @property
+    def model(self):
+        """Return model itself."""
+        return self._model
+
+    @model.setter
+    def model(self, model):
+        """Set model itself."""
+        self._model = model
+        self._graph_info = {}
+        self._get_graph_info()
+        self._output_name_to_node = {}
+        self._input_name_to_nodes = {}
+        self._get_input_name_to_nodes(self._model.graph.node)
+        self._get_output_name_to_node(self._model.graph.node)
+
+    def input(self):
+        """Return input of model."""
+        return [i.name for i in self._model.graph.input]
+
+    def output(self):
+        """Return output of model."""
+        return [i.name for i in self._model.graph.output]
+
+    def update(self):
+        """Update model info."""
+        self._graph_info = {}
+        self._get_graph_info()
+        self._output_name_to_node = {}
+        self._input_name_to_nodes = {}
+        self._get_input_name_to_nodes(self._model.graph.node)
+        self._get_output_name_to_node(self._model.graph.node)
+
+    @property
+    def graph_info(self):
+        """Return ORT Graph Info object holding information about backend graph."""
+        return self._graph_info
+
+    def _get_graph_info(self):
+        """Update graph info."""
+        for node in self._model.graph.node:
+            self.graph_info.update({node.name: node.op_type})
+
+    def save(self, root):
+        """Save ONNX model."""
+        if os.path.split(root)[0] != "" and not os.path.exists(os.path.split(root)[0]):
+            raise ValueError('"root" directory does not exists.')
+        if self.is_large_model:  # pragma: no cover
+            from onnx.external_data_helper import load_external_data_for_model
+
+            load_external_data_for_model(self._model, os.path.split(self._model_path)[0])
+            onnx.save_model(
+                self._model,
+                root,
+                save_as_external_data=True,
+                all_tensors_to_one_file=True,
+                location=root.split("/")[-1] + "_data",
+                size_threshold=1024,
+                convert_attribute=False,
+            )
+        else:
+            onnx.save(self._model, root)
+
+        if self._config is not None:
+            model_type = "" if not hasattr(self._config, "model_type") else self._config.model_type
+            self._config.__class__.model_type = model_type
+            output_config_file = Path(root).parent.joinpath("config.json").as_posix()
+            self._config.to_json_file(output_config_file, use_diff=False)
+
+    def nodes(self):
+        """Return model nodes."""
+        return self._model.graph.node
+
+    def initializer(self):
+        """Return model initializer."""
+        return self._model.graph.initializer
+
+    def graph(self):
+        """Return model graph."""
+        return self._model.graph
+
+    def ir_version(self):
+        """Return model ir_version."""
+        return self._model.ir_version
+
+    def opset_import(self):
+        """Return model opset_import."""
+        return self._model.opset_import
+
+    def remove_node(self, node):
+        """Remove a node from model."""
+        if node in self._model.graph.node:
+            self._model.graph.node.remove(node)
+
+    def remove_nodes(self, nodes_to_remove):
+        """Remove nodes from model."""
+        for node in nodes_to_remove:
+            self.remove_node(node)
+
+    def add_node(self, node):
+        """Add a node to model."""
+        self._model.graph.node.extend([node])
+
+    def add_nodes(self, nodes_to_add):
+        """Add nodes to model."""
+        self._model.graph.node.extend(nodes_to_add)
+
+    def add_initializer(self, tensor):
+        """Add a initializer to model."""
+        if find_by_name(tensor.name, self._model.graph.initializer) is None:
+            self._model.graph.initializer.extend([tensor])
+
+    def add_initializers(self, tensors):
+        """Add initializers to model."""
+        for tensor in tensors:
+            self.add_initializer(tensor)
+
+    def get_initializer(self, name):
+        """Get an initializer by name."""
+        for tensor in self._model.graph.initializer:
+            if tensor.name == name:
+                return tensor
+        return None
+
+    def get_initializer_share_num(self, name):
+        """Get the number of shares of initializer."""
+        num = 0
+        if self.get_initializer(name) is None:
+            return num
+
+        for node in self.nodes():
+            if name in node.input:
+                num += 1
+        return num
+
+    def get_node(self, name):
+        """Get a node by name."""
+        for node in self._model.graph.node:
+            if node.name == name:
+                return node
+        return None
+
+    def remove_initializer(self, tensor):
+        """Remove an initializer from model."""
+        if tensor in self._model.graph.initializer:
+            self._model.graph.initializer.remove(tensor)
+
+    def remove_initializers(self, init_to_remove):
+        """Remove initializers from model."""
+        for initializer in init_to_remove:
+            self.remove_initializer(initializer)
+
+    def set_initializer(self, tensor, array, raw=False):
+        """Update initializer."""
+        old_tensor = self.get_initializer(tensor)
+        self.remove_initializer(old_tensor)
+        dims = old_tensor.dims
+        data_type = old_tensor.data_type
+        new_tensor = (
+            onnx.helper.make_tensor(tensor, data_type, dims, array.flatten().tolist())
+            if not raw
+            else onnx.helper.make_tensor(tensor, data_type, dims, array.tostring(), raw=raw)
+        )
+        self.add_initializer(new_tensor)
+
+    @property
+    def input_name_to_nodes(self):
+        """Return input names of nodes."""
+        return self._input_name_to_nodes
+
+    def _get_input_name_to_nodes(self, nodes):
+        """Get input names of nodes."""
+        for node in nodes:
+            attrs = [
+                attr
+                for attr in node.attribute
+                if attr.type == onnx.AttributeProto.GRAPH or attr.type == onnx.AttributeProto.GRAPHS
+            ]
+            if len(attrs) > 0:
+                for attr in attrs:
+                    self._get_input_name_to_nodes(attr.g.node)
+            for input_name in node.input:
+                if len(input_name.strip()) != 0:
+                    if input_name not in self._input_name_to_nodes:
+                        self._input_name_to_nodes[input_name] = [node]
+                    else:
+                        self._input_name_to_nodes[input_name].append(node)
+
+    @property
+    def output_name_to_node(self):
+        """Return output names of nodes."""
+        return self._output_name_to_node
+
+    def _get_output_name_to_node(self, nodes):
+        """Get output names of nodes."""
+        for node in nodes:
+            attrs = [
+                attr
+                for attr in node.attribute
+                if attr.type == onnx.AttributeProto.GRAPH or attr.type == onnx.AttributeProto.GRAPHS
+            ]
+            if len(attrs) > 0:
+                for attr in attrs:
+                    self._get_output_name_to_node(attr.g.node)
+            for output_name in node.output:
+                if len(output_name.strip()) != 0:
+                    self._output_name_to_node[output_name] = node
+
+    def get_siblings(self, node):
+        """Get siblings nodes."""
+        siblings = []
+        for parent in self.get_parents(node):
+            for child in self.get_children(parent):
+                if child.name != node.name:
+                    siblings.append(child)
+        return siblings
+
+    def get_children(self, node, input_name_to_nodes=None):
+        """Get children nodes."""
+        if input_name_to_nodes is None:
+            input_name_to_nodes = self._input_name_to_nodes
+
+        children = []
+        for output in node.output:
+            if output in input_name_to_nodes:
+                for child in input_name_to_nodes[output]:
+                    children.append(child)  # noqa:  PERF402
+        return children
+
+    def get_parents(self, node, output_name_to_node=None):
+        """Get parents nodes."""
+        if output_name_to_node is None:
+            output_name_to_node = self._output_name_to_node
+
+        parents = []
+        for input in node.input:
+            if input in output_name_to_node:
+                parents.append(output_name_to_node[input])
+        return parents
+
+    def get_parent(self, node, idx, output_name_to_node=None):
+        """Get parent node by idx."""
+        if output_name_to_node is None:
+            output_name_to_node = self._output_name_to_node
+
+        if len(node.input) <= idx:
+            return None
+
+        input = node.input[idx]
+        if input not in output_name_to_node:
+            return None
+
+        return output_name_to_node[input]
+
+    def find_node_by_name(self, node_name, new_nodes_list, graph):
+        """Find out node by name."""
+        graph_nodes_list = list(graph.node)  # deep copy
+        graph_nodes_list.extend(new_nodes_list)
+        node = find_by_name(node_name, graph_nodes_list)
+        return node
+
+    def find_nodes_by_initializer(self, graph, initializer):
+        """Find all nodes with given initializer as an input."""
+        nodes = []
+        for node in graph.node:
+            for node_input in node.input:
+                if node_input == initializer.name:
+                    nodes.append(node)
+        return nodes
+
+    def get_scale_zero(self, tensor):
+        """Help function to get scale and zero_point."""
+        if not tensor.endswith("_quantized"):
+            logger.debug(f"Find {tensor} in the quantized graph is not quantized.")
+            return None, None
+
+        def _searcher(tensor_name):
+            """Search scale and zero point tensor recursively."""
+            node = self._input_name_to_nodes[tensor_name][0]
+            parent = self._output_name_to_node.get(tensor_name, None)
+            direct_int8 = ["Reshape", "Transpose", "Squeeze", "Unsqueeze", "MaxPool", "Pad", "Split"]
+            if parent is not None and parent.op_type in direct_int8:
+                fp32_tensor_name = (
+                    parent.input[0]
+                    .replace("_quantized", "")
+                    .replace("_QuantizeLinear", "")
+                    .replace("_QuantizeInput", "")
+                )
+            elif node.op_type in ["Gather"]:  # pragma: no cover
+                fp32_tensor_name = (
+                    node.output[0]
+                    .replace("_quantized", "")
+                    .replace("_QuantizeLinear", "")
+                    .replace("_QuantizeInput", "")
+                )
+            else:
+                fp32_tensor_name = (
+                    tensor_name.replace("_quantized", "").replace("_QuantizeLinear", "").replace("_QuantizeInput", "")
+                )
+            scale = fp32_tensor_name + "_scale"
+            scale_tensor = self.get_initializer(scale)
+            zo = fp32_tensor_name + "_zero_point"
+            zo_tensor = self.get_initializer(zo)
+
+            if scale_tensor is None or zo_tensor is None:
+                if parent is not None:
+                    scale_tensor, zo_tensor = _searcher(parent.input[0])
+            return scale_tensor, zo_tensor
+
+        node = self._input_name_to_nodes[tensor][0]
+        # TODO check if scale_tensor and zero_point is needed
+        # for bias of qlinearconv, scale and zero_point is not needed
+        if (node.op_type == "QLinearConv" and tensor == node.input[-1]) or (
+            node.op_type == "QGemm" and tensor == node.input[-3]
+        ):
+            return None, None
+        else:
+            scale_tensor, zo_tensor = _searcher(tensor)
+            assert scale_tensor, f"missing scale for tensor {tensor}"
+            assert zo_tensor, f"missing zero point for tensor {tensor}"
+            return scale_tensor, zo_tensor
+
+    def save_model_to_file(self, output_path, use_external_data_format=False):
+        """Save model to external data, which is needed for model size > 2GB."""
+        from onnx.external_data_helper import convert_model_to_external_data
+
+        if use_external_data_format:
+            convert_model_to_external_data(
+                self._model, all_tensors_to_one_file=True, location=Path(output_path).name + ".data"
+            )
+        onnx.save_model(self._model, output_path)
+
+    @staticmethod
+    def replace_node_input(node, old_input_name, new_input_name):
+        """Replace input of a node."""
+        assert isinstance(old_input_name, str) and isinstance(new_input_name, str)
+        for j in range(len(node.input)):
+            if node.input[j] == old_input_name:
+                node.input[j] = new_input_name
+
+    def replace_input_of_all_nodes(self, old_input_name, new_input_name, white_optype=None, black_optype=None):
+        """Replace inputs of all nodes."""
+        if white_optype is None:
+            white_optype = []
+        if black_optype is None:
+            black_optype = []
+        if len(white_optype) > 0:
+            for node in self.model.graph.node:
+                if node.op_type in white_optype:
+                    ONNXModel.replace_node_input(node, old_input_name, new_input_name)
+        else:
+            for node in self.model.graph.node:
+                if node.op_type not in black_optype:
+                    ONNXModel.replace_node_input(node, old_input_name, new_input_name)
+
+    @staticmethod
+    def replace_node_output(node, old_output_name, new_output_name):
+        """Replace output of a node."""
+        assert isinstance(old_output_name, str) and isinstance(new_output_name, str)
+        for j in range(len(node.output)):
+            if node.output[j] == old_output_name:
+                node.output[j] = new_output_name
+
+    def replace_output_of_all_nodes(self, old_output_name, new_output_name, white_optype=None, black_optype=None):
+        """Replace outputs of all nodes."""
+        if white_optype is None:
+            white_optype = []
+        if black_optype is None:
+            black_optype = []
+        if len(white_optype) > 0:
+            for node in self.model.graph.node:
+                if node.op_type in white_optype:
+                    ONNXModel.replace_node_output(node, old_output_name, new_output_name)
+        else:
+            for node in self.model.graph.node:
+                if node.op_type not in black_optype:
+                    ONNXModel.replace_node_output(node, old_output_name, new_output_name)
+
+    def remove_unused_nodes(self):
+        """Remove unused nodes."""
+        unused_nodes = []
+        nodes = self.nodes()
+        for node in nodes:
+            if (
+                node.op_type == "Constant"
+                and node.output[0] not in self._model.graph.output
+                and node.output[0] not in self._input_name_to_nodes
+            ):
+                unused_nodes.append(node)
+            elif (
+                node.op_type == "QuantizeLinear"
+                and len(self.get_children(node)) == 1
+                and self.get_children(node)[0].op_type == "DequantizeLinear"
+                and node.input[0] not in self._output_name_to_node
+                and self.get_children(node)[0].output[0] not in self._input_name_to_nodes
+            ):
+                unused_nodes.append(node)
+                unused_nodes.extend(self.get_children(node))
+            else:
+                # remove the node if it does not serve as the input or output of any other nodes
+                unused = True
+                for output in node.output:
+                    if output in self._input_name_to_nodes or output in self.output():
+                        unused = False
+                        break
+                for input in node.input:
+                    if self.get_initializer(input) is not None:
+                        continue
+                    elif input in self._output_name_to_node or input in self.input():
+                        unused = False
+                        break
+                if unused:
+                    unused_nodes.append(node)
+        self.remove_nodes(unused_nodes)
+
+        ununsed_weights = []
+        for w in self._model.graph.initializer:
+            if w.name not in self._input_name_to_nodes and w.name not in self._model.graph.output:
+                ununsed_weights.append(w)
+                # Remove from graph.input
+                for graph_input in self.graph().input:
+                    if graph_input.name == w.name:
+                        self.graph().input.remove(graph_input)
+
+        self.remove_initializers(ununsed_weights)
+        self.update()
+
+    def topological_sort(self, enable_subgraph=False):
+        """Topological sort the model."""
+        import copy
+        from collections import deque
+
+        if not enable_subgraph:
+            input_name_to_nodes = {}
+            output_name_to_node = {}
+            for node in self.model.graph.node:
+                for input_name in node.input:
+                    if len(input_name.strip()) != 0:
+                        if input_name not in input_name_to_nodes:
+                            input_name_to_nodes[input_name] = [node]
+                        else:
+                            input_name_to_nodes[input_name].append(node)
+                for output_name in node.output:
+                    if len(output_name.strip()) != 0:
+                        output_name_to_node[output_name] = node
+        else:  # pragma: no cover
+            input_name_to_nodes = self._input_name_to_nodes
+            output_name_to_node = self._output_name_to_node
+
+        all_nodes = {}
+        q = deque()
+        wait = deque()
+        for inp in self.model.graph.input:
+            q.extend(input_name_to_nodes[inp.name])
+        for n in self.model.graph.node:
+            if all(i not in output_name_to_node and i not in self.input() for i in n.input):
+                q.append(n)
+
+        while q:
+            n = q.popleft()
+            if not all(output_name_to_node[i].name in all_nodes for i in n.input if i in output_name_to_node):
+                if n not in wait:
+                    wait.append(n)
+                continue
+
+            all_nodes[n.name] = n
+            for out in n.output:
+                if out in input_name_to_nodes:
+                    q.extend([i for i in input_name_to_nodes[out] if i.name not in all_nodes and i not in q])
+            if len(q) == 0 and len(wait) != 0:
+                q = copy.deepcopy(wait)
+                wait.clear()
+        nodes = [i[1] for i in all_nodes.items()]
+        assert len(list({n.name for n in nodes})) == len(list({n.name for n in self.model.graph.node}))
+        self.model.graph.ClearField("node")
+        self.model.graph.node.extend(nodes)
+
+    def get_nodes_chain(self, start, stop, result_chain=None):
+        """Get nodes chain with given start node and stop node."""
+        from collections import deque
+
+        from onnx import NodeProto
+
+        if result_chain is None:
+            result_chain = []
+        # process start node list
+        start_node = deque()
+        for node in start:
+            if isinstance(node, str):
+                start_node.append(node)
+            elif isinstance(node, NodeProto):
+                start_node.append(node.name)
+            else:
+                assert False, "'get_nodes_chain' function only support list[string]or list[NodeProto] params"  # noqa: B011
+
+        # process stop node list
+        stop_node = []
+        for node in stop:
+            if isinstance(node, str):
+                stop_node.append(node)
+            elif isinstance(node, NodeProto):
+                stop_node.append(node.name)
+            else:
+                assert False, "'get_nodes_chain' function only support list[string]or list[NodeProto] params"  # noqa: B011
+
+        while start_node:
+            node_name = start_node.popleft()
+            if node_name in stop_node:
+                continue
+            if node_name not in result_chain:
+                result_chain.append(node_name)
+            else:
+                continue
+
+            node = find_by_name(node_name, list(self.model.graph.node))
+            for parent in self.get_parents(node):
+                start_node.append(parent.name)
+
+        return result_chain
+
+    def find_split_node_for_layer_wise_quantization(self):
+        """Find split node for layer wise quantization."""
+        # find split nodes of decoder blocks
+        # embed -> decoder.0 -(split_node)-> ... -(split_node)-> decoder.n -(split_node)-> norm -> head
+        # after split: embed -> decoder.0,
+        #              decoder.1,
+        #              decoder.2,
+        #              ...,
+        #              decoder.n,
+        #              norm -> head
+        start_nodes = []
+        for node in self._model.graph.node:
+            start_node, qkv_nodes_list = None, None
+            if node.op_type == "SkipLayerNormalization":
+                start_node = node
+                qkv_nodes_list = [
+                    self.match_parent_path(
+                        start_node,
+                        ["MatMul", "Reshape", "Transpose", "Reshape", "MatMul"],
+                        [None, 0, 0, 0, 0],
+                    ),
+                    self.match_parent_path(
+                        start_node,
+                        ["Add", "MatMul", "Reshape", "Transpose", "MatMul"],
+                        [1, 1, 0, 0, 0],
+                    ),
+                ]
+            if node.op_type == "Add":
+                start_node = node
+                qkv_nodes_list = [
+                    # match base attention structure
+                    self.match_parent_path(
+                        start_node,
+                        ["Add", "MatMul", "Reshape", "Transpose", "MatMul"],
+                        [0, None, 0, 0, 0],
+                    ),
+                    self.match_parent_path(
+                        start_node, ["Add", "MatMul", "Reshape", "Transpose", "MatMul"], [1, None, 0, 0, 0]
+                    ),
+                    # match gpt attention no past structure
+                    self.match_parent_path(
+                        start_node,
+                        ["Reshape", "Gemm", "Reshape", "Reshape", "Transpose", "MatMul"],
+                        [None, 0, 0, 0, 0, 0],
+                        output_name_to_node=self.output_name_to_node,
+                        return_indice=[],
+                    ),
+                    # match bart attention structure
+                    self.match_parent_path(
+                        start_node,
+                        ["Add", "MatMul", "Reshape", "Transpose", "Reshape", "MatMul"],
+                        [0, None, 0, 0, 0, 0],
+                    ),
+                    self.match_parent_path(
+                        start_node,
+                        ["Add", "MatMul", "Reshape", "Transpose", "Reshape", "MatMul"],
+                        [1, None, 0, 0, 0, 0],
+                    ),
+                    self.match_parent_path(
+                        start_node,
+                        ["MatMul", "Mul", "MatMul", "Mul", "Div", "Add"],
+                        [None, 0, None, 0, None, 0],
+                    ),
+                    self.match_parent_path(
+                        start_node,
+                        ["MatMul", "Mul", "MatMul", "SimplifiedLayerNormalization", "Add"],
+                        [None, 0, None, 0, 0],
+                    ),
+                ]
+            if not start_node:
+                continue
+            if not any(qkv_nodes_list):
+                continue
+            start_nodes.append(start_node)
+        return start_nodes
+
+    def find_qkv_in_attention(self, find_all=False):
+        """Find qkv MatMul in Attention.
+
+        Args:
+            find_all (bool, optional): find all qkv MatMul. Defaults to False
+
+        Returns:
+            qkv (list): qkv MatMul list
+        """
+        qkv = []
+        for node in self._model.graph.node:
+            if node.op_type == "Attention":
+                qkv.append([node.name])
+                continue
+            start_node, qkv_nodes_list = None, None
+            if node.op_type == "SkipLayerNormalization":
+                start_node = node
+                qkv_nodes_list = [
+                    self.match_parent_path(
+                        start_node,
+                        ["MatMul", "Reshape", "Transpose", "Reshape", "MatMul"],
+                        [None, 0, 0, 0, 0],
+                    ),
+                    self.match_parent_path(
+                        start_node,
+                        ["Add", "MatMul", "Reshape", "Transpose", "MatMul"],
+                        [1, 1, 0, 0, 0],
+                    ),
+                ]
+            if node.op_type == "Add":
+                start_node = node
+                qkv_nodes_list = [
+                    # match base attention structure
+                    self.match_parent_path(
+                        start_node,
+                        ["Add", "MatMul", "Reshape", "Transpose", "MatMul"],
+                        [0, None, 0, 0, 0],
+                    ),
+                    self.match_parent_path(
+                        start_node, ["Add", "MatMul", "Reshape", "Transpose", "MatMul"], [1, None, 0, 0, 0]
+                    ),
+                    # match gpt attention no past structure
+                    self.match_parent_path(
+                        start_node,
+                        ["Reshape", "Gemm", "Reshape", "Reshape", "Transpose", "MatMul"],
+                        [None, 0, 0, 0, 0, 0],
+                        output_name_to_node=self.output_name_to_node,
+                        return_indice=[],
+                    ),
+                    # match bart attention structure
+                    self.match_parent_path(
+                        start_node,
+                        ["Add", "MatMul", "Reshape", "Transpose", "Reshape", "MatMul"],
+                        [0, None, 0, 0, 0, 0],
+                    ),
+                    self.match_parent_path(
+                        start_node,
+                        ["Add", "MatMul", "Reshape", "Transpose", "Reshape", "MatMul"],
+                        [1, None, 0, 0, 0, 0],
+                    ),
+                ]
+            if not start_node:
+                continue
+            if not any(qkv_nodes_list):
+                continue
+            qkv_nodes = [qkv for qkv in qkv_nodes_list if qkv is not None][-1]
+            other_inputs = []
+            for input in start_node.input:
+                if input not in self.output_name_to_node:
+                    continue
+                if input == qkv_nodes[0].output[0]:
+                    continue
+                other_inputs.append(input)
+            if len(other_inputs) != 1:
+                continue
+            root_input = other_inputs[0]
+            input_name_to_nodes = self.input_name_to_nodes
+            children = input_name_to_nodes[root_input]
+            children_types = [child.op_type for child in children]
+            if children_types.count("MatMul") == 3:
+                qkv.append([child.name for child in children if child.op_type == "MatMul"])
+                if not find_all:
+                    break
+        return qkv
+
+    def find_ffn_matmul(self, attention_index, attention_matmul_list, block_len):
+        """Find MatMul in FFN.
+
+        Args:
+            attention_index (list): index of Attention
+            attention_matmul_list (list): list of Attention and MatMul nodes
+            block_len (int): block length
+
+        Returns:
+            list: list of MatMul in FFN
+        """
+        ffn_matmul = []
+        for idx in range(len(attention_index)):
+            if idx != len(attention_index) - 1:
+                index = attention_index[idx + 1]
+                if index - 2 >= 0:
+                    ffn_matmul.append([attention_matmul_list[index - 2], attention_matmul_list[index - 1]])
+            else:
+                index = attention_index[idx]
+                if index + block_len - 1 < len(attention_matmul_list):
+                    ffn_matmul.append(
+                        [attention_matmul_list[index + block_len - 2], attention_matmul_list[index + block_len - 1]]
+                    )
+        return ffn_matmul
+
+    def export(self, save_path, conf):
+        """Export Qlinear to QDQ model."""
+        from neural_compressor.config import ONNXQlinear2QDQConfig
+        from neural_compressor.utils.export import onnx_qlinear_to_qdq
+
+        if isinstance(conf, ONNXQlinear2QDQConfig):
+            add_nodes, remove_nodes, inits = onnx_qlinear_to_qdq(self._model, self._input_name_to_nodes)
+            self.add_nodes(add_nodes)
+            self.remove_nodes(remove_nodes)
+            self.add_initializers(inits)
+            self.update()
+            self.remove_unused_nodes()
+            self.topological_sort()
+            self.save(save_path)
+        else:
+            logger.warning("Unsupported config for export, only ONNXQlinear2QDQConfig is supported!")
+            exit(0)
+
+    def add_tensors_to_outputs(self, tensor_names):
+        """Add the tensors to the model outputs to gets their values.
+
+        Args:
+            tensor_names: The names of tensors to be dumped.
+        """
+        added_outputs = []
+        for tensor in tensor_names:
+            if tensor not in self.output():
+                added_tensor = onnx.helper.ValueInfoProto()
+                added_tensor.name = tensor
+                added_outputs.append(added_tensor)
+        self._model.graph.output.extend(added_outputs)  # pylint: disable=no-member
+
+    def remove_tensors_from_outputs(self, tensor_names):
+        """Remove the tensors from the model outputs.
+
+        Args:
+            tensor_names: The names of tensors to be removed.
+        """
+        removed_outputs = []
+        for tensor in tensor_names:
+            if tensor in self.output():
+                removed_outputs.append(self._model.graph.output[self.output().index(tensor)])
+        for output in removed_outputs:
+            self._model.graph.output.remove(output)
+
+    def match_first_parent(self, node, parent_op_type, output_name_to_node, exclude=None):
+        """Find parent node based on constraints on op_type.
+
+        Args:
+            node (str): current node name.
+            parent_op_type (str): constraint of parent node op_type.
+            output_name_to_node (dict): dictionary with output name as key, and node as value.
+            exclude (list): list of nodes that are excluded (not allowed to match as parent).
+
+        Returns:
+            parent: The matched parent node. None if not found.
+            index: The input index of matched parent node. None if not found.
+        """
+        if exclude is None:
+            exclude = []
+        for i, input in enumerate(node.input):
+            if input in output_name_to_node:
+                parent = output_name_to_node[input]
+                if parent.op_type == parent_op_type and parent not in exclude:
+                    return parent, i
+        return None, None
+
+    def match_parent(
+        self,
+        node,
+        parent_op_type,
+        input_index=None,
+        output_name_to_node=None,
+        exclude=None,
+        return_indice=None,
+    ):
+        """Find parent node based on constraints on op_type and index.
+
+        Args:
+            node (str): current node name.
+            parent_op_type (str): constraint of parent node op_type.
+            input_index (int or None): only check the parent given input index of current node.
+            output_name_to_node (dict): dictionary with output name as key, and node as value.
+            exclude (list): list of nodes that are excluded (not allowed to match as parent).
+            return_indice (list): a list to append the input index when input_index is None.
+
+        Returns:
+            parent: The matched parent node.
+        """
+        assert node is not None
+        assert input_index is None or input_index >= 0
+        if exclude is None:
+            exclude = []
+        if output_name_to_node is None:
+            output_name_to_node = self._output_name_to_node
+
+        if input_index is None:
+            parent, index = self.match_first_parent(node, parent_op_type, output_name_to_node, exclude)
+            if return_indice is not None:
+                return_indice.append(index)
+            return parent
+
+        if input_index >= len(node.input):
+            return None
+
+        parent = self.get_parent(node, input_index, output_name_to_node)
+        if parent is not None and parent.op_type == parent_op_type and parent not in exclude:
+            return parent
+
+        return None
+
+    def match_parent_path(
+        self,
+        node,
+        parent_op_types,
+        parent_input_index,
+        output_name_to_node=None,
+        return_indice=None,
+    ):
+        """Find a sequence of input edges based on constraints on parent op_type and index.
+
+        Args:
+            node (str): current node name.
+            parent_op_types (str): constraint of parent node op_type of each input edge.
+            parent_input_index (list): constraint of input index of each input edge.
+                                       None means no constraint.
+            output_name_to_node (dict): dictionary with output name as key, and node as value.
+            return_indice (list): a list to append the input index when there is
+                                  no constraint on input index of an edge.
+
+        Returns:
+            parents: a list of matched parent node.
+        """
+        assert len(parent_input_index) == len(parent_op_types)
+
+        if output_name_to_node is None:
+            output_name_to_node = self._output_name_to_node
+
+        current_node = node
+        matched_parents = []
+        for i, op_type in enumerate(parent_op_types):
+            matched_parent = self.match_parent(
+                current_node,
+                op_type,
+                parent_input_index[i],
+                output_name_to_node,
+                exclude=[],
+                return_indice=return_indice,
+            )
+            if matched_parent is None:
+                return None
+
+            matched_parents.append(matched_parent)
+            current_node = matched_parent
+
+        return matched_parents
+
+    def is_smoothquant_model(self):
+        """Check the model is smooth quantized or not.
+
+        Returns:
+            bool: the model is smooth quantized or not.
+        """
+        for init in self.model.graph.initializer:  # noqa: SIM110
+            if "_smooth_scale" in init.name:
+                return True
+        return False
+
+    def find_split_nodes(self):
+        """Find split nodes for layer-wise quantization."""
+        split_nodes = self.find_split_node_for_layer_wise_quantization()
+        return split_nodes
+
+    def split_model_with_node(
+        self, split_node_name, path_of_model_to_split, shape_infer=True, save_both_split_models=True
+    ):
+        """Split model into two parts at a given node.
+
+        Args:
+            split_node_name (str): name of the node where the model is split at>
+            path_of_model_to_split (str): path of model to be split.
+            shape_infer (bool): do shape inference. Default is True.
+            save_both_split_models (bool): whether to save the two split models.
+                False means only save the first split model.
+                True means save both the two split models.
+                Default id True.
+
+        Returns:
+            tuple: the first split model, the second split model
+        """
+        # origin model : ... -> node_1 -> split_node -> node_2 -> ...
+        # split model 1: ... -> node_1 -> split_node
+        # split model 2: node_2 -> ...
+
+        split_model_part_1 = onnx.ModelProto()
+        split_model_part_1.CopyFrom(self._model)
+        split_model_part_1.graph.ClearField("node")
+
+        split_model_part_2 = onnx.ModelProto()
+        split_model_part_2.CopyFrom(self._model)
+        split_model_part_2.graph.ClearField("node")
+
+        split_node_output = None
+        part_idx = 1
+        for node in self._model.graph.node:
+            if part_idx == 1:
+                split_model_part_1.graph.node.append(node)
+            elif part_idx == 2:
+                split_model_part_2.graph.node.append(node)
+
+            if node.name == split_node_name:
+                split_node_output = node.output
+                part_idx = 2
+
+        assert len(split_node_output) == 1, (
+            f"Only support split at node with 1 output tensor, while current split node {split_node_name} has {len(split_node_output)} output tensors"
+        )
+        split_tensor_name = split_node_output[0]
+
+        # infer shape of the model to be split
+        if shape_infer:
+            try:
+                from neural_compressor.adaptor.ox_utils.util import infer_shapes
+
+                self._model = infer_shapes(self._model, auto_merge=True, base_dir=os.path.dirname(self._model_path))
+            except Exception as e:  # pragma: no cover
+                logger.error(
+                    "Shape infer fails for layer-wise quantization. "
+                    "We would recommend checking the graph optimization level of your model "
+                    "and setting it to 'DISABLE_ALL' or 'ENABLE_BASIC', "
+                    "as this may help avoid this error."
+                )
+                raise e
+
+        split_tensor_type, split_tensor_shape = self._get_output_type_shape_by_tensor_name(split_tensor_name)
+        split_tensor = onnx.helper.make_tensor_value_info(split_tensor_name, split_tensor_type, split_tensor_shape)
+
+        split_model_part_1 = ONNXModel(split_model_part_1, ignore_warning=True)
+        split_model_part_2 = ONNXModel(split_model_part_2, ignore_warning=True)
+
+        # remove unused input & output
+        split_model_part_1._remove_unused_input_output()
+        split_model_part_2._remove_unused_input_output()
+
+        split_model_part_1.model.graph.output.append(split_tensor)
+        split_model_part_2.model.graph.input.append(split_tensor)
+
+        insert_output_for_model_1 = []
+        insert_input_for_model_2 = []
+        for output in split_model_part_1.output_name_to_node:
+            if output in split_model_part_2.input_name_to_nodes:
+                output_type, output_shape = self._get_output_type_shape_by_tensor_name(output)
+                output_tensor = onnx.helper.make_tensor_value_info(output, output_type, output_shape)
+                if output_tensor not in split_model_part_1.model.graph.output:
+                    insert_output_for_model_1.append(output_tensor)
+                if output_tensor not in split_model_part_2.model.graph.input:
+                    insert_input_for_model_2.append(output_tensor)
+
+        # insert model 1 output
+        for output in insert_output_for_model_1:
+            split_model_part_1.model.graph.output.append(output)
+
+        # insert model 2 input
+        for input in insert_input_for_model_2:
+            split_model_part_2.model.graph.input.append(input)
+
+        # remove unused init
+        split_model_part_1.remove_unused_init()
+        split_model_part_2.remove_unused_init()
+
+        split_model_part_1.update()
+        split_model_part_2.update()
+
+        dir_of_model_to_split = os.path.dirname(path_of_model_to_split)
+
+        split_model_part_1.load_model_initializer_by_tensor(dir_of_model_to_split)
+        split_model_part_1_path = os.path.join(dir_of_model_to_split, "split_model_part_1.onnx")
+        split_model_part_1.model_path = split_model_part_1_path
+        split_model_part_1._save_split_model(split_model_part_1_path)
+        split_model_part_1.check_is_large_model()
+        logger.debug(f"save split model part 1 to {split_model_part_1_path} for layer wise quantization")
+
+        if save_both_split_models:
+            split_model_part_2.load_model_initializer_by_tensor(dir_of_model_to_split)
+            split_model_part_2_path = os.path.join(dir_of_model_to_split, "split_model_part_2.onnx")
+            split_model_part_2.model_path = split_model_part_2_path
+            split_model_part_2._save_split_model(split_model_part_2_path)
+            split_model_part_2.check_is_large_model()
+            logger.debug(f"save split model part 2 to {split_model_part_2_path} for layer wise quantization")
+            return split_model_part_1, split_model_part_2
+        else:
+            return split_model_part_1, split_model_part_2
+
+    def _save_split_model(self, save_path):
+        """Save split model as external data for layer wise quantization.
+
+        Args:
+            save_path (str): the path to save the split model
+        """
+        if os.path.exists(save_path + "_data"):
+            os.remove(save_path + "_data")
+        onnx.save_model(
+            self._model,
+            save_path,
+            save_as_external_data=True,
+            all_tensors_to_one_file=True,
+            location=save_path.split("/")[-1] + "_data",
+            size_threshold=1024,
+            convert_attribute=False,
+        )
+
+    def _get_output_type_shape_by_tensor_name(self, tensor_name):
+        """Get output type and shape with a tensor name.
+
+        Args:
+            tensor_name (str): name of a tensor
+
+        Returns:
+            tuple: output type and shape
+        """
+        elem_type = onnx.TensorProto.FLOAT
+        shape = None
+        for output in self._model.graph.value_info:
+            if output.name == tensor_name:
+                elem_type = output.type.tensor_type.elem_type
+                shape = [
+                    dim.dim_value if dim.HasField("dim_value") else -1 for dim in output.type.tensor_type.shape.dim
+                ]
+                break
+        return elem_type, shape
+
+    def _remove_unused_input_output(self):
+        """Remove unused input & output for split model."""
+        remove_outputs = []
+        remove_inputs = []
+        for output in self._model.graph.output:
+            if output.name not in self.output_name_to_node:
+                remove_outputs.append(output)
+
+        for input in self._model.graph.input:
+            if input.name not in self.input_name_to_nodes:
+                remove_inputs.append(input)
+
+        for output in remove_outputs:
+            self._model.graph.output.remove(output)
+        for input in remove_inputs:
+            self._model.graph.input.remove(input)
+
+    def remove_unused_init(self):
+        """Remove unused init."""
+        remov_inits = []
+        for init in self._model.graph.initializer:
+            if init.name not in self.input_name_to_nodes:
+                remov_inits.append(init)
+        self.remove_initializers(remov_inits)
+
+    def load_model_initializer_by_tensor(self, data_path=None):
+        """Load model initializer by tensor.
+
+        Args:
+            data_path (str, optional): the directory of saved initializer. Defaults to None.
+        """
+        from onnx.external_data_helper import load_external_data_for_tensor
+
+        if data_path is None:
+            data_path = os.path.dirname(self._model_path)
+        for init in self._model.graph.initializer:
+            if init.HasField("data_location") and init.data_location == onnx.TensorProto.EXTERNAL:
+                load_external_data_for_tensor(init, data_path)
+
+    def write_external_data_to_new_location(self, external_data_location="external.data", overwrite=False):
+        """Write external data of merged quantized model to new location to save memory.
+
+        Args:
+            external_data_location (str, optional): external data location of merged quantized model.
+                                                    Defaults to "external.data".
+            overwrite (bool, optional): if True, remove existed externa data. Defaults to False.
+        """
+        from onnx.external_data_helper import convert_model_to_external_data, write_external_data_tensors
+
+        if overwrite and os.path.exists(os.path.join(os.path.dirname(self._model_path), external_data_location)):
+            os.remove(os.path.join(os.path.dirname(self._model_path), external_data_location))
+        self.load_model_initializer_by_tensor()
+        convert_model_to_external_data(self._model, location=external_data_location)
+        # TODO : if init is already saved, skip write it
+        write_external_data_tensors(self._model, filepath=os.path.dirname(self._model_path))
+
+    def merge_split_models(self, to_merge_model):
+        """Merge two split model into final model."""
+        to_merge_model.write_external_data_to_new_location()
+        self.add_nodes(list(to_merge_model.nodes()))
+        self.add_initializers(list(to_merge_model.initializer()))
+        self.update()
+
+        # add new output
+        for output in to_merge_model.graph().output:
+            if output.name not in self.output():
+                self._model.graph.output.append(output)
+
+        # remove unused output
+        remove_output = []
+        for output in self._model.graph.output:
+            if output.name in to_merge_model.input():
+                remove_output.append(output)
+        for output in remove_output:
+            self._model.graph.output.remove(output)
+
+        # add new input
+        for input in to_merge_model.graph().input:
+            if (
+                input.name not in self.input()
+                and input.name not in self.output()
+                and input.name not in self.output_name_to_node
+            ):
+                self._model.graph.input.append(input)
+
+    def re_org_output(self, origin_output):
+        """Re-org output of merged model for layer-wise quantization."""
+        outputs = {}
+        tmp_remove = []
+        for output in self._model.graph.output:
+            outputs[output.name] = output
+            tmp_remove.append(output)
+
+        for output in tmp_remove:
+            self._model.graph.output.remove(output)
+
+        for out_name in origin_output:
+            self._model.graph.output.append(outputs[out_name])
diff --git a/onnxruntime/python/tools/quantization/neural_compressor/util.py b/onnxruntime/python/tools/quantization/neural_compressor/util.py
new file mode 100644
index 0000000000000..aae01b4defd1f
--- /dev/null
+++ b/onnxruntime/python/tools/quantization/neural_compressor/util.py
@@ -0,0 +1,80 @@
+#
+#  The implementation of this file is based on:
+# https://github.com/intel/neural-compressor/tree/master/neural_compressor
+#
+# Copyright (c) 2023 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Helper classes or functions for onnxrt adaptor."""
+
+import importlib
+import logging
+
+import numpy as np
+
+logger = logging.getLogger("neural_compressor")
+
+
+MAXIMUM_PROTOBUF = 2147483648
+
+
+def simple_progress_bar(total, i):
+    """Progress bar for cases where tqdm can't be used."""
+    progress = i / total
+    bar_length = 20
+    bar = "#" * int(bar_length * progress)
+    spaces = " " * (bar_length - len(bar))
+    percentage = progress * 100
+    print(f"\rProgress: [{bar}{spaces}] {percentage:.2f}%", end="")
+
+
+def find_by_name(name, item_list):
+    """Helper function to find item by name in a list."""
+    items = []
+    for item in item_list:
+        assert hasattr(item, "name"), f"{item} should have a 'name' attribute defined"  # pragma: no cover
+        if item.name == name:
+            items.append(item)
+    if len(items) > 0:
+        return items[0]
+    else:
+        return None
+
+
+def to_numpy(data):
+    """Convert to numpy ndarrays."""
+    import torch
+
+    if not isinstance(data, np.ndarray):
+        if not importlib.util.find_spec("torch"):
+            logger.error(
+                "Please install torch to enable subsequent data type check and conversion, "
+                "or reorganize your data format to numpy array."
+            )
+            exit(0)
+        if isinstance(data, torch.Tensor):
+            if data.dtype is torch.bfloat16:  # pragma: no cover
+                return data.detach().cpu().to(torch.float32).numpy()
+            if data.dtype is torch.chalf:  # pragma: no cover
+                return data.detach().cpu().to(torch.cfloat).numpy()
+            return data.detach().cpu().numpy()
+        else:
+            try:
+                return np.array(data)
+            except Exception:
+                assert False, (  # noqa: B011
+                    f"The input data for onnx model is {type(data)}, which is not supported to convert to numpy ndarrays."
+                )
+    else:
+        return data
diff --git a/onnxruntime/python/tools/quantization/neural_compressor/weight_only.py b/onnxruntime/python/tools/quantization/neural_compressor/weight_only.py
new file mode 100644
index 0000000000000..4eda7efc9b8fe
--- /dev/null
+++ b/onnxruntime/python/tools/quantization/neural_compressor/weight_only.py
@@ -0,0 +1,932 @@
+#
+#  The implementation of this file is based on:
+# https://github.com/intel/neural-compressor/tree/master/neural_compressor
+#
+# Copyright (c) 2023 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Modifications:
+# Add k-quant quantization method.
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+"""WeightOnly for onnxrt adaptor."""
+
+import copy
+import logging
+import os
+import sys
+
+import numpy as np
+import onnx
+from onnx import numpy_helper
+from onnx.helper import np_dtype_to_tensor_dtype
+
+import onnxruntime as ort
+
+from .onnx_model import ONNXModel
+from .util import simple_progress_bar
+
+logger = logging.getLogger("neural_compressor")
+
+
+def make_matmul_weight_only_node(
+    node,
+    weight_shape,
+    num_bits,
+    group_size,
+    k_blocks,
+    q_weight,
+    scale,
+    zero_point,
+    accuracy_level=0,
+):  # pragma: no cover
+    """Build MatMulNBits node.
+
+    Args:
+        node: original matmul node
+        weight_shape: original weight shape
+        num_bits (int): num_bits
+        group_size (int): how many elements share one scale/zp
+        k_blocks (int): block number
+        q_weight (array): quantized weight
+        scale (array): scale
+        zero_point (array): zero point
+        accuracy_level (int): accuracy level. Support 0 (unset), 1(fp32), 2(fp16), 3(bf16), or 4(int8).
+
+    Returns:
+        matmul_weight_only_node: MatMulNBits node
+        new_inits: initializers of the new node
+    """
+    blob_size = group_size * num_bits // 8
+    packed = np.zeros((q_weight.shape[0], blob_size), dtype="uint8")
+    q_weight_name = node.input[1] + f"_Q{num_bits!s}G{group_size!s}"
+    input_names = [node.input[0], q_weight_name]
+    new_inits = []
+    kwargs = {}
+
+    op_type = "MatMulNBits"
+
+    # pack quantized weight
+    if num_bits == 4:
+        q_weight_pairs = q_weight[:, ::2] | q_weight[:, 1::2] << 4
+        packed[:, :] = q_weight_pairs[:, :blob_size]
+    elif num_bits == 8:
+        packed = q_weight
+    else:
+        logger.error(f"MatMulNBits does not have kernel support for num_bits = {num_bits}.")
+
+    packed = np.reshape(packed, (-1, k_blocks, blob_size))
+
+    # build scale tensor
+    scale = np.reshape(scale, (-1, k_blocks))
+    assert scale.dtype == np.float32 or scale.dtype == np.float16
+    scale_tensor = onnx.helper.make_tensor(
+        name=node.input[1] + "_scale",
+        data_type=np_dtype_to_tensor_dtype(scale.dtype),
+        dims=scale.shape,
+        vals=scale.tobytes(),
+        raw=True,
+    )
+    input_names.append(scale_tensor.name)
+    new_inits.append(scale_tensor)
+
+    # build zero_point tensor
+    if zero_point is not None:
+        if num_bits == 8:
+            packed_zp = zero_point.astype("uint8")
+        elif num_bits == 4:
+            # For 4-bit case, the default zeros is 0x8. So it is 0x88 = 136 if we fill lower/higher 4 bits with 0x8.
+            packed_zp = np.full((zero_point.shape[0] + 1) // 2, 136, dtype="uint8")
+            # create an index array
+            idx = np.arange(zero_point.shape[0] // k_blocks * k_blocks).reshape(-1)
+            # separate odd and even indices
+            even_idx = idx[::2]
+            odd_idx = idx[1::2]
+            # vectorized operation for even and odd indices
+            packed_zp[even_idx // 2] = (packed_zp[even_idx // 2] & 0xF0) | zero_point[even_idx].ravel()
+            packed_zp[odd_idx // 2] = (packed_zp[odd_idx // 2] & 0x0F) | (zero_point[odd_idx].ravel() << 4)
+        else:
+            raise ValueError(f"MatMulNBits does not have kernel support for num_bits = {num_bits}.")
+
+        packed_zp = np.reshape(packed_zp, (weight_shape[1], -1))
+        zp_tensor = onnx.helper.make_tensor(
+            name=node.input[1] + "_zp", data_type=2, dims=packed_zp.shape, vals=packed_zp.tobytes(), raw=True
+        )
+        input_names.append(zp_tensor.name)
+        new_inits.append(zp_tensor)
+
+    # set kwargs
+    kwargs["K"] = weight_shape[0]
+    kwargs["N"] = weight_shape[1]
+    kwargs["bits"] = num_bits
+    kwargs["block_size"] = group_size
+    if accuracy_level > 0:
+        # require onnxruntime > 1.16.3
+        kwargs["accuracy_level"] = accuracy_level
+
+    q_weight_tensor = onnx.helper.make_tensor(
+        name=q_weight_name,
+        data_type=2,
+        dims=packed.shape,
+        vals=packed.tobytes(),
+        raw=True,
+    )
+    new_inits.append(q_weight_tensor)
+
+    matmul_weight_only_node = onnx.helper.make_node(
+        op_type,
+        inputs=input_names,
+        outputs=node.output,
+        name=node.name + "_Q" + str(num_bits) if node.name else "_Q" + str(num_bits),
+        domain="com.microsoft",
+        **kwargs,
+    )
+    return matmul_weight_only_node, new_inits
+
+
+def quant_tensor(data, num_bits=4, group_size=32, scheme="asym", dtype="int", ratio=1.0):
+    """Quantize tensor per group.
+
+    Args:
+        data : input weight
+        num_bits (int, optional): num_bits. Defaults to 4.
+        group_size (int, optional): how many elements share one scale/zp. Defaults to 4.
+        scheme (str, optional): quantization scheme. Defaults to "asym".
+        dtype (str, optional): data type. Defaults to "int".
+        ratio (float, optional): percentile of clip. Defaults to 1.0.
+
+    Returns:
+        output: quantized weight
+        scale: scale
+        zero_point: zero point
+    """
+    data = np.reshape(data, (-1, group_size))
+    if scheme == "asym" or dtype == "uint":
+        maxq = 2**num_bits - 1
+        minq = 0
+    elif scheme == "sym":
+        maxq = 2 ** (num_bits - 1) - 1 if num_bits != 1 else 0
+        minq = -(2 ** (num_bits - 1)) if num_bits != 1 else -1
+
+    rmin = np.min(data, axis=1, keepdims=True) * ratio
+    rmax = np.max(data, axis=1, keepdims=True) * ratio
+    if scheme == "sym":
+        max_range = np.maximum(np.abs(rmin), np.abs(rmax))
+        scale = np.ones(rmax.shape)
+        mask = max_range > 0
+        scale[mask] = (max_range[mask] * 2.0).astype(np.float64) / (maxq - minq)
+        zero_point = (
+            np.zeros(scale.shape) if dtype == "int" else np.ones(rmax.shape, dtype="uint8") * (1 << (num_bits - 1))
+        )
+    else:
+        scale = np.ones(rmax.shape)
+        scale[rmin != rmax] = np.array(
+            [float(i) / (maxq - minq) for i in (rmax - rmin)[rmin != rmax].flatten().tolist()]
+        )
+        zero_point = (
+            ((np.zeros(scale.shape) - rmin) / scale).round()
+            if dtype == "int"
+            else np.maximum(0, np.minimum(maxq, ((np.zeros(scale.shape) - rmin) / scale).round())).astype("uint8")
+        )
+
+    q_weight = np.empty_like(data, dtype=scale.dtype)
+    np.divide(data, scale, out=q_weight)
+    np.add(q_weight, zero_point, out=q_weight)
+    np.round(q_weight, out=q_weight)
+    np.clip(q_weight, minq, maxq, out=q_weight)
+
+    return q_weight, scale, zero_point
+
+
+def quant_tensor_k_quant_cpu(data, num_bits=4, group_size=32):
+    """Quantize tensor per group based on k quant.
+
+    Ref: https://github.com/ggml-org/llama.cpp/blob/64eda5deb9859e87a020e56bab5d2f9ca956f1de/ggml/src/ggml-quants.c
+
+    Args:
+        data : input weight
+        num_bits (int, optional): num_bits. Defaults to 4.
+        group_size (int, optional): how many elements share one scale/zp. Defaults to 32.
+
+    Returns:
+        output: quantized weight
+        scale: scale
+        zero_point: zero point
+    """
+    data = np.reshape(data, (-1, group_size)).astype(np.float32)  # nb = data.shape[0], (nb, group_size)
+    maxq = 2**num_bits - 1
+    minq = 0
+    sum_x2 = np.sum(data**2, axis=1, keepdims=True)  # (nb, 1)
+    av_x = np.sqrt(sum_x2 / group_size)  # (nb, 1)
+    weights = np.add(av_x, np.abs(data))  # (nb, group_size)
+    rmin = np.min(data, axis=1, keepdims=True)  # (nb, 1)
+    rmax = np.max(data, axis=1, keepdims=True)  # (nb, 1)
+    sum_w = np.sum(weights, axis=1, keepdims=True)  # (nb, 1)
+    sum_x = np.sum(weights * data, axis=1, keepdims=True)  # (nb, group_size)
+    iscale = np.ones(rmax.shape, dtype=data.dtype)  # (nb, 1)
+    mask = rmin != rmax
+    iscale[mask] = (maxq - minq) / (rmax[mask] - rmin[mask])
+    scale = 1 / iscale
+    quant_data = np.clip(np.round(iscale * (data - rmin)), minq, maxq)  # (nb, group_size)
+    diff = scale * quant_data + rmin - data  # (nb, group_size)
+    best_mad = np.sum(weights * diff**2, axis=1, keepdims=True)  # (nb, 1)
+    nstep = 20
+    rdelta = 0.1
+    # nstep * rdelta = -2 * rrmin, maxq - minq = 2**num_bits - 1
+    rrmin = -1
+    for is_ in range(nstep):
+        iscale_new = np.ones(rmax.shape, dtype=data.dtype)  # (nb, 1)
+        factor = np.array([rrmin + rdelta * is_ + maxq - minq]).astype(data.dtype)[0]
+        mask = rmin != rmax
+        iscale_new[mask] = factor / (rmax[mask] - rmin[mask])
+        quant_data_new = np.clip(np.round(iscale_new * (data - rmin)), minq, maxq)  # (nb, group_size)
+        mul_weights_quant_data_new = weights * quant_data_new
+        sum_l = np.sum(mul_weights_quant_data_new, axis=1, keepdims=True)  # (nb, 1)
+        sum_l2 = np.sum(mul_weights_quant_data_new * quant_data_new, axis=1, keepdims=True)  # (nb, 1)
+        sum_xl = np.sum(mul_weights_quant_data_new * data, axis=1, keepdims=True)  # (nb, 1)
+        D = np.subtract(sum_w * sum_l2, sum_l**2)  # noqa: N806
+
+        this_scale = (sum_w * sum_xl - sum_x * sum_l) / D  # (nb, 1)
+        this_min = (sum_l2 * sum_x - sum_l * sum_xl) / D  # (nb, 1)
+
+        diff = this_scale * quant_data_new + this_min - data  # (nb, group_size)
+        mad = np.sum(weights * diff**2, axis=1, keepdims=True)  # (nb, 1)
+
+        mad_1 = np.array(mad)
+        best_mad_1 = np.array(best_mad)
+        idx_to_replace = np.where(mad_1 < best_mad_1)[0]
+        quant_data[idx_to_replace, :] = quant_data_new[idx_to_replace, :]
+        best_mad[idx_to_replace] = mad[idx_to_replace]
+        scale[idx_to_replace] = this_scale[idx_to_replace]
+        rmin[idx_to_replace] = this_min[idx_to_replace]
+
+    zero_point = np.clip(((-rmin) / scale).round(), 0, maxq).astype("uint8")
+    scale = scale.astype(np.float64)
+    q_weight = np.empty_like(data, dtype=scale.dtype)
+    np.divide(data, scale, out=q_weight)
+    np.add(q_weight, zero_point, out=q_weight)
+    np.round(q_weight, out=q_weight)
+    np.clip(q_weight, minq, maxq, out=q_weight)
+
+    return q_weight, scale, zero_point
+
+
+def quant_tensor_k_quant_cuda(data, num_bits=4, group_size=32):
+    """Quantize tensor per group based on k quant.
+
+    Ref: https://github.com/ggml-org/llama.cpp/blob/64eda5deb9859e87a020e56bab5d2f9ca956f1de/ggml/src/ggml-quants.c
+
+    Args:
+        data : input weight
+        num_bits (int, optional): num_bits. Defaults to 4.
+        group_size (int, optional): how many elements share one scale/zp. Defaults to 4.
+
+    Returns:
+        output: quantized weight
+        scale: scale
+        zero_point: zero point
+    """
+    try:
+        import cupy as cp
+        import torch
+
+        if torch.cuda.is_available():
+            data = cp.asarray(data)
+            data = data.reshape((-1, group_size)).astype(cp.float32)  # nb = data.shape[0], (nb, group_size)
+            maxq = 2**num_bits - 1
+            minq = 0
+            sum_x2 = cp.sum(data**2, axis=1, keepdims=True)  # (nb, 1)
+            av_x = cp.sqrt(sum_x2 / group_size)  # (nb, 1)
+            weights = cp.add(av_x, cp.abs(data))  # (nb, group_size)
+            rmin = cp.min(data, axis=1, keepdims=True)  # (nb, 1)
+            rmax = cp.max(data, axis=1, keepdims=True)  # (nb, 1)
+            sum_w = cp.sum(weights, axis=1, keepdims=True)  # (nb, 1)
+            sum_x = cp.sum(weights * data, axis=1, keepdims=True)  # (nb, group_size)
+            iscale = cp.ones(rmax.shape, dtype=data.dtype)  # (nb, 1)
+            mask = rmin != rmax
+            iscale[mask] = (maxq - minq) / (rmax[mask] - rmin[mask])
+            scale = 1 / iscale
+            quant_data = cp.clip(cp.round(iscale * (data - rmin)), minq, maxq)  # (nb, group_size)
+            diff = scale * quant_data + rmin - data  # (nb, group_size)
+            best_mad = cp.sum(weights * diff**2, axis=1, keepdims=True)  # (nb, 1)
+            nstep = 20
+            rdelta = 0.1
+            rrmin = -1
+            for is_ in range(nstep):
+                iscale_new = cp.ones(rmax.shape, dtype=data.dtype)  # (nb, 1)
+                factor = cp.array([rrmin + rdelta * is_ + maxq - minq]).astype(data.dtype)[0]
+                mask = rmin != rmax
+                iscale_new[mask] = factor / (rmax[mask] - rmin[mask])
+                quant_data_new = cp.clip(cp.round(iscale_new * (data - rmin)), minq, maxq)  # (nb, group_size)
+                mul_weights_quant_data_new = weights * quant_data_new
+                sum_l = cp.sum(mul_weights_quant_data_new, axis=1, keepdims=True)  # (nb, 1)
+                sum_l2 = cp.sum(mul_weights_quant_data_new * quant_data_new, axis=1, keepdims=True)  # (nb, 1)
+                sum_xl = cp.sum(mul_weights_quant_data_new * data, axis=1, keepdims=True)  # (nb, 1)
+                D = cp.subtract(sum_w * sum_l2, sum_l**2)  # noqa: N806
+
+                this_scale = (sum_w * sum_xl - sum_x * sum_l) / D  # (nb, 1)
+                this_min = (sum_l2 * sum_x - sum_l * sum_xl) / D  # (nb, 1)
+
+                diff = this_scale * quant_data_new + this_min - data  # (nb, group_size)
+                mad = cp.sum(weights * diff**2, axis=1, keepdims=True)  # (nb, 1)
+
+                mad_1 = cp.array(mad)
+                best_mad_1 = cp.array(best_mad)
+                idx_to_replace = cp.where(mad_1 < best_mad_1)[0]
+                quant_data[idx_to_replace, :] = quant_data_new[idx_to_replace, :]
+                best_mad[idx_to_replace] = mad[idx_to_replace]
+                scale[idx_to_replace] = this_scale[idx_to_replace]
+                rmin[idx_to_replace] = this_min[idx_to_replace]
+
+            zero_point = cp.clip(((-rmin) / scale).round(), 0, maxq).astype("uint8")
+            scale = scale.astype(cp.float64)
+            q_weight = cp.empty_like(data, dtype=scale.dtype)
+            cp.divide(data, scale, out=q_weight)
+            cp.add(q_weight, zero_point, out=q_weight)
+            cp.round(q_weight, out=q_weight)
+            cp.clip(q_weight, minq, maxq, out=q_weight)
+
+            return q_weight.get(), scale.get(), zero_point.get()
+        else:
+            logger.warning(
+                "Try to use k-quant quantization on CUDA. However, CUDA is not available."
+                "Fall back to k-quant quantization on CPU."
+            )
+            return quant_tensor_k_quant_cpu(data, num_bits, group_size)
+    except ImportError:
+        logger.info(
+            "Now we are using k-quant quantization on cpu, which is time consuming."
+            "Please consider install cupy to speed up on CUDA. See https://cupy.dev/"
+            "Please also install torch to check CUDA availability."
+        )
+        return quant_tensor_k_quant_cpu(data, num_bits, group_size)
+
+
+def qdq_tensor(data, num_bits=4, group_size=32, scheme="asym", dtype="int", ratio=1.0):
+    """Quant dequant tensor per group.
+
+    Args:
+        data : input weight
+        num_bits (int, optional): num_bits. Defaults to 4.
+        group_size (int, optional): how many elements share one scale/zp. Defaults to 4.
+        scheme (str, optional): quantization scheme. Defaults to "asym".
+        dtype (str, optional): data type. Defaults to "int".
+        ratio (float, optional): percentile of clip. Defaults to 1.0.
+
+    Returns:
+        output: quant-dequant weight
+    """
+    org_shape = data.shape
+    weight, scale, zp = quant_tensor(data, num_bits, group_size, scheme, dtype, ratio)
+    return np.reshape(scale * (weight - zp), org_shape)
+
+
+def pad_tensor(weight, group_size, k_blocks):
+    """Pad tensor rowi so that it can be is divisible by group_size.
+
+    Args:
+        weight (array): weight
+        group_size (int): how many elements share one scale/zp
+        k_blocks (int): the number of block
+
+    Returns:
+        weight: paded weight
+    """
+    if group_size == -1:
+        return weight
+
+    org_w_shape = weight.shape
+    padded_rows = k_blocks * group_size
+    pad_len = padded_rows - org_w_shape[0]
+
+    if pad_len > 0:
+        weight = np.pad(weight, ((0, pad_len), (0, 0)), "constant")
+
+    return weight
+
+
+def rtn_quantize(
+    model,
+    weight_config={},  # noqa: B006
+    num_bits=4,
+    group_size=32,
+    scheme="asym",
+    ratios={},  # noqa: B006
+    accuracy_level=0,
+    providers=["CPUExecutionProvider"],  # noqa: B006
+    algorithm="k_quant",
+):
+    """Quant the model with round to nearst method.
+
+    Args:
+        model (ModelProto or ONNXModel): onnx model
+        weight_config (dict): quantization config
+                For example,
+                weight_config = {
+                    'fc2':
+                        {
+                            'bits': 4,
+                            'group_size': 32,
+                            'scheme': 'sym',
+                            'algorithm': 'RTN'
+                        }
+                }
+        num_bits (int, optional): num_bits. Default is 4.
+        group_size (int, optional): how many elements share one scale/zp. Default is 32.
+        scheme (str, optional): sym or asym. Defaults to "asym".
+        ratios (dict, optional): percentile of clip. Defaults to {}.
+        accuracy_level (int): accuracy level. Support 0 (unset),1(fp32), 2(fp16), 3(bf16), or 4(int8).
+        providers (list): providers to use
+
+    Returns:
+        model: fake quantized ONNXModel
+    """
+    model = ONNXModel(model)
+    base_dir = os.path.dirname(model.model_path) if model.model_path is not None else ""
+    new_nodes = []
+    remove_nodes = []
+    total_num = len([i for i in model.nodes() if i.op_type in ["MatMul"]])
+    curr_id = 0
+    for node in model.nodes():
+        if node.op_type in ["MatMul"]:
+            curr_id += 1
+            simple_progress_bar(total_num, curr_id)
+        if (
+            node.op_type in ["MatMul"]
+            and model.get_initializer(node.input[1]) is not None
+            and weight_config.get(node.name, {}) != "fp32"
+        ):
+            weight_tensor = model.get_initializer(node.input[1])
+            weight = numpy_helper.to_array(weight_tensor, base_dir=base_dir).copy()
+            if len(weight.shape) != 2:
+                continue
+
+            dtype = weight.dtype
+
+            if node.name in weight_config:
+                num_bits = weight_config[node.name]["bits"]
+                group_size = weight_config[node.name]["group_size"]
+                scheme = weight_config[node.name]["scheme"]
+
+            org_w_shape = weight.shape  # ic, oc
+            group_size = group_size if group_size != -1 else org_w_shape[0]
+
+            k_blocks = (org_w_shape[0] - 1) // group_size + 1
+            init_share_num = model.get_initializer_share_num(node.input[1])
+
+            weight = pad_tensor(weight, group_size, k_blocks)
+
+            satisfy_MatMulNBits_condition = num_bits == 4 or num_bits == 8  # noqa: N806
+
+            if satisfy_MatMulNBits_condition:  # pragma: no cover
+                if algorithm == "k_quant":
+                    q_weight, scale, zp = quant_tensor_k_quant_cuda(weight.T, num_bits, group_size)
+                else:
+                    q_weight, scale, zp = quant_tensor(
+                        weight.T, num_bits, group_size, scheme, "uint", ratios.get(node.input[1], 1)
+                    )
+
+                q_matmul_node, new_inits = make_matmul_weight_only_node(
+                    node=node,
+                    weight_shape=org_w_shape,
+                    num_bits=num_bits,
+                    group_size=group_size,
+                    k_blocks=k_blocks,
+                    q_weight=q_weight.astype("uint8"),
+                    scale=scale.astype(dtype),
+                    zero_point=zp if scheme == "asym" else None,
+                    accuracy_level=accuracy_level,
+                )
+
+                model.add_initializers(new_inits)
+                remove_nodes.append(node)
+                new_nodes.append(q_matmul_node)
+            else:
+                q_weight = qdq_tensor(weight.T, num_bits, group_size, scheme, "int", ratios.get(node.input[1], 1))
+                q_weight = np.reshape(q_weight, (org_w_shape[1], -1))
+                q_weight = np.transpose(q_weight)
+                q_weight = q_weight[: org_w_shape[0], :].astype(dtype)
+                q_weight_tensor = onnx.helper.make_tensor(
+                    name=node.input[1] + f"_Q{num_bits!s}G{group_size!s}",
+                    data_type=np_dtype_to_tensor_dtype(dtype),
+                    dims=weight.shape,
+                    vals=q_weight.tobytes(),
+                    raw=True,
+                )
+                model.add_initializer(q_weight_tensor)
+                node.input[1] = q_weight_tensor.name
+            if init_share_num == 1:
+                model.remove_initializer(weight_tensor)
+
+    model.add_nodes(new_nodes)
+    model.remove_nodes(remove_nodes)
+    model.topological_sort()
+    return model
+
+
+def get_weight_scale(weight, group_size):
+    """Get the scale of weight."""
+    org_shape = weight.shape
+    weight = np.reshape(weight, (-1, group_size)) if group_size != -1 else weight
+    scale = np.mean(np.reshape(np.abs(weight) / np.max(np.abs(weight), axis=1, keepdims=True), org_shape), axis=0)
+    return scale
+
+
+def prepare_inputs(model, n_samples, dataloader, providers):
+    """Prepare inputs for weight only quantization.
+
+    Args:
+        model (ModelProto or ONNXModel): onnx model
+        n_samples (int, optional): calibration sample number. -1 means all samples.
+        dataloader (object): dataloader for calibration.
+        providers (list): providers to use
+
+    Returns:
+        inputs: prepared inputs.
+        so: session options
+    """
+    from importlib.util import find_spec
+
+    from .util import to_numpy
+
+    so = ort.SessionOptions()
+    if sys.version_info < (3, 11) and find_spec("onnxruntime_extensions"):  # pragma: no cover
+        from onnxruntime_extensions import get_library_path
+
+        so.register_custom_ops_library(get_library_path())
+    if model.is_large_model:
+        onnx.save_model(
+            model.model,
+            model.model_path + "_augment.onnx",
+            save_as_external_data=True,
+            all_tensors_to_one_file=True,
+            convert_attribute=False,
+        )
+
+    session = (
+        ort.InferenceSession(model.model.SerializeToString(), so, providers=providers)
+        if not model.is_large_model
+        else ort.InferenceSession(model.model_path + "_augment.onnx", so, providers=providers)
+    )
+    inputs_names = [i.name for i in session.get_inputs()]
+    del session
+
+    inputs = []
+    for i, data in enumerate(dataloader):
+        if n_samples != -1 and ((i + 1) * dataloader.batch_size) > n_samples:
+            break
+        if len(inputs_names) != 1 or isinstance(data[0], dict):
+            assert len(data[0]) == len(inputs_names), (
+                f"Input number mismatch, require {len(inputs_names)} but get {len(data[0])}"
+            )
+
+        if isinstance(data[0], dict):
+            inputs.append(dict([(name, to_numpy(inp_data)) for name, inp_data in data[0].items()]))  # noqa: C404
+        elif isinstance(data[0], np.ndarray):  # pragma: no cover
+            inputs.append(dict([(name, inp) for name, inp in zip(inputs_names, [data[0]], strict=False)]))  # noqa: C404
+        else:  # pragma: no cover
+            inputs.append(dict([(name, to_numpy(inp)) for name, inp in zip(inputs_names, data[0], strict=False)]))  # noqa: C404
+    return inputs, so
+
+
+def gptq(
+    W,
+    H,
+    num_bits=4,
+    group_size=32,
+    scheme="asym",
+    blocksize=128,
+    percdamp=0.01,
+    actorder=False,
+    mse=False,
+    perchannel=True,
+):
+    """Quant the weight with GPTQ method.
+
+    Args:
+        W (array): weight.
+        H (array): Hessian matrix.
+        num_bits (int, optional): num_bits. Default is 4.
+        group_size (int, optional): how many elements share one scale/zp. Default is 32.
+        scheme (str, optional): sym or asym. Defaults to "asym".
+        blocksize (int, optional): blocksize to quantize weight.
+        percdamp (float, optional): percent of the average Hessian diagonal to use for dampening.
+        actorder (bool, optional): whether rearrange Hessian matrix considering the diag's value.
+        mse (bool, optional): whether get scale and zero point with mse error.
+        perchannel (bool, optional): whether quantize weight per-channel.
+
+    Returns:
+        Q: fake quantized weight
+    """
+    maxq = 2**num_bits - 1
+    grid = 100
+    maxshrink = 0.8
+    norm = 2.4
+
+    def find_params(weight):
+        org_shape = weight.shape
+        # find zp, scale
+        if not perchannel:
+            weight = np.expand_dims(weight.flatten(), axis=1)
+        tmp = np.zeros(weight.shape[1])
+        xmin = np.minimum(np.min(weight, axis=0), tmp)
+        xmax = np.maximum(np.max(weight, axis=0), tmp)
+        if scheme == "sym":
+            xmax = np.maximum(np.abs(xmin), xmax)
+            tmp = xmin < 0
+            if np.any(tmp):
+                xmin[tmp] = -xmax[tmp]
+        tmp = (xmin == 0) & (xmax == 0)
+        xmin[tmp] = -1
+        xmax[tmp] = +1
+
+        scale = (xmax - xmin) / maxq
+        if scheme == "sym":
+            zero = np.ones(scale.shape) * (maxq + 1) / 2
+        else:
+            zero = np.round(-xmin / scale)
+        if mse:
+            best = np.ones([weight.shape[1]]) * float("inf")
+            for i in range(int(maxshrink * grid)):
+                p = 1 - i / grid
+                xmin1 = p * xmin
+                xmax1 = p * xmax
+                scale1 = (xmax1 - xmin1) / maxq
+                zero1 = np.round(-xmin1 / scale1) if scheme != "sym" else zero
+                q = np.clip(np.round(weight / scale1) + zero1, 0, maxq)
+                q -= weight
+                q = np.power(np.abs(q), norm)
+                err = np.sum(q, 0)
+                tmp = err < best
+                if np.any(tmp):
+                    best[tmp] = err[tmp]
+                    scale[tmp] = scale1[tmp]
+                    zero[tmp] = zero1[tmp]
+        if not perchannel:
+            tmp = org_shape[1]
+            scale = np.repeat(scale, tmp)
+            zero = np.repeat(zero, tmp)
+        shape = [-1] + [1] * (len(org_shape) - 1)
+        scale = np.reshape(scale, shape)
+        zero = np.reshape(zero, shape)
+        return scale, zero
+
+    shape = W.shape
+    scale, zp = find_params(W)
+    dead = np.diag(H) == 0
+    H[dead, dead] = 1
+    W[dead, :] = 0  # such channel makes no contribution to quantization computation
+
+    # rearrange considering the diag's value
+    if actorder:
+        perm = np.argsort(np.diag(H))[::-1]
+        W = W[perm, :]  # noqa: N806
+        H = H[perm, :][:, perm]  # noqa: N806
+    Losses = np.zeros_like(W)  # noqa: N806
+    Q = np.zeros_like(W)  # noqa: N806
+    damp = percdamp * np.mean(np.diag(H))
+    diag = np.arange(shape[0])
+    H[diag, diag] += damp  # add a average value of
+    H = np.linalg.cholesky(np.linalg.inv(H)).T  # noqa: N806
+    Hinv = H  # noqa: N806
+    for i1 in range(0, shape[0], blocksize):
+        i2 = min(i1 + blocksize, shape[0])
+        count = i2 - i1
+
+        W1 = copy.deepcopy(W[i1:i2, :])  # noqa: N806
+        Q1 = np.zeros_like(W1)  # noqa: N806
+        Err1 = np.zeros_like(W1)  # noqa: N806
+        Losses1 = np.zeros_like(W1)  # noqa: N806
+        Hinv1 = Hinv[i1:i2, i1:i2]  # noqa: N806
+
+        for i in range(count):  # within a block, channel wise
+            w = W1[i, :]
+            d = Hinv1[i, i]
+
+            if group_size != -1:
+                if (i1 + i) % group_size == 0:
+                    scale, zp = find_params(W[(i1 + i) : (i1 + i + group_size), :])
+
+            q = (scale * (np.clip(np.round(w[:, np.newaxis] / scale) + zp, 0, maxq) - zp)).flatten()
+            Q1[i, :] = q
+            Losses1[i, :] = (w - q) ** 2 / d**2
+
+            err1 = (w - q) / d
+            W1[i:, :] -= np.matmul(np.expand_dims(Hinv1[i:, i], axis=1), np.expand_dims(err1, axis=0))
+            Err1[i, :] = err1
+
+        Q[i1:i2, :] = Q1
+        Losses[i1:i2, :] = Losses1 / 2
+
+        W[i2:, :] -= np.matmul(Hinv[i2:, i1:i2], Err1)
+
+    if actorder:
+        invperm = np.argsort(perm)
+        Q = Q[invperm, :]  # noqa: N806
+
+    Q = np.reshape(Q, W.shape)  # noqa: N806
+    del W
+    return Q
+
+
+def gptq_quantize(
+    model,
+    dataloader,
+    weight_config={},  # noqa: B006
+    num_bits=4,
+    group_size=32,
+    scheme="asym",
+    n_samples=128,
+    percdamp=0.01,
+    blocksize=128,
+    actorder=False,
+    mse=False,
+    perchannel=True,
+    accuracy_level=0,
+    providers=["CPUExecutionProvider"],  # noqa: B006
+):
+    """Quant the model with GPTQ method.
+
+    Args:
+        model (ModelProto or ONNXModel): onnx model
+        dataloader (object): dataloader for calibration.
+        weight_config (dict): quantization config
+                For example,
+                weight_config = {
+                    'fc2':
+                        {
+                            'bits': 4,
+                            'group_size': 32,
+                            'scheme': 'sym',
+                            'algorithm': 'GPTQ'
+                        }
+                }
+        num_bits (int, optional): num_bits. Default is 4.
+        group_size (int, optional): how many elements share one scale/zp. Default is 32.
+        scheme (str, optional): sym or asym. Defaults to "asym".
+        n_samples (int, optional): calibration sample number.
+        percdamp (float, optional): percent of the average Hessian diagonal to use for dampening.
+        blocksize (int, optional): blocksize to quantize weight.
+        actorder (bool, optional): whether rearrange Hessian matrix considering the diag's value.
+        mse (bool, optional): whether get scale and zero point with mse error.
+        perchannel (bool, optional): whether quantize weight per-channel.
+        accuracy_level (int): accuracy level. Support 0 (unset), 1(fp32), 2(fp16), 3(bf16), or 4(int8).
+        providers (list): providers to use
+
+    Returns:
+        model: fake quantized ONNXModel
+    """
+    model = ONNXModel(model)
+    base_dir = os.path.dirname(model.model_path) if model.model_path is not None else ""
+
+    inputs, so = prepare_inputs(model, n_samples, dataloader, providers)
+    del dataloader
+    org_output = copy.deepcopy(model.model.graph.output)
+    model.remove_tensors_from_outputs([i.name for i in org_output])
+    output_names = []
+    for node in model.nodes():
+        if (
+            node.op_type in ["MatMul"]
+            and weight_config.get(node.name, {}) != "fp32"
+            and weight_config.get(node.name, {}).get("algorithm", "GPTQ") == "GPTQ"
+        ):
+            output_names.append(node.input[0])
+    output_names = list(set(output_names))
+    model.add_tensors_to_outputs(output_names)
+    if model.is_large_model:
+        onnx.save_model(
+            model.model,
+            model.model_path + "_augment.onnx",
+            save_as_external_data=True,
+            all_tensors_to_one_file=True,
+            convert_attribute=False,
+        )
+
+    session = (
+        ort.InferenceSession(model.model.SerializeToString(), so, providers=providers)
+        if not model.is_large_model
+        else ort.InferenceSession(model.model_path + "_augment.onnx", so, providers=providers)
+    )
+
+    for idx, input_name in enumerate(output_names):
+        simple_progress_bar(len(output_names), idx + 1)
+        node_list = []
+        weights = []
+
+        for node in model.input_name_to_nodes[input_name]:
+            if (
+                node.op_type in ["MatMul"]
+                and weight_config.get(node.name, {}) != "fp32"
+                and weight_config.get(node.name, {}).get("algorithm", "GPTQ") == "GPTQ"
+                and model.get_initializer(node.input[1]) is not None
+            ):
+                weight = numpy_helper.to_array(
+                    model.get_initializer(model.get_node(node.name).input[1]), base_dir
+                ).copy()
+                if len(weight.shape) != 2:
+                    continue
+
+                weights.append(weight)
+                node_list.append(model.get_node(node.name))
+
+        if len(weights) == 0:
+            continue
+
+        Hs = [np.zeros((i.shape[0], i.shape[0])) for i in weights]  # noqa: N806
+        nsamples = 0
+        for data in inputs:
+            inp = session.run([input_name], data)[0]
+            tmp = inp.shape[0]
+            inp = np.reshape(inp, (-1, inp.shape[-1]))
+            Hs = [i * (nsamples / (nsamples + tmp)) for i in Hs]  # noqa: N806
+            nsamples += tmp
+            inp = np.sqrt(2 / nsamples) * inp
+            Hs = [i + np.matmul(inp.T, inp) for i in Hs]  # noqa: N806
+
+        for (
+            node,
+            weight,
+            H,  # noqa: N806
+        ) in zip(node_list, weights, Hs, strict=False):
+            if node.name in weight_config:
+                num_bits = weight_config[node.name]["bits"]
+                group_size = weight_config[node.name]["group_size"]
+                scheme = weight_config[node.name]["scheme"]
+            group_size = group_size if group_size != -1 else weight.shape[0]
+            dtype = weight.dtype
+
+            q_weight = gptq(
+                weight,
+                H,
+                num_bits=num_bits,
+                group_size=group_size,
+                scheme=scheme,
+                blocksize=blocksize,
+                percdamp=percdamp,
+                actorder=actorder,
+                mse=mse,
+                perchannel=perchannel,
+            )
+
+            weight_tensor = model.get_initializer(node.input[1])
+            init_share_num = model.get_initializer_share_num(node.input[1])
+
+            satisfy_MatMulNBits_condition = num_bits == 4  # noqa: N806
+
+            if satisfy_MatMulNBits_condition:  # pragma: no cover
+                org_shape = weight.shape
+                k_blocks = (org_shape[0] + group_size - 1) // group_size
+                q_weight = pad_tensor(q_weight, group_size, k_blocks)
+                q_weight, scale, zp = quant_tensor(q_weight.T, num_bits, group_size, scheme, "uint")
+                q_matmul_node, new_inits = make_matmul_weight_only_node(
+                    node=node,
+                    weight_shape=org_shape,
+                    num_bits=num_bits,
+                    group_size=group_size,
+                    k_blocks=k_blocks,
+                    q_weight=q_weight.astype("uint8"),
+                    scale=scale.astype(dtype),
+                    zero_point=zp if scheme == "asym" else None,
+                    accuracy_level=accuracy_level,
+                )
+
+                model.add_initializers(new_inits)
+                model.remove_node(node)
+                model.add_node(q_matmul_node)
+            else:
+                q_weight_tensor = onnx.helper.make_tensor(
+                    name=node.input[1] + f"_Q{num_bits!s}G{group_size!s}",
+                    data_type=np_dtype_to_tensor_dtype(dtype),
+                    dims=q_weight.shape,
+                    vals=q_weight.astype(dtype).tobytes(),
+                    raw=True,
+                )
+                model.add_initializer(q_weight_tensor)
+                node.input[1] = q_weight_tensor.name
+            if init_share_num == 1:
+                model.remove_initializer(weight_tensor)
+
+    model.remove_tensors_from_outputs(output_names)
+    model.model.graph.output.MergeFrom(org_output)
+
+    model.topological_sort()
+
+    # reload external data to prevent external data file path errors
+    if model.is_large_model:
+        from onnx.external_data_helper import load_external_data_for_model
+
+        load_external_data_for_model(model.model, os.path.split(model.model_path)[0])
+
+    return model
diff --git a/onnxruntime/test/python/quantization/test_op_matmul_4bits.py b/onnxruntime/test/python/quantization/test_op_matmul_4bits.py
index 0e739055b1772..03f4791c580e6 100644
--- a/onnxruntime/test/python/quantization/test_op_matmul_4bits.py
+++ b/onnxruntime/test/python/quantization/test_op_matmul_4bits.py
@@ -360,6 +360,8 @@ def test_quantize_matmul_int4_offsets_qdq(self):
     def test_quantize_matmul_int4_using_rtn_algo(self):
         if not find_spec("neural_compressor"):
             self.skipTest("skip test_smooth_quant since neural_compressor is not installed")
+        if not find_spec("torch"):
+            self.skipTest("skip test_quantize_matmul_int4_using_rtn_algo since torch is not installed")
         model_fp32_path = str(Path(self._tmp_model_dir.name).joinpath("matmul_fp32_offset.onnx").absolute())
         self.construct_model_matmul(model_fp32_path, symmetric=False)
         data_reader = self.input_feeds(1, {"input": (100, 52)})
@@ -371,6 +373,8 @@ def test_quantize_matmul_int4_using_rtn_algo(self):
     def test_quantize_matmul_int4_using_gptq_algo(self):
         if not find_spec("neural_compressor"):
             self.skipTest("skip test_smooth_quant since neural_compressor is not installed")
+        if not find_spec("torch"):
+            self.skipTest("skip test_quantize_matmul_int4_using_gptq_algo since torch is not installed")
         model_fp32_path = str(Path(self._tmp_model_dir.name).joinpath("matmul_fp32_offset.onnx").absolute())
         self.construct_model_matmul(model_fp32_path, symmetric=False)
         data_reader = self.input_feeds(1, {"input": (100, 52)})
diff --git a/setup.py b/setup.py
index 1e426ea8e060b..c45657c0c2873 100644
--- a/setup.py
+++ b/setup.py
@@ -517,6 +517,7 @@ def finalize_options(self):
     "onnxruntime.quantization.CalTableFlatBuffers",
     "onnxruntime.quantization.fusions",
     "onnxruntime.quantization.execution_providers.qnn",
+    "onnxruntime.quantization.neural_compressor",
     "onnxruntime.transformers",
     "onnxruntime.transformers.models.bart",
     "onnxruntime.transformers.models.bert",

From b4f7a905b0d636b71bd486c0ef702eb5a44eadf2 Mon Sep 17 00:00:00 2001
From: Scott McKay <skottmckay@gmail.com>
Date: Sat, 3 May 2025 16:48:51 +1000
Subject: [PATCH 02/84] Selection policy. Device discovery updates. Bug fixes.
 (#24625)

### Description
<!-- Describe your changes. -->
Add initial selection policy implementations.

Update device discovery
- get vendor and vendor id for CPU from cpuid_info
- trim metadata to known useful fields
- NPU detection via dxcore only Bug fixes/updates from PRs for C# and
python bindings

Add some tests for selection policy
- TODO: Add more tests

### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
Desire to boil oceans.

---------

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
---
 .../core/session/onnxruntime_c_api.h          |  44 ++-
 .../core/session/onnxruntime_cxx_api.h        |  12 +-
 .../core/session/onnxruntime_cxx_inline.h     |   7 +
 onnxruntime/core/common/cpuid_info.cc         |  10 +
 onnxruntime/core/common/cpuid_info.h          |   7 +
 .../core/framework/graph_partitioner.cc       |   8 +-
 onnxruntime/core/framework/session_options.h  |  15 +
 .../core/platform/windows/device_discovery.cc |  85 +++--
 .../core/session/abi_key_value_pairs.h        |   2 +
 .../core/session/abi_session_options.cc       |  11 +
 .../core/session/ep_library_internal.cc       |   8 +-
 onnxruntime/core/session/inference_session.h  |   4 +-
 .../core/session/model_compilation_options.cc |   4 +-
 onnxruntime/core/session/onnxruntime_c_api.cc |   3 +-
 onnxruntime/core/session/ort_apis.h           |   4 +
 .../core/session/provider_policy_context.cc   | 347 ++++++++++++++++++
 .../core/session/provider_policy_context.h    |  79 ++++
 onnxruntime/core/session/utils.cc             |  17 +-
 .../test/autoep/test_autoep_selection.cc      | 122 +++++-
 19 files changed, 724 insertions(+), 65 deletions(-)
 create mode 100644 onnxruntime/core/session/provider_policy_context.cc
 create mode 100644 onnxruntime/core/session/provider_policy_context.h

diff --git a/include/onnxruntime/core/session/onnxruntime_c_api.h b/include/onnxruntime/core/session/onnxruntime_c_api.h
index 9a5891f9e236d..cef5eab9a505e 100644
--- a/include/onnxruntime/core/session/onnxruntime_c_api.h
+++ b/include/onnxruntime/core/session/onnxruntime_c_api.h
@@ -425,6 +425,32 @@ typedef enum OrtExecutionProviderDevicePolicy {
   OrtExecutionProviderDevicePolicy_MIN_OVERALL_POWER,
 } OrtExecutionProviderDevicePolicy;
 
+/** \brief Delegate to allow providing custom OrtEpDevice selection logic
+ *
+ * This delegate is called by the EP selection code to allow the user to provide custom device selection logic.
+ * The user can use this to select OrtEpDevice instances from the list of available devices.
+ *
+ * \param ep_devices The list of available devices.
+ * \param num_devices The number of available devices.
+ * \param model_metadata The model metadata.
+ * \param runtime_metadata The runtime metadata. May be nullptr.
+ * \param selected Pre-allocated array to populate with selected OrtEpDevice pointers from ep_devices.
+ * \param max_ep_devices The maximum number of devices that can be selected in the pre-allocated array.
+                         Currently the maximum is 8.
+ * \param num_ep_devices The number of selected devices.
+ *
+ * \return OrtStatus* Selection status. Return nullptr on success.
+ *                    Use CreateStatus to provide error info. Use ORT_FAIL as the error code.
+ *                    ORT will release the OrtStatus* if not null.
+ */
+typedef OrtStatus* (*EpSelectionDelegate)(_In_ const OrtEpDevice** ep_devices,
+                                          _In_ size_t num_devices,
+                                          _In_ const OrtKeyValuePairs* model_metadata,
+                                          _In_opt_ const OrtKeyValuePairs* runtime_metadata,
+                                          _Inout_ const OrtEpDevice** selected,
+                                          _In_ size_t max_selected,
+                                          _Out_ size_t* num_selected);
+
 /** \brief Algorithm to use for cuDNN Convolution Op
  */
 typedef enum OrtCudnnConvAlgoSearch {
@@ -5073,7 +5099,8 @@ struct OrtApi {
   ORT_API2_STATUS(GetEpDevices, _In_ const OrtEnv* env,
                   _Outptr_ const OrtEpDevice* const** ep_devices, _Out_ size_t* num_ep_devices);
 
-  /** \brief Append execution provider to the session options by name.
+  /** \brief Append the execution provider that is responsible for the selected OrtEpDevice instances
+   *         to the session options.
    *
    * \param[in] session_options Session options to add execution provider to.
    * \param[in] env Environment that execution providers were registered with.
@@ -5098,6 +5125,21 @@ struct OrtApi {
                   _In_reads_(num_op_options) const char* const* ep_option_vals,
                   size_t num_ep_options);
 
+  /** \brief Set the execution provider selection policy for the session.
+   *
+   * Allows users to specify a device selection policy for automatic execution provider (EP) selection,
+   * or provide a delegate callback for custom selection logic.
+   *
+   * \param[in] session_options The OrtSessionOptions instance.
+   * \param[in] policy The device selection policy to use (see OrtExecutionProviderDevicePolicy).
+   * \param[in] delegate Optional delegate callback for custom selection. Pass nullptr to use the built-in policy.
+   *
+   * \since Version 1.22
+   */
+  ORT_API2_STATUS(SessionOptionsSetEpSelectionPolicy, _In_ OrtSessionOptions* session_options,
+                  _In_ OrtExecutionProviderDevicePolicy policy,
+                  _In_opt_ EpSelectionDelegate* delegate);
+
   /** \brief Get the hardware device type.
    *
    * \param[in] device The OrtHardwareDevice instance to query.
diff --git a/include/onnxruntime/core/session/onnxruntime_cxx_api.h b/include/onnxruntime/core/session/onnxruntime_cxx_api.h
index 0ecc27c59dc28..6c175c606b4a1 100644
--- a/include/onnxruntime/core/session/onnxruntime_cxx_api.h
+++ b/include/onnxruntime/core/session/onnxruntime_cxx_api.h
@@ -1085,19 +1085,27 @@ struct SessionOptionsImpl : ConstSessionOptionsImpl<T> {
   SessionOptionsImpl& AppendExecutionProvider_TensorRT(const OrtTensorRTProviderOptions& provider_options);       ///< Wraps OrtApi::SessionOptionsAppendExecutionProvider_TensorRT
   SessionOptionsImpl& AppendExecutionProvider_TensorRT_V2(const OrtTensorRTProviderOptionsV2& provider_options);  ///< Wraps OrtApi::SessionOptionsAppendExecutionProvider_TensorRT
   SessionOptionsImpl& AppendExecutionProvider_MIGraphX(const OrtMIGraphXProviderOptions& provider_options);       ///< Wraps OrtApi::SessionOptionsAppendExecutionProvider_MIGraphX
-  ///< Wraps OrtApi::SessionOptionsAppendExecutionProvider_CANN
+  /// Wraps OrtApi::SessionOptionsAppendExecutionProvider_CANN
   SessionOptionsImpl& AppendExecutionProvider_CANN(const OrtCANNProviderOptions& provider_options);
-  ///< Wraps OrtApi::SessionOptionsAppendExecutionProvider_Dnnl
+  /// Wraps OrtApi::SessionOptionsAppendExecutionProvider_Dnnl
   SessionOptionsImpl& AppendExecutionProvider_Dnnl(const OrtDnnlProviderOptions& provider_options);
   /// Wraps OrtApi::SessionOptionsAppendExecutionProvider. Currently supports QNN, SNPE and XNNPACK.
   SessionOptionsImpl& AppendExecutionProvider(const std::string& provider_name,
                                               const std::unordered_map<std::string, std::string>& provider_options = {});
 
+  /// Append EPs that have been registered previously with the OrtEnv.
+  /// Wraps OrtApi::SessionOptionsAppendExecutionProvider_V2
   SessionOptionsImpl& AppendExecutionProvider_V2(Env& env, const std::vector<ConstEpDevice>& ep_devices,
                                                  const KeyValuePairs& ep_options);
+  /// Append EPs that have been registered previously with the OrtEnv.
+  /// Wraps OrtApi::SessionOptionsAppendExecutionProvider_V2
   SessionOptionsImpl& AppendExecutionProvider_V2(Env& env, const std::vector<ConstEpDevice>& ep_devices,
                                                  const std::unordered_map<std::string, std::string>& ep_options);
 
+  /// Wraps OrtApi::SessionOptionsSetEpSelectionPolicy
+  SessionOptionsImpl& SetEpSelectionPolicy(OrtExecutionProviderDevicePolicy policy,
+                                           EpSelectionDelegate* delegate = nullptr);
+
   SessionOptionsImpl& SetCustomCreateThreadFn(OrtCustomCreateThreadFn ort_custom_create_thread_fn);  ///< Wraps OrtApi::SessionOptionsSetCustomCreateThreadFn
   SessionOptionsImpl& SetCustomThreadCreationOptions(void* ort_custom_thread_creation_options);      ///< Wraps OrtApi::SessionOptionsSetCustomThreadCreationOptions
   SessionOptionsImpl& SetCustomJoinThreadFn(OrtCustomJoinThreadFn ort_custom_join_thread_fn);        ///< Wraps OrtApi::SessionOptionsSetCustomJoinThreadFn
diff --git a/include/onnxruntime/core/session/onnxruntime_cxx_inline.h b/include/onnxruntime/core/session/onnxruntime_cxx_inline.h
index 48b3b80cced55..1fdb8f16d9600 100644
--- a/include/onnxruntime/core/session/onnxruntime_cxx_inline.h
+++ b/include/onnxruntime/core/session/onnxruntime_cxx_inline.h
@@ -1149,6 +1149,13 @@ inline SessionOptionsImpl<T>& SessionOptionsImpl<T>::AppendExecutionProvider_V2(
   return *this;
 }
 
+template <typename T>
+inline SessionOptionsImpl<T>& SessionOptionsImpl<T>::SetEpSelectionPolicy(OrtExecutionProviderDevicePolicy policy,
+                                                                          EpSelectionDelegate* delegate) {
+  ThrowOnError(GetApi().SessionOptionsSetEpSelectionPolicy(this->p_, policy, delegate));
+  return *this;
+}
+
 template <typename T>
 inline SessionOptionsImpl<T>& SessionOptionsImpl<T>::SetCustomCreateThreadFn(OrtCustomCreateThreadFn ort_custom_create_thread_fn) {
   ThrowOnError(GetApi().SessionOptionsSetCustomCreateThreadFn(this->p_, ort_custom_create_thread_fn));
diff --git a/onnxruntime/core/common/cpuid_info.cc b/onnxruntime/core/common/cpuid_info.cc
index 97766028cfe12..ee7782e3c8763 100644
--- a/onnxruntime/core/common/cpuid_info.cc
+++ b/onnxruntime/core/common/cpuid_info.cc
@@ -106,6 +106,7 @@ void CPUIDInfo::X86Init() {
   GetCPUID(0, data);
 
   vendor_ = GetX86Vendor(data);
+  vendor_id_ = GetVendorId(vendor_);
 
   int num_IDs = data[0];
   if (num_IDs >= 1) {
@@ -151,6 +152,14 @@ std::string CPUIDInfo::GetX86Vendor(int32_t* data) {
 
 #endif  // defined(CPUIDINFO_ARCH_X86)
 
+uint32_t CPUIDInfo::GetVendorId(const std::string& vendor) {
+  if (vendor == "GenuineIntel") return 0x8086;
+  if (vendor == "GenuineAMD") return 0x1022;
+  if (vendor.find("Qualcomm") == 0) return 'Q' << 24 | 'C' << 16 | 'O' << 8 | 'M';
+  if (vendor.find("NV") == 0) return 0x10DE;
+  return 0;
+}
+
 #if defined(CPUIDINFO_ARCH_ARM)
 
 #if defined(__linux__)
@@ -204,6 +213,7 @@ void CPUIDInfo::ArmLinuxInit() {
 void CPUIDInfo::ArmWindowsInit() {
   // Get the ARM vendor string from the registry
   vendor_ = GetArmWindowsVendor();
+  vendor_id_ = GetVendorId(vendor_);
 
   // Read MIDR and ID_AA64ISAR1_EL1 register values from Windows registry
   // There should be one per CPU
diff --git a/onnxruntime/core/common/cpuid_info.h b/onnxruntime/core/common/cpuid_info.h
index 4d6e7e8b9105e..b820fa2ab1af7 100644
--- a/onnxruntime/core/common/cpuid_info.h
+++ b/onnxruntime/core/common/cpuid_info.h
@@ -19,6 +19,10 @@ class CPUIDInfo {
     return vendor_;
   }
 
+  uint32_t GetCPUVendorId() const {
+    return vendor_id_;
+  }
+
   bool HasAMX_BF16() const { return has_amx_bf16_; }
   bool HasAVX() const { return has_avx_; }
   bool HasAVX2() const { return has_avx2_; }
@@ -123,6 +127,9 @@ class CPUIDInfo {
   bool has_arm_neon_bf16_{false};
 
   std::string vendor_;
+  uint32_t vendor_id_;
+
+  uint32_t GetVendorId(const std::string& vendor);
 
 #if defined(CPUIDINFO_ARCH_X86)
 
diff --git a/onnxruntime/core/framework/graph_partitioner.cc b/onnxruntime/core/framework/graph_partitioner.cc
index e3e54be3f7c21..8ed5eeaa8d44f 100644
--- a/onnxruntime/core/framework/graph_partitioner.cc
+++ b/onnxruntime/core/framework/graph_partitioner.cc
@@ -806,7 +806,13 @@ static Status CreateEpContextModel(const ExecutionProviders& execution_providers
     ORT_RETURN_IF(ep_context_gen_options.error_if_no_compiled_nodes,
                   "Compiled model does not contain any EPContext nodes. "
                   "Check that the session EPs support compilation and can execute at least one model subgraph.");
-    return Status::OK();
+
+    LOGS(logger, WARNING) << "Compiled model does not contain any EPContext nodes. "
+                             "Either the session EPs do not support compilation or "
+                             "no subgraphs were able to be compiled.";
+
+    // we continue on to generate the compiled model which may benefit from L1 optimizations even if there are not
+    // EPContext nodes.
   }
 
   auto get_ep_context_node = [&all_ep_context_nodes](const std::string& node_name) -> std::pair<bool, const Node*> {
diff --git a/onnxruntime/core/framework/session_options.h b/onnxruntime/core/framework/session_options.h
index 94ff2bb55a055..8f8a3d6634a7e 100644
--- a/onnxruntime/core/framework/session_options.h
+++ b/onnxruntime/core/framework/session_options.h
@@ -90,6 +90,15 @@ struct EpContextModelGenerationOptions {
   size_t output_external_initializer_size_threshold = 0;
 };
 
+struct EpSelectionPolicy {
+  // flag to detect that a policy was set by the user.
+  // need to preserve current behavior of defaulting to CPU EP if no EPs are explicitly registered
+  // and no selection policy was explicitly provided.
+  bool enable{false};
+  OrtExecutionProviderDevicePolicy policy = OrtExecutionProviderDevicePolicy_DEFAULT;
+  EpSelectionDelegate* delegate{};
+};
+
 /**
  * Configuration information for a session.
  */
@@ -222,6 +231,11 @@ struct SessionOptions {
   // copied internally and the flag needs to be accessible across all copies.
   std::shared_ptr<std::atomic_bool> load_cancellation_flag = std::make_shared<std::atomic_bool>(false);
 
+  // Policy to guide Execution Provider selection
+  EpSelectionPolicy ep_selection_policy = {false,
+                                           OrtExecutionProviderDevicePolicy::OrtExecutionProviderDevicePolicy_DEFAULT,
+                                           nullptr};
+
   // Options for generating compile EPContext models were previously stored in session_option.configs as
   // string key/value pairs. To support more advanced options, such as setting input/output buffers, we
   // now have to store EPContext options in a struct of type EpContextModelGenerationOptions.
@@ -253,6 +267,7 @@ inline std::ostream& operator<<(std::ostream& os, const SessionOptions& session_
      << " use_per_session_threads:" << session_options.use_per_session_threads
      << " thread_pool_allow_spinning:" << session_options.thread_pool_allow_spinning
      << " use_deterministic_compute:" << session_options.use_deterministic_compute
+     << " ep_selection_policy:" << session_options.ep_selection_policy.policy
      << " config_options: { " << session_options.config_options << " }"
   //<< " initializers_to_share_map:"          << session_options.initializers_to_share_map
 #if !defined(ORT_MINIMAL_BUILD) && !defined(DISABLE_EXTERNAL_INITIALIZERS)
diff --git a/onnxruntime/core/platform/windows/device_discovery.cc b/onnxruntime/core/platform/windows/device_discovery.cc
index 88fbec37c8075..5a5b5041a5912 100644
--- a/onnxruntime/core/platform/windows/device_discovery.cc
+++ b/onnxruntime/core/platform/windows/device_discovery.cc
@@ -119,7 +119,13 @@ std::unordered_map<uint64_t, DeviceInfo> GetDeviceInfoSetupApi(const std::unorde
 
         uint32_t vendor_id = get_id(buffer, L"VEN_");
         uint32_t device_id = get_id(buffer, L"DEV_");
-        // won't always have a vendor id from an ACPI entry. need at least a device id to identify the hardware
+
+        // Processor ID should come from CPUID mapping.
+        if (vendor_id == 0 && guid == GUID_DEVCLASS_PROCESSOR) {
+          vendor_id = CPUIDInfo::GetCPUIDInfo().GetCPUVendorId();
+        }
+
+        // Won't always have a vendor id from an ACPI entry.  ACPI is not defined for this purpose.
         if (vendor_id == 0 && device_id == 0) {
           continue;
         }
@@ -138,8 +144,8 @@ std::unordered_map<uint64_t, DeviceInfo> GetDeviceInfoSetupApi(const std::unorde
         entry = &device_info[key];
         entry->vendor_id = vendor_id;
         entry->device_id = device_id;
-        // put the first hardware id string in the metadata. ignore the other lines.
-        entry->metadata.emplace(L"SPDRP_HARDWAREID", std::wstring(buffer, wcslen(buffer)));
+        // put the first hardware id string in the metadata. ignore the other lines. not sure if this is of value.
+        // entry->metadata.emplace(L"SPDRP_HARDWAREID", std::wstring(buffer, wcslen(buffer)));
       } else {
         // need valid ids
         continue;
@@ -156,14 +162,14 @@ std::unordered_map<uint64_t, DeviceInfo> GetDeviceInfoSetupApi(const std::unorde
                                             (PBYTE)buffer, sizeof(buffer), &size)) {
         std::wstring desc{buffer};
 
-        // Should we require the NPU to be found by DXCore or do we want to allow this vague matching?
-        // Probably depends on whether we always attempt to run DXCore or not.
-        const auto possible_npu = [](const std::wstring& desc) {
-          return (desc.find(L"NPU") != std::wstring::npos ||
-                  desc.find(L"Neural") != std::wstring::npos ||
-                  desc.find(L"AI Engine") != std::wstring::npos ||
-                  desc.find(L"VPU") != std::wstring::npos);
-        };
+        // For now, require dxcore to identify an NPU.
+        // If we want to try and infer it from the description this _may_ work but is untested.
+        // const auto possible_npu = [](const std::wstring& desc) {
+        //  return (desc.find(L"NPU") != std::wstring::npos ||
+        //           desc.find(L"Neural") != std::wstring::npos ||
+        //           desc.find(L"AI Engine") != std::wstring::npos ||
+        //           desc.find(L"VPU") != std::wstring::npos);
+        // };
 
         // use description if no friendly name
         if (entry->description.empty()) {
@@ -171,7 +177,7 @@ std::unordered_map<uint64_t, DeviceInfo> GetDeviceInfoSetupApi(const std::unorde
         }
 
         uint64_t npu_key = GetDeviceKey(*entry);
-        bool is_npu = npus.count(npu_key) > 0 || possible_npu(desc);
+        bool is_npu = npus.count(npu_key) > 0;  // rely on dxcore to determine if something is an NPU
 
         if (guid == GUID_DEVCLASS_DISPLAY) {
           entry->type = OrtHardwareDeviceType_GPU;
@@ -194,22 +200,17 @@ std::unordered_map<uint64_t, DeviceInfo> GetDeviceInfoSetupApi(const std::unorde
         continue;
       }
 
-      if (SetupDiGetDeviceRegistryPropertyW(devInfo, &devData, SPDRP_MFG, nullptr,
-                                            (PBYTE)buffer, sizeof(buffer), &size)) {
-        entry->vendor = std::wstring(buffer, wcslen(buffer));
+      if (entry->type == OrtHardwareDeviceType_CPU) {
+        // get 12 byte string from CPUID. easier for a user to match this if they are explicitly picking a device.
+        std::string_view cpuid_vendor = CPUIDInfo::GetCPUIDInfo().GetCPUVendor();
+        entry->vendor = std::wstring(cpuid_vendor.begin(), cpuid_vendor.end());
       }
 
-      // Add the UI number if GPU. Helpful if user has integrated and discrete GPUs
-      if (entry->type == OrtHardwareDeviceType_GPU) {
-        DWORD ui_number = 0;
-        if (SetupDiGetDeviceRegistryPropertyW(devInfo, &devData, SPDRP_UI_NUMBER, nullptr,
-                                              (PBYTE)&ui_number, sizeof(ui_number), &size)) {
-          // use value read.
-        } else {
-          // infer it as 0 if not set.
+      if (entry->vendor.empty()) {
+        if (SetupDiGetDeviceRegistryPropertyW(devInfo, &devData, SPDRP_MFG, nullptr,
+                                              (PBYTE)buffer, sizeof(buffer), &size)) {
+          entry->vendor = std::wstring(buffer, wcslen(buffer));
         }
-
-        entry->metadata.emplace(L"SPDRP_UI_NUMBER", std::to_wstring(ui_number));
       }
     }
 
@@ -252,9 +253,7 @@ std::unordered_map<uint64_t, DeviceInfo> GetDeviceInfoD3D12() {
     info.description = std::wstring(desc.Description);
 
     info.metadata[L"DxgiAdapterNumber"] = std::to_wstring(i);
-    info.metadata[L"VideoMemory"] = std::to_wstring(desc.DedicatedVideoMemory / (1024 * 1024)) + L" MB";
-    info.metadata[L"SystemMemory"] = std::to_wstring(desc.DedicatedSystemMemory / (1024 * 1024)) + L" MB";
-    info.metadata[L"SharedSystemMemory"] = std::to_wstring(desc.DedicatedSystemMemory / (1024 * 1024)) + L" MB";
+    info.metadata[L"DxgiVideoMemory"] = std::to_wstring(desc.DedicatedVideoMemory / (1024 * 1024)) + L" MB";
   }
 
   // iterate by high-performance GPU preference to add that info
@@ -272,7 +271,7 @@ std::unordered_map<uint64_t, DeviceInfo> GetDeviceInfoD3D12() {
     auto it = device_info.find(key);
     if (it != device_info.end()) {
       DeviceInfo& info = it->second;
-      info.metadata[L"HighPerformanceIndex"] = std::to_wstring(i);
+      info.metadata[L"DxgiHighPerformanceIndex"] = std::to_wstring(i);
     }
   }
 
@@ -405,25 +404,40 @@ std::unordered_set<OrtHardwareDevice> DeviceDiscovery::DiscoverDevicesForPlatfor
     }
   }
 
-  std::wstring_convert<std::codecvt_utf8<wchar_t> > converter;  // wstring to string
-  const auto device_to_ortdevice = [&converter](
+  // our log output to std::wclog breaks with UTF8 chars that are not supported by the current code page.
+  // e.g. (TM) symbol. that stops ALL logging working on at least arm64.
+  // safest way to avoid that is to keep it to single byte chars.
+  // process the OrtHardwareDevice values this way so it can be safely logged.
+  // only the 'description' metadata is likely to be affected and that is mainly for diagnostic purposes.
+  const auto to_safe_string = [](const std::wstring& wstr) -> std::string {
+    std::string str(wstr.size(), ' ');
+    std::transform(wstr.begin(), wstr.end(), str.begin(), [](wchar_t wchar) {
+      if (wchar >= 0 && wchar <= 127) {
+        return static_cast<char>(wchar);
+      }
+      return ' ';
+    });
+    return str;
+  };
+
+  const auto device_to_ortdevice = [&to_safe_string](
                                        DeviceInfo& device,
                                        std::unordered_map<std::wstring, std::wstring>* extra_metadata = nullptr) {
-    OrtHardwareDevice ortdevice{device.type, device.vendor_id, device.device_id, converter.to_bytes(device.vendor)};
+    OrtHardwareDevice ortdevice{device.type, device.vendor_id, device.device_id, to_safe_string(device.vendor)};
 
     if (!device.description.empty()) {
-      ortdevice.metadata.Add("Description", converter.to_bytes(device.description));
+      ortdevice.metadata.Add("Description", to_safe_string(device.description));
     }
 
     for (auto& [key, value] : device.metadata) {
-      ortdevice.metadata.Add(converter.to_bytes(key), converter.to_bytes(value));
+      ortdevice.metadata.Add(to_safe_string(key), to_safe_string(value));
     }
 
     if (extra_metadata) {
       // add any extra metadata from the dxcore info
       for (auto& [key, value] : *extra_metadata) {
         if (device.metadata.find(key) == device.metadata.end()) {
-          ortdevice.metadata.Add(converter.to_bytes(key), converter.to_bytes(value));
+          ortdevice.metadata.Add(to_safe_string(key), to_safe_string(value));
         }
       }
     }
@@ -431,6 +445,7 @@ std::unordered_set<OrtHardwareDevice> DeviceDiscovery::DiscoverDevicesForPlatfor
     std::ostringstream oss;
     oss << "Adding OrtHardwareDevice {vendor_id:0x" << std::hex << ortdevice.vendor_id
         << ", device_id:0x" << ortdevice.device_id
+        << ", vendor:" << ortdevice.vendor
         << ", type:" << std::dec << static_cast<int>(ortdevice.type)
         << ", metadata: [";
     for (auto& [key, value] : ortdevice.metadata.entries) {
diff --git a/onnxruntime/core/session/abi_key_value_pairs.h b/onnxruntime/core/session/abi_key_value_pairs.h
index 3242be817881a..150575b3a9efc 100644
--- a/onnxruntime/core/session/abi_key_value_pairs.h
+++ b/onnxruntime/core/session/abi_key_value_pairs.h
@@ -57,6 +57,8 @@ struct OrtKeyValuePairs {
         keys.erase(key_iter);
         values.erase(values.begin() + idx);
       }
+
+      entries.erase(iter);
     }
   }
 
diff --git a/onnxruntime/core/session/abi_session_options.cc b/onnxruntime/core/session/abi_session_options.cc
index 0b116c2fa64f6..b1c0467da642e 100644
--- a/onnxruntime/core/session/abi_session_options.cc
+++ b/onnxruntime/core/session/abi_session_options.cc
@@ -366,6 +366,17 @@ ORT_API_STATUS_IMPL(OrtApis::SetDeterministicCompute, _Inout_ OrtSessionOptions*
   API_IMPL_END
 }
 
+ORT_API_STATUS_IMPL(OrtApis::SessionOptionsSetEpSelectionPolicy, _In_ OrtSessionOptions* options,
+                    _In_ OrtExecutionProviderDevicePolicy policy,
+                    _In_opt_ EpSelectionDelegate* delegate) {
+  API_IMPL_BEGIN
+  options->value.ep_selection_policy.enable = true;
+  options->value.ep_selection_policy.policy = policy;
+  options->value.ep_selection_policy.delegate = delegate;
+  return nullptr;
+  API_IMPL_END
+}
+
 ORT_API_STATUS_IMPL(OrtApis::SessionOptionsSetLoadCancellationFlag, _Inout_ OrtSessionOptions* options,
                     _In_ bool is_cancel) {
   API_IMPL_BEGIN
diff --git a/onnxruntime/core/session/ep_library_internal.cc b/onnxruntime/core/session/ep_library_internal.cc
index c515195c7e6bf..aa032f24f13c0 100644
--- a/onnxruntime/core/session/ep_library_internal.cc
+++ b/onnxruntime/core/session/ep_library_internal.cc
@@ -183,14 +183,14 @@ std::vector<std::unique_ptr<EpLibraryInternal>> EpLibraryInternal::CreateInterna
   // CPU EP
   internal_eps.push_back(CreateCpuEp());
 
-#if defined(USE_DML)
-  internal_eps.push_back(CreateDmlEp());
-#endif
-
 #if defined(USE_WEBGPU)
   internal_eps.push_back(CreateWebGpuEp());
 #endif
 
+#if defined(USE_DML)
+  internal_eps.push_back(CreateDmlEp());
+#endif
+
   return internal_eps;
 }
 
diff --git a/onnxruntime/core/session/inference_session.h b/onnxruntime/core/session/inference_session.h
index a21388d1e9918..ba9812a59fec3 100644
--- a/onnxruntime/core/session/inference_session.h
+++ b/onnxruntime/core/session/inference_session.h
@@ -350,8 +350,8 @@ class InferenceSession {
 
   /**
    * Initializes a previously loaded ONNX model. Initialization includes but is not
-   * limited to graph transformations, construction of kernels, etc.
-   * This method assumes that a method has been loaded previously.
+   * limited to graph transformations, construction of kernels, EP policy decisions, etc.
+   * This method assumes that a model has been loaded previously.
    * This API is thread-safe.
    * @return OK if success
    */
diff --git a/onnxruntime/core/session/model_compilation_options.cc b/onnxruntime/core/session/model_compilation_options.cc
index 80ef18de5cfa3..c4a7c5262d03d 100644
--- a/onnxruntime/core/session/model_compilation_options.cc
+++ b/onnxruntime/core/session/model_compilation_options.cc
@@ -19,7 +19,9 @@ ModelCompilationOptions::ModelCompilationOptions(const OrtEnv& env, const OrtSes
   session_options_.value.ep_context_gen_options = session_options.value.GetEpContextGenerationOptions();
   session_options_.value.ep_context_gen_options.enable = true;
   session_options_.value.ep_context_gen_options.overwrite_existing_output_file = true;
-  session_options_.value.ep_context_gen_options.error_if_no_compiled_nodes = true;
+  // defaulting to false to support wider usage. will log WARNING if compiling model with no context nodes.
+  // TODO: Add ability for user to explicitly set this.
+  session_options_.value.ep_context_gen_options.error_if_no_compiled_nodes = false;
 
   // Shouldn't fail because the key/value strings are below the maximum string length limits in ConfigOptions.
   ORT_ENFORCE(session_options_.value.config_options.AddConfigEntry(kOrtSessionOptionEpContextEnable, "1").IsOK());
diff --git a/onnxruntime/core/session/onnxruntime_c_api.cc b/onnxruntime/core/session/onnxruntime_c_api.cc
index b5c271594055a..c70075b234faf 100644
--- a/onnxruntime/core/session/onnxruntime_c_api.cc
+++ b/onnxruntime/core/session/onnxruntime_c_api.cc
@@ -3012,6 +3012,7 @@ static constexpr OrtApi ort_api_1_to_23 = {
     &OrtApis::UnregisterExecutionProviderLibrary,
     &OrtApis::GetEpDevices,
     &OrtApis::SessionOptionsAppendExecutionProvider_V2,
+    &OrtApis::SessionOptionsSetEpSelectionPolicy,
 
     &OrtApis::HardwareDevice_Type,
     &OrtApis::HardwareDevice_VendorId,
@@ -3061,7 +3062,7 @@ static_assert(offsetof(OrtApi, AddExternalInitializersFromFilesInMemory) / sizeo
 // no additions in version 19, 20, and 21
 static_assert(offsetof(OrtApi, SetEpDynamicOptions) / sizeof(void*) == 284, "Size of version 20 API cannot change");
 
-static_assert(offsetof(OrtApi, GetEpApi) / sizeof(void*) == 315, "Size of version 22 API cannot change");
+static_assert(offsetof(OrtApi, GetEpApi) / sizeof(void*) == 316, "Size of version 22 API cannot change");
 
 // So that nobody forgets to finish an API version, this check will serve as a reminder:
 static_assert(std::string_view(ORT_VERSION) == "1.23.0",
diff --git a/onnxruntime/core/session/ort_apis.h b/onnxruntime/core/session/ort_apis.h
index 0033eb0d604f2..7928f9b822cf0 100644
--- a/onnxruntime/core/session/ort_apis.h
+++ b/onnxruntime/core/session/ort_apis.h
@@ -575,6 +575,10 @@ ORT_API_STATUS_IMPL(SessionOptionsAppendExecutionProvider_V2, _In_ OrtSessionOpt
                     _In_reads_(num_op_options) const char* const* ep_option_vals,
                     size_t num_ep_options);
 
+ORT_API_STATUS_IMPL(SessionOptionsSetEpSelectionPolicy, _In_ OrtSessionOptions* sess_options,
+                    _In_ OrtExecutionProviderDevicePolicy policy,
+                    _In_opt_ EpSelectionDelegate* delegate);
+
 // OrtHardwareDevice accessors.
 ORT_API(OrtHardwareDeviceType, HardwareDevice_Type, _In_ const OrtHardwareDevice* device);
 ORT_API(uint32_t, HardwareDevice_VendorId, _In_ const OrtHardwareDevice* device);
diff --git a/onnxruntime/core/session/provider_policy_context.cc b/onnxruntime/core/session/provider_policy_context.cc
new file mode 100644
index 0000000000000..565891fe2cdfb
--- /dev/null
+++ b/onnxruntime/core/session/provider_policy_context.cc
@@ -0,0 +1,347 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#if !defined(ORT_MINIMAL_BUILD)
+
+#include "core/session/provider_policy_context.h"
+
+#include <algorithm>
+
+#include "core/framework/error_code_helper.h"
+#include "core/session/abi_devices.h"
+#include "core/session/ep_factory_internal.h"
+#include "core/session/inference_session.h"
+#include "core/session/inference_session_utils.h"
+#include "core/session/onnxruntime_c_api.h"
+#include "core/session/onnxruntime_session_options_config_keys.h"
+#include "core/session/ort_apis.h"
+
+namespace onnxruntime {
+namespace {
+bool MatchesEpVendor(const OrtEpDevice* d) {
+  // TODO: Would be better to match on Id. Should the EP add that in EP metadata?
+  return d->device->vendor == d->ep_vendor;
+}
+
+bool IsDiscreteDevice(const OrtEpDevice* d) {
+  if (d->device->type != OrtHardwareDeviceType::OrtHardwareDeviceType_GPU) {
+    return false;
+  }
+
+  const auto& entries = d->device->metadata.entries;
+  if (auto it = entries.find("Discrete"); it != entries.end()) {
+    return it->second == "1";
+  }
+
+  return false;
+}
+
+bool IsDefaultCpuEp(const OrtEpDevice* d) {
+  return d->device->type == OrtHardwareDeviceType::OrtHardwareDeviceType_CPU &&
+         d->ep_vendor == "Microsoft";
+}
+
+// Sort devices. NPU -> GPU -> CPU
+// Within in type, vendor owned, not.
+// Default CPU EP is last
+std::vector<const OrtEpDevice*> OrderDevices(const std::vector<const OrtEpDevice*>& devices) {
+  std::vector<const OrtEpDevice*> sorted_devices(devices.begin(), devices.end());
+  std::sort(sorted_devices.begin(), sorted_devices.end(), [](const OrtEpDevice* a, const OrtEpDevice* b) {
+    auto aDeviceType = a->device->type;
+    auto bDeviceType = b->device->type;
+    if (aDeviceType != bDeviceType) {
+      // NPU -> GPU -> CPU
+      // std::sort is ascending order, so NPU < GPU < CPU
+
+      // one is an NPU
+      if (aDeviceType == OrtHardwareDeviceType::OrtHardwareDeviceType_NPU) {
+        return true;
+      } else if (bDeviceType == OrtHardwareDeviceType::OrtHardwareDeviceType_NPU) {
+        return false;
+      }
+
+      if (aDeviceType == OrtHardwareDeviceType::OrtHardwareDeviceType_GPU) {
+        return true;
+      } else if (bDeviceType == OrtHardwareDeviceType::OrtHardwareDeviceType_GPU) {
+        return false;
+      }
+
+      // this shouldn't be reachable as it would imply both are CPU
+      ORT_THROW("Unexpected combination of devices");
+    }
+
+    // both devices are the same
+
+    if (aDeviceType == OrtHardwareDeviceType::OrtHardwareDeviceType_GPU) {
+      bool aDiscrete = IsDiscreteDevice(a);
+      bool bDiscrete = IsDiscreteDevice(b);
+      if (aDiscrete != bDiscrete) {
+        return aDiscrete == true;  // prefer discrete
+      }
+
+      // both discrete or both integrated
+    }
+
+    // prefer device matching platform vendor
+    bool aVendor = MatchesEpVendor(a);
+    bool bVendor = MatchesEpVendor(b);
+    if (aVendor != bVendor) {
+      return aVendor == true;  // prefer the device that matches the EP vendor
+    }
+
+    // default CPU EP last
+    bool aIsDefaultCpuEp = IsDefaultCpuEp(a);
+    bool bIsDefaultCpuEp = IsDefaultCpuEp(b);
+    if (!aIsDefaultCpuEp && !bIsDefaultCpuEp) {
+      // neither are default CPU EP. both do/don't match vendor.
+      // TODO: implement tie-breaker for this scenario. arbitrarily prefer the first for now
+      return true;
+    }
+
+    // one is the default CPU EP
+    return aIsDefaultCpuEp == false;  // prefer the one that is not the default CPU EP
+  });
+
+  return sorted_devices;
+}
+}  // namespace
+
+// Select execution providers based on the device policy and available devices and add to session
+Status ProviderPolicyContext::SelectEpsForSession(const Environment& env, const OrtSessionOptions& options,
+                                                  InferenceSession& sess) {
+  ORT_ENFORCE(options.value.ep_selection_policy.delegate == nullptr,
+              "EP selection delegate support is not implemented yet.");
+
+  // Get the list of devices from the environment and order them.
+  // Ordered by preference within each type. NPU -> GPU -> NPU
+  // TODO: Should environment.cc do the ordering?
+  const auto& execution_devices = OrderDevices(env.GetOrtEpDevices());
+
+  // The list of devices selected by policies
+  std::vector<const OrtEpDevice*> devices_selected;
+
+  // Run the delegate if it was passed in lieu of any other policy
+  if (options.value.ep_selection_policy.delegate) {
+    auto policy_fn = options.value.ep_selection_policy.delegate;
+    std::vector<const OrtEpDevice*> delegate_devices(execution_devices.begin(), execution_devices.end());
+    std::array<const OrtEpDevice*, 8> selected_devices{nullptr};
+
+    size_t num_selected = 0;
+    auto* status = (*policy_fn)(delegate_devices.data(), delegate_devices.size(),
+                                nullptr, nullptr, selected_devices.data(), selected_devices.size(), &num_selected);
+
+    // return or fall-through for both these cases
+    // going with explicit failure for now so it's obvious to user what is happening
+    if (status != nullptr) {
+      std::string delegate_error_msg = OrtApis::GetErrorMessage(status);  // copy
+      OrtApis::ReleaseStatus(status);
+
+      return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "EP selection delegate failed: ", delegate_error_msg);
+    }
+
+    if (num_selected == 0) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "EP selection delegate did not select anything.");
+    }
+  } else {
+    // Create the selector for the chosen policy
+    std::unique_ptr<IEpPolicySelector> selector;
+    switch (options.value.ep_selection_policy.policy) {
+      case OrtExecutionProviderDevicePolicy_DEFAULT:
+        selector = std::make_unique<DefaultEpPolicy>();
+        break;
+      case OrtExecutionProviderDevicePolicy_PREFER_CPU:
+        selector = std::make_unique<PreferCpuEpPolicy>();
+        break;
+      case OrtExecutionProviderDevicePolicy_PREFER_NPU:
+      case OrtExecutionProviderDevicePolicy_MAX_EFFICIENCY:
+      case OrtExecutionProviderDevicePolicy_MIN_OVERALL_POWER:
+        selector = std::make_unique<PreferNpuEpPolicy>();
+        break;
+      case OrtExecutionProviderDevicePolicy_PREFER_GPU:
+      case OrtExecutionProviderDevicePolicy_MAX_PERFORMANCE:
+        selector = std::make_unique<PreferGpuEpPolicy>();
+        break;
+    }
+
+    // Execute policy
+
+    selector->SelectProvidersForDevices(execution_devices, devices_selected);
+  }
+
+  // Fail if we did not find any device matches
+  if (devices_selected.empty()) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL,
+                           "No execution providers selected. Please check the device policy and available devices.");
+  }
+
+  // Configure the session options for the devices. This updates the SessionOptions in the InferenceSession with any
+  // EP options that have not been overridden by the user.
+  ORT_RETURN_IF_ERROR(AddEpDefaultOptionsToSession(sess, devices_selected));
+
+  // Create OrtSessionOptions for the CreateEp call.
+  // Once the InferenceSession is created, its SessionOptions is the source of truth and contains all the values from
+  // the user provided OrtSessionOptions. We do a copy for simplicity. The OrtSessionOptions instance goes away
+  // once we exit this function so an EP implementation should not use OrtSessionOptions after it returns from
+  // CreateEp.
+  auto& session_options = sess.GetMutableSessionOptions();
+  OrtSessionOptions ort_so;
+  ort_so.value = session_options;
+  const auto& session_logger = sess.GetLogger();
+  const OrtLogger& api_session_logger = *session_logger->ToExternal();
+
+  // Remove the ORT CPU EP if configured to do so
+  bool disable_ort_cpu_ep = ort_so.value.config_options.GetConfigEntry(kOrtSessionOptionsDisableCPUEPFallback) == "1";
+  if (disable_ort_cpu_ep) {
+    RemoveOrtCpuDevice(devices_selected);
+  }
+
+  // Fold the EPs into a single structure per factory
+  std::vector<SelectionInfo> eps_selected;
+  FoldSelectedDevices(devices_selected, eps_selected);
+
+  // Iterate through the selected EPs and create them
+  for (size_t idx = 0; idx < eps_selected.size(); ++idx) {
+    std::unique_ptr<IExecutionProvider> ep = nullptr;
+    ORT_RETURN_IF_ERROR(CreateExecutionProvider(env, ort_so, api_session_logger, eps_selected[idx], ep));
+    if (ep != nullptr) {
+      ORT_RETURN_IF_ERROR(sess.RegisterExecutionProvider(std::move(ep)));
+    }
+  }
+
+  return Status::OK();
+}
+
+void ProviderPolicyContext::FoldSelectedDevices(std::vector<const OrtEpDevice*> devices_selected,
+                                                std::vector<SelectionInfo>& eps_selected) {
+  while (devices_selected.size() > 0) {
+    const auto ep_name = std::string(devices_selected[0]->ep_name);
+    SelectionInfo info;
+    info.ep_factory = devices_selected[0]->ep_factory;
+
+    do {
+      auto iter = std::find_if(devices_selected.begin(), devices_selected.end(), [&ep_name](const OrtEpDevice* d) {
+        return d->ep_name == ep_name;
+      });
+
+      if (iter != devices_selected.end()) {
+        info.devices.push_back((*iter)->device);
+        info.ep_metadata.push_back(&(*iter)->ep_metadata);
+        devices_selected.erase(iter);
+      } else {
+        break;
+      }
+    } while (true);
+
+    eps_selected.push_back(info);
+  }
+}
+
+Status ProviderPolicyContext::CreateExecutionProvider(const Environment& env, OrtSessionOptions& options,
+                                                      const OrtLogger& logger,
+                                                      SelectionInfo& info, std::unique_ptr<IExecutionProvider>& ep) {
+  EpFactoryInternal* internal_factory = env.GetEpFactoryInternal(info.ep_factory);
+
+  if (internal_factory) {
+    // this is a factory we created and registered internally for internal and provider bridge EPs
+    OrtStatus* status = internal_factory->CreateIExecutionProvider(info.devices.data(), info.ep_metadata.data(),
+                                                                   info.devices.size(), &options, &logger,
+                                                                   &ep);
+    if (status != nullptr) {
+      return ToStatus(status);
+    }
+  } else {
+    // in the real setup we need an IExecutionProvider wrapper implementation that uses the OrtEp internally,
+    // and we would add that IExecutionProvider to the InferenceSession.
+    // but first we need OrtEp and the OrtEpApi to be implemented.
+    ORT_NOT_IMPLEMENTED("IExecutionProvider that wraps OrtEp has not been implemented.");
+
+    // OrtEp* api_ep = nullptr;
+    //// add the ep_options to session options but leave any existing entries (user provided overrides) untouched.
+    // auto status = info.ep_factory->CreateEp(info.ep_factory, info.devices.data(), info.ep_metadata.data(),
+    //                                         info.devices.size(), &options, &logger,
+    //                                         &api_ep);
+    // if (status != nullptr) {
+    //   return ToStatus(status);
+    // }
+  }
+
+  return Status::OK();
+}
+
+Status ProviderPolicyContext::AddEpDefaultOptionsToSession(InferenceSession& sess,
+                                                           std::vector<const OrtEpDevice*> devices) {
+  auto& config_options = sess.GetMutableSessionOptions().config_options;
+  for (auto device : devices) {
+    const std::string ep_options_prefix = OrtSessionOptions::GetProviderOptionPrefix(device->ep_name.c_str());
+    for (const auto& [key, value] : device->ep_options.entries) {
+      const std::string option_key = ep_options_prefix + key;
+      // preserve user-provided options as they override any defaults the EP factory specified earlier
+      if (config_options.configurations.find(option_key) == config_options.configurations.end()) {
+        // use AddConfigEntry for the error checking it does
+        ORT_RETURN_IF_ERROR(config_options.AddConfigEntry(option_key.c_str(), value.c_str()));
+      }
+    }
+  }
+
+  return Status::OK();
+}
+
+void ProviderPolicyContext::RemoveOrtCpuDevice(std::vector<const OrtEpDevice*>& devices) {
+  // Remove the Microsoft CPU EP. always last if available.
+  if (IsDefaultCpuEp(devices.back())) {
+    devices.pop_back();
+  }
+}
+
+void DefaultEpPolicy::SelectProvidersForDevices(const std::vector<const OrtEpDevice*>& sorted_devices,
+                                                std::vector<const OrtEpDevice*>& selected) {
+  // Default policy is prefer CPU
+  PreferCpuEpPolicy().SelectProvidersForDevices(sorted_devices, selected);
+}
+
+void PreferCpuEpPolicy::SelectProvidersForDevices(const std::vector<const OrtEpDevice*>& sorted_devices,
+                                                  std::vector<const OrtEpDevice*>& selected) {
+  // Select the first CPU device from sorted devices
+  auto first_cpu = std::find_if(sorted_devices.begin(), sorted_devices.end(),
+                                [](const OrtEpDevice* device) {
+                                  return device->device->type == OrtHardwareDeviceType::OrtHardwareDeviceType_CPU;
+                                });
+
+  ORT_ENFORCE(first_cpu != sorted_devices.end(), "No CPU based execution providers were found.");
+  selected.push_back(*first_cpu);
+
+  // add ORT CPU EP as the final option to ensure maximum coverage of opsets and operators
+  if (!IsDefaultCpuEp(*first_cpu) && IsDefaultCpuEp(sorted_devices.back())) {
+    selected.push_back(sorted_devices.back());
+  }
+}
+
+void PreferNpuEpPolicy::SelectProvidersForDevices(const std::vector<const OrtEpDevice*>& sorted_devices,
+                                                  std::vector<const OrtEpDevice*>& selected) {
+  // Select the first NPU if there is one.
+  if (sorted_devices.front()->device->type == OrtHardwareDeviceType::OrtHardwareDeviceType_NPU) {
+    selected.push_back(sorted_devices.front());
+  }
+
+  // CPU fallback
+  PreferCpuEpPolicy().SelectProvidersForDevices(sorted_devices, selected);
+}
+
+void PreferGpuEpPolicy::SelectProvidersForDevices(const std::vector<const OrtEpDevice*>& sorted_devices,
+                                                  std::vector<const OrtEpDevice*>& selected) {
+  // Select the first GPU device
+  auto first_gpu = std::find_if(sorted_devices.begin(), sorted_devices.end(),
+                                [](const OrtEpDevice* device) {
+                                  return device->device->type == OrtHardwareDeviceType::OrtHardwareDeviceType_GPU;
+                                });
+
+  if (first_gpu != sorted_devices.end()) {
+    selected.push_back(*first_gpu);
+  }
+
+  // Add a CPU fallback
+  PreferCpuEpPolicy().SelectProvidersForDevices(sorted_devices, selected);
+}
+}  // namespace onnxruntime
+
+#endif  // !defined(ORT_MINIMAL_BUILD)
diff --git a/onnxruntime/core/session/provider_policy_context.h b/onnxruntime/core/session/provider_policy_context.h
new file mode 100644
index 0000000000000..185f9523312ba
--- /dev/null
+++ b/onnxruntime/core/session/provider_policy_context.h
@@ -0,0 +1,79 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#if !defined(ORT_MINIMAL_BUILD)
+
+#include "core/session/abi_session_options_impl.h"
+#include "core/session/environment.h"
+#include "core/session/onnxruntime_c_api.h"  // For OrtExecutionProviderDevicePolicy
+
+namespace onnxruntime {
+
+struct SelectionInfo {
+  OrtEpFactory* ep_factory;
+  std::vector<const OrtHardwareDevice*> devices;
+  std::vector<const OrtKeyValuePairs*> ep_metadata;
+};
+
+class IEpPolicySelector {
+ public:
+  /// <summary>
+  /// Select the OrtEpDevice instances to use.
+  /// Selection is in priority order. Highest priority first.
+  /// </summary>
+  /// <param name="devices">Ordered devices.
+  /// Type order is NPU -> GPU -> CPU
+  /// Within a type: Discrete -> Integrated if GPU, EP vendor matches device vendor, vendor does not match
+  /// ORT CPU EP is always last if available.
+  /// </param>
+  /// <param name="selected_devices"></param>
+  virtual void SelectProvidersForDevices(const std::vector<const OrtEpDevice*>& sorted_devices,
+                                         std::vector<const OrtEpDevice*>& selected_devices) = 0;
+
+  virtual ~IEpPolicySelector() = default;
+};
+
+class ProviderPolicyContext {
+ public:
+  ProviderPolicyContext() = default;
+
+  Status SelectEpsForSession(const Environment& env, const OrtSessionOptions& options, InferenceSession& sess);
+  Status AddEpDefaultOptionsToSession(InferenceSession& sess, std::vector<const OrtEpDevice*> devices);
+  void RemoveOrtCpuDevice(std::vector<const OrtEpDevice*>& devices);
+  Status CreateExecutionProvider(const Environment& env, OrtSessionOptions& options, const OrtLogger& logger,
+                                 SelectionInfo& info, std::unique_ptr<IExecutionProvider>& ep);
+  void FoldSelectedDevices(std::vector<const OrtEpDevice*> devices_selected,  // copy
+                           std::vector<SelectionInfo>& eps_selected);
+
+ private:
+};
+
+class DefaultEpPolicy : public IEpPolicySelector {
+ public:
+  void SelectProvidersForDevices(const std::vector<const OrtEpDevice*>& sorted_devices,
+                                 std::vector<const OrtEpDevice*>& selected_devices) override;
+};
+
+class PreferCpuEpPolicy : public IEpPolicySelector {
+ public:
+  void SelectProvidersForDevices(const std::vector<const OrtEpDevice*>& sorted_devices,
+                                 std::vector<const OrtEpDevice*>& selected_devices) override;
+};
+
+class PreferNpuEpPolicy : public IEpPolicySelector {
+ public:
+  void SelectProvidersForDevices(const std::vector<const OrtEpDevice*>& sorted_devices,
+                                 std::vector<const OrtEpDevice*>& selected_devices) override;
+};
+
+class PreferGpuEpPolicy : public IEpPolicySelector {
+ public:
+  void SelectProvidersForDevices(const std::vector<const OrtEpDevice*>& sorted_devices,
+                                 std::vector<const OrtEpDevice*>& selected_devices) override;
+};
+
+}  // namespace onnxruntime
+
+#endif  // !ORT_MINIMAL_BUILD
diff --git a/onnxruntime/core/session/utils.cc b/onnxruntime/core/session/utils.cc
index adb019fdde86d..d17514e54a945 100644
--- a/onnxruntime/core/session/utils.cc
+++ b/onnxruntime/core/session/utils.cc
@@ -18,6 +18,7 @@
 #include "core/session/ort_apis.h"
 #include "core/session/ort_env.h"
 #include "core/session/onnxruntime_session_options_config_keys.h"
+#include "core/session/provider_policy_context.h"
 
 using namespace onnxruntime;
 #if !defined(ORT_MINIMAL_BUILD)
@@ -71,6 +72,11 @@ Status TestAutoSelectEPsImpl(const Environment& env, InferenceSession& sess, con
         return ToStatus(status);
       }
     } else {
+      // in the real setup we need an IExecutionProvider wrapper implementation that uses the OrtEp internally,
+      // and we would add that IExecutionProvider to the InferenceSession.
+      ORT_NOT_IMPLEMENTED("IExecutionProvider that wraps OrtEp has not been implemented.");
+
+      /*
       OrtEp* api_ep = nullptr;
       auto status = ep_device->ep_factory->CreateEp(
           ep_device->ep_factory, devices.data(), ep_metadata.data(), devices.size(),
@@ -79,10 +85,7 @@ Status TestAutoSelectEPsImpl(const Environment& env, InferenceSession& sess, con
       if (status != nullptr) {
         return ToStatus(status);
       }
-
-      // in the real setup we need an IExecutionProvider wrapper implementation that uses the OrtEp internally,
-      // and we would add that IExecutionProvider to the InferenceSession.
-      ORT_NOT_IMPLEMENTED("IExecutionProvider that wraps OrtEp has not been implemented.");
+      */
     }
 
     ORT_RETURN_IF_ERROR(sess.RegisterExecutionProvider(std::move(ep)));
@@ -175,6 +178,12 @@ OrtStatus* CreateSessionAndLoadModel(_In_ const OrtSessionOptions* options,
   if (auto_select_ep_name) {
     ORT_API_RETURN_IF_STATUS_NOT_OK(TestAutoSelectEPsImpl(env->GetEnvironment(), *sess, *auto_select_ep_name));
   }
+
+  // if there are no providers registered, and there's an ep selection policy set, do auto ep selection
+  if (options != nullptr && options->provider_factories.empty() && options->value.ep_selection_policy.enable) {
+    ProviderPolicyContext context;
+    ORT_API_RETURN_IF_STATUS_NOT_OK(context.SelectEpsForSession(env->GetEnvironment(), *options, *sess));
+  }
 #endif  // !defined(ORT_MINIMAL_BUILD)
 
 #if !defined(ORT_MINIMAL_BUILD) || defined(ORT_MINIMAL_BUILD_CUSTOM_OPS)
diff --git a/onnxruntime/test/autoep/test_autoep_selection.cc b/onnxruntime/test/autoep/test_autoep_selection.cc
index 619f0a4bcda33..04b1b2ea0bdc4 100644
--- a/onnxruntime/test/autoep/test_autoep_selection.cc
+++ b/onnxruntime/test/autoep/test_autoep_selection.cc
@@ -64,7 +64,10 @@ static void TestInference(Ort::Env& env, const std::basic_string<ORTCHAR_T>& mod
                           const std::vector<int64_t>& expected_dims_y,
                           const std::vector<ModelOutputT>& expected_values_y,
                           bool auto_select = true,  // auto select vs SessionOptionsAppendExecutionProvider_V2
+                          // manual select using functor
                           const std::function<void(std::vector<const OrtEpDevice*>&)>& select_devices = nullptr,
+                          // auto select using policy
+                          std::optional<OrtExecutionProviderDevicePolicy> policy = std::nullopt,
                           bool test_session_creation_only = false) {
   Ort::SessionOptions session_options;
 
@@ -74,16 +77,20 @@ static void TestInference(Ort::Env& env, const std::basic_string<ORTCHAR_T>& mod
   }
 
   if (auto_select) {
-    // manually specify EP to select for now
-    session_options.AddConfigEntry("test.ep_to_select", ep_to_select.c_str());
-
-    // add the provider options to the session options with the required prefix
-    const std::string option_prefix = OrtSessionOptions::GetProviderOptionPrefix(ep_to_select.c_str());
-    std::vector<const char*> keys, values;
-    ep_options.GetKeyValuePairs(keys, values);
-    for (size_t i = 0, end = keys.size(); i < end; ++i) {
-      // add the default value with prefix
-      session_options.AddConfigEntry((option_prefix + keys[i]).c_str(), values[i]);
+    if (policy) {
+      session_options.SetEpSelectionPolicy(*policy);
+    } else {
+      // manually specify EP to select
+      session_options.AddConfigEntry("test.ep_to_select", ep_to_select.c_str());
+
+      // add the provider options to the session options with the required prefix
+      const std::string option_prefix = OrtSessionOptions::GetProviderOptionPrefix(ep_to_select.c_str());
+      std::vector<const char*> keys, values;
+      ep_options.GetKeyValuePairs(keys, values);
+      for (size_t i = 0, end = keys.size(); i < end; ++i) {
+        // add the default value with prefix
+        session_options.AddConfigEntry((option_prefix + keys[i]).c_str(), values[i]);
+      }
     }
   } else {
     std::vector<const OrtEpDevice*> devices;
@@ -188,7 +195,7 @@ TEST(AutoEpSelection, DmlEP) {
           devices.push_back(ep_device);
         } else {
           // if this is available, 0 == best performance
-          auto* perf_index = c_api->GetKeyValue(kvps, "HighPerformanceIndex");
+          auto* perf_index = c_api->GetKeyValue(kvps, "DxgiHighPerformanceIndex");
           if (perf_index && strcmp(perf_index, "0") == 0) {
             devices[0] = ep_device;  // replace as this is the higher performance device
           }
@@ -213,20 +220,27 @@ TEST(AutoEpSelection, WebGpuEP) {
 TEST(AutoEpSelection, MiscApiTests) {
   const OrtApi* c_api = &Ort::GetApi();
 
-  // nullptr and empty input to OrtKeyValuePairs
+  // nullptr and empty input to OrtKeyValuePairs. also test RemoveKeyValuePair
   {
     OrtKeyValuePairs* kvps = nullptr;
     c_api->CreateKeyValuePairs(&kvps);
     c_api->AddKeyValuePair(kvps, "key1", nullptr);    // should be ignored
     c_api->AddKeyValuePair(kvps, nullptr, "value1");  // should be ignored
     c_api->RemoveKeyValuePair(kvps, nullptr);         // should be ignored
-
-    c_api->AddKeyValuePair(kvps, "", "value2");  // empty key should be ignored
+    c_api->AddKeyValuePair(kvps, "", "value2");       // should be ignored
     ASSERT_EQ(c_api->GetKeyValue(kvps, ""), nullptr);
 
+    c_api->AddKeyValuePair(kvps, "key1", "value1");
     c_api->AddKeyValuePair(kvps, "key2", "");  // empty value is allowed
     ASSERT_EQ(c_api->GetKeyValue(kvps, "key2"), std::string(""));
 
+    c_api->RemoveKeyValuePair(kvps, "key1");
+    const char* const* keys = nullptr;
+    const char* const* values = nullptr;
+    size_t num_entries = 0;
+    c_api->GetKeyValuePairs(kvps, &keys, &values, &num_entries);
+    ASSERT_EQ(num_entries, 1);
+
     c_api->ReleaseKeyValuePairs(kvps);
   }
 
@@ -259,6 +273,86 @@ TEST(AutoEpSelection, MiscApiTests) {
   }
 }
 
+TEST(AutoEpSelection, PreferCpu) {
+  std::vector<Input<float>> inputs(1);
+  auto& input = inputs.back();
+  input.name = "X";
+  input.dims = {3, 2};
+  input.values = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
+
+  // prepare expected inputs and outputs
+  std::vector<int64_t> expected_dims_y = {3, 2};
+  std::vector<float> expected_values_y = {1.0f, 4.0f, 9.0f, 16.0f, 25.0f, 36.0f};
+
+  const Ort::KeyValuePairs provider_options;
+
+  TestInference<float>(*ort_env, ORT_TSTR("testdata/mul_1.onnx"),
+                       "",  // don't need EP name
+                       std::nullopt,
+                       provider_options,
+                       inputs,
+                       "Y",
+                       expected_dims_y,
+                       expected_values_y,
+                       /* auto_select */ true,
+                       /*select_devices*/ nullptr,
+                       OrtExecutionProviderDevicePolicy::OrtExecutionProviderDevicePolicy_PREFER_CPU);
+}
+
+// this should fallback to CPU if no GPU
+TEST(AutoEpSelection, PreferGpu) {
+  std::vector<Input<float>> inputs(1);
+  auto& input = inputs.back();
+  input.name = "X";
+  input.dims = {3, 2};
+  input.values = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
+
+  // prepare expected inputs and outputs
+  std::vector<int64_t> expected_dims_y = {3, 2};
+  std::vector<float> expected_values_y = {1.0f, 4.0f, 9.0f, 16.0f, 25.0f, 36.0f};
+
+  const Ort::KeyValuePairs provider_options;
+
+  TestInference<float>(*ort_env, ORT_TSTR("testdata/mul_1.onnx"),
+                       "",  // don't need EP name
+                       std::nullopt,
+                       provider_options,
+                       inputs,
+                       "Y",
+                       expected_dims_y,
+                       expected_values_y,
+                       /* auto_select */ true,
+                       /*select_devices*/ nullptr,
+                       OrtExecutionProviderDevicePolicy::OrtExecutionProviderDevicePolicy_PREFER_GPU);
+}
+
+// this should fallback to CPU if no NPU
+TEST(AutoEpSelection, PreferNpu) {
+  std::vector<Input<float>> inputs(1);
+  auto& input = inputs.back();
+  input.name = "X";
+  input.dims = {3, 2};
+  input.values = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
+
+  // prepare expected inputs and outputs
+  std::vector<int64_t> expected_dims_y = {3, 2};
+  std::vector<float> expected_values_y = {1.0f, 4.0f, 9.0f, 16.0f, 25.0f, 36.0f};
+
+  const Ort::KeyValuePairs provider_options;
+
+  TestInference<float>(*ort_env, ORT_TSTR("testdata/mul_1.onnx"),
+                       "",  // don't need EP name
+                       std::nullopt,
+                       provider_options,
+                       inputs,
+                       "Y",
+                       expected_dims_y,
+                       expected_values_y,
+                       /* auto_select */ true,
+                       /*select_devices*/ nullptr,
+                       OrtExecutionProviderDevicePolicy::OrtExecutionProviderDevicePolicy_PREFER_NPU);
+}
+
 namespace {
 struct ExamplePluginInfo {
   const std::filesystem::path library_path =

From c51d67b400e162b28c85c1dfe648635cb38714a3 Mon Sep 17 00:00:00 2001
From: Scott McKay <skottmckay@gmail.com>
Date: Sat, 3 May 2025 22:56:53 +1000
Subject: [PATCH 03/84] C# API updates for auto ep selection and compile API
 (#24604)

### Description
<!-- Describe your changes. -->
C# API updates for auto ep selection and the compilation API.
Also includes bugfix to OrtKeyValuePairs::Remove.

### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->

---------

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
---
 .../CompileModel.shared.cs                    | 130 ++++++++
 .../NativeCompileApiMethods.shared.cs         | 152 ++++++++++
 .../NativeMethods.shared.cs                   | 277 +++++++++++++++++-
 .../Microsoft.ML.OnnxRuntime/OrtEnv.shared.cs |  63 ++++
 .../OrtEpDevice.shared.cs                     |  98 +++++++
 .../OrtHardwareDevice.shared.cs               | 116 ++++++++
 .../OrtKeyValuePairs.shared.cs                | 192 ++++++++++++
 .../OrtValue.shared.cs                        |   4 +-
 .../SessionOptions.shared.cs                  | 107 ++++++-
 .../CompileApiTests.cs                        |  67 +++++
 .../OrtAutoEpTests.cs                         | 159 ++++++++++
 .../OrtKeyValuePairsTests.cs                  |  39 +++
 ...oft.ML.OnnxRuntime.Tests.NetCoreApp.csproj |   3 +-
 onnxruntime/core/session/ort_apis.h           |   5 +-
 14 files changed, 1402 insertions(+), 10 deletions(-)
 create mode 100644 csharp/src/Microsoft.ML.OnnxRuntime/CompileModel.shared.cs
 create mode 100644 csharp/src/Microsoft.ML.OnnxRuntime/NativeCompileApiMethods.shared.cs
 create mode 100644 csharp/src/Microsoft.ML.OnnxRuntime/OrtEpDevice.shared.cs
 create mode 100644 csharp/src/Microsoft.ML.OnnxRuntime/OrtHardwareDevice.shared.cs
 create mode 100644 csharp/src/Microsoft.ML.OnnxRuntime/OrtKeyValuePairs.shared.cs
 create mode 100644 csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/CompileApiTests.cs
 create mode 100644 csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/OrtAutoEpTests.cs
 create mode 100644 csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/OrtKeyValuePairsTests.cs

diff --git a/csharp/src/Microsoft.ML.OnnxRuntime/CompileModel.shared.cs b/csharp/src/Microsoft.ML.OnnxRuntime/CompileModel.shared.cs
new file mode 100644
index 0000000000000..9f42bf2247529
--- /dev/null
+++ b/csharp/src/Microsoft.ML.OnnxRuntime/CompileModel.shared.cs
@@ -0,0 +1,130 @@
+﻿// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+namespace Microsoft.ML.OnnxRuntime
+{
+    using System;
+    using System.Runtime.InteropServices;
+
+    /// <summary>
+    /// This class is used to set options for model compilation, and to produce a compiled model using those options.
+    /// See https://onnxruntime.ai/docs/api/c/ for further details of various options.
+    /// </summary>
+    public class OrtModelCompilationOptions : SafeHandle
+    {
+        /// <summary>
+        /// Create a new OrtModelCompilationOptions object from SessionOptions.
+        /// </summary>
+        /// <param name="sessionOptions">SessionOptions instance to read settings from.</param>
+        public OrtModelCompilationOptions(SessionOptions sessionOptions)
+            : base(IntPtr.Zero, true)
+        {
+            NativeApiStatus.VerifySuccess(
+                NativeMethods.CompileApi.OrtCreateModelCompilationOptionsFromSessionOptions(
+                    OrtEnv.Instance().Handle, sessionOptions.Handle, out handle));
+        }
+
+        /// <summary>
+        /// Compile the model using the options set in this object.
+        /// </summary>
+        public void CompileModel()
+        {
+            NativeApiStatus.VerifySuccess(NativeMethods.CompileApi.OrtCompileModel(OrtEnv.Instance().Handle, handle));
+        }
+
+
+        /// <summary>
+        /// Set the input model to compile.
+        /// </summary>
+        /// <param name="path">Path to ONNX model to compile.</param>
+        public void SetInputModelPath(string path)
+        {
+            var platformPath = NativeOnnxValueHelper.GetPlatformSerializedString(path);
+            NativeApiStatus.VerifySuccess(
+                NativeMethods.CompileApi.OrtModelCompilationOptions_SetInputModelPath(handle, platformPath));
+        }
+
+        /// <summary>
+        /// Set the input model to compile to be a byte array.
+        /// The input bytes are NOT copied and must remain valid while in use by ORT.
+        /// </summary>
+        /// <param name="buffer">Input model bytes.</param>
+        public void SetInputModelFromBuffer(byte[] buffer)
+        {
+            NativeApiStatus.VerifySuccess(
+                NativeMethods.CompileApi.OrtModelCompilationOptions_SetInputModelFromBuffer(
+                    handle, buffer, (UIntPtr)buffer.Length));
+        }
+
+        /// <summary>
+        /// Set the path to write the compiled ONNX model to.
+        /// </summary>
+        /// <param name="path">Path to write compiled model to.</param>
+        public void SetOutputModelPath(string path)
+        {
+            var platformPath = NativeOnnxValueHelper.GetPlatformSerializedString(path);
+            NativeApiStatus.VerifySuccess(
+                NativeMethods.CompileApi.OrtModelCompilationOptions_SetOutputModelPath(handle, platformPath));
+
+        }
+
+        /// <summary>
+        /// Set the path to a file to write initializers as external data to,
+        /// and the threshold that determines when to write an initializer to the external data file.
+        /// </summary>
+        /// <param name="filePath">Path to file to write external data to.</param>
+        /// <param name="threshold">Size at which an initializer will be written to external data.</param>
+        public void SetOutputModelExternalInitializersFile(string filePath, ulong threshold)
+        {
+            var platformPath = NativeOnnxValueHelper.GetPlatformSerializedString(filePath);
+            NativeApiStatus.VerifySuccess(
+                NativeMethods.CompileApi.OrtModelCompilationOptions_SetOutputModelExternalInitializersFile(
+                    handle, platformPath, new UIntPtr(threshold)));
+        }
+
+        // TODO: In order to use this to create an InferenceSession without copying bytes we need more infrastructure.
+        // - Need something that wraps the allocator, pointer and size and is SafeHandle based.
+        //   - When it is disposed we need to use the allocator to release the native buffer.
+        // - Need the 4 InferenceSession ctors that take byte[] for the model to be duplicated to handle this new
+        //   wrapper type.
+        // Due to that making this API internal so we can test it. We can make it public when the other infrastructure
+        // is in place as it will change the signature of the API.
+        internal void SetOutputModelBuffer(OrtAllocator allocator,
+                                           ref IntPtr outputModelBufferPtr, ref UIntPtr outputModelBufferSizePtr)
+        {
+            NativeApiStatus.VerifySuccess(
+                NativeMethods.CompileApi.OrtModelCompilationOptions_SetOutputModelBuffer(
+                    handle, allocator.Pointer, ref outputModelBufferPtr, ref outputModelBufferSizePtr));
+        }
+
+        /// <summary>
+        /// Enables or disables the embedding of EPContext binary data into the `ep_cache_context` attribute
+        /// of EPContext nodes.
+        /// </summary>
+        /// <param name="embed">Enable if true. Default is false.</param>
+        public void SetEpContextEmbedMode(bool embed)
+        {
+            NativeApiStatus.VerifySuccess(
+                NativeMethods.CompileApi.OrtModelCompilationOptions_SetEpContextEmbedMode(handle, embed));
+        }
+
+        internal IntPtr Handle => handle;
+
+
+        /// <summary>
+        /// Indicates whether the native handle is invalid.
+        /// </summary>
+        public override bool IsInvalid => handle == IntPtr.Zero;
+
+        /// <summary>
+        /// Release the native instance of OrtModelCompilationOptions.
+        /// </summary>
+        /// <returns>true</returns>
+        protected override bool ReleaseHandle()
+        {
+            NativeMethods.CompileApi.OrtReleaseModelCompilationOptions(handle);
+            handle = IntPtr.Zero;
+            return true;
+        }
+    }
+}
\ No newline at end of file
diff --git a/csharp/src/Microsoft.ML.OnnxRuntime/NativeCompileApiMethods.shared.cs b/csharp/src/Microsoft.ML.OnnxRuntime/NativeCompileApiMethods.shared.cs
new file mode 100644
index 0000000000000..3a87f87d124e9
--- /dev/null
+++ b/csharp/src/Microsoft.ML.OnnxRuntime/NativeCompileApiMethods.shared.cs
@@ -0,0 +1,152 @@
+﻿// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+namespace Microsoft.ML.OnnxRuntime.CompileApi;
+
+using System;
+using System.Runtime.InteropServices;
+
+// NOTE: The order of the APIs in this struct should match exactly that in OrtCompileApi
+// See onnxruntime/core/session/compile_api.cc.
+[StructLayout(LayoutKind.Sequential)]
+public struct OrtCompileApi
+{
+    public IntPtr ReleaseModelCompilationOptions;
+    public IntPtr CreateModelCompilationOptionsFromSessionOptions;
+    public IntPtr ModelCompilationOptions_SetInputModelPath;
+    public IntPtr ModelCompilationOptions_SetInputModelFromBuffer;
+    public IntPtr ModelCompilationOptions_SetOutputModelPath;
+    public IntPtr ModelCompilationOptions_SetOutputModelExternalInitializersFile;
+    public IntPtr ModelCompilationOptions_SetOutputModelBuffer;
+    public IntPtr ModelCompilationOptions_SetEpContextEmbedMode;
+    public IntPtr CompileModel;
+}
+
+internal class NativeMethods
+{
+    private static OrtCompileApi _compileApi;
+
+    //
+    // Define the delegate signatures, and a static member for each to hold the marshaled function pointer.
+    //
+    // We populate the static members in the constructor of this class.
+    //
+    // The C# code will call the C++ API through the delegate instances in the static members.
+    //
+
+    [UnmanagedFunctionPointer(CallingConvention.Winapi)]
+    public delegate void DOrtReleaseModelCompilationOptions(IntPtr /* OrtModelCompilationOptions* */ options);
+    public DOrtReleaseModelCompilationOptions OrtReleaseModelCompilationOptions;
+
+    [UnmanagedFunctionPointer(CallingConvention.Winapi)]
+    public delegate IntPtr /* OrtStatus* */ DOrtCreateModelCompilationOptionsFromSessionOptions(
+        IntPtr /* const OrtEnv* */ env,
+        IntPtr /* const OrtSessionOptions* */ sessionOptions,
+        out IntPtr /* OrtModelCompilationOptions** */ outOptions);
+    public DOrtCreateModelCompilationOptionsFromSessionOptions 
+                    OrtCreateModelCompilationOptionsFromSessionOptions;
+
+    [UnmanagedFunctionPointer(CallingConvention.Winapi)]
+    public delegate IntPtr /* OrtStatus* */ DOrtModelCompilationOptions_SetInputModelPath(
+        IntPtr /* OrtModelCompilationOptions* */ options,
+        byte[] /* const ORTCHAR_T* */ inputModelPath);
+    public DOrtModelCompilationOptions_SetInputModelPath OrtModelCompilationOptions_SetInputModelPath;
+
+    [UnmanagedFunctionPointer(CallingConvention.Winapi)]
+    public delegate IntPtr /* OrtStatus* */ DOrtModelCompilationOptions_SetInputModelFromBuffer(
+        IntPtr /* OrtModelCompilationOptions* */ options,
+        byte[] /* const void* */ inputModelData,
+        UIntPtr /* size_t */ inputModelDataSize);
+    public DOrtModelCompilationOptions_SetInputModelFromBuffer 
+                    OrtModelCompilationOptions_SetInputModelFromBuffer;
+
+    [UnmanagedFunctionPointer(CallingConvention.Winapi)]
+    public delegate IntPtr /* OrtStatus* */ DOrtModelCompilationOptions_SetOutputModelPath(
+        IntPtr /* OrtModelCompilationOptions* */ options,
+        byte[] /* const ORTCHAR_T* */ outputModelPath);
+    public DOrtModelCompilationOptions_SetOutputModelPath OrtModelCompilationOptions_SetOutputModelPath;
+
+    [UnmanagedFunctionPointer(CallingConvention.Winapi)]
+    public delegate IntPtr /* OrtStatus* */ DOrtModelCompilationOptions_SetOutputModelExternalInitializersFile(
+        IntPtr /* OrtModelCompilationOptions* */ options,
+        byte[] /* const ORTCHAR_T* */ externalInitializersFilePath,
+        UIntPtr /* size_t */ externalInitializerSizeThreshold);
+    public DOrtModelCompilationOptions_SetOutputModelExternalInitializersFile 
+                    OrtModelCompilationOptions_SetOutputModelExternalInitializersFile;
+
+    [UnmanagedFunctionPointer(CallingConvention.Winapi)]
+    public delegate IntPtr /* OrtStatus* */ DOrtModelCompilationOptions_SetOutputModelBuffer(
+        IntPtr /* OrtModelCompilationOptions* */ options,
+        IntPtr /* OrtAllocator* */ allocator,
+        ref IntPtr /* void** */ outputModelBufferPtr,
+        ref UIntPtr /* size_t* */ outputModelBufferSizePtr);
+    public DOrtModelCompilationOptions_SetOutputModelBuffer OrtModelCompilationOptions_SetOutputModelBuffer;
+
+    [UnmanagedFunctionPointer(CallingConvention.Winapi)]
+    public delegate IntPtr /* OrtStatus* */ DOrtModelCompilationOptions_SetEpContextEmbedMode(
+        IntPtr /* OrtModelCompilationOptions* */ options,
+        bool embedEpContextInModel);
+    public DOrtModelCompilationOptions_SetEpContextEmbedMode OrtModelCompilationOptions_SetEpContextEmbedMode;
+
+    [UnmanagedFunctionPointer(CallingConvention.Winapi)]
+    public delegate IntPtr /* OrtStatus* */ DOrtCompileModel(
+        IntPtr /* const OrtEnv* */ env,
+        IntPtr /* const OrtModelCompilationOptions* */ modelOptions);
+    public DOrtCompileModel OrtCompileModel;
+
+    internal NativeMethods(OnnxRuntime.NativeMethods.DOrtGetCompileApi getCompileApi)
+    {
+
+#if NETSTANDARD2_0
+        IntPtr compileApiPtr = getCompileApi();
+        _compileApi = (OrtCompileApi)Marshal.PtrToStructure(compileApiPtr, typeof(OrtCompileApi));
+#else
+        _compileApi = (OrtCompileApi)getCompileApi();
+#endif
+
+        OrtReleaseModelCompilationOptions = 
+            (DOrtReleaseModelCompilationOptions)Marshal.GetDelegateForFunctionPointer(
+                _compileApi.ReleaseModelCompilationOptions, 
+                typeof(DOrtReleaseModelCompilationOptions));
+
+        OrtCreateModelCompilationOptionsFromSessionOptions = 
+            (DOrtCreateModelCompilationOptionsFromSessionOptions)Marshal.GetDelegateForFunctionPointer(
+                _compileApi.CreateModelCompilationOptionsFromSessionOptions, 
+                typeof(DOrtCreateModelCompilationOptionsFromSessionOptions));
+
+        OrtModelCompilationOptions_SetInputModelPath = 
+            (DOrtModelCompilationOptions_SetInputModelPath)Marshal.GetDelegateForFunctionPointer(
+                _compileApi.ModelCompilationOptions_SetInputModelPath, 
+                typeof(DOrtModelCompilationOptions_SetInputModelPath));
+
+        OrtModelCompilationOptions_SetInputModelFromBuffer = 
+            (DOrtModelCompilationOptions_SetInputModelFromBuffer)Marshal.GetDelegateForFunctionPointer(
+                _compileApi.ModelCompilationOptions_SetInputModelFromBuffer, 
+                typeof(DOrtModelCompilationOptions_SetInputModelFromBuffer));
+
+        OrtModelCompilationOptions_SetOutputModelPath = 
+            (DOrtModelCompilationOptions_SetOutputModelPath)Marshal.GetDelegateForFunctionPointer(
+                _compileApi.ModelCompilationOptions_SetOutputModelPath, 
+                typeof(DOrtModelCompilationOptions_SetOutputModelPath));
+
+        OrtModelCompilationOptions_SetOutputModelExternalInitializersFile = 
+            (DOrtModelCompilationOptions_SetOutputModelExternalInitializersFile)Marshal.GetDelegateForFunctionPointer(
+                _compileApi.ModelCompilationOptions_SetOutputModelExternalInitializersFile, 
+                typeof(DOrtModelCompilationOptions_SetOutputModelExternalInitializersFile));
+
+        OrtModelCompilationOptions_SetOutputModelBuffer = 
+            (DOrtModelCompilationOptions_SetOutputModelBuffer)Marshal.GetDelegateForFunctionPointer(
+                _compileApi.ModelCompilationOptions_SetOutputModelBuffer, 
+                typeof(DOrtModelCompilationOptions_SetOutputModelBuffer));
+
+        OrtModelCompilationOptions_SetEpContextEmbedMode = 
+            (DOrtModelCompilationOptions_SetEpContextEmbedMode)Marshal.GetDelegateForFunctionPointer(
+                _compileApi.ModelCompilationOptions_SetEpContextEmbedMode, 
+                typeof(DOrtModelCompilationOptions_SetEpContextEmbedMode));
+
+        OrtCompileModel = 
+            (DOrtCompileModel)Marshal.GetDelegateForFunctionPointer(
+                _compileApi.CompileModel, 
+                typeof(DOrtCompileModel));
+    }
+}
diff --git a/csharp/src/Microsoft.ML.OnnxRuntime/NativeMethods.shared.cs b/csharp/src/Microsoft.ML.OnnxRuntime/NativeMethods.shared.cs
index 77c35aac65b92..620c13b8641b5 100644
--- a/csharp/src/Microsoft.ML.OnnxRuntime/NativeMethods.shared.cs
+++ b/csharp/src/Microsoft.ML.OnnxRuntime/NativeMethods.shared.cs
@@ -336,12 +336,43 @@ public struct OrtApi
         public IntPtr GetModelEditorApi;
         public IntPtr CreateTensorWithDataAndDeleterAsOrtValue;
         public IntPtr SessionOptionsSetLoadCancellationFlag;
+
+        public IntPtr GetCompileApi;
+
+        public IntPtr CreateKeyValuePairs;
+        public IntPtr AddKeyValuePair;
+        public IntPtr GetKeyValue;
+        public IntPtr GetKeyValuePairs;
+        public IntPtr RemoveKeyValuePair;
+        public IntPtr ReleaseKeyValuePairs;
+
+        public IntPtr RegisterExecutionProviderLibrary;
+        public IntPtr UnregisterExecutionProviderLibrary;
+
+        public IntPtr GetEpDevices;
+
+        public IntPtr SessionOptionsAppendExecutionProvider_V2;
+        public IntPtr SessionOptionsSetEpSelectionPolicy;
+
+        public IntPtr HardwareDevice_Type;
+        public IntPtr HardwareDevice_VendorId;
+        public IntPtr HardwareDevice_Vendor;
+        public IntPtr HardwareDevice_DeviceId;
+        public IntPtr HardwareDevice_Metadata;
+
+        public IntPtr EpDevice_EpName;
+        public IntPtr EpDevice_EpVendor;
+        public IntPtr EpDevice_EpMetadata;
+        public IntPtr EpDevice_EpOptions;
+        public IntPtr EpDevice_Device;
     }
 
     internal static class NativeMethods
     {
         static OrtApi api_;
 
+        static internal CompileApi.NativeMethods CompileApi;
+
 #if NETSTANDARD2_0
         [UnmanagedFunctionPointer(CallingConvention.Winapi)]
         public delegate IntPtr DOrtGetApi(UInt32 version);
@@ -582,6 +613,85 @@ static NativeMethods()
                 typeof(DReleaseLoraAdapter));
             OrtRunOptionsAddActiveLoraAdapter = (DOrtRunOptionsAddActiveLoraAdapter)Marshal.GetDelegateForFunctionPointer(
                 api_.RunOptionsAddActiveLoraAdapter, typeof(DOrtRunOptionsAddActiveLoraAdapter));
+
+            OrtGetCompileApi = (DOrtGetCompileApi)Marshal.GetDelegateForFunctionPointer(
+                api_.GetCompileApi, typeof(DOrtGetCompileApi));
+
+            // populate the CompileApi struct now that we have the delegate to get the compile API pointer.
+            CompileApi = new CompileApi.NativeMethods(OrtGetCompileApi);
+
+            OrtCreateKeyValuePairs = (DOrtCreateKeyValuePairs)Marshal.GetDelegateForFunctionPointer(
+                api_.CreateKeyValuePairs, typeof(DOrtCreateKeyValuePairs));
+
+            OrtAddKeyValuePair = (DOrtAddKeyValuePair)Marshal.GetDelegateForFunctionPointer(
+                api_.AddKeyValuePair, typeof(DOrtAddKeyValuePair));
+
+            OrtGetKeyValue = (DOrtGetKeyValue)Marshal.GetDelegateForFunctionPointer(
+                api_.GetKeyValue, typeof(DOrtGetKeyValue));
+
+            OrtGetKeyValuePairs = (DOrtGetKeyValuePairs)Marshal.GetDelegateForFunctionPointer(
+                api_.GetKeyValuePairs, typeof(DOrtGetKeyValuePairs));
+
+            OrtRemoveKeyValuePair = (DOrtRemoveKeyValuePair)Marshal.GetDelegateForFunctionPointer(
+                api_.RemoveKeyValuePair, typeof(DOrtRemoveKeyValuePair));
+
+            OrtReleaseKeyValuePairs = (DOrtReleaseKeyValuePairs)Marshal.GetDelegateForFunctionPointer(
+                api_.ReleaseKeyValuePairs, typeof(DOrtReleaseKeyValuePairs));
+
+            OrtHardwareDevice_Type = (DOrtHardwareDevice_Type)Marshal.GetDelegateForFunctionPointer(
+                api_.HardwareDevice_Type, typeof(DOrtHardwareDevice_Type));
+
+            OrtHardwareDevice_VendorId = (DOrtHardwareDevice_VendorId)Marshal.GetDelegateForFunctionPointer(
+                api_.HardwareDevice_VendorId, typeof(DOrtHardwareDevice_VendorId));
+
+            OrtHardwareDevice_Vendor = (DOrtHardwareDevice_Vendor)Marshal.GetDelegateForFunctionPointer(
+                api_.HardwareDevice_Vendor, typeof(DOrtHardwareDevice_Vendor));
+
+            OrtHardwareDevice_DeviceId = (DOrtHardwareDevice_DeviceId)Marshal.GetDelegateForFunctionPointer(
+                api_.HardwareDevice_DeviceId, typeof(DOrtHardwareDevice_DeviceId));
+
+            OrtHardwareDevice_Metadata = (DOrtHardwareDevice_Metadata)Marshal.GetDelegateForFunctionPointer(
+                api_.HardwareDevice_Metadata, typeof(DOrtHardwareDevice_Metadata));
+
+
+            OrtEpDevice_EpName = (DOrtEpDevice_EpName)Marshal.GetDelegateForFunctionPointer(
+                api_.EpDevice_EpName, typeof(DOrtEpDevice_EpName));
+
+            OrtEpDevice_EpVendor = (DOrtEpDevice_EpVendor)Marshal.GetDelegateForFunctionPointer(
+                api_.EpDevice_EpVendor, typeof(DOrtEpDevice_EpVendor));
+
+            OrtEpDevice_EpMetadata = (DOrtEpDevice_EpMetadata)Marshal.GetDelegateForFunctionPointer(
+                api_.EpDevice_EpMetadata, typeof(DOrtEpDevice_EpMetadata));
+
+            OrtEpDevice_EpOptions = (DOrtEpDevice_EpOptions)Marshal.GetDelegateForFunctionPointer(
+                api_.EpDevice_EpOptions, typeof(DOrtEpDevice_EpOptions));
+
+            OrtEpDevice_Device = (DOrtEpDevice_Device)Marshal.GetDelegateForFunctionPointer(
+                api_.EpDevice_Device, typeof(DOrtEpDevice_Device));
+
+            OrtRegisterExecutionProviderLibrary = 
+                (DOrtRegisterExecutionProviderLibrary)Marshal.GetDelegateForFunctionPointer(
+                    api_.RegisterExecutionProviderLibrary,
+                    typeof(DOrtRegisterExecutionProviderLibrary));
+
+            OrtUnregisterExecutionProviderLibrary = 
+                (DOrtUnregisterExecutionProviderLibrary)Marshal.GetDelegateForFunctionPointer(
+                    api_.UnregisterExecutionProviderLibrary,
+                    typeof(DOrtUnregisterExecutionProviderLibrary));
+
+            OrtGetEpDevices = (DOrtGetEpDevices)Marshal.GetDelegateForFunctionPointer(
+                api_.GetEpDevices,
+                typeof(DOrtGetEpDevices));
+
+            OrtSessionOptionsAppendExecutionProvider_V2 = 
+                (DOrtSessionOptionsAppendExecutionProvider_V2)Marshal.GetDelegateForFunctionPointer(
+                    api_.SessionOptionsAppendExecutionProvider_V2,
+                    typeof(DOrtSessionOptionsAppendExecutionProvider_V2));
+
+            OrtSessionOptionsSetEpSelectionPolicy = 
+                (DSessionOptionsSetEpSelectionPolicy)Marshal.GetDelegateForFunctionPointer(
+                    api_.SessionOptionsSetEpSelectionPolicy,
+                    typeof(DSessionOptionsSetEpSelectionPolicy));
         }
 
         internal class NativeLib
@@ -823,7 +933,7 @@ internal class NativeLib
             IntPtr /* (OrtEnv*) */ environment,
             //[MarshalAs(UnmanagedType.LPStr)]string modelPath
             byte[] modelPath,
-            IntPtr /* (OrtSessionOptions*) */ sessopnOptions,
+            IntPtr /* (OrtSessionOptions*) */ sessionOptions,
             out IntPtr /**/ session);
 
         public static DOrtCreateSession OrtCreateSession;
@@ -1350,7 +1460,7 @@ out IntPtr lora_adapter
 
 #endregion
 
-    #region RunOptions API
+#region RunOptions API
 
         [UnmanagedFunctionPointer(CallingConvention.Winapi)]
         public delegate IntPtr /*(OrtStatus*)*/ DOrtCreateRunOptions(out IntPtr /* OrtRunOptions** */ runOptions);
@@ -2153,7 +2263,168 @@ out IntPtr lora_adapter
 
 #endregion
 
-#region Misc API
+#region Compile API
+
+#if NETSTANDARD2_0
+        [UnmanagedFunctionPointer(CallingConvention.Winapi)]
+        public delegate IntPtr DOrtGetCompileApi();
+#else
+        [UnmanagedFunctionPointer(CallingConvention.Winapi)]
+        public delegate ref CompileApi.OrtCompileApi DOrtGetCompileApi();
+#endif
+        public static DOrtGetCompileApi OrtGetCompileApi;
+#endregion
+
+#region Auto EP API related
+        //
+        // OrtKeyValuePairs
+        [UnmanagedFunctionPointer(CallingConvention.Winapi)]
+        public delegate void DOrtCreateKeyValuePairs(out IntPtr /* OrtKeyValuePairs** */ kvps);
+
+        [UnmanagedFunctionPointer(CallingConvention.Winapi)]
+        public delegate void DOrtAddKeyValuePair(IntPtr /* OrtKeyValuePairs* */ kvps,
+                                                 byte[] /* const char* */ key,
+                                                 byte[] /* const char* */ value);
+
+        [UnmanagedFunctionPointer(CallingConvention.Winapi)]
+        public delegate IntPtr /* const char* */ DOrtGetKeyValue(IntPtr /* const OrtKeyValuePairs* */ kvps,
+                                                                 byte[] /* const char* */ key);
+
+        [UnmanagedFunctionPointer(CallingConvention.Winapi)]
+        public delegate void DOrtGetKeyValuePairs(IntPtr /* const OrtKeyValuePairs* */ kvps,
+                                                  out IntPtr /* const char* const** */ keys,
+                                                  out IntPtr /* const char* const** */ values,
+                                                  out UIntPtr /* size_t* */ numEntries);
+
+        [UnmanagedFunctionPointer(CallingConvention.Winapi)]
+        public delegate void DOrtRemoveKeyValuePair(IntPtr /* OrtKeyValuePairs* */ kvps,
+                                                    byte[] /* const char* */ key);
+
+        [UnmanagedFunctionPointer(CallingConvention.Winapi)]
+        public delegate void DOrtReleaseKeyValuePairs(IntPtr /* OrtKeyValuePairs* */ kvps);
+
+
+        public static DOrtCreateKeyValuePairs OrtCreateKeyValuePairs;
+        public static DOrtAddKeyValuePair OrtAddKeyValuePair;
+        public static DOrtGetKeyValue OrtGetKeyValue;
+        public static DOrtGetKeyValuePairs OrtGetKeyValuePairs;
+        public static DOrtRemoveKeyValuePair OrtRemoveKeyValuePair;
+        public static DOrtReleaseKeyValuePairs OrtReleaseKeyValuePairs;
+
+
+        //
+        // OrtHardwareDevice
+        [UnmanagedFunctionPointer(CallingConvention.Winapi)]
+        public delegate int /* OrtHardwareDeviceType */ DOrtHardwareDevice_Type(
+            IntPtr /* const OrtHardwareDevice* */ device);
+
+        [UnmanagedFunctionPointer(CallingConvention.Winapi)]
+        public delegate uint /* uint32_t */ DOrtHardwareDevice_VendorId(
+            IntPtr /* const OrtHardwareDevice* */ device);
+
+        [UnmanagedFunctionPointer(CallingConvention.Winapi)]
+        public delegate IntPtr /* const char* */ DOrtHardwareDevice_Vendor(
+            IntPtr /* const OrtHardwareDevice* */ device);
+
+        [UnmanagedFunctionPointer(CallingConvention.Winapi)]
+        public delegate uint /* uint32_t */ DOrtHardwareDevice_DeviceId(
+            IntPtr /* const OrtHardwareDevice* */ device);
+
+        [UnmanagedFunctionPointer(CallingConvention.Winapi)]
+        public delegate IntPtr /* const OrtKeyValuePairs* */ DOrtHardwareDevice_Metadata(
+            IntPtr /* const OrtHardwareDevice* */ device);
+
+
+        public static DOrtHardwareDevice_Type OrtHardwareDevice_Type;
+        public static DOrtHardwareDevice_VendorId OrtHardwareDevice_VendorId;
+        public static DOrtHardwareDevice_Vendor OrtHardwareDevice_Vendor;
+        public static DOrtHardwareDevice_DeviceId OrtHardwareDevice_DeviceId;
+        public static DOrtHardwareDevice_Metadata OrtHardwareDevice_Metadata;
+
+        //
+        // OrtEpDevice
+        [UnmanagedFunctionPointer(CallingConvention.Winapi)]
+        public delegate IntPtr /* const char* */ DOrtEpDevice_EpName(IntPtr /* const OrtEpDevice* */ ep_device);
+
+        [UnmanagedFunctionPointer(CallingConvention.Winapi)]
+        public delegate IntPtr /* const char* */ DOrtEpDevice_EpVendor(IntPtr /* const OrtEpDevice* */ ep_device);
+
+        [UnmanagedFunctionPointer(CallingConvention.Winapi)]
+        public delegate IntPtr /* const OrtKeyValuePairs* */ DOrtEpDevice_EpMetadata(
+            IntPtr /* const OrtEpDevice* */ ep_device);
+
+        [UnmanagedFunctionPointer(CallingConvention.Winapi)]
+        public delegate IntPtr /* const OrtKeyValuePairs* */ DOrtEpDevice_EpOptions(
+            IntPtr /* const OrtEpDevice* */ ep_device);
+
+        [UnmanagedFunctionPointer(CallingConvention.Winapi)]
+        public delegate IntPtr /* const OrtHardwareDevice* */ DOrtEpDevice_Device(
+            IntPtr /* const OrtEpDevice* */ ep_device);
+
+
+        public static DOrtEpDevice_EpName OrtEpDevice_EpName;
+        public static DOrtEpDevice_EpVendor OrtEpDevice_EpVendor;
+        public static DOrtEpDevice_EpMetadata OrtEpDevice_EpMetadata;
+        public static DOrtEpDevice_EpOptions OrtEpDevice_EpOptions;
+        public static DOrtEpDevice_Device OrtEpDevice_Device;
+
+        //
+        // Auto Selection EP registration and selection customization
+        [UnmanagedFunctionPointer(CallingConvention.Winapi)]
+        public delegate IntPtr /* OrtStatus* */ DOrtRegisterExecutionProviderLibrary(
+            IntPtr /* OrtEnv* */ env,
+            byte[] /* const char* */ registration_name,
+            byte[] /* const ORTCHAR_T* */ path);
+
+        [UnmanagedFunctionPointer(CallingConvention.Winapi)]
+        public delegate IntPtr /* OrtStatus* */ DOrtUnregisterExecutionProviderLibrary(
+            IntPtr /* OrtEnv* */ env,
+            byte[] /* const char* */ registration_name);
+
+        public static DOrtRegisterExecutionProviderLibrary OrtRegisterExecutionProviderLibrary;
+        public static DOrtUnregisterExecutionProviderLibrary OrtUnregisterExecutionProviderLibrary;
+
+        [UnmanagedFunctionPointer(CallingConvention.Winapi)]
+        public delegate IntPtr /* OrtStatus* */ DOrtGetEpDevices(
+            IntPtr /* const OrtEnv* */ env,
+            out IntPtr /* const OrtEpDevice* const** */ ep_devices,
+            out UIntPtr /* size_t* */ num_ep_devices);
+
+        public static DOrtGetEpDevices OrtGetEpDevices;
+
+        [UnmanagedFunctionPointer(CallingConvention.Winapi)]
+        public delegate IntPtr /* OrtStatus* */ DOrtSessionOptionsAppendExecutionProvider_V2(
+            IntPtr /* OrtSessionOptions* */ sess_options,
+            IntPtr /* OrtEnv* */ env,
+            IntPtr[] /* const OrtEpDevice* const* */ ep_devices,
+            UIntPtr /* size_t */ num_ep_devices,
+            IntPtr /* const char* const* */ ep_option_keys,  // use OrtKeyValuePairs.GetKeyValuePairHandles
+            IntPtr /* const char* const* */ ep_option_vals,
+            UIntPtr /* size_t */ num_ep_options);
+
+        public static DOrtSessionOptionsAppendExecutionProvider_V2 OrtSessionOptionsAppendExecutionProvider_V2;
+
+        [UnmanagedFunctionPointer(CallingConvention.Winapi)]
+        public delegate IntPtr DOrtEpSelectionDelegate(
+            IntPtr /* OrtEpDevice** */ epDevices,
+            uint numDevices,
+            IntPtr /* OrtKeyValuePairs* */ modelMetadata,
+            IntPtr /* OrtKeyValuePairs* */ runtimeMetadata,
+            IntPtr /* OrtEpDevice** */ selected,
+            uint maxSelected,
+            out UIntPtr numSelected
+        );
+
+        [UnmanagedFunctionPointer(CallingConvention.Winapi)]
+        public delegate IntPtr /* OrtStatus* */ DSessionOptionsSetEpSelectionPolicy(
+            IntPtr /* OrtSessionOptions* */ session_options,
+            int /* OrtExecutionProviderDevicePolicy */ policy,
+            IntPtr /* DOrtEpSelectionDelegate* */ selection_delegate);
+        public static DSessionOptionsSetEpSelectionPolicy OrtSessionOptionsSetEpSelectionPolicy;
+
+
+        #endregion
+        #region Misc API
 
         /// <summary>
         /// Queries all the execution providers supported in the native onnxruntime shared library
diff --git a/csharp/src/Microsoft.ML.OnnxRuntime/OrtEnv.shared.cs b/csharp/src/Microsoft.ML.OnnxRuntime/OrtEnv.shared.cs
index f4b2649f8d055..5c70808b82be1 100644
--- a/csharp/src/Microsoft.ML.OnnxRuntime/OrtEnv.shared.cs
+++ b/csharp/src/Microsoft.ML.OnnxRuntime/OrtEnv.shared.cs
@@ -2,6 +2,7 @@
 // Licensed under the MIT License.
 
 using System;
+using System.Collections.Generic;
 using System.Runtime.InteropServices;
 
 namespace Microsoft.ML.OnnxRuntime
@@ -376,6 +377,68 @@ public OrtLoggingLevel EnvLogLevel
             }
         }
 
+        /// <summary>
+        /// Register an execution provider library with the OrtEnv instance.
+        /// A registered execution provider library can be used by all sessions created with the OrtEnv instance.
+        /// Devices the execution provider can utilize are added to the values returned by GetEpDevices() and can
+        /// be used in SessionOptions.AppendExecutionProvider to select an execution provider for a device.
+        /// 
+        /// Coming: A selection policy can be specified and ORT will automatically select the best execution providers
+        /// and devices for the model.
+        /// </summary>
+        /// <param name="registrationName">The name to register the library under.</param>
+        /// <param name="libraryPath">The path to the library to register.</param>
+        /// <see cref="GetEpDevices"/>
+        /// <see cref="SessionOptions.AppendExecutionProvider(OrtEnv, IReadOnlyList{OrtEpDevice}, IReadOnlyDictionary{string, string})"/>
+        public void RegisterExecutionProviderLibrary(string registrationName, string libraryPath)
+        {
+            var registrationNameUtf8 = NativeOnnxValueHelper.StringToZeroTerminatedUtf8(registrationName);
+            var pathUtf8 = NativeOnnxValueHelper.GetPlatformSerializedString(libraryPath);
+
+            NativeApiStatus.VerifySuccess(
+                NativeMethods.OrtRegisterExecutionProviderLibrary(handle, registrationNameUtf8, pathUtf8));
+        }
+
+        /// <summary>
+        /// Unregister an execution provider library from the OrtEnv instance.
+        /// </summary>
+        /// <param name="registrationName">The name the library was registered under.</param>
+        public void UnregisterExecutionProviderLibrary(string registrationName)
+        {
+            var registrationNameUtf8 = NativeOnnxValueHelper.StringToZeroTerminatedUtf8(registrationName);
+
+            NativeApiStatus.VerifySuccess(
+                NativeMethods.OrtUnregisterExecutionProviderLibrary(handle, registrationNameUtf8));
+        }
+
+        /// <summary>
+        /// Get the list of all execution provider and device combinations that are available.
+        /// These can be used to select the execution provider and device for a session.
+        /// </summary>
+        /// <see cref="OrtEpDevice"/>
+        /// <see cref="SessionOptions.AppendExecutionProvider(OrtEnv, IReadOnlyList{OrtEpDevice}, IReadOnlyDictionary{string, string})"/>
+        /// <returns></returns>
+        public IReadOnlyList<OrtEpDevice> GetEpDevices()
+        {
+            IntPtr epDevicesPtr;
+            UIntPtr numEpDevices;
+
+            NativeApiStatus.VerifySuccess(NativeMethods.OrtGetEpDevices(handle, out epDevicesPtr, out numEpDevices));
+
+            int count = (int)numEpDevices;
+            var epDevices = new List<OrtEpDevice>(count);
+
+            IntPtr[] epDevicePtrs = new IntPtr[count];
+            Marshal.Copy(epDevicesPtr, epDevicePtrs, 0, count);
+
+            foreach (var ptr in epDevicePtrs)
+            {
+                epDevices.Add(new OrtEpDevice(ptr));
+            }
+
+            return epDevices.AsReadOnly();
+        }        
+
         #endregion
 
         #region SafeHandle overrides
diff --git a/csharp/src/Microsoft.ML.OnnxRuntime/OrtEpDevice.shared.cs b/csharp/src/Microsoft.ML.OnnxRuntime/OrtEpDevice.shared.cs
new file mode 100644
index 0000000000000..e3947d900214e
--- /dev/null
+++ b/csharp/src/Microsoft.ML.OnnxRuntime/OrtEpDevice.shared.cs
@@ -0,0 +1,98 @@
+﻿// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+using System;
+using System.Runtime.InteropServices;
+
+namespace Microsoft.ML.OnnxRuntime
+{
+    /// <summary>
+    /// Represents the combination of an execution provider and a hardware device 
+    /// that the execution provider can utilize.
+    /// </summary>
+    public class OrtEpDevice : SafeHandle
+    {
+        /// <summary>
+        /// Construct an OrtEpDevice from an existing native OrtEpDevice instance.
+        /// </summary>
+        /// <param name="epDeviceHandle">Native OrtEpDevice handle.</param>
+        internal OrtEpDevice(IntPtr epDeviceHandle)
+            : base(epDeviceHandle, ownsHandle: false)
+        {
+        }
+
+        internal IntPtr Handle => handle;
+
+        /// <summary>
+        /// The name of the execution provider.
+        /// </summary>
+        public string EpName
+        {
+            get
+            {
+                IntPtr namePtr = NativeMethods.OrtEpDevice_EpName(handle);
+                return NativeOnnxValueHelper.StringFromNativeUtf8(namePtr);
+            }
+        }
+
+        /// <summary>
+        /// The vendor who owns the execution provider.
+        /// </summary>
+        public string EpVendor
+        {
+            get
+            {
+                IntPtr vendorPtr = NativeMethods.OrtEpDevice_EpVendor(handle);
+                return NativeOnnxValueHelper.StringFromNativeUtf8(vendorPtr);
+            }
+        }
+
+        /// <summary>
+        /// Execution provider metadata.
+        /// </summary>
+        public OrtKeyValuePairs EpMetadata
+        {
+            get
+            {
+                return new OrtKeyValuePairs(NativeMethods.OrtEpDevice_EpMetadata(handle));
+            }
+        }
+
+        /// <summary>
+        /// Execution provider options.
+        /// </summary>
+        public OrtKeyValuePairs EpOptions
+        {
+            get
+            {
+                return new OrtKeyValuePairs(NativeMethods.OrtEpDevice_EpOptions(handle));
+            }
+        }
+
+        /// <summary>
+        /// The hardware device that the execution provider can utilize.
+        /// </summary>
+        public OrtHardwareDevice HardwareDevice
+        {
+            get
+            {
+                IntPtr devicePtr = NativeMethods.OrtEpDevice_Device(handle);
+                return new OrtHardwareDevice(devicePtr);
+            }
+        }
+
+        /// <summary>
+        /// Indicates whether the native handle is invalid.
+        /// </summary>
+        public override bool IsInvalid => handle == IntPtr.Zero;
+
+        /// <summary>
+        /// No-op. OrtEpDevice is always read-only as the instance is owned by native ORT.
+        /// </summary>
+        /// <returns>True</returns>
+        protected override bool ReleaseHandle()
+        {
+            return true;
+        }
+    }
+}
\ No newline at end of file
diff --git a/csharp/src/Microsoft.ML.OnnxRuntime/OrtHardwareDevice.shared.cs b/csharp/src/Microsoft.ML.OnnxRuntime/OrtHardwareDevice.shared.cs
new file mode 100644
index 0000000000000..8e7caae90ff79
--- /dev/null
+++ b/csharp/src/Microsoft.ML.OnnxRuntime/OrtHardwareDevice.shared.cs
@@ -0,0 +1,116 @@
+﻿// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+
+namespace Microsoft.ML.OnnxRuntime
+{
+    using System;
+    using System.Runtime.InteropServices;
+
+    /// <summary>
+    /// Represents the type of hardware device.
+    /// Matches OrtHardwareDeviceType in the ORT C API.
+    /// </summary>
+    public enum OrtHardwareDeviceType
+    {
+        CPU = 0,
+        GPU = 1,
+        NPU = 2,
+    }
+
+    /// <summary>
+    /// Represents a hardware device that is available on the current system.
+    /// </summary>
+    public class OrtHardwareDevice : SafeHandle
+    {
+
+        /// <summary>
+        /// Construct an OrtHardwareDevice for a native OrtHardwareDevice instance.
+        /// </summary>
+        /// <param name="deviceHandle">Native OrtHardwareDevice handle.</param>
+        internal OrtHardwareDevice(IntPtr deviceHandle)
+            : base(deviceHandle, ownsHandle: false)
+        {
+        }
+
+        /// <summary>
+        /// Get the type of hardware device.
+        /// </summary>
+        public OrtHardwareDeviceType Type
+        {
+            get
+            {
+                return (OrtHardwareDeviceType)NativeMethods.OrtHardwareDevice_Type(handle);
+            }
+        }
+
+        /// <summary>
+        /// Get the vendor ID of the hardware device if known.
+        /// </summary>
+        /// <remarks>
+        /// For PCIe devices the vendor ID is the PCIe vendor ID. See https://pcisig.com/membership/member-companies.
+        /// </remarks>
+        public uint VendorId
+        {
+            get
+            {
+                return NativeMethods.OrtHardwareDevice_VendorId(handle);
+            }
+        }
+
+        /// <summary>
+        /// The vendor (manufacturer) of the hardware device.
+        /// </summary>
+        public string Vendor
+        {
+            get
+            {
+                IntPtr vendorPtr = NativeMethods.OrtHardwareDevice_Vendor(handle);
+                return NativeOnnxValueHelper.StringFromNativeUtf8(vendorPtr);
+            }
+        }
+
+        /// <summary>
+        /// Get the device ID of the hardware device if known.
+        /// </summary>
+        /// <remarks>
+        /// This is the identifier of the device model. 
+        /// PCIe device IDs can be looked up at https://www.pcilookup.com/ when combined with the VendorId.
+        /// It is NOT a unique identifier for the device in the current system.
+        /// </remarks>
+        public uint DeviceId
+        {
+            get
+            {
+                return NativeMethods.OrtHardwareDevice_DeviceId(handle);
+            }
+        }
+
+        /// <summary>
+        /// Get device metadata.
+        /// This may include information such as whether a GPU is discrete or integrated.
+        /// The available metadata will differ by platform and device type.
+        /// </summary>
+        public OrtKeyValuePairs Metadata
+        {
+            get
+            {
+                return new OrtKeyValuePairs(NativeMethods.OrtHardwareDevice_Metadata(handle));
+            }
+        }
+
+        /// <summary>
+        /// Indicates whether the native handle is invalid.
+        /// </summary>
+        public override bool IsInvalid => handle == IntPtr.Zero;
+
+        /// <summary>
+        /// No-op. OrtHardwareDevice is always read-only as the instance is owned by native ORT.
+        /// </summary>
+        /// <returns>True</returns>
+        protected override bool ReleaseHandle()
+        {
+            return true;
+        }
+    }
+}
\ No newline at end of file
diff --git a/csharp/src/Microsoft.ML.OnnxRuntime/OrtKeyValuePairs.shared.cs b/csharp/src/Microsoft.ML.OnnxRuntime/OrtKeyValuePairs.shared.cs
new file mode 100644
index 0000000000000..6a8d1037d9017
--- /dev/null
+++ b/csharp/src/Microsoft.ML.OnnxRuntime/OrtKeyValuePairs.shared.cs
@@ -0,0 +1,192 @@
+﻿// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+
+namespace Microsoft.ML.OnnxRuntime
+{
+    using System;
+    using System.Collections.Generic;
+    using System.Runtime.InteropServices;
+
+    /// <summary>
+    /// Class to manage key-value pairs. 
+    /// These are most often used for options and metadata.
+    /// </summary>
+    /// <see cref="OrtHardwareDevice.Metadata"/>
+    /// <see cref="OrtEpDevice.EpMetadata"/>
+    /// <see cref="OrtEpDevice.EpOptions"/>
+    public class OrtKeyValuePairs : SafeHandle
+    {
+        private readonly bool _createdHandle;
+
+        // cache the values here for convenience.
+        // we could force a call to the C API every time in case something was changed in the background.
+        private Dictionary<string, string> _keyValuePairs;
+
+        /// <summary>
+        /// Create a new OrtKeyValuePairs instance. 
+        /// </summary>
+        /// <remarks>
+        /// A backing native instance is created and kept in sync with the C# content.
+        /// </remarks>
+        public OrtKeyValuePairs()
+            : base(IntPtr.Zero, ownsHandle: true)
+        {
+            NativeMethods.OrtCreateKeyValuePairs(out handle);
+            _createdHandle = true;
+            _keyValuePairs = new Dictionary<string, string>();
+        }
+
+        /// <summary>
+        /// Create a new OrtKeyValuePairs instance from an existing native OrtKeyValuePairs handle.
+        /// </summary>
+        /// <param name="constHandle">Native OrtKeyValuePairs handle.</param>
+        /// <remarks>
+        /// The instance is read-only, so calling Add or Remove will throw an InvalidOperationError.
+        /// </remarks>
+        internal OrtKeyValuePairs(IntPtr constHandle)
+            : base(constHandle, ownsHandle: false)
+        {
+            _createdHandle = false;
+            _keyValuePairs = GetLatest();
+        }
+
+        /// <summary>
+        /// Create a new OrtKeyValuePairs instance from a dictionary.
+        /// </summary>
+        /// <param name="keyValuePairs">Key-value pairs to add.</param>
+        /// <remarks>
+        /// A backing native instance is created and kept in sync with the C# content.
+        /// </remarks>
+        public OrtKeyValuePairs(IReadOnlyDictionary<string, string> keyValuePairs)
+            : base(IntPtr.Zero, ownsHandle: true)
+        {
+            NativeMethods.OrtCreateKeyValuePairs(out handle);
+            _createdHandle = true;
+            _keyValuePairs = new Dictionary<string, string>(keyValuePairs != null ? keyValuePairs.Count : 0);
+
+            if (keyValuePairs != null && keyValuePairs.Count > 0)
+            {
+                foreach (var kvp in keyValuePairs)
+                {
+                    Add(kvp.Key, kvp.Value);
+                }
+            }
+        }
+
+        /// <summary>
+        /// Current key-value pair entries.
+        /// </summary>
+        /// <remarks>
+        /// Call Refresh() to update the cached values with the latest from the backing native instance.
+        /// In general that should not be required as it's not expected an OrtKeyValuePairs instance would be 
+        /// updated by both native and C# code.
+        /// </remarks>
+        public IReadOnlyDictionary<string, string> Entries => _keyValuePairs;
+
+        /// <summary>
+        /// Adds a key-value pair. Overrides any existing value for the key.
+        /// </summary>
+        /// <param name="key"> Key to add. Must not be null or empty.</param>
+        /// <param name="value"> Value to add. May be empty. Must not be null.</param>
+        public void Add(string key, string value)
+        {
+            if (!_createdHandle)
+            {
+                throw new InvalidOperationException($"{nameof(Add)} can only be called on instances you created.");
+            }
+
+            var keyPtr = NativeOnnxValueHelper.StringToZeroTerminatedUtf8(key);
+            var valuePtr = NativeOnnxValueHelper.StringToZeroTerminatedUtf8(value);
+            NativeMethods.OrtAddKeyValuePair(handle, keyPtr, valuePtr);
+            _keyValuePairs[key] = value; // update the cached value
+        }
+
+        /// <summary>
+        /// Update the cached values with the latest from the backing native instance as that is the source of truth.
+        /// </summary>
+        public void Refresh()
+        {
+            // refresh the cached values.
+            _keyValuePairs = GetLatest();
+        }
+
+        /// <summary>
+        /// Removes a key-value pair by key. Ignores keys that do not exist.
+        /// </summary>
+        /// <param name="key">Key to remove.</param>
+        public void Remove(string key)
+        {
+            if (!_createdHandle)
+            {
+                throw new InvalidOperationException($"{nameof(Remove)} can only be called on instances you created.");
+            }
+
+            var keyPtr = NativeOnnxValueHelper.StringToZeroTerminatedUtf8(key);
+            NativeMethods.OrtRemoveKeyValuePair(handle, keyPtr);
+
+            _keyValuePairs.Remove(key); // update the cached value
+        }
+
+        // for internal usage to pass into the call to OrtSessionOptionsAppendExecutionProvider_V2
+        // from SessionOptions::AppendExecutionProvider
+        internal void GetKeyValuePairHandles(out IntPtr keysHandle, out IntPtr valuesHandle, out UIntPtr numEntries)
+        {
+            if (IsInvalid)
+            {
+                throw new InvalidOperationException($"{nameof(GetKeyValuePairHandles)}: Invalid instance.");
+            }
+
+            NativeMethods.OrtGetKeyValuePairs(handle, out keysHandle, out valuesHandle, out numEntries);
+        }
+
+        /// <summary>
+        /// Fetch all the key/value pairs to make sure we are in sync with the C API.
+        /// </summary>
+        private Dictionary<string, string> GetLatest()
+        {
+            var dict = new Dictionary<string, string>();
+            if (IsInvalid)
+            {
+                return dict;
+            }
+
+            IntPtr keys, values;
+            UIntPtr numEntries;
+            NativeMethods.OrtGetKeyValuePairs(handle, out keys, out values, out numEntries);
+
+            ulong count = numEntries.ToUInt64();
+            int offset = 0;
+            for (ulong i = 0; i < count; i++, offset += IntPtr.Size)
+            {
+                IntPtr keyPtr = Marshal.ReadIntPtr(keys, offset);
+                IntPtr valuePtr = Marshal.ReadIntPtr(values, offset);
+                var key = NativeOnnxValueHelper.StringFromNativeUtf8(keyPtr);
+                var value = NativeOnnxValueHelper.StringFromNativeUtf8(valuePtr);
+                dict.Add(key, value);
+            }
+
+            return dict;
+        }
+
+        /// <summary>
+        /// Indicates whether the native handle is invalid.
+        /// </summary>
+        public override bool IsInvalid { get { return handle == IntPtr.Zero; } }
+
+        /// <summary>
+        /// Release the native instance of OrtKeyValuePairs if we own it.
+        /// </summary>
+        /// <returns>true</returns>
+        protected override bool ReleaseHandle()
+        {
+            if (_createdHandle)
+            {
+                NativeMethods.OrtReleaseKeyValuePairs(handle);
+                handle = IntPtr.Zero;
+            }
+
+            return true;
+        }
+    }
+}
\ No newline at end of file
diff --git a/csharp/src/Microsoft.ML.OnnxRuntime/OrtValue.shared.cs b/csharp/src/Microsoft.ML.OnnxRuntime/OrtValue.shared.cs
index 7a5c3aaa19eac..f3c0287d2bf9d 100644
--- a/csharp/src/Microsoft.ML.OnnxRuntime/OrtValue.shared.cs
+++ b/csharp/src/Microsoft.ML.OnnxRuntime/OrtValue.shared.cs
@@ -689,8 +689,8 @@ public static OrtValue CreateTensorValueFromMemory<T>(T[] data, long[] shape) wh
         /// The method will attempt to pin managed memory so no copying occurs when data is passed down
         /// to native code.
         /// </summary>
-        /// <param name="value">Tensor object</param>
-        /// <param name="elementType">discovered tensor element type</param>
+        /// <typeparam name="T"></typeparam>
+        /// <param name="tensor">Tensor object</param>
         /// <returns>And instance of OrtValue constructed on top of the object</returns>
         [Experimental("SYSLIB5001")]
         public static OrtValue CreateTensorValueFromSystemNumericsTensorObject<T>(SystemNumericsTensors.Tensor<T> tensor) where T : unmanaged
diff --git a/csharp/src/Microsoft.ML.OnnxRuntime/SessionOptions.shared.cs b/csharp/src/Microsoft.ML.OnnxRuntime/SessionOptions.shared.cs
index 9b0f183f03681..de6189e105f78 100644
--- a/csharp/src/Microsoft.ML.OnnxRuntime/SessionOptions.shared.cs
+++ b/csharp/src/Microsoft.ML.OnnxRuntime/SessionOptions.shared.cs
@@ -32,6 +32,21 @@ public enum ExecutionMode
         ORT_PARALLEL = 1,
     }
 
+    /// <summary>
+    /// Controls the execution provider selection when using automatic EP selection.
+    /// Execution providers must be registered with the OrtEnv to be available for selection.
+    /// </summary>
+    public enum ExecutionProviderDevicePolicy
+    {
+        DEFAULT = 0,
+        PREFER_CPU = 1,
+        PREFER_NPU,
+        PREFER_GPU,
+        MAX_PERFORMANCE,
+        MAX_EFFICIENCY,
+        MIN_OVERALL_POWER,
+    }
+
     /// <summary>
     /// Holds the options for creating an InferenceSession
     /// It forces the instantiation of the OrtEnv singleton.
@@ -408,6 +423,82 @@ public void AppendExecutionProvider(string providerName, Dictionary<string, stri
             var appender = new ExecutionProviderAppender(utf8ProviderName);
             ProviderOptionsUpdater.Update(providerOptions, handle, appender.Appender);
         }
+
+        /// <summary>
+        /// Select execution providers from the list of available execution providers and devices returned by 
+        /// GetEpDevices.
+        /// 
+        /// One or more OrtEpDevice instances may be provided in epDevices, but must all be for the same 
+        /// execution provider.
+        /// 
+        /// Make multiple calls to AppendExecutionProvider if you wish to use multiple execution providers.
+        /// 
+        /// e.g. 
+        ///   - if execution provider 'A' has an OrtEpDevice for NPU and one for GPU and you wish to use it for
+        ///     both devices, pass the two OrtEpDevice instances in the epDevices list in one call.
+        ///   - if you wish to use execution provider 'B' for GPU and execution provider 'C' for CPU, 
+        ///     make two calls to AppendExecutionProvider, with one OrtEpDevice in the epDevices list in each call.
+        ///     
+        /// The priority of the execution providers is set by the order in which they are appended.
+        /// Highest priority is first.
+        /// </summary>
+        /// <param name="env">OrtEnv that provided the OrtEpDevice instances via a call to GetEpDevices.</param>
+        /// <param name="epDevices">One or more OrtEpDevice instances to append.
+        ///                         These must all have the save EpName value.</param>
+        /// <param name="epOptions">Optional options to configure the execution provider. May be null.</param>
+        /// <exception cref="ArgumentException">epDevices was empty.</exception>
+        /// <see cref="OrtEnv.GetEpDevices" />
+        public void AppendExecutionProvider(OrtEnv env, IReadOnlyList<OrtEpDevice> epDevices, 
+                                            IReadOnlyDictionary<string, string> epOptions)
+        {
+            if (epDevices == null || epDevices.Count == 0)
+            {
+                throw new ArgumentException("No execution provider devices were specified.");
+            }
+
+            // Convert EpDevices to native pointers
+            IntPtr[] epDevicePtrs = new IntPtr[epDevices.Count];
+            for (int i = 0; i < epDevices.Count; i++)
+            {
+                epDevicePtrs[i] = epDevices[i].Handle;
+            }
+
+            if (epOptions != null && epOptions.Count > 0)
+            {
+                // this creates an OrtKeyValuePairs instance with a backing native instance
+                using var kvps = new OrtKeyValuePairs(epOptions);
+
+                // get the native key/value handles so we can pass those straight through to the C API
+                // and not have to do any special marshaling here.
+                IntPtr epOptionsKeys, epOptionsValues;
+                UIntPtr epOptionsCount;
+                kvps.GetKeyValuePairHandles(out epOptionsKeys, out epOptionsValues, out epOptionsCount);
+
+                NativeApiStatus.VerifySuccess(
+                    NativeMethods.OrtSessionOptionsAppendExecutionProvider_V2(
+                        handle,
+                        env.Handle,
+                        epDevicePtrs,
+                        (UIntPtr)epDevices.Count,
+                        epOptionsKeys,  
+                        epOptionsValues,
+                        epOptionsCount));
+            }
+            else
+            {
+                NativeApiStatus.VerifySuccess(
+                    NativeMethods.OrtSessionOptionsAppendExecutionProvider_V2(
+                        handle,
+                        env.Handle,
+                        epDevicePtrs,
+                        (UIntPtr)epDevices.Count,
+                        IntPtr.Zero,    // EP options keys
+                        IntPtr.Zero,    // EP options values
+                        UIntPtr.Zero)); // EP options count
+            }
+
+        }
+
         #endregion //ExecutionProviderAppends
 
         #region Public Methods
@@ -452,8 +543,8 @@ public void RegisterCustomOpLibraryV2(string libraryPath, out IntPtr libraryHand
             // End result of that is
             //   SessionOptions.RegisterCustomOpLibrary calls NativeMethods.OrtRegisterCustomOpsLibrary_V2
             //   SessionOptions.RegisterCustomOpLibraryV2 calls NativeMethods.OrtRegisterCustomOpsLibrary
-            var utf8Path = NativeOnnxValueHelper.StringToZeroTerminatedUtf8(libraryPath);
-            NativeApiStatus.VerifySuccess(NativeMethods.OrtRegisterCustomOpsLibrary(handle, utf8Path,
+            var platformPath = NativeOnnxValueHelper.StringToZeroTerminatedUtf8(libraryPath);
+            NativeApiStatus.VerifySuccess(NativeMethods.OrtRegisterCustomOpsLibrary(handle, platformPath,
                                                                                     out libraryHandle));
         }
 
@@ -536,6 +627,18 @@ public void AddFreeDimensionOverrideByName(string dimName, long dimValue)
             var utf8 = NativeOnnxValueHelper.StringToZeroTerminatedUtf8(dimName);
             NativeApiStatus.VerifySuccess(NativeMethods.OrtAddFreeDimensionOverrideByName(handle, utf8, dimValue));
         }
+
+        /// <summary>
+        /// Set the execution provider selection policy if using automatic execution provider selection.
+        /// Execution providers must be registered with the OrtEnv to be available for selection.
+        /// </summary>
+        /// <param name="policy">Policy to use.</param>
+        public void SetEpSelectionPolicy(ExecutionProviderDevicePolicy policy)
+        {
+            NativeApiStatus.VerifySuccess(
+                NativeMethods.OrtSessionOptionsSetEpSelectionPolicy(handle, (int)policy, IntPtr.Zero));
+        }
+
         #endregion
 
         internal IntPtr Handle
diff --git a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/CompileApiTests.cs b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/CompileApiTests.cs
new file mode 100644
index 0000000000000..72c165df56418
--- /dev/null
+++ b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/CompileApiTests.cs
@@ -0,0 +1,67 @@
+﻿// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+// not supported on mobile platforms
+#if !(ANDROID || IOS)
+
+namespace Microsoft.ML.OnnxRuntime.Tests;
+
+using System;
+using System.Globalization;
+using System.Runtime.InteropServices;
+using Xunit;
+
+
+public class CompileApiTests
+{
+    private OrtEnv ortEnvInstance = OrtEnv.Instance();
+
+
+    [Fact]
+    public void BasicUsage()
+    {
+        var so = new SessionOptions();
+        using (var compileOptions = new OrtModelCompilationOptions(so))
+        {
+            // mainly checking these don't throw which ensures all the plumbing for the binding works.
+            compileOptions.SetInputModelPath("model.onnx");
+            compileOptions.SetOutputModelPath("compiled_model.onnx");
+
+            compileOptions.SetOutputModelExternalInitializersFile("external_data.bin", 512);
+            compileOptions.SetEpContextEmbedMode(true);
+
+        }
+
+        // setup a new instance as SetOutputModelExternalInitializersFile is incompatible with SetOutputModelBuffer
+        using (var compileOptions = new OrtModelCompilationOptions(so))
+        {
+            var model = TestDataLoader.LoadModelFromEmbeddedResource("squeezenet.onnx");
+            compileOptions.SetInputModelFromBuffer(model);
+
+            // SetOutputModelBuffer updates the user provided IntPtr and size when it allocates data post-compile.
+            // Due to that we need to allocate an IntPtr and UIntPtr here.
+            IntPtr bytePtr = new IntPtr();
+            UIntPtr bytesSize = new UIntPtr();
+            var allocator = OrtAllocator.DefaultInstance;
+            compileOptions.SetOutputModelBuffer(allocator, ref bytePtr, ref bytesSize);
+
+            compileOptions.CompileModel();
+
+            Assert.NotEqual(IntPtr.Zero, bytePtr);
+            Assert.NotEqual(UIntPtr.Zero, bytesSize);
+
+            byte[] compiledBytes = new byte[bytesSize.ToUInt64()];
+            Marshal.Copy(bytePtr, compiledBytes, 0, (int)bytesSize.ToUInt32());
+
+            // Check the compiled model is valid
+            using (var session = new InferenceSession(compiledBytes, so))
+            {
+                Assert.NotNull(session);
+            }
+
+            allocator.FreeMemory(bytePtr);
+        }
+    }
+}
+
+#endif
\ No newline at end of file
diff --git a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/OrtAutoEpTests.cs b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/OrtAutoEpTests.cs
new file mode 100644
index 0000000000000..1aa4db15d275c
--- /dev/null
+++ b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/OrtAutoEpTests.cs
@@ -0,0 +1,159 @@
+﻿// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+// not supported on mobile platforms
+#if !(ANDROID || IOS)
+
+namespace Microsoft.ML.OnnxRuntime.Tests;
+
+using System;
+using System.Linq;
+using System.IO;
+using System.Runtime.InteropServices;
+using Xunit;
+using System.Collections.Generic;
+
+/// <summary>
+/// Tests for auto ep selection/registration.
+/// Includes testing of OrtHardwareDevice and OrtEpDevice as those only come from auto ep related code and we only
+/// get read-only access to them (i.e. we can't directly create instances of them to test).
+/// </summary>
+public class OrtAutoEpTests
+{
+    private OrtEnv ortEnvInstance = OrtEnv.Instance();
+
+    private void ReadHardwareDeviceValues(OrtHardwareDevice device)
+    {
+        Assert.True(device.Type == OrtHardwareDeviceType.CPU ||
+                    device.Type == OrtHardwareDeviceType.GPU ||
+                    device.Type == OrtHardwareDeviceType.NPU);
+        if (device.Type == OrtHardwareDeviceType.CPU)
+        {
+            Assert.NotEmpty(device.Vendor);
+        }
+        else
+        {
+            Assert.True(device.VendorId != 0);
+            Assert.True(device.DeviceId != 0);
+        }
+
+        var metadata = device.Metadata;
+        Assert.NotNull(metadata);
+        foreach (var kvp in metadata.Entries)
+        {
+            Assert.NotEmpty(kvp.Key);
+            // Assert.NotEmpty(kvp.Value); this is allowed
+        }
+    }
+
+    [Fact]
+    public void GetEpDevices()
+    {
+        var epDevices = ortEnvInstance.GetEpDevices();
+        Assert.NotNull(epDevices);
+        Assert.NotEmpty(epDevices);
+        foreach (var ep_device in epDevices)
+        {
+            Assert.NotEmpty(ep_device.EpName);
+            Assert.NotEmpty(ep_device.EpVendor);
+            var metadata = ep_device.EpMetadata;
+            Assert.NotNull(metadata);
+            var options = ep_device.EpOptions;
+            Assert.NotNull(options);
+            ReadHardwareDeviceValues(ep_device.HardwareDevice);
+        }
+    }
+
+    [Fact]
+    public void RegisterUnregisterLibrary()
+    {
+        if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows))
+        {
+            string libFullPath = Path.Combine(Directory.GetCurrentDirectory(), "example_plugin_ep.dll");
+            Assert.True(File.Exists(libFullPath), $"Expected lib {libFullPath} does not exist.");
+
+            // example plugin ep uses the registration name as the ep name
+            const string epName = "csharp_ep";
+
+            // register. shouldn't throw
+            ortEnvInstance.RegisterExecutionProviderLibrary(epName, libFullPath);
+
+            // check OrtEpDevice was found
+            var epDevices = ortEnvInstance.GetEpDevices();
+            var found = epDevices.Any(d => string.Equals(epName, d.EpName, StringComparison.OrdinalIgnoreCase));
+            Assert.True(found);
+
+            // unregister
+            ortEnvInstance.UnregisterExecutionProviderLibrary(epName);
+        }
+    }
+
+    [Fact]
+    public void AppendToSessionOptionsV2()
+    {
+        var runTest = (Func<Dictionary<string, string>> getEpOptions) =>
+        {
+            SessionOptions sessionOptions = new SessionOptions();
+            sessionOptions.LogSeverityLevel = OrtLoggingLevel.ORT_LOGGING_LEVEL_VERBOSE;
+
+            var epDevices = ortEnvInstance.GetEpDevices();
+
+            // cpu ep ignores the provider options so we can use any value in epOptions and it won't break.
+            List<OrtEpDevice> selectedEpDevices = epDevices.Where(d => d.EpName == "CPUExecutionProvider").ToList();
+
+            Dictionary<string, string> epOptions = getEpOptions();
+            sessionOptions.AppendExecutionProvider(ortEnvInstance, selectedEpDevices, epOptions);
+
+            var model = TestDataLoader.LoadModelFromEmbeddedResource("squeezenet.onnx");
+
+            // session should load successfully
+            using (var session = new InferenceSession(model))
+            {
+                Assert.NotNull(session);
+            }
+        };
+
+        runTest(() =>
+        {
+            // null options
+            return null;
+        });
+
+        runTest(() =>
+        {
+            // empty options
+            return new Dictionary<string, string>();
+        });
+
+        runTest(() =>
+        {
+            // dummy options
+            return new Dictionary<string, string>
+            {
+                { "random_key", "value" },
+            };
+        });
+    }
+
+    [Fact]
+    public void SetEpSelectionPolicy()
+    {
+        SessionOptions sessionOptions = new SessionOptions();
+        sessionOptions.LogSeverityLevel = OrtLoggingLevel.ORT_LOGGING_LEVEL_VERBOSE;
+
+        var epDevices = ortEnvInstance.GetEpDevices();
+        Assert.NotEmpty(epDevices);
+
+        // doesn't matter what the value is. should fallback to ORT CPU EP
+        sessionOptions.SetEpSelectionPolicy(ExecutionProviderDevicePolicy.PREFER_GPU);
+
+        var model = TestDataLoader.LoadModelFromEmbeddedResource("squeezenet.onnx");
+
+        // session should load successfully
+        using (var session = new InferenceSession(model))
+        {
+            Assert.NotNull(session);
+        }
+    }
+}
+#endif
\ No newline at end of file
diff --git a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/OrtKeyValuePairsTests.cs b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/OrtKeyValuePairsTests.cs
new file mode 100644
index 0000000000000..b89b970688d5f
--- /dev/null
+++ b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/OrtKeyValuePairsTests.cs
@@ -0,0 +1,39 @@
+﻿// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+using System;
+using Xunit;
+
+namespace Microsoft.ML.OnnxRuntime.Tests;
+
+public class OrtKeyValuePairsTests
+{
+    private OrtEnv ortEnvInstance = OrtEnv.Instance();
+
+
+    [Fact]
+    public void CRUD()
+    {
+        using var kvp = new OrtKeyValuePairs();
+        kvp.Add("key1", "value1");
+        kvp.Add("key2", "value2");
+        kvp.Add("key3", "");  // allowed
+
+        Assert.Equal("value1", kvp.Entries["key1"]);
+        Assert.Equal("value2", kvp.Entries["key2"]);
+        Assert.Equal("", kvp.Entries["key3"]);
+        
+        kvp.Remove("key1");
+        Assert.False(kvp.Entries.ContainsKey("key1"));
+
+        kvp.Remove("invalid_key");  // shouldn't break
+
+        Assert.Equal(2, kvp.Entries.Count);
+
+        // refresh from the C API to make sure everything is in sync
+        kvp.Refresh();
+        Assert.Equal(2, kvp.Entries.Count);
+        Assert.Equal("value2", kvp.Entries["key2"]);
+        Assert.Equal("", kvp.Entries["key3"]);
+    }
+}
diff --git a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.NetCoreApp/Microsoft.ML.OnnxRuntime.Tests.NetCoreApp.csproj b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.NetCoreApp/Microsoft.ML.OnnxRuntime.Tests.NetCoreApp.csproj
index a8abcd2b4aa1c..ee3c8c69aa2ae 100644
--- a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.NetCoreApp/Microsoft.ML.OnnxRuntime.Tests.NetCoreApp.csproj
+++ b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.NetCoreApp/Microsoft.ML.OnnxRuntime.Tests.NetCoreApp.csproj
@@ -70,7 +70,8 @@
     <None Condition="'$(IsWindowsBuild)'=='true'"
           Include="$(NativeBuildOutputDir)\onnxruntime_providers_*.dll;
                    $(NativeBuildOutputDir)\onnxruntime_providers_*.pdb;
-                   $(NativeBuildOutputDir)\custom_op_library*.dll">
+                   $(NativeBuildOutputDir)\custom_op_library*.dll;
+                   $(NativeBuildOutputDir)\example_plugin_ep.dll">
       <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
       <Visible>false</Visible>
     </None>
diff --git a/onnxruntime/core/session/ort_apis.h b/onnxruntime/core/session/ort_apis.h
index 7928f9b822cf0..7be518a39480f 100644
--- a/onnxruntime/core/session/ort_apis.h
+++ b/onnxruntime/core/session/ort_apis.h
@@ -562,8 +562,9 @@ ORT_API(void, GetKeyValuePairs, _In_ const OrtKeyValuePairs* kvps,
 ORT_API(void, RemoveKeyValuePair, _In_ OrtKeyValuePairs* kvps, _In_ const char* key);
 ORT_API(void, ReleaseKeyValuePairs, _Frees_ptr_opt_ OrtKeyValuePairs*);
 
-ORT_API_STATUS_IMPL(RegisterExecutionProviderLibrary, _In_ OrtEnv* env, const char* ep_name, const ORTCHAR_T* path);
-ORT_API_STATUS_IMPL(UnregisterExecutionProviderLibrary, _In_ OrtEnv* env, _In_ const char* ep_name);
+ORT_API_STATUS_IMPL(RegisterExecutionProviderLibrary, _In_ OrtEnv* env, const char* registration_name,
+                    const ORTCHAR_T* path);
+ORT_API_STATUS_IMPL(UnregisterExecutionProviderLibrary, _In_ OrtEnv* env, _In_ const char* registration_name);
 
 ORT_API_STATUS_IMPL(GetEpDevices, _In_ const OrtEnv* env,
                     _Outptr_ const OrtEpDevice* const** ep_devices, _Out_ size_t* num_ep_devices);

From 6fa8ba107f461d72073ee4842830a9d0a96d2546 Mon Sep 17 00:00:00 2001
From: David Fan <30608893+jiafatom@users.noreply.github.com>
Date: Sat, 3 May 2025 14:33:57 -0700
Subject: [PATCH 04/84] Remove neural_compressor dependency in MatMulNBits
 (#24627)

### Description
As titled.


### Motivation and Context
Dependency no need.
---
 .../quantization/matmul_nbits_quantizer.py     | 18 +-----------------
 1 file changed, 1 insertion(+), 17 deletions(-)

diff --git a/onnxruntime/python/tools/quantization/matmul_nbits_quantizer.py b/onnxruntime/python/tools/quantization/matmul_nbits_quantizer.py
index ef08b56cfe7ad..0297472d0738c 100644
--- a/onnxruntime/python/tools/quantization/matmul_nbits_quantizer.py
+++ b/onnxruntime/python/tools/quantization/matmul_nbits_quantizer.py
@@ -8,7 +8,6 @@
 
 import argparse
 import copy
-import importlib
 import logging
 import os
 
@@ -16,7 +15,6 @@
 import numpy.typing as npt
 import onnx
 from onnx.onnx_pb import GraphProto, ModelProto, NodeProto, TensorProto
-from packaging import version
 
 from onnxruntime.capi._pybind_state import quantize_matmul_4bits, quantize_matmul_8bits, quantize_qdq_matmul_4bits
 
@@ -1356,21 +1354,7 @@ def process(self):
             self.model = ONNXModel(self.model)  # Ensure the model is wrapped back into ONNXModel
             self.model.clean_initializers()
         else:
-            # use Intel® Neural Compressor for RTN or GPTQ weight-only quantize algorithm
-            try:
-                importlib.import_module("neural_compressor")
-            except Exception as e:
-                logging.error(f"{e}.")
-                raise RuntimeError(
-                    "neural-compressor is not correctly installed. Please check your environment."
-                ) from e
-
-            import neural_compressor
-
-            assert version.parse(neural_compressor.__version__) >= version.parse("2.3.2"), (
-                "Require neural-compressor >= 2.3.2 to support weight only quantization!"
-            )
-
+            # RTN or GPTQ weight-only quantize algorithm
             self.int4_quant_algo()
 
 
From ade008ed4c02bf3c6b2df96bf6d30d8ec6cc22ce Mon Sep 17 00:00:00 2001
From: Adrian Lizarraga <adlizarraga@microsoft.com>
Date: Sun, 4 May 2025 00:57:40 -0700
Subject: [PATCH 05/84] [QNN EP] Enable automatic selection of QNN EP for
 PREFER_NPU policy (#24629)

### Description
- Enables automatic selection of QNN EP for PREFER_NPU policy
- Fixes cpuid vendor id for Qualcomm to be `'Q' | ('C' << 8) | ('O' <<
16) | ('M' << 24);`

Sample code from unit test:
```c++
// Tests autoEP feature to automatically select an EP that supports the NPU.
// Currently only works on Windows.
TEST_F(QnnHTPBackendTests, AutoEp_PreferNpu) {
  ASSERT_ORTSTATUS_OK(Ort::GetApi().RegisterExecutionProviderLibrary(*ort_env, kQnnExecutionProvider,
                                                                     ORT_TSTR("onnxruntime_providers_qnn.dll")));

  Ort::SessionOptions so;
  so.SetEpSelectionPolicy(OrtExecutionProviderDevicePolicy_PREFER_NPU);

  const ORTCHAR_T* ort_model_path = ORT_MODEL_FOLDER "nhwc_resize_sizes_opset18.quant.onnx";
  Ort::Session session(*ort_env, ort_model_path, so);
  EXPECT_TRUE(SessionHasEp(session, kQnnExecutionProvider));

  ASSERT_ORTSTATUS_OK(Ort::GetApi().UnregisterExecutionProviderLibrary(*ort_env, kQnnExecutionProvider));
}
```

### Motivation and Context
A recent feature allows ORT to automatically select an EP according to
policies set by the user (e.g., prefer npu or prefer gpu). This PR
allows QNN EP to be potentially selected when the user sets the
`PREFER_NPU` policy.
---
 onnxruntime/core/common/cpuid_info.cc         |   2 +-
 .../providers/qnn/qnn_provider_factory.cc     | 138 ++++++++++++++++++
 onnxruntime/core/providers/qnn/symbols.def    |   2 +
 .../test/providers/qnn/qnn_basic_test.cc      |  59 +++++---
 4 files changed, 181 insertions(+), 20 deletions(-)

diff --git a/onnxruntime/core/common/cpuid_info.cc b/onnxruntime/core/common/cpuid_info.cc
index ee7782e3c8763..91961bf22ce1e 100644
--- a/onnxruntime/core/common/cpuid_info.cc
+++ b/onnxruntime/core/common/cpuid_info.cc
@@ -155,7 +155,7 @@ std::string CPUIDInfo::GetX86Vendor(int32_t* data) {
 uint32_t CPUIDInfo::GetVendorId(const std::string& vendor) {
   if (vendor == "GenuineIntel") return 0x8086;
   if (vendor == "GenuineAMD") return 0x1022;
-  if (vendor.find("Qualcomm") == 0) return 'Q' << 24 | 'C' << 16 | 'O' << 8 | 'M';
+  if (vendor.find("Qualcomm") == 0) return 'Q' | ('C' << 8) | ('O' << 16) | ('M' << 24);
   if (vendor.find("NV") == 0) return 0x10DE;
   return 0;
 }
diff --git a/onnxruntime/core/providers/qnn/qnn_provider_factory.cc b/onnxruntime/core/providers/qnn/qnn_provider_factory.cc
index 7b92a23e428eb..b2f289448b013 100644
--- a/onnxruntime/core/providers/qnn/qnn_provider_factory.cc
+++ b/onnxruntime/core/providers/qnn/qnn_provider_factory.cc
@@ -79,6 +79,27 @@ struct QNN_Provider : Provider {
     return std::make_shared<onnxruntime::QNNProviderFactory>(*provider_options, config_options);
   }
 
+  Status CreateIExecutionProvider(const OrtHardwareDevice* const* /*devices*/,
+                                  const OrtKeyValuePairs* const* /*ep_metadata*/,
+                                  size_t num_devices,
+                                  ProviderOptions& provider_options,
+                                  const OrtSessionOptions& session_options,
+                                  const OrtLogger& logger,
+                                  std::unique_ptr<IExecutionProvider>& ep) override {
+    if (num_devices != 1) {
+      return Status(common::ONNXRUNTIME, ORT_EP_FAIL, "QNN EP only supports one device.");
+    }
+
+    const ConfigOptions* config_options = &session_options.GetConfigOptions();
+
+    std::array<const void*, 2> configs_array = {&provider_options, config_options};
+    const void* arg = reinterpret_cast<const void*>(&configs_array);
+    auto ep_factory = CreateExecutionProviderFactory(arg);
+    ep = ep_factory->CreateProvider(session_options, logger);
+
+    return Status::OK();
+  }
+
   void Initialize() override {}
   void Shutdown() override {}
 } g_provider;
@@ -93,4 +114,121 @@ ORT_API(onnxruntime::Provider*, GetProvider) {
   return &onnxruntime::g_provider;
 }
 }
+
+#include "core/framework/error_code_helper.h"
+
+// OrtEpApi infrastructure to be able to use the QNN EP as an OrtEpFactory for auto EP selection.
+struct QnnEpFactory : OrtEpFactory {
+  QnnEpFactory(const OrtApi& ort_api_in,
+               const char* ep_name,
+               OrtHardwareDeviceType hw_type,
+               const char* qnn_backend_type)
+      : ort_api{ort_api_in}, ep_name{ep_name}, ort_hw_device_type{hw_type}, qnn_backend_type{qnn_backend_type} {
+    GetName = GetNameImpl;
+    GetVendor = GetVendorImpl;
+    GetSupportedDevices = GetSupportedDevicesImpl;
+    CreateEp = CreateEpImpl;
+    ReleaseEp = ReleaseEpImpl;
+  }
+
+  // Returns the name for the EP. Each unique factory configuration must have a unique name.
+  // Ex: a factory that supports NPU should have a different than a factory that supports GPU.
+  static const char* GetNameImpl(const OrtEpFactory* this_ptr) {
+    const auto* factory = static_cast<const QnnEpFactory*>(this_ptr);
+    return factory->ep_name.c_str();
+  }
+
+  static const char* GetVendorImpl(const OrtEpFactory* this_ptr) {
+    const auto* factory = static_cast<const QnnEpFactory*>(this_ptr);
+    return factory->vendor.c_str();
+  }
+
+  // Creates and returns OrtEpDevice instances for all OrtHardwareDevices that this factory supports.
+  // An EP created with this factory is expected to be able to execute a model with *all* supported
+  // hardware devices at once. A single instance of QNN EP is not currently setup to partition a model among
+  // multiple different QNN backends at once (e.g, npu, cpu, gpu), so this factory instance is set to only
+  // support one backend: npu. To support a different backend, like gpu, create a different factory instance
+  // that only supports GPU.
+  static OrtStatus* GetSupportedDevicesImpl(OrtEpFactory* this_ptr,
+                                            const OrtHardwareDevice* const* devices,
+                                            size_t num_devices,
+                                            OrtEpDevice** ep_devices,
+                                            size_t max_ep_devices,
+                                            size_t* p_num_ep_devices) {
+    size_t& num_ep_devices = *p_num_ep_devices;
+    auto* factory = static_cast<QnnEpFactory*>(this_ptr);
+
+    for (size_t i = 0; i < num_devices && num_ep_devices < max_ep_devices; ++i) {
+      const OrtHardwareDevice& device = *devices[i];
+      if (factory->ort_api.HardwareDevice_Type(&device) == factory->ort_hw_device_type &&
+          factory->ort_api.HardwareDevice_VendorId(&device) == factory->vendor_id) {
+        OrtKeyValuePairs* ep_options = nullptr;
+        factory->ort_api.CreateKeyValuePairs(&ep_options);
+        factory->ort_api.AddKeyValuePair(ep_options, "backend_type", factory->qnn_backend_type.c_str());
+        ORT_API_RETURN_IF_ERROR(
+            factory->ort_api.GetEpApi()->CreateEpDevice(factory, &device, nullptr, ep_options,
+                                                        &ep_devices[num_ep_devices++]));
+      }
+    }
+
+    return nullptr;
+  }
+
+  static OrtStatus* CreateEpImpl(OrtEpFactory* /*this_ptr*/,
+                                 _In_reads_(num_devices) const OrtHardwareDevice* const* /*devices*/,
+                                 _In_reads_(num_devices) const OrtKeyValuePairs* const* /*ep_metadata*/,
+                                 _In_ size_t /*num_devices*/,
+                                 _In_ const OrtSessionOptions* /*session_options*/,
+                                 _In_ const OrtLogger* /*logger*/,
+                                 _Out_ OrtEp** /*ep*/) {
+    return onnxruntime::CreateStatus(ORT_INVALID_ARGUMENT, "QNN EP factory does not support this method.");
+  }
+
+  static void ReleaseEpImpl(OrtEpFactory* /*this_ptr*/, OrtEp* /*ep*/) {
+    // no-op as we never create an EP here.
+  }
+
+  const OrtApi& ort_api;
+  const std::string ep_name;              // EP name
+  const std::string vendor{"Microsoft"};  // EP vendor name
+
+  // Qualcomm vendor ID. Refer to the ACPI ID registry (search Qualcomm): https://uefi.org/ACPI_ID_List
+  const uint32_t vendor_id{'Q' | ('C' << 8) | ('O' << 16) | ('M' << 24)};
+  const OrtHardwareDeviceType ort_hw_device_type;  // Supported OrtHardwareDevice
+  const std::string qnn_backend_type;              // QNN backend type for OrtHardwareDevice
+};
+
+extern "C" {
+//
+// Public symbols
+//
+OrtStatus* CreateEpFactories(const char* /*registration_name*/, const OrtApiBase* ort_api_base,
+                             OrtEpFactory** factories, size_t max_factories, size_t* num_factories) {
+  const OrtApi* ort_api = ort_api_base->GetApi(ORT_API_VERSION);
+
+  // Factory could use registration_name or define its own EP name.
+  auto factory_npu = std::make_unique<QnnEpFactory>(*ort_api,
+                                                    onnxruntime::kQnnExecutionProvider,
+                                                    OrtHardwareDeviceType_NPU, "htp");
+
+  // If want to support GPU, create a new factory instance because QNN EP is not currently setup to partition a single model
+  // among heterogeneous devices.
+  // std::unique_ptr<OrtEpFactory> factory_gpu = std::make_unique<QnnEpFactory>(*ort_api, "QNNExecutionProvider_GPU", OrtHardwareDeviceType_GPU, "gpu");
+
+  if (max_factories < 1) {
+    return ort_api->CreateStatus(ORT_INVALID_ARGUMENT,
+                                 "Not enough space to return EP factory. Need at least one.");
+  }
+
+  factories[0] = factory_npu.release();
+  *num_factories = 1;
+
+  return nullptr;
+}
+
+OrtStatus* ReleaseEpFactory(OrtEpFactory* factory) {
+  delete static_cast<QnnEpFactory*>(factory);
+  return nullptr;
+}
+}
 #endif  // !BUILD_QNN_EP_STATIC_LIB
diff --git a/onnxruntime/core/providers/qnn/symbols.def b/onnxruntime/core/providers/qnn/symbols.def
index 4ec2f7914c208..3afed01da1966 100644
--- a/onnxruntime/core/providers/qnn/symbols.def
+++ b/onnxruntime/core/providers/qnn/symbols.def
@@ -1,2 +1,4 @@
 EXPORTS
    GetProvider
+   CreateEpFactories
+   ReleaseEpFactory
diff --git a/onnxruntime/test/providers/qnn/qnn_basic_test.cc b/onnxruntime/test/providers/qnn/qnn_basic_test.cc
index f736abcd3006d..0212dacadbced 100644
--- a/onnxruntime/test/providers/qnn/qnn_basic_test.cc
+++ b/onnxruntime/test/providers/qnn/qnn_basic_test.cc
@@ -16,6 +16,7 @@
 #include "core/session/onnxruntime_run_options_config_keys.h"
 
 #include "test/providers/qnn/qnn_test_utils.h"
+#include "test/util/include/api_asserts.h"
 
 #include "gtest/gtest.h"
 #include "gmock/gmock.h"
@@ -37,24 +38,24 @@ namespace test {
 // TODO: When we need QNN in a minimal build we should add an ORT format version of the model
 #if !defined(ORT_MINIMAL_BUILD)
 
+static bool SessionHasEp(Ort::Session& session, const char* ep_name) {
+  // Access the underlying InferenceSession.
+  const OrtSession* ort_session = session;
+  const InferenceSession* s = reinterpret_cast<const InferenceSession*>(ort_session);
+  bool has_ep = false;
+
+  for (const auto& provider : s->GetRegisteredProviderTypes()) {
+    if (provider == ep_name) {
+      has_ep = true;
+      break;
+    }
+  }
+  return has_ep;
+}
+
 // Tests that the QNN EP is registered when added via the public C++ API.
 // Loads a simple ONNX model that adds floats.
 TEST_F(QnnHTPBackendTests, TestAddEpUsingPublicApi) {
-  auto session_has_qnn_ep = [](Ort::Session& session) -> bool {
-    // Access the underlying InferenceSession.
-    const OrtSession* ort_session = session;
-    const InferenceSession* s = reinterpret_cast<const InferenceSession*>(ort_session);
-    bool have_qnn_ep = false;
-
-    for (const auto& provider : s->GetRegisteredProviderTypes()) {
-      if (provider == kQnnExecutionProvider) {
-        have_qnn_ep = true;
-        break;
-      }
-    }
-    return have_qnn_ep;
-  };
-
   onnxruntime::ProviderOptions options;
 #if defined(_WIN32)
   options["backend_path"] = "QnnHtp.dll";
@@ -77,8 +78,9 @@ TEST_F(QnnHTPBackendTests, TestAddEpUsingPublicApi) {
     so.AppendExecutionProvider("QNN", options);
 
     Ort::Session session(*ort_env, ort_model_path, so);
-    ASSERT_TRUE(session_has_qnn_ep(session)) << "QNN EP was not found in registered providers for session "
-                                             << "when added to session with name 'QNN'";
+    ASSERT_TRUE(SessionHasEp(session, kQnnExecutionProvider))
+        << "QNN EP was not found in registered providers for session "
+        << "providers for session when added to session with name 'QNN'";
   }
 
   {
@@ -92,8 +94,9 @@ TEST_F(QnnHTPBackendTests, TestAddEpUsingPublicApi) {
     so.AppendExecutionProvider(kQnnExecutionProvider, options);
 
     Ort::Session session(*ort_env, ort_model_path, so);
-    ASSERT_TRUE(session_has_qnn_ep(session)) << "QNN EP was not found in registered providers for session "
-                                             << "when added to session with name '" << kQnnExecutionProvider << "'";
+    ASSERT_TRUE(SessionHasEp(session, kQnnExecutionProvider))
+        << "QNN EP was not found in registered providers for session "
+        << "when added to session with name '" << kQnnExecutionProvider << "'";
   }
 }
 
@@ -1265,6 +1268,24 @@ TEST_F(QnnHTPBackendTests, LoadingAndUnloadingOfQnnLibrary_FixSegFault) {
 }
 #endif  // !BUILD_QNN_EP_STATIC_LIB
 
+#if defined(WIN32) && !BUILD_QNN_EP_STATIC_LIB
+// Tests autoEP feature to automatically select an EP that supports the NPU.
+// Currently only works on Windows.
+TEST_F(QnnHTPBackendTests, AutoEp_PreferNpu) {
+  ASSERT_ORTSTATUS_OK(Ort::GetApi().RegisterExecutionProviderLibrary(*ort_env, kQnnExecutionProvider,
+                                                                     ORT_TSTR("onnxruntime_providers_qnn.dll")));
+
+  Ort::SessionOptions so;
+  so.SetEpSelectionPolicy(OrtExecutionProviderDevicePolicy_PREFER_NPU);
+
+  const ORTCHAR_T* ort_model_path = ORT_MODEL_FOLDER "nhwc_resize_sizes_opset18.quant.onnx";
+  Ort::Session session(*ort_env, ort_model_path, so);
+  EXPECT_TRUE(SessionHasEp(session, kQnnExecutionProvider));
+
+  ASSERT_ORTSTATUS_OK(Ort::GetApi().UnregisterExecutionProviderLibrary(*ort_env, kQnnExecutionProvider));
+}
+#endif  // defined(WIN32) && !BUILD_QNN_EP_STATIC_LIB
+
 #endif  // defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__)
 #endif  // !defined(ORT_MINIMAL_BUILD)
 

From 74dcf7e296639095dfa55d31336998b6f719ed76 Mon Sep 17 00:00:00 2001
From: Scott McKay <skottmckay@gmail.com>
Date: Mon, 5 May 2025 15:40:22 +1000
Subject: [PATCH 06/84] =?UTF-8?q?Fix=20OrtEpDevices=20sort.=20Debug=20asse?=
 =?UTF-8?q?rtion=20when=20there=20are=20two=20devices=20of=20the=20same=20?=
 =?UTF-8?q?type=E2=80=A6=20(#24633)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

### Description
<!-- Describe your changes. -->
Fix debug assertion when there are two devices of the same type that
don't match the vendor. e.g. WebGPU and DML.

### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
---
 onnxruntime/core/session/provider_policy_context.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/onnxruntime/core/session/provider_policy_context.cc b/onnxruntime/core/session/provider_policy_context.cc
index 565891fe2cdfb..4ce13fe36ea86 100644
--- a/onnxruntime/core/session/provider_policy_context.cc
+++ b/onnxruntime/core/session/provider_policy_context.cc
@@ -95,7 +95,7 @@ std::vector<const OrtEpDevice*> OrderDevices(const std::vector<const OrtEpDevice
     if (!aIsDefaultCpuEp && !bIsDefaultCpuEp) {
       // neither are default CPU EP. both do/don't match vendor.
       // TODO: implement tie-breaker for this scenario. arbitrarily prefer the first for now
-      return true;
+      return a < b;
     }
 
     // one is the default CPU EP

From 186f00eca993ab1383c2d1add04539c5549ef5fc Mon Sep 17 00:00:00 2001
From: Guenther Schmuelling <guschmue@microsoft.com>
Date: Mon, 5 May 2025 18:19:45 -0700
Subject: [PATCH 07/84] fix checks for metal when running under wasm (#24637)

When under wasm we can't check for metal by looking at backend because
it will always be WEBGPU.
Because of this we'll take the DP4A path on metal that results in
sub-optimal performance.
Use vendor to check for metal instead.
---
 .../contrib_ops/webgpu/quantization/dp4a_matmul_nbits.cc      | 3 ++-
 .../webgpu/quantization/subgroup_matrix_matmul_nbits.cc       | 4 ++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/onnxruntime/contrib_ops/webgpu/quantization/dp4a_matmul_nbits.cc b/onnxruntime/contrib_ops/webgpu/quantization/dp4a_matmul_nbits.cc
index e3353c921094a..1411c07e97b2a 100644
--- a/onnxruntime/contrib_ops/webgpu/quantization/dp4a_matmul_nbits.cc
+++ b/onnxruntime/contrib_ops/webgpu/quantization/dp4a_matmul_nbits.cc
@@ -513,8 +513,9 @@ bool CanApplyDP4AMatrixMatMulNBits(onnxruntime::webgpu::ComputeContext& context,
                                    bool has_zero_points) {
   // macOS - Avoid using dp4a on Metal, as it does not appear to have native dp4a support.
   // https://github.com/gpuweb/gpuweb/issues/2677#issuecomment-1713292226
+  // Use 'vendor' to check for metal; 'backend' is always WEBGPU when running under wasm.
   bool use_dp4a = context.HasFeature(wgpu::FeatureName::Subgroups) &&
-                  context.AdapterInfo().backendType != wgpu::BackendType::Metal;
+                  context.AdapterInfo().vendor != std::string_view{"apple"};
   return (accuracy_level == 4 && block_size % 32 == 0 &&
           batch_count == 1 && components_k == 4 && K % 128 == 0 && N % 16 == 0 &&
           !has_zero_points && use_dp4a);
diff --git a/onnxruntime/contrib_ops/webgpu/quantization/subgroup_matrix_matmul_nbits.cc b/onnxruntime/contrib_ops/webgpu/quantization/subgroup_matrix_matmul_nbits.cc
index 09650be9358d0..674473a173445 100644
--- a/onnxruntime/contrib_ops/webgpu/quantization/subgroup_matrix_matmul_nbits.cc
+++ b/onnxruntime/contrib_ops/webgpu/quantization/subgroup_matrix_matmul_nbits.cc
@@ -248,8 +248,8 @@ bool CanApplySubgroupMatrixMatMulNBits(onnxruntime::webgpu::ComputeContext& cont
   // some precision issues with subgroupMatrixMultiplyAccumulate. It is possible to support higher accuracy
   // by setting compute_precision to Fp32, but that will be slower. For 1K token prefill FP16 Phi 3.5 is around 5s,
   // FP322 is around 7s.
-  return context.AdapterInfo().backendType == wgpu::BackendType::Metal &&
-         has_subgroup_matrix &&
+  return has_subgroup_matrix &&
+         context.AdapterInfo().vendor == std::string_view{"apple"} &&
          accuracy_level == 4 &&
          block_size == 32 &&
          batch_count == 1 &&

From 8bf5362a24300f60f5a6344fb1f493a1d847e94c Mon Sep 17 00:00:00 2001
From: Hector Li <hecli@microsoft.com>
Date: Mon, 5 May 2025 20:52:23 -0700
Subject: [PATCH 08/84] Enable use_vcpkg for QNN Nuget package build and Python
 arm64ec build (#24640)

### Description
enable use_vcpkg for QNN Nuget package build and Python arm64ec build
---
 .../github/azure-pipelines/templates/py-win-arm64ec-qnn.yml     | 2 +-
 tools/ci_build/github/azure-pipelines/templates/qnn-ep-win.yml  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/ci_build/github/azure-pipelines/templates/py-win-arm64ec-qnn.yml b/tools/ci_build/github/azure-pipelines/templates/py-win-arm64ec-qnn.yml
index 6df46bfc8e1b0..1a00d67bdbb2a 100644
--- a/tools/ci_build/github/azure-pipelines/templates/py-win-arm64ec-qnn.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/py-win-arm64ec-qnn.yml
@@ -91,7 +91,7 @@ jobs:
         --use_qnn
         --qnn_home $(QnnSDKRootDir)
         --enable_pybind
-        --parallel --update --arm64ec
+        --parallel --use_vcpkg --use_vcpkg_ms_internal_asset_cache --update --arm64ec
         $(TelemetryOption) ${{ parameters.BUILD_PY_PARAMETERS }}
       workingDirectory: '$(Build.BinariesDirectory)'
 
diff --git a/tools/ci_build/github/azure-pipelines/templates/qnn-ep-win.yml b/tools/ci_build/github/azure-pipelines/templates/qnn-ep-win.yml
index e4888ffd62df3..6bfc00b5b46eb 100644
--- a/tools/ci_build/github/azure-pipelines/templates/qnn-ep-win.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/qnn-ep-win.yml
@@ -20,7 +20,7 @@ stages:
       name: ${{ parameters.qnn_ep_build_pool_name }}
     variables:
       OrtPackageId: ${{ parameters.OrtNugetPackageId }}
-      commonBuildArgs: '--compile_no_warning_as_error --skip_submodule_sync --build_shared_lib --cmake_generator "Visual Studio 17 2022" --config ${{ parameters.build_config }} --parallel --use_binskim_compliant_compile_flags '
+      commonBuildArgs: '--compile_no_warning_as_error --skip_submodule_sync --build_shared_lib --cmake_generator "Visual Studio 17 2022" --config ${{ parameters.build_config }} --parallel --use_vcpkg --use_vcpkg_ms_internal_asset_cache --use_binskim_compliant_compile_flags '
 
     steps:
     - template: set-version-number-variables-step.yml

From 1ef7b1b7af5b905441f190a9602e1613a618ffeb Mon Sep 17 00:00:00 2001
From: Baiju Meswani <bmeswani@microsoft.com>
Date: Mon, 5 May 2025 21:00:47 -0700
Subject: [PATCH 09/84] Publish debug symbols for windows (#24643)

---
 ...artifacts-package-and-publish-steps-windows.yml | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/tools/ci_build/github/azure-pipelines/templates/c-api-artifacts-package-and-publish-steps-windows.yml b/tools/ci_build/github/azure-pipelines/templates/c-api-artifacts-package-and-publish-steps-windows.yml
index f15a2992e0d00..8c3a9eba82356 100644
--- a/tools/ci_build/github/azure-pipelines/templates/c-api-artifacts-package-and-publish-steps-windows.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/c-api-artifacts-package-and-publish-steps-windows.yml
@@ -31,6 +31,20 @@ parameters:
   default: true
 
 steps:
+    - task: PublishSymbols@2
+      displayName: 'Publish Build Debug Symbols'
+      condition: and(succeeded(), or(eq(variables['Build.SourceBranch'], 'refs/heads/main'), startsWith(variables['Build.SourceBranch'], 'refs/heads/rel-')))
+      inputs:
+        SymbolsFolder: '$(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\'
+        SearchPattern: '*.pdb'
+        IndexSources: true
+        PublishSymbols: true
+        SymbolServerType: 'TeamServices'
+        SymbolExpirationInDays: '36530'
+        IndexableFileFormats: 'Default'
+        DetailedLog: true
+        SymbolsArtifactName: 'Symbols_${{parameters.buildConfig}}'
+
     - task: CmdLine@2
       displayName: 'Copy build artifacts for zipping'
       inputs:

From 9e7fcef4988ff3d53f75735d1d4ad7181a410912 Mon Sep 17 00:00:00 2001
From: Adrian Lizarraga <adlizarraga@microsoft.com>
Date: Mon, 5 May 2025 21:02:51 -0700
Subject: [PATCH 10/84] Add Python bindings for "compile" and "auto EP" APIs
 (#24614)

### Description
Python API updates for auto ep selection and the compilation API.
- Adds Python API `SessionOptions.add_provider()` (equivalent to C API's
`SessionOptionsAppendExecutionProvider`)
- Adds Python API `SessionOptions.add_provider_for_devices()`
(equivalent to C API's `SessionOptionsAppendExecutionProvider_V2`)
- Adds Python API `SessionOptions.set_provider_selection_policy()`
(equivalent to C API's `SessionOptionsSetEpSelectionPolicy`)
- Adds Python API class `ModelCompiler` to compile models (wraps C API's
`OrtModelCompilationOptions` and `CompileModel()`)
- TODO: Finish delegate callback. Need to add a `void*` parameter to
delegate function.

### Sample program that uses autoep APIs
Adapted from a unit test.
```python
    def test_cuda_prefer_gpu_and_inference(self):
        """
        Test selecting CUDA EP via the PREFER_GPU policy and running inference.
        """
        ep_lib_path = "onnxruntime_providers_cuda.dll"
        ep_registration_name = "CUDAExecutionProvider"

        if sys.platform != "win32":
            self.skipTest("Skipping test because device discovery is only supported on Windows")

        if not os.path.exists(ep_lib_path):
            self.skipTest(f"Skipping test because EP library '{ep_lib_path}' cannot be found")

        onnxrt.register_execution_provider_library(ep_registration_name, os.path.realpath(ep_lib_path))

        # Set a policy to prefer GPU. Cuda should be selected.
        sess_options = onnxrt.SessionOptions()
        sess_options.set_provider_selection_policy(onnxrt.OrtExecutionProviderDevicePolicy.PREFER_GPU)
        self.assertTrue(sess_options.has_providers())

        # Run sample model and check output
        sess = onnxrt.InferenceSession(get_name("mul_1.onnx"), sess_options=sess_options)

        x = np.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], dtype=np.float32)
        input_name = sess.get_inputs()[0].name
        res = sess.run([], {input_name: x})
        output_expected = np.array([[1.0, 4.0], [9.0, 16.0], [25.0, 36.0]], dtype=np.float32)
        np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08)
```
### Sample program that uses compile APIs
Adapted from a unit test that compiles using EP selection policy.
```python
     def test_compile_with_files_prefer_npu_policy(self):
        """
        Tests compiling a model (to/from files) using an EP selection policy (PREFER_NPU).
        """
        ep_lib_path = "onnxruntime_providers_qnn.dll"
        ep_registration_name = "QNNExecutionProvider"
        onnxrt.register_execution_provider_library(ep_registration_name, ep_lib_path)

        input_model_path = get_name("nhwc_resize_scales_opset18.onnx")
        output_model_path = os.path.join(self._tmp_dir_path, "model.compiled0.onnx")

        session_options = onnxrt.SessionOptions()
        session_options.set_provider_selection_policy(onnxrt.OrtExecutionProviderDevicePolicy.PREFER_NPU)

        model_compiler = onnxrt.ModelCompiler(
            session_options,
            input_model_path,
            embed_compiled_data_into_model=True,
            external_initializers_file_path=None,
        )
        model_compiler.compile_to_file(output_model_path)
        self.assertTrue(os.path.exists(output_model_path))
        onnxrt.unregister_execution_provider_library(ep_registration_name)
```

Adapted from a unit test that compiles using explicit EPs.
```python
    def test_compile_with_input_and_output_files(self):
        """
        Tests compiling a model (to/from files) using explicit EP.
        """
        provider = None
        provider_options = dict()
        if "QNNExecutionProvider" in available_providers:
            provider = "QNNExecutionProvider"
            provider_options["backend_type"] = "htp"
        # TODO(adrianlizarraga): Allow test to run for other compiling EPs (e.g., OpenVINO)

        input_model_path = get_name("nhwc_resize_scales_opset18.onnx")
        output_model_path = os.path.join(self._tmp_dir_path, "model.compiled1.onnx")

        session_options = onnxrt.SessionOptions()
        if provider:
            session_options.add_provider(provider, provider_options)

        model_compiler = onnxrt.ModelCompiler(
            session_options,
            input_model_path,
            embed_compiled_data_into_model=True,
            external_initializers_file_path=None,
        )
        model_compiler.compile_to_file(output_model_path)
        self.assertTrue(os.path.exists(output_model_path))
```

### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
---
 onnxruntime/__init__.py                       |   8 +
 onnxruntime/core/session/compile_api.cc       |  32 +-
 .../core/session/model_compilation_options.cc |  11 +-
 .../core/session/model_compilation_options.h  |  10 +-
 onnxruntime/core/session/onnxruntime_c_api.cc |  53 +-
 onnxruntime/core/session/utils.cc             | 131 ++++-
 onnxruntime/core/session/utils.h              |  32 ++
 .../onnxruntime_inference_collection.py       | 126 ++++-
 .../python/onnxruntime_pybind_exceptions.cc   |   6 +
 .../python/onnxruntime_pybind_exceptions.h    |   6 +
 .../onnxruntime_pybind_model_compiler.cc      |  80 +++
 .../onnxruntime_pybind_model_compiler.h       |  78 +++
 .../python/onnxruntime_pybind_state.cc        | 468 ++++++++++++++++--
 .../python/onnxruntime_pybind_state_common.h  |  23 +-
 onnxruntime/test/python/autoep_helper.py      |  67 +++
 .../test/python/onnxruntime_test_python.py    |  19 +-
 .../python/onnxruntime_test_python_autoep.py  | 167 +++++++
 .../python/onnxruntime_test_python_backend.py |  16 +-
 .../onnxruntime_test_python_compile_api.py    | 185 +++++++
 tools/ci_build/build.py                       |   9 +
 20 files changed, 1376 insertions(+), 151 deletions(-)
 create mode 100644 onnxruntime/python/onnxruntime_pybind_model_compiler.cc
 create mode 100644 onnxruntime/python/onnxruntime_pybind_model_compiler.h
 create mode 100644 onnxruntime/test/python/autoep_helper.py
 create mode 100644 onnxruntime/test/python/onnxruntime_test_python_autoep.py
 create mode 100644 onnxruntime/test/python/onnxruntime_test_python_compile_api.py

diff --git a/onnxruntime/__init__.py b/onnxruntime/__init__.py
index f8e13f7629e6b..6ef0707f4b7c6 100644
--- a/onnxruntime/__init__.py
+++ b/onnxruntime/__init__.py
@@ -30,6 +30,10 @@
         NodeArg,  # noqa: F401
         OrtAllocatorType,  # noqa: F401
         OrtArenaCfg,  # noqa: F401
+        OrtEpDevice,  # noqa: F401
+        OrtExecutionProviderDevicePolicy,  # noqa: F401
+        OrtHardwareDevice,  # noqa: F401
+        OrtHardwareDeviceType,  # noqa: F401
         OrtMemoryInfo,  # noqa: F401
         OrtMemType,  # noqa: F401
         OrtSparseFormat,  # noqa: F401
@@ -44,11 +48,14 @@
         get_available_providers,  # noqa: F401
         get_build_info,  # noqa: F401
         get_device,  # noqa: F401
+        get_ep_devices,  # noqa: F401
         get_version_string,  # noqa: F401
         has_collective_ops,  # noqa: F401
+        register_execution_provider_library,  # noqa: F401
         set_default_logger_severity,  # noqa: F401
         set_default_logger_verbosity,  # noqa: F401
         set_seed,  # noqa: F401
+        unregister_execution_provider_library,  # noqa: F401
     )
 
     import_capi_exception = None
@@ -64,6 +71,7 @@
     AdapterFormat,  # noqa: F401
     InferenceSession,  # noqa: F401
     IOBinding,  # noqa: F401
+    ModelCompiler,  # noqa: F401
     OrtDevice,  # noqa: F401
     OrtValue,  # noqa: F401
     SparseTensor,  # noqa: F401
diff --git a/onnxruntime/core/session/compile_api.cc b/onnxruntime/core/session/compile_api.cc
index a3f6addd100ad..ad128fee6cc3d 100644
--- a/onnxruntime/core/session/compile_api.cc
+++ b/onnxruntime/core/session/compile_api.cc
@@ -8,11 +8,13 @@
 #include <string>
 
 #include "core/common/common.h"
+#include "core/session/allocator_adapters.h"
 #include "core/framework/error_code_helper.h"
 #include "core/session/abi_session_options_impl.h"
 #include "core/session/inference_session.h"
 #include "core/session/model_compilation_options.h"
 #include "core/session/ort_apis.h"
+#include "core/session/ort_env.h"
 #include "core/session/utils.h"
 #else
 #include "core/common/common.h"
@@ -43,7 +45,8 @@ ORT_API_STATUS_IMPL(OrtCompileAPI::CreateModelCompilationOptionsFromSessionOptio
     return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, "The session_options argument must be a non-null pointer");
   }
 
-  auto model_compile_options = std::make_unique<onnxruntime::ModelCompilationOptions>(*env, *session_options);
+  auto model_compile_options = std::make_unique<onnxruntime::ModelCompilationOptions>(env->GetEnvironment(),
+                                                                                      *session_options);
   *out = reinterpret_cast<OrtModelCompilationOptions*>(model_compile_options.release());
   return nullptr;
 #else
@@ -150,7 +153,7 @@ ORT_API_STATUS_IMPL(OrtCompileAPI::ModelCompilationOptions_SetOutputModelExterna
 
 ORT_API_STATUS_IMPL(OrtCompileAPI::ModelCompilationOptions_SetOutputModelBuffer,
                     _In_ OrtModelCompilationOptions* ort_model_compile_options,
-                    _Inout_ OrtAllocator* allocator, void** output_model_data_ptr, size_t* output_model_data_size_ptr) {
+                    _Inout_ OrtAllocator* ort_allocator, void** output_model_data_ptr, size_t* output_model_data_size_ptr) {
   API_IMPL_BEGIN
 #if !defined(ORT_MINIMAL_BUILD)
   auto model_compile_options = reinterpret_cast<onnxruntime::ModelCompilationOptions*>(ort_model_compile_options);
@@ -163,17 +166,18 @@ ORT_API_STATUS_IMPL(OrtCompileAPI::ModelCompilationOptions_SetOutputModelBuffer,
     return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, "Invalid output model buffer: size pointer is null");
   }
 
-  if (allocator == nullptr) {
+  if (ort_allocator == nullptr) {
     return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, "Invalid allocator for output model buffer: allocator pointer is null");
   }
 
-  ORT_API_RETURN_IF_STATUS_NOT_OK(model_compile_options->SetOutputModelBuffer(allocator,
+  onnxruntime::AllocatorPtr allocator = std::make_shared<onnxruntime::IAllocatorImplWrappingOrtAllocator>(ort_allocator);
+  ORT_API_RETURN_IF_STATUS_NOT_OK(model_compile_options->SetOutputModelBuffer(std::move(allocator),
                                                                               output_model_data_ptr,
                                                                               output_model_data_size_ptr));
   return nullptr;
 #else
   ORT_UNUSED_PARAMETER(ort_model_compile_options);
-  ORT_UNUSED_PARAMETER(allocator);
+  ORT_UNUSED_PARAMETER(ort_allocator);
   ORT_UNUSED_PARAMETER(output_model_data_ptr);
   ORT_UNUSED_PARAMETER(output_model_data_size_ptr);
   return OrtApis::CreateStatus(ORT_NOT_IMPLEMENTED, "Compile API is not supported in this build");
@@ -202,23 +206,7 @@ ORT_API_STATUS_IMPL(OrtCompileAPI::CompileModel, _In_ const OrtEnv* env,
   API_IMPL_BEGIN
 #if !defined(ORT_MINIMAL_BUILD)
   auto model_compile_options = reinterpret_cast<const onnxruntime::ModelCompilationOptions*>(ort_model_compile_options);
-  ORT_API_RETURN_IF_STATUS_NOT_OK(model_compile_options->Check());
-
-  std::unique_ptr<onnxruntime::InferenceSession> session;
-  const OrtSessionOptions* session_options = &model_compile_options->GetSessionOptions();
-
-  if (model_compile_options->InputModelComesFromFile()) {
-    PathString input_model_path = ToPathString(model_compile_options->GetInputModelPath());
-    ORT_API_RETURN_IF_ERROR(CreateSessionAndLoadModel(session_options, env,
-                                                      input_model_path.c_str(),
-                                                      nullptr, 0, session));
-  } else {
-    ORT_API_RETURN_IF_ERROR(CreateSessionAndLoadModel(session_options, env, nullptr,
-                                                      model_compile_options->GetInputModelData(),
-                                                      model_compile_options->GetInputModelDataSize(), session));
-  }
-
-  ORT_API_RETURN_IF_ERROR(InitializeSession(session_options, *session));
+  ORT_API_RETURN_IF_STATUS_NOT_OK(onnxruntime::CompileModel(env->GetEnvironment(), *model_compile_options));
   return nullptr;
 #else
   ORT_UNUSED_PARAMETER(env);
diff --git a/onnxruntime/core/session/model_compilation_options.cc b/onnxruntime/core/session/model_compilation_options.cc
index c4a7c5262d03d..d0cb092f78843 100644
--- a/onnxruntime/core/session/model_compilation_options.cc
+++ b/onnxruntime/core/session/model_compilation_options.cc
@@ -8,12 +8,12 @@
 #include <string>
 #include <utility>
 
-#include "core/session/allocator_adapters.h"
+#include "core/framework/allocator.h"
 #include "core/session/onnxruntime_session_options_config_keys.h"
-#include "core/session/ort_env.h"
+#include "core/session/environment.h"
 
 namespace onnxruntime {
-ModelCompilationOptions::ModelCompilationOptions(const OrtEnv& env, const OrtSessionOptions& session_options)
+ModelCompilationOptions::ModelCompilationOptions(const onnxruntime::Environment& env, const OrtSessionOptions& session_options)
     : env_(env), session_options_(session_options) {
   session_options_.value.has_explicit_ep_context_gen_options = true;
   session_options_.value.ep_context_gen_options = session_options.value.GetEpContextGenerationOptions();
@@ -86,15 +86,14 @@ void ModelCompilationOptions::SetOutputModelExternalInitializersFile(const std::
       external_initializer_size_threshold;
 }
 
-Status ModelCompilationOptions::SetOutputModelBuffer(OrtAllocator* allocator,
+Status ModelCompilationOptions::SetOutputModelBuffer(onnxruntime::AllocatorPtr allocator,
                                                      void** output_model_buffer_ptr,
                                                      size_t* output_model_buffer_size_ptr) {
   ORT_RETURN_IF_ERROR(ResetOutputModelSettings());
 
   session_options_.value.ep_context_gen_options.output_model_buffer_ptr = output_model_buffer_ptr;
   session_options_.value.ep_context_gen_options.output_model_buffer_size_ptr = output_model_buffer_size_ptr;
-  session_options_.value.ep_context_gen_options.output_model_buffer_allocator =
-      std::make_shared<onnxruntime::IAllocatorImplWrappingOrtAllocator>(allocator);
+  session_options_.value.ep_context_gen_options.output_model_buffer_allocator = std::move(allocator);
   return Status::OK();
 }
 
diff --git a/onnxruntime/core/session/model_compilation_options.h b/onnxruntime/core/session/model_compilation_options.h
index 5ee64d48c3060..9238264003645 100644
--- a/onnxruntime/core/session/model_compilation_options.h
+++ b/onnxruntime/core/session/model_compilation_options.h
@@ -8,11 +8,13 @@
 #include <string>
 #include "core/common/status.h"
 #include "core/common/path_string.h"
+#include "core/framework/allocator.h"
 #include "core/session/abi_session_options_impl.h"
 #include "core/session/onnxruntime_c_api.h"
 #include "core/session/onnxruntime_session_options_config_keys.h"
 
 namespace onnxruntime {
+class Environment;
 
 /// <summary>
 /// Stores options to compile ONNX models into "EPContext" models.
@@ -23,9 +25,9 @@ class ModelCompilationOptions {
   /// Creates an instance with the session options to use for model compilation.
   /// The session options are expected to have execution providers that compile.
   /// </summary>
-  /// <param name="env">Reference to OrtEnv</param>
+  /// <param name="env">Reference to Environment</param>
   /// <param name="session_options">Reference to session options</param>
-  ModelCompilationOptions(const OrtEnv& env, const OrtSessionOptions& session_options);
+  ModelCompilationOptions(const onnxruntime::Environment& env, const OrtSessionOptions& session_options);
 
   /// <summary>
   /// Sets the file path to the input ONNX model to compile.
@@ -67,7 +69,7 @@ class ModelCompilationOptions {
   /// <param name="output_model_buffer_ptr">Pointer to the buffer that will contain the compiled model</param>
   /// <param name="output_model_buffer_size_ptr">Set to the size of the buffer</param>
   /// <returns>Status indicating potential error</returns>
-  Status SetOutputModelBuffer(OrtAllocator* allocator, void** output_model_buffer_ptr,
+  Status SetOutputModelBuffer(onnxruntime::AllocatorPtr allocator, void** output_model_buffer_ptr,
                               size_t* output_model_buffer_size_ptr);
 
   /// <summary>
@@ -122,7 +124,7 @@ class ModelCompilationOptions {
   Status CheckInputModelSettings() const;
   Status CheckOutputModelSettings() const;
 
-  const OrtEnv& env_;
+  const onnxruntime::Environment& env_;
   OrtSessionOptions session_options_;
   std::string input_model_path_;
   const void* input_model_data_ = nullptr;
diff --git a/onnxruntime/core/session/onnxruntime_c_api.cc b/onnxruntime/core/session/onnxruntime_c_api.cc
index c70075b234faf..304966605c9cf 100644
--- a/onnxruntime/core/session/onnxruntime_c_api.cc
+++ b/onnxruntime/core/session/onnxruntime_c_api.cc
@@ -2465,49 +2465,16 @@ ORT_API_STATUS_IMPL(OrtApis::SessionOptionsAppendExecutionProvider_V2, _In_ OrtS
                     _In_reads_(num_op_options) const char* const* ep_option_vals,
                     size_t num_ep_options) {
   API_IMPL_BEGIN
-  if (num_ep_devices > 1) {
-    const auto& ep_name = ep_devices[0]->ep_name;
-    bool all_match = std::all_of(ep_devices + 1, ep_devices + num_ep_devices,
-                                 [&ep_name](const OrtEpDevice* ep_device) { return ep_device->ep_name == ep_name; });
-    if (!all_match) {
-      return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT,
-                                   "All OrtEpDevice values in ep_devices must have the same execution provider.");
-    }
-  }
-
-  EpFactoryInternal* internal_factory = nullptr;
-  for (size_t i = 0; i < num_ep_devices; ++i) {
-    const OrtEpDevice* entry = ep_devices[i];
-
-    // we expect the internal factory to be available for internal and provider bridge EPs, which is all we support.
-    internal_factory = env->GetEnvironment().GetEpFactoryInternal(entry->ep_factory);
-    if (!internal_factory) {
-      return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, "EP is not currently supported by this API");
-    }
-
-    // add the options to the session options with the EP prefix.
-    // first add the default values with prefix followed by user specified values so those win
-    const auto prefix = OrtSessionOptions::GetProviderOptionPrefix(entry->ep_name.c_str());
-    auto& config_options = session_options->value.config_options;
-    for (const auto& [key, value] : entry->ep_options.entries) {
-      ORT_API_RETURN_IF_STATUS_NOT_OK(config_options.AddConfigEntry((prefix + key).c_str(), value.c_str()));
-    }
-
-    for (size_t j = 0; j < num_ep_options; ++j) {
-      if (ep_option_keys[j] == nullptr) {
-        continue;
-      }
-
-      ORT_API_RETURN_IF_STATUS_NOT_OK(config_options.AddConfigEntry((prefix + ep_option_keys[j]).c_str(),
-                                                                    ep_option_vals[j]));
-    }
-  }
-
-  if (internal_factory) {
-    session_options->provider_factories.push_back(
-        std::make_unique<InternalExecutionProviderFactory>(
-            *internal_factory, std::vector<const OrtEpDevice*>(ep_devices, ep_devices + num_ep_devices)));
-  }
+  std::unique_ptr<IExecutionProviderFactory> provider_factory = nullptr;
+
+  ORT_API_RETURN_IF_STATUS_NOT_OK(CreateIExecutionProviderFactoryForEpDevices(
+      env->GetEnvironment(),
+      session_options->value,
+      gsl::span<const OrtEpDevice* const>(ep_devices, num_ep_devices),
+      gsl::span<const char* const>(ep_option_keys, num_ep_options),
+      gsl::span<const char* const>(ep_option_vals, num_ep_options),
+      /*output*/ provider_factory));
+  session_options->provider_factories.push_back(std::move(provider_factory));
 
   return nullptr;
   API_IMPL_END
diff --git a/onnxruntime/core/session/utils.cc b/onnxruntime/core/session/utils.cc
index d17514e54a945..c05394039d8c7 100644
--- a/onnxruntime/core/session/utils.cc
+++ b/onnxruntime/core/session/utils.cc
@@ -10,15 +10,18 @@
 #include "core/session/environment.h"
 #include "core/session/inference_session.h"
 #include "core/session/inference_session_utils.h"
-#include "core/session/ep_factory_internal.h"
 #include "core/session/onnxruntime_c_api.h"
 #include "core/session/onnxruntime_session_options_config_keys.h"
-#include "core/session/ep_library_plugin.h"
-#include "core/session/ep_library_provider_bridge.h"
 #include "core/session/ort_apis.h"
 #include "core/session/ort_env.h"
-#include "core/session/onnxruntime_session_options_config_keys.h"
+
+#if !defined(ORT_MINIMAL_BUILD)
+#include "core/session/ep_factory_internal.h"
+#include "core/session/ep_library_plugin.h"
+#include "core/session/ep_library_provider_bridge.h"
+#include "core/session/model_compilation_options.h"
 #include "core/session/provider_policy_context.h"
+#endif  // !defined(ORT_MINIMAL_BUILD)
 
 using namespace onnxruntime;
 #if !defined(ORT_MINIMAL_BUILD)
@@ -120,13 +123,14 @@ common::Status CopyStringToOutputArg(std::string_view str, const char* err_msg,
   return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, err_msg);
 }
 
-// provider either model_path, or modal_data + model_data_length.
-OrtStatus* CreateSessionAndLoadModel(_In_ const OrtSessionOptions* options,
-                                     _In_ const OrtEnv* env,
-                                     _In_opt_z_ const ORTCHAR_T* model_path,
-                                     _In_opt_ const void* model_data,
-                                     size_t model_data_length,
-                                     std::unique_ptr<onnxruntime::InferenceSession>& sess) {
+// Internal function that creates an InferenceSession and loads the model.
+// Caller should provide either model_path, or modal_data + model_data_length.
+static OrtStatus* CreateSessionAndLoadModelImpl(_In_ const OrtSessionOptions* options,
+                                                const onnxruntime::Environment& env,
+                                                _In_opt_z_ const ORTCHAR_T* model_path,
+                                                _In_opt_ const void* model_data,
+                                                size_t model_data_length,
+                                                std::unique_ptr<onnxruntime::InferenceSession>& sess) {
   // quick check here to decide load path. InferenceSession will provide error message for invalid values.
   // TODO: Could move to a helper
   const Env& os_env = Env::Default();  // OS environment (!= ORT environment)
@@ -155,12 +159,12 @@ OrtStatus* CreateSessionAndLoadModel(_In_ const OrtSessionOptions* options,
     if (model_path != nullptr) {
       sess = std::make_unique<onnxruntime::InferenceSession>(
           options == nullptr ? onnxruntime::SessionOptions() : options->value,
-          env->GetEnvironment(),
+          env,
           model_path);
     } else {
       sess = std::make_unique<onnxruntime::InferenceSession>(
           options == nullptr ? onnxruntime::SessionOptions() : options->value,
-          env->GetEnvironment(),
+          env,
           model_data, static_cast<int>(model_data_length));
     }
 #else
@@ -169,20 +173,20 @@ OrtStatus* CreateSessionAndLoadModel(_In_ const OrtSessionOptions* options,
   } else {
     sess = std::make_unique<onnxruntime::InferenceSession>(
         options == nullptr ? onnxruntime::SessionOptions() : options->value,
-        env->GetEnvironment());
+        env);
   }
 
 #if !defined(ORT_MINIMAL_BUILD)
   // TEMPORARY for testing. Manually specify the EP to select.
   auto auto_select_ep_name = sess->GetSessionOptions().config_options.GetConfigEntry("test.ep_to_select");
   if (auto_select_ep_name) {
-    ORT_API_RETURN_IF_STATUS_NOT_OK(TestAutoSelectEPsImpl(env->GetEnvironment(), *sess, *auto_select_ep_name));
+    ORT_API_RETURN_IF_STATUS_NOT_OK(TestAutoSelectEPsImpl(env, *sess, *auto_select_ep_name));
   }
 
   // if there are no providers registered, and there's an ep selection policy set, do auto ep selection
   if (options != nullptr && options->provider_factories.empty() && options->value.ep_selection_policy.enable) {
     ProviderPolicyContext context;
-    ORT_API_RETURN_IF_STATUS_NOT_OK(context.SelectEpsForSession(env->GetEnvironment(), *options, *sess));
+    ORT_API_RETURN_IF_STATUS_NOT_OK(context.SelectEpsForSession(env, *options, *sess));
   }
 #endif  // !defined(ORT_MINIMAL_BUILD)
 
@@ -209,6 +213,17 @@ OrtStatus* CreateSessionAndLoadModel(_In_ const OrtSessionOptions* options,
   return nullptr;
 }
 
+// Creates an InferenceSession and loads the model.
+// Caller should provide either model_path, or modal_data + model_data_length.
+OrtStatus* CreateSessionAndLoadModel(_In_ const OrtSessionOptions* options,
+                                     _In_ const OrtEnv* env,
+                                     _In_opt_z_ const ORTCHAR_T* model_path,
+                                     _In_opt_ const void* model_data,
+                                     size_t model_data_length,
+                                     std::unique_ptr<onnxruntime::InferenceSession>& sess) {
+  return CreateSessionAndLoadModelImpl(options, env->GetEnvironment(), model_path, model_data, model_data_length, sess);
+}
+
 OrtStatus* InitializeSession(_In_ const OrtSessionOptions* options,
                              _In_ onnxruntime::InferenceSession& sess,
                              _Inout_opt_ OrtPrepackedWeightsContainer* prepacked_weights_container) {
@@ -245,6 +260,28 @@ OrtStatus* InitializeSession(_In_ const OrtSessionOptions* options,
 
 namespace onnxruntime {
 #if !defined(ORT_MINIMAL_BUILD)
+Status CompileModel(const Environment& env, const ModelCompilationOptions& model_compile_options) {
+  ORT_RETURN_IF_ERROR(model_compile_options.Check());
+
+  std::unique_ptr<onnxruntime::InferenceSession> session;
+  const OrtSessionOptions* session_options = &model_compile_options.GetSessionOptions();
+
+  if (model_compile_options.InputModelComesFromFile()) {
+    PathString input_model_path = ToPathString(model_compile_options.GetInputModelPath());
+    ORT_RETURN_IF_ERROR(ToStatus(CreateSessionAndLoadModelImpl(session_options, env,
+                                                               input_model_path.c_str(),
+                                                               nullptr, 0, session)));
+  } else {
+    ORT_RETURN_IF_ERROR(ToStatus(CreateSessionAndLoadModelImpl(session_options, env, nullptr,
+                                                               model_compile_options.GetInputModelData(),
+                                                               model_compile_options.GetInputModelDataSize(),
+                                                               session)));
+  }
+
+  ORT_RETURN_IF_ERROR(ToStatus(InitializeSession(session_options, *session)));
+  return Status::OK();
+}
+
 Status LoadPluginOrProviderBridge(const std::string& registration_name,
                                   const ORTCHAR_T* library_path,
                                   std::unique_ptr<EpLibrary>& ep_library,
@@ -283,5 +320,65 @@ Status LoadPluginOrProviderBridge(const std::string& registration_name,
 
   return Status::OK();
 }
-#endif
+
+Status CreateIExecutionProviderFactoryForEpDevices(const Environment& env,
+                                                   SessionOptions& session_options,
+                                                   gsl::span<const OrtEpDevice* const> ep_devices,
+                                                   gsl::span<const char* const> ep_option_keys,
+                                                   gsl::span<const char* const> ep_option_vals,
+                                                   /*output*/ std::unique_ptr<IExecutionProviderFactory>& out) {
+  if (ep_devices.empty()) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                           "Must provide one or more OrtEpDevice instances.");
+  }
+
+  const size_t num_ep_options = ep_option_keys.size();
+  if (ep_option_vals.size() != num_ep_options) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                           "Must provide the same number of keys and values for EP options.");
+  }
+
+  const auto& ep_name = ep_devices[0]->ep_name;
+  bool all_match = std::all_of(ep_devices.begin() + 1, ep_devices.end(),
+                               [&ep_name](const OrtEpDevice* ep_device) { return ep_device->ep_name == ep_name; });
+  if (!all_match) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                           "All OrtEpDevice values in ep_devices must have the same execution provider.");
+  }
+
+  EpFactoryInternal* internal_factory = nullptr;
+  for (const OrtEpDevice* ep_device : ep_devices) {
+    // we expect the internal factory to be available for internal and provider bridge EPs, which is all we support.
+    internal_factory = env.GetEpFactoryInternal(ep_device->ep_factory);
+    if (!internal_factory) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "EP is not currently supported by this API");
+    }
+
+    // add the options to the session options with the EP prefix.
+    // first add the default values with prefix followed by user specified values so those win
+    const std::string prefix = OrtSessionOptions::GetProviderOptionPrefix(ep_device->ep_name.c_str());
+    auto& config_options = session_options.config_options;
+    for (const auto& [key, value] : ep_device->ep_options.entries) {
+      ORT_RETURN_IF_ERROR(config_options.AddConfigEntry((prefix + key).c_str(), value.c_str()));
+    }
+
+    for (size_t j = 0; j < num_ep_options; ++j) {
+      if (ep_option_keys[j] == nullptr) {
+        continue;
+      }
+
+      ORT_RETURN_IF_ERROR(config_options.AddConfigEntry((prefix + ep_option_keys[j]).c_str(), ep_option_vals[j]));
+    }
+  }
+
+  if (!internal_factory) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "EP is not currently supported by this API");
+  }
+
+  out = std::make_unique<InternalExecutionProviderFactory>(*internal_factory,
+                                                           std::vector<const OrtEpDevice*>(ep_devices.begin(),
+                                                                                           ep_devices.end()));
+  return Status::OK();
+}
+#endif  // !defined(ORT_MINIMAL_BUILD)
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/session/utils.h b/onnxruntime/core/session/utils.h
index 535a7b041609d..5a5dcae9165ed 100644
--- a/onnxruntime/core/session/utils.h
+++ b/onnxruntime/core/session/utils.h
@@ -3,7 +3,10 @@
 
 #pragma once
 
+#include <gsl/gsl>
+#include <memory>
 #include <string_view>
+#include <vector>
 #include "core/common/common.h"
 #include "core/session/onnxruntime_c_api.h"
 
@@ -14,9 +17,18 @@ struct OrtStatus;
 struct OrtPrepackedWeightsContainer;
 namespace onnxruntime {
 class InferenceSession;
+class ModelCompilationOptions;
+}  // namespace onnxruntime
+
+#if !defined(ORT_MINIMAL_BUILD)
+namespace onnxruntime {
+class Environment;
 class EpLibrary;
 class EpFactoryInternal;
+struct IExecutionProviderFactory;
+struct SessionOptions;
 }  // namespace onnxruntime
+#endif  // !defined(ORT_MINIMAL_BUILD)
 
 OrtStatus* CreateSessionAndLoadModel(_In_ const OrtSessionOptions* options,
                                      _In_ const OrtEnv* env,
@@ -29,8 +41,18 @@ OrtStatus* InitializeSession(_In_ const OrtSessionOptions* options,
                              _In_ onnxruntime::InferenceSession& sess,
                              _Inout_opt_ OrtPrepackedWeightsContainer* prepacked_weights_container = nullptr);
 
+#if !defined(ORT_MINIMAL_BUILD)
 namespace onnxruntime {
 
+/// <summary>
+/// Compiles an ONNX model into a model with EPContext nodes. Each EPContext node represents a subgraph compiled for
+/// a specific execution provider.
+/// </summary>
+/// <param name="env">A reference to the Environment instance.</param>
+/// <param name="model_compile_options">An object specifying the compilation options.</param>
+/// <returns>A Status indicating an error or success.</returns>
+Status CompileModel(const Environment& env, const ModelCompilationOptions& model_compile_options);
+
 // load a library that is added using RegisterExecutionProviderLibrary.
 // infer whether it's a provider bridge library or plugin library
 Status LoadPluginOrProviderBridge(const std::string& registration_name,
@@ -38,4 +60,14 @@ Status LoadPluginOrProviderBridge(const std::string& registration_name,
                                   std::unique_ptr<EpLibrary>& ep_library,
                                   std::vector<EpFactoryInternal*>& internal_factories);
 
+// Creates an IExecutionProviderFactory instance for a list of OrtEpDevices that all refer to the same EP.
+// Adds all provider options to the OrtSessionOptions configuration.
+Status CreateIExecutionProviderFactoryForEpDevices(const Environment& env,
+                                                   SessionOptions& session_options,
+                                                   gsl::span<const OrtEpDevice* const> ep_devices,
+                                                   gsl::span<const char* const> ep_options_keys,
+                                                   gsl::span<const char* const> ep_options_vals,
+                                                   /*output*/ std::unique_ptr<IExecutionProviderFactory>& out);
+
 }  // namespace onnxruntime
+#endif  // !defined(ORT_MINIMAL_BUILD)
diff --git a/onnxruntime/python/onnxruntime_inference_collection.py b/onnxruntime/python/onnxruntime_inference_collection.py
index ed0298a85b8e7..e7dc4294f3672 100644
--- a/onnxruntime/python/onnxruntime_inference_collection.py
+++ b/onnxruntime/python/onnxruntime_inference_collection.py
@@ -172,10 +172,10 @@ class Session:
     This is the main class used to run a model.
     """
 
-    def __init__(self):
+    def __init__(self, enable_fallback: bool = True):
         # self._sess is managed by the derived class and relies on bindings from C.InferenceSession
         self._sess = None
-        self._enable_fallback = True
+        self._enable_fallback = enable_fallback
 
     def get_session_options(self) -> onnxruntime.SessionOptions:
         "Return the session options. See :class:`onnxruntime.SessionOptions`."
@@ -446,7 +446,7 @@ def __init__(
         means execute a node using `CUDAExecutionProvider`
         if capable, otherwise execute using `CPUExecutionProvider`.
         """
-        super().__init__()
+        super().__init__(enable_fallback=int(kwargs.get("enable_fallback", 1)) == 1)
 
         if isinstance(path_or_bytes, (str, os.PathLike)):
             self._model_path = os.fspath(path_or_bytes)
@@ -459,7 +459,6 @@ def __init__(
 
         self._sess_options = sess_options
         self._sess_options_initial = sess_options
-        self._enable_fallback = True
         if "read_config_from_model" in kwargs:
             self._read_config_from_model = int(kwargs["read_config_from_model"]) == 1
         else:
@@ -542,6 +541,16 @@ def _create_inference_session(self, providers, provider_options, disabled_optimi
             providers, provider_options, available_providers
         )
 
+        # Print a warning if user passed providers to InferenceSession() but the SessionOptions instance
+        # already has provider information (e.g., via add_provider_for_devices()). The providers specified
+        # here will take precedence.
+        if self._sess_options is not None and (providers or provider_options) and self._sess_options.has_providers():
+            warnings.warn(
+                "Specified 'providers'/'provider_options' when creating InferenceSession but SessionOptions has "
+                "already been configured with providers. InferenceSession will only use the providers "
+                "passed to InferenceSession()."
+            )
+
         session_options = self._sess_options if self._sess_options else C.get_default_session_options()
 
         self._register_ep_custom_ops(session_options, providers, provider_options, available_providers)
@@ -609,6 +618,115 @@ def _register_ep_custom_ops(self, session_options, providers, provider_options,
                 C.register_nv_tensorrt_rtx_plugins_as_custom_ops(session_options, providers[i][1])
 
 
+class ModelCompiler:
+    """
+    This class is used to compile an ONNX model. A compiled ONNX model has EPContext nodes that each
+    encapsulates a subgraph compiled/optimized for a specific execution provider.
+
+    Refer to the EPContext design document for more information about EPContext models:
+    https://onnxruntime.ai/docs/execution-providers/EP-Context-Design.html
+
+        ::
+
+            sess_options = onnxruntime.SessionOptions()
+            sess_options.add_provider("SomeExecutionProvider", {"option1": "value1"})
+            # Alternatively, allow ONNX Runtime to select the provider automatically given a policy:
+            # sess_options.set_provider_selection_policy(onnxrt.OrtExecutionProviderDevicePolicy.PREFER_NPU)
+
+            model_compiler = onnxruntime.ModelCompiler(sess_options, "input_model.onnx")
+            model_compiler.compile_to_file("output_model.onnx")
+    """
+
+    def __init__(
+        self,
+        sess_options: onnxruntime.SessionOptions,
+        input_model_path_or_bytes: str | os.PathLike | bytes,
+        embed_compiled_data_into_model: bool = False,
+        external_initializers_file_path: str | os.PathLike | None = None,
+        external_initializers_size_threshold: int = 1024,
+    ):
+        """
+        Creates a ModelCompiler instance.
+
+        :param sess_options: Session options containing the providers for which the model will be compiled.
+            Refer to SessionOptions.add_provider() and SessionOptions.set_provider_selection_policy().
+        :param input_model_path_or_bytes: The path to the input model file or bytes representing a serialized
+            ONNX model.
+        :param embed_compiled_data_into_model: Defaults to False. Set to True to embed compiled binary data into
+            EPContext nodes in the compiled model.
+        :param external_initializers_file_path: Defaults to None. Set to a path for a file that will store the
+            initializers for non-compiled nodes.
+        :param external_initializers_size_threshold: Defaults to 1024. Ignored if `external_initializers_file_path`
+            is None or empty. Initializers larger than this threshold are stored in the external initializers file.
+        """
+        input_model_path: str | os.PathLike | None = None
+        input_model_bytes: bytes | None = None
+        if isinstance(input_model_path_or_bytes, (str, os.PathLike)):
+            if not input_model_path_or_bytes:
+                raise ValueError("Input model path is empty")
+            input_model_path = os.fspath(input_model_path_or_bytes)
+        elif isinstance(input_model_path_or_bytes, bytes):
+            if len(input_model_path_or_bytes) == 0:
+                raise ValueError("Input model bytes array is empty")
+            input_model_bytes = input_model_path_or_bytes
+        else:
+            raise TypeError(f"Unable to load from type '{type(input_model_path_or_bytes)}'")
+
+        if external_initializers_file_path:
+            if not isinstance(external_initializers_file_path, (str, os.PathLike)):
+                arg_type = type(external_initializers_file_path)
+                raise TypeError(f"Output external initializer filepath is of unexpected type '{arg_type}'")
+            external_initializers_file_path = os.fspath(external_initializers_file_path)
+        else:
+            external_initializers_file_path = ""
+
+        if input_model_path:
+            self._model_compiler = C.ModelCompiler(
+                sess_options,
+                input_model_path,
+                True,  # is path
+                embed_compiled_data_into_model,
+                external_initializers_file_path,
+                external_initializers_size_threshold,
+            )
+        else:
+            self._model_compiler = C.ModelCompiler(
+                sess_options,
+                input_model_bytes,
+                False,  # is bytes
+                embed_compiled_data_into_model,
+                external_initializers_file_path,
+                external_initializers_size_threshold,
+            )
+
+    def compile_to_file(self, output_model_path: str | None = None):
+        """
+        Compiles to an output file. If an output file path is not provided,
+        the output file path is generated based on the input model path by replacing
+        '.onnx' with '_ctx.onnx'. Ex: The generated output file is 'model_ctx.onnx' for
+        an input model with path 'model.onnx'.
+
+        Raises an 'InvalidArgument' exception if the compilation options are invalid.
+
+        :param output_model_path: Defaults to None. The path for the output/compiled model.
+        """
+        if output_model_path:
+            if not isinstance(output_model_path, (str, os.PathLike)):
+                raise TypeError(f"Output model's filepath is of unexpected type '{type(output_model_path)}'")
+            output_model_path = os.fspath(output_model_path)
+        self._model_compiler.compile_to_file(output_model_path)
+
+    def compile_to_bytes(self) -> bytes:
+        """
+        Compiles to bytes representing the serialized compiled ONNX model.
+
+        Raises an 'InvalidArgument' exception if the compilation options are invalid.
+
+        :return: A bytes object representing the compiled ONNX model.
+        """
+        return self._model_compiler.compile_to_bytes()
+
+
 class IOBinding:
     """
     This class provides API to bind input/output to a specified device, e.g. GPU.
diff --git a/onnxruntime/python/onnxruntime_pybind_exceptions.cc b/onnxruntime/python/onnxruntime_pybind_exceptions.cc
index 1a1aae6f48ad1..8f3b97c8c7786 100644
--- a/onnxruntime/python/onnxruntime_pybind_exceptions.cc
+++ b/onnxruntime/python/onnxruntime_pybind_exceptions.cc
@@ -35,6 +35,8 @@ void RegisterExceptions(pybind11::module& m) {
   pybind11::register_exception<NotImplemented>(m, "NotImplemented");
   pybind11::register_exception<InvalidGraph>(m, "InvalidGraph");
   pybind11::register_exception<EPFail>(m, "EPFail");
+  pybind11::register_exception<ModelLoadCanceled>(m, "ModelLoadCanceled");
+  pybind11::register_exception<ModelRequiresCompilation>(m, "ModelRequiresCompilation");
 }
 
 void OrtPybindThrowIfError(onnxruntime::common::Status status) {
@@ -61,6 +63,10 @@ void OrtPybindThrowIfError(onnxruntime::common::Status status) {
         throw InvalidGraph(std::move(msg));
       case onnxruntime::common::StatusCode::EP_FAIL:
         throw EPFail(std::move(msg));
+      case onnxruntime::common::StatusCode::MODEL_LOAD_CANCELED:
+        throw ModelLoadCanceled(std::move(msg));
+      case onnxruntime::common::StatusCode::MODEL_REQUIRES_COMPILATION:
+        throw ModelRequiresCompilation(std::move(msg));
       default:
         throw std::runtime_error(std::move(msg));
     }
diff --git a/onnxruntime/python/onnxruntime_pybind_exceptions.h b/onnxruntime/python/onnxruntime_pybind_exceptions.h
index bc7d31ff2be2d..86bc4a5da8d46 100644
--- a/onnxruntime/python/onnxruntime_pybind_exceptions.h
+++ b/onnxruntime/python/onnxruntime_pybind_exceptions.h
@@ -44,6 +44,12 @@ struct InvalidGraph : std::runtime_error {
 struct EPFail : std::runtime_error {
   explicit EPFail(const std::string& what) : std::runtime_error(what) {}
 };
+struct ModelLoadCanceled : std::runtime_error {
+  explicit ModelLoadCanceled(const std::string& what) : std::runtime_error(what) {}
+};
+struct ModelRequiresCompilation : std::runtime_error {
+  explicit ModelRequiresCompilation(const std::string& what) : std::runtime_error(what) {}
+};
 
 void RegisterExceptions(pybind11::module& m);
 
diff --git a/onnxruntime/python/onnxruntime_pybind_model_compiler.cc b/onnxruntime/python/onnxruntime_pybind_model_compiler.cc
new file mode 100644
index 0000000000000..8bb7ee2098caf
--- /dev/null
+++ b/onnxruntime/python/onnxruntime_pybind_model_compiler.cc
@@ -0,0 +1,80 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+// Licensed under the MIT License.
+#include "python/onnxruntime_pybind_model_compiler.h"
+
+#include <algorithm>
+#include <cassert>
+#include <iterator>
+#include "core/common/common.h"
+#include "core/framework/error_code_helper.h"
+#include "core/session/utils.h"
+
+namespace onnxruntime {
+namespace python {
+
+onnxruntime::Status PyModelCompiler::Create(/*out*/ std::unique_ptr<PyModelCompiler>& out,
+                                            std::shared_ptr<onnxruntime::Environment> env,
+                                            const PySessionOptions& sess_options,
+                                            std::string&& input_model_path_or_bytes, bool input_model_is_path,
+                                            bool embed_compiled_data_into_model,
+                                            const std::string& external_initializers_file_path,
+                                            size_t external_initializers_size_threshold) {
+  auto model_compiler = std::make_unique<PyModelCompiler>(env, sess_options, PrivateConstructorTag{});
+  ModelCompilationOptions& compile_options = model_compiler->model_compile_options_;
+
+  if (input_model_is_path) {
+    compile_options.SetInputModelPath(input_model_path_or_bytes);
+  } else {
+    model_compiler->input_model_bytes_ = std::move(input_model_path_or_bytes);
+    compile_options.SetInputModelFromBuffer(reinterpret_cast<const void*>(model_compiler->input_model_bytes_.data()),
+                                            model_compiler->input_model_bytes_.size());
+  }
+
+  ORT_RETURN_IF_ERROR(compile_options.SetEpContextEmbedMode(embed_compiled_data_into_model));
+
+  if (!external_initializers_file_path.empty()) {
+    compile_options.SetOutputModelExternalInitializersFile(external_initializers_file_path,
+                                                           external_initializers_size_threshold);
+  }
+
+  out = std::move(model_compiler);
+  return Status::OK();
+}
+
+onnxruntime::Status PyModelCompiler::CompileToFile(const std::string& output_model_path) {
+  ORT_RETURN_IF_ERROR(model_compile_options_.SetOutputModelPath(output_model_path));
+  ORT_RETURN_IF_ERROR(onnxruntime::CompileModel(*env_, model_compile_options_));
+  return Status::OK();
+}
+
+onnxruntime::Status PyModelCompiler::CompileToBytes(std::string& output_buffer) {
+  if (!output_buffer.empty()) {
+    // Opt to return an error if the output buffer is not empty instead of just calling output_buffer.clear()
+    // because the C++ standard does not explicitly require that capacity is unchanged by a call to clear().
+    // Don't want to reallocate a large buffer an extra time unnecessarily. So, we'll consider this an internal
+    // ORT error.
+    // Refer to: https://en.cppreference.com/w/cpp/string/basic_string/clear
+    return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Output buffer should be empty.");
+  }
+
+  onnxruntime::AllocatorPtr allocator = std::make_shared<CPUAllocator>();
+
+  void* buffer_data = nullptr;
+  size_t buffer_size = 0;
+  ORT_RETURN_IF_ERROR(model_compile_options_.SetOutputModelBuffer(allocator, &buffer_data, &buffer_size));
+  ORT_RETURN_IF_ERROR(onnxruntime::CompileModel(*env_, model_compile_options_));
+
+  // Copy into output buffer.
+  output_buffer.reserve(buffer_size);
+  gsl::span<char> src(reinterpret_cast<char*>(buffer_data), buffer_size);
+  std::copy(src.begin(), src.end(), std::back_inserter(output_buffer));
+  return Status::OK();
+}
+
+PyModelCompiler::PyModelCompiler(std::shared_ptr<onnxruntime::Environment> env, const PySessionOptions& sess_options,
+                                 PrivateConstructorTag)
+    : env_(env), model_compile_options_(*env, sess_options) {
+}
+}  // namespace python
+}  // namespace onnxruntime
diff --git a/onnxruntime/python/onnxruntime_pybind_model_compiler.h b/onnxruntime/python/onnxruntime_pybind_model_compiler.h
new file mode 100644
index 0000000000000..6c9f48fa00ba6
--- /dev/null
+++ b/onnxruntime/python/onnxruntime_pybind_model_compiler.h
@@ -0,0 +1,78 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+// Licensed under the MIT License.
+#pragma once
+
+#if !defined(ORT_MINIMAL_BUILD)
+#include <memory>
+#include <string>
+#include "core/common/status.h"
+#include "core/session/model_compilation_options.h"
+#include "python/onnxruntime_pybind_state_common.h"
+
+namespace onnxruntime {
+class Environment;
+
+namespace python {
+/// <summary>
+/// Class exposed to Python that enables compiling ONNX models.
+/// Internally wraps a onnxruntime::ModelCompilationOptions that stores and validates settings.
+/// </summary>
+class PyModelCompiler {
+ private:
+  // private tag to pass to constructor to ensure that constructor cannot be directly called externally
+  struct PrivateConstructorTag {};
+
+ public:
+  /// <summary>
+  /// Static class function that creates a unique_ptr<PyModelCompiler> with the given settings.
+  /// </summary>
+  /// <param name="out">Output parameter for the result</param>
+  /// <param name="env">The Environment instance</param>
+  /// <param name="sess_options">The SessionOptions from which to initialize compilation options.</param>
+  /// <param name="input_model_path_or_bytes">An r-value string that could be the input model's path or bytes</param>
+  /// <param name="input_model_is_path">True if 'input_model_path_or_bytes' is a path, and false if its bytes.</param>
+  /// <param name="embed_compiled_data_into_model">True to embed compiled binary data into EPContext nodes.</param>
+  /// <param name="external_initializers_file_path">The file into which to store initializers for non-compiled
+  /// nodes.</param>
+  /// <param name="external_initializers_size_threshold">Ignored if 'external_initializers_file_path' is empty.
+  /// Initializers with a size greater than this threshold are dumped into the external file.</param>
+  /// <returns>A Status indicating error or success.</returns>
+  static onnxruntime::Status Create(/*out*/ std::unique_ptr<PyModelCompiler>& out,
+                                    std::shared_ptr<onnxruntime::Environment> env,
+                                    const PySessionOptions& sess_options,
+                                    std::string&& input_model_path_or_bytes, bool input_model_is_path,
+                                    bool embed_compiled_data_into_model = false,
+                                    const std::string& external_initializers_file_path = {},
+                                    size_t external_initializers_size_threshold = 1024);
+
+  // Note: Creation should be done via Create(). This constructor is public so that it can be called from
+  // std::make_shared().
+  PyModelCompiler(std::shared_ptr<onnxruntime::Environment> env, const PySessionOptions& sess_options,
+                  PrivateConstructorTag);
+
+  /// <summary>
+  /// Compiles the input model and saves the result to an output file.
+  /// If the 'output_model_path' is not specified,
+  /// it is generated based on the input model's path by replacing '.onnx' with '_ctx.onnx'.
+  /// </summary>
+  /// <param name="output_model_path">The path into which to save the compiled model.</param>
+  /// <returns>A Status indicating error or success.</returns>
+  onnxruntime::Status CompileToFile(const std::string& output_model_path = {});
+
+  /// <summary>
+  /// Compiles the input model and stores the result into a buffer.
+  /// </summary>
+  /// <param name="output_buffer">A reference to the output buffer into which to store the
+  /// serialized ONNX model bytes.</param>
+  /// <returns>A Status indicating error or success.</returns>
+  onnxruntime::Status CompileToBytes(std::string& output_buffer);
+
+ private:
+  std::shared_ptr<onnxruntime::Environment> env_;
+  onnxruntime::ModelCompilationOptions model_compile_options_;
+  std::string input_model_bytes_;
+};
+}  // namespace python
+}  // namespace onnxruntime
+#endif  // !defined(ORT_MINIMAL_BUILD)
diff --git a/onnxruntime/python/onnxruntime_pybind_state.cc b/onnxruntime/python/onnxruntime_pybind_state.cc
index 0f15c5fbbdba0..c29a8b0497b1f 100644
--- a/onnxruntime/python/onnxruntime_pybind_state.cc
+++ b/onnxruntime/python/onnxruntime_pybind_state.cc
@@ -2,10 +2,15 @@
 // SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 // Licensed under the MIT License.
 
+#include <functional>
 #include "python/onnxruntime_pybind_exceptions.h"
 #include "python/onnxruntime_pybind_mlvalue.h"
 #include "python/onnxruntime_pybind_state_common.h"
 
+#if !defined(ORT_MINIMAL_BUILD)
+#include "python/onnxruntime_pybind_model_compiler.h"
+#endif  // !defined(ORT_MINIMAL_BUILD)
+
 #define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
 #define PY_ARRAY_UNIQUE_SYMBOL onnxruntime_python_ARRAY_API
 #include "python/numpy_helper.h"
@@ -26,6 +31,7 @@
 #include "core/graph/graph_viewer.h"
 #include "core/platform/env.h"
 #include "core/providers/get_execution_providers.h"
+#include "core/providers/providers.h"
 #include "core/providers/tensorrt/tensorrt_provider_options.h"
 #include "core/session/IOBinding.h"
 #include "core/session/abi_session_options_impl.h"
@@ -34,6 +40,13 @@
 
 #include "core/session/lora_adapters.h"
 
+#if !defined(ORT_MINIMAL_BUILD)
+#include "core/session/abi_devices.h"
+#include "core/session/ep_factory_internal.h"
+#include "core/session/provider_policy_context.h"
+#include "core/session/utils.h"
+#endif
+
 #ifdef ENABLE_ATEN
 #include "contrib_ops/cpu/aten_ops/aten_op_executor.h"
 #endif
@@ -402,7 +415,7 @@ py::object AddTensorAsPyObj(const OrtValue& val, const DataTransferManager* data
   return GetPyObjFromTensor(val, data_transfer_manager, mem_cpy_to_host_functions);
 }
 
-static std::unique_ptr<onnxruntime::IExecutionProvider> LoadExecutionProvider(
+static std::shared_ptr<onnxruntime::IExecutionProviderFactory> LoadExecutionProviderFactory(
     const std::string& ep_shared_lib_path,
     const ProviderOptions& provider_options = {},
     const std::string& entry_symbol_name = "GetProvider") {
@@ -417,8 +430,7 @@ static std::unique_ptr<onnxruntime::IExecutionProvider> LoadExecutionProvider(
   OrtPybindThrowIfError(Env::Default().GetSymbolFromLibrary(handle, entry_symbol_name, (void**)&PGetProvider));
 
   Provider* provider = PGetProvider();
-  std::shared_ptr<IExecutionProviderFactory> ep_factory = provider->CreateExecutionProviderFactory(&provider_options);
-  return ep_factory->CreateProvider();
+  return provider->CreateExecutionProviderFactory(&provider_options);
 }
 
 #if defined(USE_CUDA) || defined(USE_CUDA_PROVIDER_INTERFACE)
@@ -539,14 +551,21 @@ void RegisterNvTensorRTRtxPluginsAsCustomOps(PySessionOptions& so, const Provide
 }
 #endif
 
-std::unique_ptr<IExecutionProvider> CreateExecutionProviderInstance(
+/**
+ * Creates an IExecutionProviderFactory instance of the specified type.
+ * @param session_options The session options.
+ * @param type The execution provider type (e.g., CUDAExecutionProvider).
+ * @param provider_options_map A map of provider options.
+ *
+ * @return A shared_ptr with the factory instance, or null if unable to create it.
+ */
+static std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactoryInstance(
     const SessionOptions& session_options,
     const std::string& type,
     const ProviderOptionsMap& provider_options_map) {
   if (type == kCpuExecutionProvider) {
     return onnxruntime::CPUProviderFactoryCreator::Create(
-               session_options.enable_cpu_mem_arena)
-        ->CreateProvider();
+        session_options.enable_cpu_mem_arena);
   } else if (type == kTensorrtExecutionProvider) {
 #if defined(USE_TENSORRT) || defined(USE_TENSORRT_PROVIDER_INTERFACE)
     // If the environment variable 'ORT_TENSORRT_UNAVAILABLE' exists, then we do not load TensorRT. This is set by _ld_preload for the manylinux case
@@ -869,11 +888,11 @@ std::unique_ptr<IExecutionProvider> CreateExecutionProviderInstance(
           }
         }
         if (std::shared_ptr<IExecutionProviderFactory> tensorrt_provider_factory = onnxruntime::TensorrtProviderFactoryCreator::Create(&params)) {
-          return tensorrt_provider_factory->CreateProvider();
+          return tensorrt_provider_factory;
         }
       } else {
         if (std::shared_ptr<IExecutionProviderFactory> tensorrt_provider_factory = onnxruntime::TensorrtProviderFactoryCreator::Create(cuda_device_id)) {
-          return tensorrt_provider_factory->CreateProvider();
+          return tensorrt_provider_factory;
         }
       }
     }
@@ -892,11 +911,11 @@ std::unique_ptr<IExecutionProvider> CreateExecutionProviderInstance(
         ProviderOptions info = it->second;
         if (std::shared_ptr<IExecutionProviderFactory> nv_tensorrt_rtx_provider_factory = onnxruntime::NvProviderFactoryCreator::Create(
                 info, &session_options)) {
-          return nv_tensorrt_rtx_provider_factory->CreateProvider();
+          return nv_tensorrt_rtx_provider_factory;
         }
       } else {
         if (std::shared_ptr<IExecutionProviderFactory> nv_tensorrt_rtx_provider_factory = onnxruntime::NvProviderFactoryCreator::Create(cuda_device_id)) {
-          return nv_tensorrt_rtx_provider_factory->CreateProvider();
+          return nv_tensorrt_rtx_provider_factory;
         }
       }
     }
@@ -1024,12 +1043,12 @@ std::unique_ptr<IExecutionProvider> CreateExecutionProviderInstance(
       }
       if (std::shared_ptr<IExecutionProviderFactory> migraphx_provider_factory =
               onnxruntime::MIGraphXProviderFactoryCreator::Create(&params)) {
-        return migraphx_provider_factory->CreateProvider();
+        return migraphx_provider_factory;
       }
     } else {
       if (std::shared_ptr<IExecutionProviderFactory> migraphx_provider_factory =
               onnxruntime::MIGraphXProviderFactoryCreator::Create(cuda_device_id)) {
-        return migraphx_provider_factory->CreateProvider();
+        return migraphx_provider_factory;
       }
     }
 #endif
@@ -1048,7 +1067,7 @@ std::unique_ptr<IExecutionProvider> CreateExecutionProviderInstance(
         // hence we must try to initialize it here if we can since FromProviderOptions might contain
         // external CUDA allocator.
         external_allocator_info = info.external_allocator_info;
-        return cuda_provider_info->CreateExecutionProviderFactory(info)->CreateProvider();
+        return cuda_provider_info->CreateExecutionProviderFactory(info);
       }
     }
 #if defined(USE_CUDA)
@@ -1081,7 +1100,7 @@ std::unique_ptr<IExecutionProvider> CreateExecutionProviderInstance(
       // however they still exist and are in-use. Nevertheless, it is used to return ROCMAllocator, hence we must
       // try to initialize it here if we can since FromProviderOptions might contain external ROCM allocator.
       external_allocator_info = info.external_allocator_info;
-      return rocm_provider_info->CreateExecutionProviderFactory(info)->CreateProvider();
+      return rocm_provider_info->CreateExecutionProviderFactory(info);
     } else {
       if (!Env::Default().GetEnvironmentVar("ROCM_PATH").empty()) {
         ORT_THROW(
@@ -1118,7 +1137,7 @@ std::unique_ptr<IExecutionProvider> CreateExecutionProviderInstance(
 #endif  // !defined(DNNL_ORT_THREAD)
     dnnl_options.use_arena = session_options.enable_cpu_mem_arena;
 
-    return onnxruntime::DnnlProviderFactoryCreator::Create(&dnnl_options)->CreateProvider();
+    return onnxruntime::DnnlProviderFactoryCreator::Create(&dnnl_options);
 #endif
   } else if (type == kOpenVINOExecutionProvider) {
 #if defined(USE_OPENVINO) || defined(USE_OPENVINO_PROVIDER_INTERFACE)
@@ -1189,10 +1208,9 @@ std::unique_ptr<IExecutionProvider> CreateExecutionProviderInstance(
     }
     if (std::shared_ptr<IExecutionProviderFactory> openvino_provider_factory = onnxruntime::OpenVINOProviderFactoryCreator::Create(
             &OV_provider_options_map, &session_options)) {
-      auto p = openvino_provider_factory->CreateProvider();
       // Reset global variables config to avoid it being accidentally passed on to the next session
       openvino_device_type.clear();
-      return p;
+      return openvino_provider_factory;
     } else {
       if (!Env::Default().GetEnvironmentVar("INTEL_OPENVINO_DIR").empty()) {
         ORT_THROW("INTEL_OPENVINO_DIR is set but OpenVINO library wasn't able to be loaded. Please install a supported version of OpenVINO as mentioned in the requirements page (https://onnxruntime.ai/docs/execution-providers/OpenVINO-ExecutionProvider.html#requirements), ensure dependency libraries are in the PATH and your hardware is supported.");
@@ -1210,7 +1228,7 @@ std::unique_ptr<IExecutionProvider> CreateExecutionProviderInstance(
     }
     info["session_options"] = std::to_string((uintptr_t)(void*)&session_options);
     if (auto vitisai_factory = onnxruntime::VitisAIProviderFactoryCreator::Create(info); vitisai_factory) {
-      return vitisai_factory->CreateProvider();
+      return vitisai_factory;
     }
     LOGS_DEFAULT(WARNING) << "Failed to create "
                           << type
@@ -1238,21 +1256,18 @@ std::unique_ptr<IExecutionProvider> CreateExecutionProviderInstance(
         }
       }
     }
-    return onnxruntime::ACLProviderFactoryCreator::Create(enable_fast_math)
-        ->CreateProvider();
+    return onnxruntime::ACLProviderFactoryCreator::Create(enable_fast_math);
 #endif
   } else if (type == kArmNNExecutionProvider) {
 #ifdef USE_ARMNN
     return onnxruntime::ArmNNProviderFactoryCreator::Create(
-               session_options.enable_cpu_mem_arena)
-        ->CreateProvider();
+        session_options.enable_cpu_mem_arena);
 #endif
   } else if (type == kDmlExecutionProvider) {
 #ifdef USE_DML
     auto cit = provider_options_map.find(type);
     return onnxruntime::DMLProviderFactoryCreator::CreateFromProviderOptions(
-               session_options.config_options, cit == provider_options_map.end() ? ProviderOptions{} : cit->second, true)
-        ->CreateProvider();
+        session_options.config_options, cit == provider_options_map.end() ? ProviderOptions{} : cit->second, true);
 #endif
   } else if (type == kNnapiExecutionProvider) {
 #if defined(USE_NNAPI)
@@ -1261,15 +1276,15 @@ std::unique_ptr<IExecutionProvider> CreateExecutionProviderInstance(
 #endif
     const auto partitioning_stop_ops_list = session_options.config_options.GetConfigEntry(
         kOrtSessionOptionsConfigNnapiEpPartitioningStopOps);
-    return onnxruntime::NnapiProviderFactoryCreator::Create(0, partitioning_stop_ops_list)->CreateProvider();
+    return onnxruntime::NnapiProviderFactoryCreator::Create(0, partitioning_stop_ops_list);
 #endif
   } else if (type == kVSINPUExecutionProvider) {
 #ifdef USE_VSINPU
-    return onnxruntime::VSINPUProviderFactoryCreator::Create()->CreateProvider();
+    return onnxruntime::VSINPUProviderFactoryCreator::Create();
 #endif
   } else if (type == kRknpuExecutionProvider) {
 #ifdef USE_RKNPU
-    return onnxruntime::RknpuProviderFactoryCreator::Create()->CreateProvider();
+    return onnxruntime::RknpuProviderFactoryCreator::Create();
 #endif
   } else if (type == kCoreMLExecutionProvider) {
 #if defined(USE_COREML)
@@ -1300,36 +1315,35 @@ std::unique_ptr<IExecutionProvider> CreateExecutionProviderInstance(
         }
       } else {
         // read from provider_options
-        return onnxruntime::CoreMLProviderFactoryCreator::Create(options)->CreateProvider();
+        return onnxruntime::CoreMLProviderFactoryCreator::Create(options);
       }
     }
 
-    return onnxruntime::CoreMLProviderFactoryCreator::Create(coreml_flags)->CreateProvider();
+    return onnxruntime::CoreMLProviderFactoryCreator::Create(coreml_flags);
 #endif
   } else if (type == kXnnpackExecutionProvider) {
 #if defined(USE_XNNPACK)
     auto cit = provider_options_map.find(type);
     return onnxruntime::XnnpackProviderFactoryCreator::Create(
-               cit == provider_options_map.end() ? ProviderOptions{} : cit->second, &session_options)
-        ->CreateProvider();
+        cit == provider_options_map.end() ? ProviderOptions{} : cit->second, &session_options);
 #endif
   } else if (type == kWebGpuExecutionProvider) {
 #if defined(USE_WEBGPU)
-    return onnxruntime::WebGpuProviderFactoryCreator::Create(session_options.config_options)->CreateProvider();
+    return onnxruntime::WebGpuProviderFactoryCreator::Create(session_options.config_options);
 #endif
   } else if (type == kCannExecutionProvider) {
 #ifdef USE_CANN
     if (auto* cann_provider_info = TryGetProviderInfo_CANN()) {
       const CANNExecutionProviderInfo info = GetCannExecutionProviderInfo(cann_provider_info,
                                                                           provider_options_map);
-      return cann_provider_info->CreateExecutionProviderFactory(info)->CreateProvider();
+      return cann_provider_info->CreateExecutionProviderFactory(info);
     } else {
       ORT_THROW("create CANN ExecutionProvider fail");
     }
 #endif
   } else if (type == kAzureExecutionProvider) {
 #ifdef USE_AZURE
-    return onnxruntime::AzureProviderFactoryCreator::Create({})->CreateProvider();
+    return onnxruntime::AzureProviderFactoryCreator::Create({});
 #endif
   } else if (type == kQnnExecutionProvider) {
 #if defined(USE_QNN) || defined(USE_QNN_PROVIDER_INTERFACE)
@@ -1337,7 +1351,7 @@ std::unique_ptr<IExecutionProvider> CreateExecutionProviderInstance(
     auto qnn_factory = onnxruntime::QNNProviderFactoryCreator::Create(
         cit == provider_options_map.end() ? ProviderOptions{} : cit->second, &session_options);
     if (qnn_factory) {
-      return qnn_factory->CreateProvider();
+      return qnn_factory;
     }
     LOGS_DEFAULT(WARNING) << "Failed to create "
                           << type
@@ -1362,7 +1376,7 @@ std::unique_ptr<IExecutionProvider> CreateExecutionProviderInstance(
             provider_options.insert(option);
           }
         }
-        return LoadExecutionProvider(shared_lib_path_it->second, provider_options, entry_symbol);
+        return LoadExecutionProviderFactory(shared_lib_path_it->second, provider_options, entry_symbol);
       }
     }
     // unknown provider
@@ -1371,18 +1385,56 @@ std::unique_ptr<IExecutionProvider> CreateExecutionProviderInstance(
   return nullptr;
 }
 
+/**
+ * Create an IExecutionProvider instance of the specified type. Note: this is called by orttraining code.
+ * @param session_options The session options.
+ * @param type The execution provider type (e.g., CUDAExecutionProvider).
+ * @param provider_options_map A map of provider options.
+ *
+ * @return A unique_ptr with the execution provider instance, or null if unable to create it.
+ */
+std::unique_ptr<IExecutionProvider> CreateExecutionProviderInstance(const SessionOptions& session_options,
+                                                                    const std::string& type,
+                                                                    const ProviderOptionsMap& provider_options_map) {
+  auto ep_factory = CreateExecutionProviderFactoryInstance(session_options, type, provider_options_map);
+  if (ep_factory) {
+    return ep_factory->CreateProvider();
+  }
+  return nullptr;
+}
+
 /*
  * Register execution provider with options.
  */
 static void RegisterExecutionProviders(InferenceSession* sess, const std::vector<std::string>& provider_types,
                                        const ProviderOptionsMap& provider_options_map) {
-  ORT_UNUSED_PARAMETER(provider_options_map);
-
   for (const std::string& type : provider_types) {
     auto ep = CreateExecutionProviderInstance(sess->GetSessionOptions(), type, provider_options_map);
-    if (ep)
+    if (ep) {
       OrtPybindThrowIfError(sess->RegisterExecutionProvider(std::move(ep)));
+    }
+  }
+}
+
+/**
+ * Adds an explicit execution provider factory to the session options.
+ *
+ * @param py_sess_options The session options.
+ * @param provider_type The type of the provider to add.
+ * @param provider_options The options for the execution provider as a map of string key/value pairs.
+ *
+ * @return A Status indicating an error or success.
+ */
+static Status AddExplicitEpFactory(PySessionOptions& py_sess_options, const std::string& provider_type,
+                                   const ProviderOptions& provider_options) {
+  const ProviderOptionsMap provider_options_map = {{provider_type, provider_options}};
+  auto ep_factory = CreateExecutionProviderFactoryInstance(py_sess_options.value, provider_type, provider_options_map);
+  if (!ep_factory) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Failed to add provider of type '",
+                           provider_type, "' to SessionOptions. Provider configuration is not supported.");
   }
+  py_sess_options.provider_factories.push_back(std::move(ep_factory));
+  return Status::OK();
 }
 
 /**
@@ -1426,6 +1478,83 @@ static void RegisterCustomOpDomains(PyInferenceSession* sess, const PySessionOpt
 }
 #endif
 
+#if !defined(ORT_MINIMAL_BUILD)
+/**
+ * Add the execution provider that is responsible for the selected OrtEpDevice instances to the session options.
+ *
+ * @param py_sess_options The session options.
+ * @param provider_type The type of the provider to add.
+ * @param provider_options The options for the execution provider as a map of string key/value pairs.
+ *
+ * @return A Status indicating an error or success.
+ */
+static Status AddEpFactoryFromEpDevices(PySessionOptions& py_sess_options,
+                                        const std::vector<const OrtEpDevice*>& ep_devices,
+                                        const ProviderOptions& provider_options) {
+  std::shared_ptr<onnxruntime::Environment> env = GetEnv();
+  const size_t num_ep_options = provider_options.size();
+  std::vector<const char*> ep_option_keys;
+  std::vector<const char*> ep_option_vals;
+
+  ep_option_keys.reserve(num_ep_options);
+  ep_option_vals.reserve(num_ep_options);
+  for (const auto& [key, val] : provider_options) {
+    ep_option_keys.push_back(key.c_str());
+    ep_option_vals.push_back(val.c_str());
+  }
+
+  std::unique_ptr<IExecutionProviderFactory> provider_factory = nullptr;
+  ORT_RETURN_IF_ERROR(CreateIExecutionProviderFactoryForEpDevices(*env,
+                                                                  py_sess_options.value,
+                                                                  ep_devices,
+                                                                  ep_option_keys,
+                                                                  ep_option_vals,
+                                                                  /*output*/ provider_factory));
+  py_sess_options.provider_factories.push_back(std::move(provider_factory));
+  return Status::OK();
+}
+
+/**
+ * Initializes the inference session using EPs specified in the session options.
+ *
+ * @param py_sess The inference session.
+ * @param disabled_optimizer_names Set of optimizers to disable.
+ * @return A Status indicating error or success.
+ */
+static Status InitializeSessionEpsFromSessionOptions(PyInferenceSession& py_sess,
+                                                     const std::unordered_set<std::string>& disabled_optimizer_names) {
+  ORT_RETURN_IF(py_sess.GetSessionHandle() == nullptr, "Invalid Python InferenceSession handle");
+  InferenceSession& sess = *py_sess.GetSessionHandle();
+
+  const logging::Logger* sess_logger = sess.GetLogger();
+  ORT_RETURN_IF(sess_logger == nullptr, "Invalid InferenceSession logger handle");
+
+  const OrtSessionOptions& ort_session_options = py_sess.GetOrtSessionOptions();
+
+  // if there are no providers registered, and there's an ep selection policy set, do auto ep selection
+  if (ort_session_options.provider_factories.empty() && ort_session_options.value.ep_selection_policy.enable) {
+    ProviderPolicyContext context;
+    ORT_RETURN_IF_ERROR(context.SelectEpsForSession(*GetEnv(), ort_session_options, sess));
+  } else {
+    for (const auto& provider_factory : ort_session_options.provider_factories) {
+      std::unique_ptr<IExecutionProvider> ep = provider_factory->CreateProvider(ort_session_options,
+                                                                                *(sess_logger->ToExternal()));
+      if (ep) {
+        ORT_RETURN_IF_ERROR(sess.RegisterExecutionProvider(std::move(ep)));
+      }
+    }
+  }
+
+  if (!disabled_optimizer_names.empty()) {
+    ORT_RETURN_IF_ERROR(sess.FilterEnabledOptimizers({disabled_optimizer_names.cbegin(),
+                                                      disabled_optimizer_names.cend()}));
+  }
+
+  ORT_RETURN_IF_ERROR(sess.Initialize());
+  return Status::OK();
+}
+#endif  // !defined(ORT_MINIMAL_BUILD)
+
 void InitializeSession(InferenceSession* sess,
                        ExecutionProviderRegistrationFn ep_registration_fn,
                        const std::vector<std::string>& provider_types,
@@ -1528,6 +1657,43 @@ void addGlobalMethods(py::module& m) {
           throw std::runtime_error("Error when creating and registering allocator in create_and_register_allocator_v2: " + st.ErrorMessage());
         }
       });
+  m.def(
+      "register_execution_provider_library",
+      [](const std::string& registration_name, const PathString& library_path) -> void {
+#if !defined(ORT_MINIMAL_BUILD)
+        std::shared_ptr<onnxruntime::Environment> env = GetEnv();
+        OrtPybindThrowIfError(env->RegisterExecutionProviderLibrary(registration_name, library_path.c_str()));
+#else
+        ORT_UNUSED_PARAMETER(registration_name);
+        ORT_UNUSED_PARAMETER(library_path);
+        ORT_THROW("Execution provider libraries are not supported in this build.");
+#endif
+      },
+      R"pbdoc(Register an execution provider library with ONNX Runtime.)pbdoc");
+  m.def(
+      "unregister_execution_provider_library",
+      [](const std::string& registration_name) -> void {
+#if !defined(ORT_MINIMAL_BUILD)
+        std::shared_ptr<onnxruntime::Environment> env = GetEnv();
+        OrtPybindThrowIfError(env->UnregisterExecutionProviderLibrary(registration_name));
+#else
+        ORT_UNUSED_PARAMETER(registration_name);
+        ORT_THROW("Execution provider libraries are not supported in this build.");
+#endif
+      },
+      R"pbdoc(Unregister an execution provider library from ONNX Runtime.)pbdoc");
+  m.def(
+      "get_ep_devices",
+      []() -> const std::vector<const OrtEpDevice*>& {
+#if !defined(ORT_MINIMAL_BUILD)
+        std::shared_ptr<onnxruntime::Environment> env = GetEnv();
+        return env->GetOrtEpDevices();
+#else
+        ORT_THROW("OrtEpDevices are not supported in this build");
+#endif
+      },
+      R"pbdoc(Get the list of available OrtEpDevice instances.)pbdoc",
+      py::return_value_policy::reference);
 
 #if defined(USE_OPENVINO) || defined(USE_OPENVINO_PROVIDER_INTERFACE)
   m.def(
@@ -1632,6 +1798,39 @@ void addGlobalMethods(py::module& m) {
 #endif
 }
 
+// TODO(adrianlizarraga): C API's delegate function needs a void* param to store state.
+// using PyEpSelectionDelegate = std::function<std::vector<const OrtEpDevice*>(const std::vector<const OrtEpDevice*>& ep_devices,
+//                                                                            const std::unordered_map<std::string, std::string>& model_metadata,
+//                                                                            const std::unordered_map<std::string, std::string>& runtime_metadata)>;
+//
+// static OrtStatus* PyDelegateWrapper(void* delegate_state,
+//                                     _In_ const OrtEpDevice** ep_devices,
+//                                     _In_ size_t num_devices,
+//                                     _In_ const OrtKeyValuePairs* model_metadata,
+//                                     _In_opt_ const OrtKeyValuePairs* runtime_metadata,
+//                                     _Inout_ const OrtEpDevice** selected,
+//                                     _In_ size_t max_selected,
+//                                     _Out_ size_t* num_selected) {
+//   PyEpSelectionDelegate* actual_delegate = reinterpret_cast<PyEpSelectionDelegate*>(delegate_state);
+//   std::vector<const OrtEpDevice*> py_ep_devices(ep_devices, ep_devices + num_devices);
+//   std::unordered_map<std::string, std::string> py_model_metadata =
+//       model_metadata ? model_metadata->entries : std::unordered_map<std::string, std::string>{};
+//   std::unordered_map<std::string, std::string> py_runtime_metadata =
+//       runtime_metadata ? runtime_metadata->entries : std::unordered_map<std::string, std::string>{};
+//
+//   std::vector<const OrtEpDevice*> py_selected = (*actual_delegate)(py_ep_devices, py_model_metadata, py_runtime_metadata);
+//
+//   // TODO: Check max_selected and return OrtStatus if necessary.
+//   assert(py_selected.size() <= max_selected);
+//
+//   *num_selected = py_selected.size();
+//   for (size_t i = 0; i < py_selected.size(); ++i) {
+//     selected[i] = py_selected[i];
+//   }
+//
+//   return nullptr;
+// };
+
 void addObjectMethods(py::module& m, ExecutionProviderRegistrationFn ep_registration_fn) {
   py::enum_<GraphOptimizationLevel>(m, "GraphOptimizationLevel")
       .value("ORT_DISABLE_ALL", GraphOptimizationLevel::ORT_DISABLE_ALL)
@@ -1672,6 +1871,75 @@ void addObjectMethods(py::module& m, ExecutionProviderRegistrationFn ep_registra
       .def_static("webgpu", []() { return OrtDevice::GPU; })
       .def_static("default_memory", []() { return OrtDevice::MemType::DEFAULT; });
 
+  py::enum_<OrtExecutionProviderDevicePolicy>(m, "OrtExecutionProviderDevicePolicy")
+      .value("DEFAULT", OrtExecutionProviderDevicePolicy_DEFAULT)
+      .value("PREFER_CPU", OrtExecutionProviderDevicePolicy_PREFER_CPU)
+      .value("PREFER_NPU", OrtExecutionProviderDevicePolicy_PREFER_NPU)
+      .value("PREFER_GPU", OrtExecutionProviderDevicePolicy_PREFER_GPU)
+      .value("MAX_PERFORMANCE", OrtExecutionProviderDevicePolicy_MAX_PERFORMANCE)
+      .value("MAX_EFFICIENCY", OrtExecutionProviderDevicePolicy_MAX_EFFICIENCY)
+      .value("MIN_OVERALL_POWER", OrtExecutionProviderDevicePolicy_MIN_OVERALL_POWER);
+
+  py::enum_<OrtHardwareDeviceType>(m, "OrtHardwareDeviceType")
+      .value("CPU", OrtHardwareDeviceType_CPU)
+      .value("GPU", OrtHardwareDeviceType_GPU)
+      .value("NPU", OrtHardwareDeviceType_NPU);
+
+  py::class_<OrtHardwareDevice> py_hw_device(m, "OrtHardwareDevice", R"pbdoc(ONNX Runtime hardware device information.)pbdoc");
+  py_hw_device.def_property_readonly(
+                  "type",
+                  [](OrtHardwareDevice* hw_device) -> OrtHardwareDeviceType { return hw_device->type; },
+                  R"pbdoc(Hardware device's type.)pbdoc")
+      .def_property_readonly(
+          "vendor_id",
+          [](OrtHardwareDevice* hw_device) -> uint32_t { return hw_device->vendor_id; },
+          R"pbdoc(Hardware device's vendor identifier.)pbdoc")
+      .def_property_readonly(
+          "vendor",
+          [](OrtHardwareDevice* hw_device) -> std::string { return hw_device->vendor; },
+          R"pbdoc(Hardware device's vendor name.)pbdoc")
+      .def_property_readonly(
+          "device_id",
+          [](OrtHardwareDevice* hw_device) -> uint32_t { return hw_device->device_id; },
+          R"pbdoc(Hardware device's unique identifier.)pbdoc")
+      .def_property_readonly(
+          "metadata",
+          [](OrtHardwareDevice* hw_device) -> std::unordered_map<std::string, std::string> {
+            return hw_device->metadata.entries;
+          },
+          R"pbdoc(Hardware device's metadata as string key/value pairs.)pbdoc");
+
+  py::class_<OrtEpDevice> py_ep_device(m, "OrtEpDevice",
+                                       R"pbdoc(Represents a hardware device that an execution provider supports
+for model inference.)pbdoc");
+  py_ep_device.def_property_readonly(
+                  "ep_name",
+                  [](OrtEpDevice* ep_device) -> std::string { return ep_device->ep_name; },
+                  R"pbdoc(The execution provider's name.)pbdoc")
+      .def_property_readonly(
+          "ep_vendor",
+          [](OrtEpDevice* ep_device) -> std::string { return ep_device->ep_vendor; },
+          R"pbdoc(The execution provider's vendor name.)pbdoc")
+      .def_property_readonly(
+          "ep_metadata",
+          [](OrtEpDevice* ep_device) -> std::unordered_map<std::string, std::string> {
+            return ep_device->ep_metadata.entries;
+          },
+          R"pbdoc(The execution provider's additional metadata for the OrtHardwareDevice.)pbdoc")
+      .def_property_readonly(
+          "ep_options",
+          [](OrtEpDevice* ep_device) -> std::unordered_map<std::string, std::string> {
+            return ep_device->ep_options.entries;
+          },
+          R"pbdoc(The execution provider's options used to configure the provider to use the OrtHardwareDevice.)pbdoc")
+      .def_property_readonly(
+          "device",
+          [](OrtEpDevice* ep_device) -> const OrtHardwareDevice& {
+            return *ep_device->device;
+          },
+          R"pbdoc(The OrtHardwareDevice instance for the OrtEpDevice.)pbdoc",
+          py::return_value_policy::reference_internal);
+
   py::class_<OrtArenaCfg> ort_arena_cfg_binding(m, "OrtArenaCfg");
   // Note: Doesn't expose initial_growth_chunk_sizes_bytes/max_power_of_two_extend_bytes option.
   // This constructor kept for backwards compatibility, key-value pair constructor overload exposes all options
@@ -1736,6 +2004,59 @@ void addObjectMethods(py::module& m, ExecutionProviderRegistrationFn ep_registra
       sess(m, "SessionOptions", R"pbdoc(Configuration information for a session.)pbdoc");
   sess
       .def(py::init())
+      .def(
+          // Equivalent to the C API's SessionOptionsAppendExecutionProvider.
+          "add_provider",
+          [](PySessionOptions* sess_options,
+             const std::string& provider_name,
+             const ProviderOptions& provider_options = {}) {
+            OrtPybindThrowIfError(AddExplicitEpFactory(*sess_options, provider_name, provider_options));
+          },
+          R"pbdoc(Adds an explicit execution provider.)pbdoc")
+      .def(
+          // Equivalent to the C API's SessionOptionsAppendExecutionProvider_V2.
+          "add_provider_for_devices",
+          [](PySessionOptions* sess_options,
+             const std::vector<const OrtEpDevice*>& ep_devices,
+             const ProviderOptions& provider_options = {}) {
+#if !defined(ORT_MINIMAL_BUILD)
+            OrtPybindThrowIfError(AddEpFactoryFromEpDevices(*sess_options,
+                                                            ep_devices,
+                                                            provider_options));
+#else
+            ORT_UNUSED_PARAMETER(sess_options);
+            ORT_UNUSED_PARAMETER(ep_devices);
+            ORT_UNUSED_PARAMETER(provider_options);
+            ORT_THROW("OrtEpDevices are not supported in this build");
+#endif
+          },
+          R"pbdoc(Adds the execution provider that is responsible for the selected OrtEpDevice instances. All OrtEpDevice instances 
+must refer to the same execution provider.)pbdoc")
+      .def(
+          // Equivalent to the C API's SessionOptionsSetEpSelectionPolicy.
+          "set_provider_selection_policy",
+          [](PySessionOptions* sess_options,
+             OrtExecutionProviderDevicePolicy policy) {
+#if !defined(ORT_MINIMAL_BUILD)
+            sess_options->value.ep_selection_policy.enable = true;
+            sess_options->value.ep_selection_policy.policy = policy;
+            sess_options->value.ep_selection_policy.delegate = nullptr;  // TODO: need a void* param in delegate.
+#else
+            ORT_UNUSED_PARAMETER(sess_options);
+            ORT_UNUSED_PARAMETER(policy);
+            ORT_THROW("EP selection policies are not supported in this build");
+#endif
+          },
+          R"pbdoc(Sets the execution provider selection policy for the session. Allows users to specify a
+selection policy for automatic execution provider (EP) selection, or provide a delegate callback
+for custom selection logic.)pbdoc")
+      .def(
+          "has_providers",
+          [](PySessionOptions* sess_options) -> bool {
+            return !sess_options->provider_factories.empty() || sess_options->value.ep_selection_policy.enable;
+          },
+          R"pbdoc(Returns true if the SessionOptions has been configured with providers, OrtEpDevices, or
+policies that will run the model.)pbdoc")
       .def_property(
           "enable_cpu_mem_arena",
           [](const PySessionOptions* options) -> bool { return options->value.enable_cpu_mem_arena; },
@@ -2132,11 +2453,18 @@ including arg name, arg type (contains both type and shape).)pbdoc")
                                const std::vector<std::string>& provider_types = {},
                                const ProviderOptionsVector& provider_options = {},
                                const std::unordered_set<std::string>& disabled_optimizer_names = {}) {
-            InitializeSession(sess->GetSessionHandle(),
-                              ep_registration_fn,
-                              provider_types,
-                              provider_options,
-                              disabled_optimizer_names);
+            // If the user did not explicitly specify providers when creating InferenceSession and the SessionOptions
+            // has provider information (i.e., either explicit EPs or an EP selection policy), then use the information
+            // in the session options to initialize the session.
+            if (provider_types.empty() && sess->HasProvidersInSessionOptions()) {
+              OrtPybindThrowIfError(InitializeSessionEpsFromSessionOptions(*sess, disabled_optimizer_names));
+            } else {
+              InitializeSession(sess->GetSessionHandle(),
+                                ep_registration_fn,
+                                provider_types,
+                                provider_options,
+                                disabled_optimizer_names);
+            }
           },
           R"pbdoc(Load a model saved in ONNX or ORT format.)pbdoc")
       .def("run",
@@ -2395,6 +2723,58 @@ including arg name, arg type (contains both type and shape).)pbdoc")
       .value("kNextPowerOfTwo", onnxruntime::ArenaExtendStrategy::kNextPowerOfTwo)
       .value("kSameAsRequested", onnxruntime::ArenaExtendStrategy::kSameAsRequested)
       .export_values();
+
+  py::class_<PyModelCompiler>(m, "ModelCompiler",
+                              R"pbdoc(This is the class used to compile an ONNX model.)pbdoc")
+      .def(py::init([](const PySessionOptions& sess_options,
+                       std::string path_or_bytes,
+                       bool is_path,
+                       bool embed_compiled_data_into_model = false,
+                       std::string external_initializers_file_path = {},
+                       size_t external_initializers_size_threshold = 1024) {
+#if !defined(ORT_MINIMAL_BUILD)
+        std::unique_ptr<PyModelCompiler> result;
+        OrtPybindThrowIfError(PyModelCompiler::Create(result, GetEnv(), sess_options,
+                                                      std::move(path_or_bytes), is_path,
+                                                      embed_compiled_data_into_model,
+                                                      external_initializers_file_path,
+                                                      external_initializers_size_threshold));
+        return result;
+#else
+        ORT_UNUSED_PARAMETER(sess_options);
+        ORT_UNUSED_PARAMETER(path_or_bytes);
+        ORT_UNUSED_PARAMETER(is_path);
+        ORT_UNUSED_PARAMETER(embed_compiled_data_into_model);
+        ORT_UNUSED_PARAMETER(external_initializers_file_path);
+        ORT_UNUSED_PARAMETER(external_initializers_size_threshold);
+        ORT_THROW("Compile API is not supported in this build.");
+#endif
+      }))
+      .def(
+          "compile_to_file",
+          [](PyModelCompiler* model_compiler, std::string output_model_path = {}) {
+#if !defined(ORT_MINIMAL_BUILD)
+            OrtPybindThrowIfError(model_compiler->CompileToFile(output_model_path));
+#else
+            ORT_UNUSED_PARAMETER(model_compiler);
+            ORT_UNUSED_PARAMETER(output_model_path);
+            ORT_THROW("Compile API is not supported in this build.");
+#endif
+          },
+          R"pbdoc(Compile an ONNX model into a file.)pbdoc")
+      .def(
+          "compile_to_bytes",
+          [](PyModelCompiler* model_compiler) -> py::bytes {
+#if !defined(ORT_MINIMAL_BUILD)
+            std::string output_bytes;
+            OrtPybindThrowIfError(model_compiler->CompileToBytes(output_bytes));
+            return py::bytes(output_bytes);
+#else
+            ORT_UNUSED_PARAMETER(model_compiler);
+            ORT_THROW("Compile API is not supported in this build.");
+#endif
+          },
+          R"pbdoc(Compile an ONNX model into a buffer.)pbdoc");
 }
 
 bool CreateInferencePybindStateModule(py::module& m) {
diff --git a/onnxruntime/python/onnxruntime_pybind_state_common.h b/onnxruntime/python/onnxruntime_pybind_state_common.h
index 3ae5c0d289c21..a964f199d43b3 100644
--- a/onnxruntime/python/onnxruntime_pybind_state_common.h
+++ b/onnxruntime/python/onnxruntime_pybind_state_common.h
@@ -234,13 +234,13 @@ using PySessionOptions = OrtSessionOptions;
 // Thin wrapper over internal C++ InferenceSession to accommodate custom op library management for the Python user
 struct PyInferenceSession {
   PyInferenceSession(std::shared_ptr<Environment> env, const PySessionOptions& so)
-      : env_(std::move(env)) {
+      : env_(std::move(env)), session_options_(so) {
     sess_ = std::make_unique<InferenceSession>(so.value, *env_);
   }
 
 #if !defined(ORT_MINIMAL_BUILD)
   PyInferenceSession(std::shared_ptr<Environment> env, const PySessionOptions& so, const std::string& arg, bool is_arg_file_name)
-      : env_(std::move(env)) {
+      : env_(std::move(env)), session_options_(so) {
     if (is_arg_file_name) {
       // Given arg is the file path. Invoke the corresponding ctor().
       sess_ = std::make_unique<InferenceSession>(so.value, *env_, arg);
@@ -252,6 +252,24 @@ struct PyInferenceSession {
   }
 #endif
 
+  // Returns true if the session options have provider information from either
+  // setting explicit providers, setting a provider that supports a OrtEpDevice(s), or
+  // setting a selection policy (e.g., prefer gpu).
+  bool HasProvidersInSessionOptions() const {
+    return !session_options_.provider_factories.empty() ||
+           session_options_.value.ep_selection_policy.enable;
+  }
+
+  // Returns (and updates) a reference to the OrtSessionOptions for this inference session.
+  OrtSessionOptions& GetOrtSessionOptions() {
+    if (sess_) {
+      // Copy internal value from InferenceSession as it is the source of truth
+      // and the option configurations may have changed.
+      session_options_.value = sess_->GetSessionOptions();
+    }
+    return session_options_;
+  }
+
   InferenceSession* GetSessionHandle() const { return sess_.get(); }
 
   virtual ~PyInferenceSession() = default;
@@ -264,6 +282,7 @@ struct PyInferenceSession {
  private:
   std::shared_ptr<Environment> env_;
   std::unique_ptr<InferenceSession> sess_;
+  OrtSessionOptions session_options_;
 };
 
 inline const PySessionOptions& GetDefaultCPUSessionOptions() {
diff --git a/onnxruntime/test/python/autoep_helper.py b/onnxruntime/test/python/autoep_helper.py
new file mode 100644
index 0000000000000..e3b214afa6e62
--- /dev/null
+++ b/onnxruntime/test/python/autoep_helper.py
@@ -0,0 +1,67 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+from __future__ import annotations
+
+import os
+import tempfile
+import unittest
+
+import onnxruntime as onnxrt
+from onnxruntime.capi.onnxruntime_pybind11_state import Fail
+
+
+class AutoEpTestCase(unittest.TestCase):
+    """
+    Base class for TestCase classes that need to register and unregister EP libraries.
+    Because EP libraries are registered with the ORT environment and all unit tests share
+    the same environment, this class tracks which libraries have already been registered
+    so that they are not erroneously registered or unregistered.
+
+    Derived classes must use 'self.register_execution_provider_library()' and
+    'self.unregister_execution_provider_library()' to benefit from these utilities.
+    """
+
+    @classmethod
+    def setUpClass(cls):
+        cls._tmp_model_dir = tempfile.TemporaryDirectory(prefix="ort.autoep_")
+
+        # Note: swap with the commented line if you want to see the models in local test dir.
+        cls._tmp_dir_path = cls._tmp_model_dir.name
+        # cls._tmp_dir_path = "."
+
+        # Track registered EP libraries across all tests.
+        cls._registered_providers = set()
+
+    @classmethod
+    def tearDownClass(cls):
+        cls._tmp_model_dir.cleanup()
+
+    def register_execution_provider_library(self, ep_registration_name: str, ep_lib_path: os.PathLike | str):
+        if ep_registration_name in self._registered_providers:
+            return  # Already registered
+
+        try:
+            onnxrt.register_execution_provider_library(ep_registration_name, ep_lib_path)
+        except Fail as onnxruntime_error:
+            if "already registered" in str(onnxruntime_error):
+                pass  # Allow register to fail if the EP library was previously registered.
+            else:
+                raise onnxruntime_error
+
+        # Add this EP library to set of registered EP libraries.
+        # If the unit test itself does not unregister the library, tearDown() will try.
+        self._registered_providers.add(ep_registration_name)
+
+    def unregister_execution_provider_library(self, ep_registration_name: str):
+        if ep_registration_name not in self._registered_providers:
+            return  # Not registered
+
+        try:
+            onnxrt.unregister_execution_provider_library(ep_registration_name)
+        except Fail as onnxruntime_error:
+            if "was not registered" in str(onnxruntime_error):
+                pass  # Allow unregister to fail if the EP library was never registered.
+            else:
+                raise onnxruntime_error
+
+        self._registered_providers.remove(ep_registration_name)
diff --git a/onnxruntime/test/python/onnxruntime_test_python.py b/onnxruntime/test/python/onnxruntime_test_python.py
index f3ebc92409f77..5631e49069ae3 100644
--- a/onnxruntime/test/python/onnxruntime_test_python.py
+++ b/onnxruntime/test/python/onnxruntime_test_python.py
@@ -1360,13 +1360,20 @@ def test_register_custom_ops_library(self):
         )
 
     def test_ort_value(self):
+        providers_to_test = onnxrt.get_available_providers()
         numpy_arr_input = np.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], dtype=np.float32)
         numpy_arr_output = np.array([[1.0, 4.0], [9.0, 16.0], [25.0, 36.0]], dtype=np.float32)
 
-        def test_session_with_ortvalue_input(ortvalue):
-            sess = onnxrt.InferenceSession(get_name("mul_1.onnx"), providers=onnxrt.get_available_providers())
+        def test_session_with_ortvalue_input(ortvalue, providers):
+            sess = onnxrt.InferenceSession(get_name("mul_1.onnx"), providers=providers)
             res = sess.run(["Y"], {"X": ortvalue})
-            self.assertTrue(np.array_equal(res[0], numpy_arr_output))
+
+            if "QNNExecutionProvider" in providers:
+                # QNN runs float32 with fp16 precision, so relax accuracy expectations
+                np.testing.assert_allclose(numpy_arr_output, res[0], rtol=1e-04, atol=1e-06)
+            else:
+                self.assertTrue(np.array_equal(res[0], numpy_arr_output))
+
             vect = sess._sess.run_with_ort_values({"X": ortvalue._get_c_value()}, ["Y"], RunOptions())
             self.assertIsInstance(vect, OrtValueVector)
 
@@ -1378,7 +1385,7 @@ def test_session_with_ortvalue_input(ortvalue):
         self.assertTrue(np.array_equal(ortvalue1.numpy(), numpy_arr_input))
 
         # Pass in the constructed OrtValue to a session via Run() and check results
-        test_session_with_ortvalue_input(ortvalue1)
+        test_session_with_ortvalue_input(ortvalue1, providers_to_test)
 
         # The constructed OrtValue should still be valid after being used in a session
         self.assertTrue(np.array_equal(ortvalue1.numpy(), numpy_arr_input))
@@ -1392,7 +1399,7 @@ def test_session_with_ortvalue_input(ortvalue):
         self.assertEqual(float_tensor_data_type, ort_value_with_type.element_type())
         self.assertEqual([3, 2], ort_value_with_type.shape())
 
-        if "CUDAExecutionProvider" in onnxrt.get_available_providers():
+        if "CUDAExecutionProvider" in providers_to_test:
             ortvalue2 = onnxrt.OrtValue.ortvalue_from_numpy(numpy_arr_input, "cuda", 0)
             self.assertEqual(ortvalue2.device_name(), "cuda")
             self.assertEqual(ortvalue2.shape(), [3, 2])
@@ -1401,7 +1408,7 @@ def test_session_with_ortvalue_input(ortvalue):
             self.assertTrue(np.array_equal(ortvalue2.numpy(), numpy_arr_input))
 
             # Pass in the constructed OrtValue to a session via Run() and check results
-            test_session_with_ortvalue_input(ortvalue2)
+            test_session_with_ortvalue_input(ortvalue2, providers_to_test)
 
             # The constructed OrtValue should still be valid after being used in a session
             self.assertTrue(np.array_equal(ortvalue2.numpy(), numpy_arr_input))
diff --git a/onnxruntime/test/python/onnxruntime_test_python_autoep.py b/onnxruntime/test/python/onnxruntime_test_python_autoep.py
new file mode 100644
index 0000000000000..1d7dba5662257
--- /dev/null
+++ b/onnxruntime/test/python/onnxruntime_test_python_autoep.py
@@ -0,0 +1,167 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+from __future__ import annotations
+
+import os
+import platform
+import sys
+import unittest
+
+import numpy as np
+from autoep_helper import AutoEpTestCase
+from helper import get_name
+
+import onnxruntime as onnxrt
+from onnxruntime.capi.onnxruntime_pybind11_state import InvalidArgument
+
+# handle change from python 3.8 and on where loading a dll from the current directory needs to be explicitly allowed.
+if platform.system() == "Windows" and sys.version_info.major >= 3 and sys.version_info.minor >= 8:  # noqa: YTT204
+    os.add_dll_directory(os.getcwd())
+
+available_providers = list(onnxrt.get_available_providers())
+
+
+class TestAutoEP(AutoEpTestCase):
+    def test_cuda_ep_register_and_inference(self):
+        """
+        Test registration of CUDA EP, adding its OrtDevice to the SessionOptions, and running inference.
+        """
+        ep_lib_path = "onnxruntime_providers_cuda.dll"
+        ep_registration_name = "CUDAExecutionProvider"
+
+        if sys.platform != "win32":
+            self.skipTest("Skipping test because device discovery is only supported on Windows")
+
+        if ep_registration_name not in available_providers:
+            self.skipTest("Skipping test because it needs to run on CUDA EP")
+
+        self.register_execution_provider_library(ep_registration_name, ep_lib_path)
+
+        ep_devices = onnxrt.get_ep_devices()
+        has_cpu_ep = False
+        cuda_ep_device = None
+        for ep_device in ep_devices:
+            ep_name = ep_device.ep_name
+            if ep_name == "CPUExecutionProvider":
+                has_cpu_ep = True
+            if ep_name == ep_registration_name:
+                cuda_ep_device = ep_device
+
+        self.assertTrue(has_cpu_ep)
+        self.assertIsNotNone(cuda_ep_device)
+        self.assertEqual(cuda_ep_device.ep_vendor, "Microsoft")
+
+        hw_device = cuda_ep_device.device
+        self.assertEqual(hw_device.type, onnxrt.OrtHardwareDeviceType.GPU)
+
+        # Add CUDA's OrtEpDevice to session options
+        sess_options = onnxrt.SessionOptions()
+        sess_options.add_provider_for_devices([cuda_ep_device], {"prefer_nhwc": "1"})
+        self.assertTrue(sess_options.has_providers())
+
+        # Run sample model and check output
+        sess = onnxrt.InferenceSession(get_name("mul_1.onnx"), sess_options=sess_options)
+
+        x = np.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], dtype=np.float32)
+        input_name = sess.get_inputs()[0].name
+        res = sess.run([], {input_name: x})
+        output_expected = np.array([[1.0, 4.0], [9.0, 16.0], [25.0, 36.0]], dtype=np.float32)
+        np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08)
+
+        # TODO(adrianlizarraga): Unregistering CUDA EP library causes issues. Investigate.
+        # self.unregister_execution_provider_library(ep_registration_name)
+
+    def test_cuda_prefer_gpu_and_inference(self):
+        """
+        Test selecting CUDA EP via the PREFER_GPU policy and running inference.
+        """
+        ep_lib_path = "onnxruntime_providers_cuda.dll"
+        ep_registration_name = "CUDAExecutionProvider"
+
+        if sys.platform != "win32":
+            self.skipTest("Skipping test because device discovery is only supported on Windows")
+
+        if ep_registration_name not in available_providers:
+            self.skipTest("Skipping test because it needs to run on CUDA EP")
+
+        self.register_execution_provider_library(ep_registration_name, ep_lib_path)
+
+        # Set a policy to prefer GPU. Cuda should be selected.
+        sess_options = onnxrt.SessionOptions()
+        sess_options.set_provider_selection_policy(onnxrt.OrtExecutionProviderDevicePolicy.PREFER_GPU)
+        self.assertTrue(sess_options.has_providers())
+
+        # Run sample model and check output
+        sess = onnxrt.InferenceSession(get_name("mul_1.onnx"), sess_options=sess_options)
+
+        x = np.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], dtype=np.float32)
+        input_name = sess.get_inputs()[0].name
+        res = sess.run([], {input_name: x})
+        output_expected = np.array([[1.0, 4.0], [9.0, 16.0], [25.0, 36.0]], dtype=np.float32)
+        np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08)
+
+        # TODO(adrianlizarraga): Unregistering CUDA EP library causes issues. Investigate.
+        # self.unregister_execution_provider_library(ep_registration_name)
+
+    def test_example_plugin_ep_devices(self):
+        """
+        Test registration of an example EP plugin and retrieval of its OrtEpDevice.
+        """
+        if sys.platform != "win32":
+            self.skipTest("Skipping test because it device discovery is only supported on Windows")
+
+        ep_lib_path = "example_plugin_ep.dll"
+        try:
+            ep_lib_path = get_name("example_plugin_ep.dll")
+        except FileNotFoundError:
+            self.skipTest(f"Skipping test because EP library '{ep_lib_path}' cannot be found")
+
+        ep_registration_name = "example_ep"
+        self.register_execution_provider_library(ep_registration_name, os.path.realpath(ep_lib_path))
+
+        ep_devices = onnxrt.get_ep_devices()
+        has_cpu_ep = False
+        test_ep_device = None
+        for ep_device in ep_devices:
+            ep_name = ep_device.ep_name
+
+            if ep_name == "CPUExecutionProvider":
+                has_cpu_ep = True
+            if ep_name == ep_registration_name:
+                test_ep_device = ep_device
+
+        self.assertTrue(has_cpu_ep)
+        self.assertIsNotNone(test_ep_device)
+
+        # Test the OrtEpDevice getters. Expected values are from /onnxruntime/test/autoep/library/example_plugin_ep.cc
+        self.assertEqual(test_ep_device.ep_vendor, "Contoso")
+
+        ep_metadata = test_ep_device.ep_metadata
+        self.assertEqual(ep_metadata["version"], "0.1")
+
+        ep_options = test_ep_device.ep_options
+        self.assertEqual(ep_options["run_really_fast"], "true")
+
+        # The CPU hw device info will vary by machine so check for the common values.
+        hw_device = test_ep_device.device
+        self.assertEqual(hw_device.type, onnxrt.OrtHardwareDeviceType.CPU)
+        self.assertGreaterEqual(hw_device.vendor_id, 0)
+        self.assertGreaterEqual(hw_device.device_id, 0)
+        self.assertGreater(len(hw_device.vendor), 0)
+
+        hw_metadata = hw_device.metadata
+        self.assertGreater(len(hw_metadata), 0)  # Should have at least SPDRP_HARDWAREID on Windows
+
+        # Test adding this EP plugin's OrtEpDevice to the SessionOptions.
+        sess_options = onnxrt.SessionOptions()
+        with self.assertRaises(InvalidArgument) as context:
+            # Will raise InvalidArgument because ORT currently only supports provider bridge APIs.
+            # Actual plugin EPs will be supported in the future.
+            sess_options.add_provider_for_devices([test_ep_device], {"opt1": "val1"})
+        self.assertIn("EP is not currently supported", str(context.exception))
+
+        self.unregister_execution_provider_library(ep_registration_name)
+
+
+if __name__ == "__main__":
+    unittest.main(verbosity=1)
diff --git a/onnxruntime/test/python/onnxruntime_test_python_backend.py b/onnxruntime/test/python/onnxruntime_test_python_backend.py
index 1f6cd78f28334..6ed7dfe59b1f6 100644
--- a/onnxruntime/test/python/onnxruntime_test_python_backend.py
+++ b/onnxruntime/test/python/onnxruntime_test_python_backend.py
@@ -40,18 +40,28 @@ def test_allocation_plan_works_with_only_execute_path_to_fetches_option(self):
         This case is handled specifically in ExecutionFrame::AllocateAsPerAllocationPlan().
         This test is to ensure that the case is covered.
         """
+        providers = onnxrt.get_available_providers()
+        has_qnn_ep = "QNNExecutionProvider" in providers
         name = get_name("alloc_tensor_reuse.onnx")
-        sess = onnxrt.InferenceSession(name, providers=onnxrt.get_available_providers())
+        sess = onnxrt.InferenceSession(name, providers=providers)
 
         run_options = onnxrt.RunOptions()
         run_options.only_execute_path_to_fetches = True
         inp0, inp1 = np.ones((10,), dtype=np.float32), np.ones((10,), dtype=np.float32)
 
         session_run_results = sess.run(["outp0"], {"inp0": inp0, "inp1": inp1}, run_options)
-        assert_allclose(session_run_results[0], -(inp0 + inp1))
+        if has_qnn_ep:
+            # QNN EP runs fp32 with fp16 precision, so relax tolerance.
+            assert_allclose(session_run_results[0], -(inp0 + inp1), rtol=1e-6, atol=1e-6)
+        else:
+            assert_allclose(session_run_results[0], -(inp0 + inp1))
 
         session_run_results = sess.run(["outp1"], {"inp0": inp0, "inp1": inp1}, run_options)
-        assert_allclose(session_run_results[0], -(inp0 - inp1))
+        if has_qnn_ep:
+            # QNN EP runs fp32 with fp16 precision, so relax tolerance.
+            assert_allclose(session_run_results[0], -(inp0 - inp1), rtol=1e-6, atol=1e-6)
+        else:
+            assert_allclose(session_run_results[0], -(inp0 - inp1))
 
 
 if __name__ == "__main__":
diff --git a/onnxruntime/test/python/onnxruntime_test_python_compile_api.py b/onnxruntime/test/python/onnxruntime_test_python_compile_api.py
new file mode 100644
index 0000000000000..f5f23e2da1e43
--- /dev/null
+++ b/onnxruntime/test/python/onnxruntime_test_python_compile_api.py
@@ -0,0 +1,185 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+from __future__ import annotations
+
+import os
+import platform
+import sys
+import unittest
+
+import onnx
+from autoep_helper import AutoEpTestCase
+from helper import get_name
+
+import onnxruntime as onnxrt
+from onnxruntime.capi.onnxruntime_pybind11_state import ModelRequiresCompilation
+
+# handle change from python 3.8 and on where loading a dll from the current directory needs to be explicitly allowed.
+if platform.system() == "Windows" and sys.version_info.major >= 3 and sys.version_info.minor >= 8:  # noqa: YTT204
+    os.add_dll_directory(os.getcwd())
+
+available_providers = list(onnxrt.get_available_providers())
+
+
+class TestCompileApi(AutoEpTestCase):
+    def test_compile_with_files_prefer_npu_policy(self):
+        """
+        Tests compiling a model (to/from files) using an EP selection policy (PREFER_NPU).
+        """
+        if "QNNExecutionProvider" not in available_providers:
+            self.skipTest("Skipping test because it needs to run on QNN EP")
+
+        if sys.platform != "win32":
+            self.skipTest("Skipping test because provider selection policies are only supported on Windows")
+
+        ep_lib_path = "onnxruntime_providers_qnn.dll"
+        ep_registration_name = "QNNExecutionProvider"
+        self.register_execution_provider_library(ep_registration_name, ep_lib_path)
+
+        input_model_path = get_name("nhwc_resize_scales_opset18.onnx")
+        output_model_path = os.path.join(self._tmp_dir_path, "model.compiled0.onnx")
+
+        session_options = onnxrt.SessionOptions()
+        session_options.set_provider_selection_policy(onnxrt.OrtExecutionProviderDevicePolicy.PREFER_NPU)
+
+        model_compiler = onnxrt.ModelCompiler(
+            session_options,
+            input_model_path,
+            embed_compiled_data_into_model=True,
+            external_initializers_file_path=None,
+        )
+        model_compiler.compile_to_file(output_model_path)
+        self.assertTrue(os.path.exists(output_model_path))
+        self.unregister_execution_provider_library(ep_registration_name)
+
+    def test_compile_with_input_and_output_files(self):
+        """
+        Tests compiling a model (to/from files) using explicit EP.
+        """
+        provider = None
+        provider_options = dict()
+        if "QNNExecutionProvider" in available_providers:
+            provider = "QNNExecutionProvider"
+            provider_options["backend_type"] = "htp"
+        # TODO(adrianlizarraga): Allow test to run for other compiling EPs (e.g., OpenVINO)
+
+        input_model_path = get_name("nhwc_resize_scales_opset18.onnx")
+        output_model_path = os.path.join(self._tmp_dir_path, "model.compiled1.onnx")
+
+        session_options = onnxrt.SessionOptions()
+        if provider:
+            session_options.add_provider(provider, provider_options)
+
+        model_compiler = onnxrt.ModelCompiler(
+            session_options,
+            input_model_path,
+            embed_compiled_data_into_model=True,
+            external_initializers_file_path=None,
+        )
+        model_compiler.compile_to_file(output_model_path)
+        self.assertTrue(os.path.exists(output_model_path))
+
+    def test_compile_to_file_with_input_model_in_buffer(self):
+        """
+        Tests compiling an input model that is stored in a buffer. The output is saved to a file.
+        """
+        provider = None
+        provider_options = dict()
+        if "QNNExecutionProvider" in available_providers:
+            provider = "QNNExecutionProvider"
+            provider_options["backend_type"] = "htp"
+        # TODO(adrianlizarraga): Allow test to run for other compiling EPs (e.g., OpenVINO)
+
+        input_onnx_model = onnx.load(get_name("nhwc_resize_scales_opset18.onnx"))
+        input_model_bytes = input_onnx_model.SerializeToString()
+        output_model_path = os.path.join(self._tmp_dir_path, "model.compiled2.onnx")
+
+        session_options = onnxrt.SessionOptions()
+        if provider:
+            session_options.add_provider(provider, provider_options)
+
+        model_compiler = onnxrt.ModelCompiler(
+            session_options,
+            input_model_bytes,
+            embed_compiled_data_into_model=True,
+            external_initializers_file_path=None,
+        )
+        model_compiler.compile_to_file(output_model_path)
+        self.assertTrue(os.path.exists(output_model_path))
+
+    def test_compile_from_buffer_to_buffer(self):
+        """
+        Tests compiling an input model that is stored in a buffer. The output is stored in a buffer too.
+        """
+        provider = None
+        provider_options = dict()
+        if "QNNExecutionProvider" in available_providers:
+            provider = "QNNExecutionProvider"
+            provider_options["backend_type"] = "htp"
+        # TODO(adrianlizarraga): Allow test to run for other compiling EPs (e.g., OpenVINO)
+
+        input_onnx_model = onnx.load(get_name("nhwc_resize_scales_opset18.onnx"))
+        input_model_bytes = input_onnx_model.SerializeToString()
+
+        session_options = onnxrt.SessionOptions()
+        if provider:
+            session_options.add_provider(provider, provider_options)
+
+        model_compiler = onnxrt.ModelCompiler(
+            session_options,
+            input_model_bytes,
+            embed_compiled_data_into_model=True,
+            external_initializers_file_path=None,
+        )
+        output_model_bytes = model_compiler.compile_to_bytes()
+        self.assertTrue(isinstance(output_model_bytes, bytes))
+        self.assertGreater(len(output_model_bytes), 0)
+
+    def test_fail_load_uncompiled_model_and_then_compile(self):
+        """
+        Tests compiling scenario:
+         - Load uncompiled model into session that disables JIT compilation.
+         - Expect an error (ModelRequiresCompilation)
+         - Compile model and retry creating an inference session successfully.
+        """
+        if "QNNExecutionProvider" not in available_providers:
+            self.skipTest("Skipping test because it needs to run on a compiling EP")
+
+        input_model_path = get_name("nhwc_resize_scales_opset18.onnx")
+
+        session_options = onnxrt.SessionOptions()
+        session_options.add_session_config_entry("session.disable_model_compile", "1")  # Disable JIT model compilation!
+        session_options.add_provider("QNNExecutionProvider", {"backend_type": "htp"})
+
+        # Session creation should fail with error ORT_MODEL_REQUIRES_COMPILATION because the input model
+        # is not compiled and we disabled JIT compilation for this session.
+        with self.assertRaises(ModelRequiresCompilation) as context:
+            onnxrt.InferenceSession(
+                input_model_path,
+                sess_options=session_options,
+                enable_fallback=False,
+            )
+        self.assertIn("needs to compile", str(context.exception))
+
+        # Try to compile the model now.
+        compiled_model_path = os.path.join(self._tmp_dir_path, "model.compiled3.onnx")
+        model_compiler = onnxrt.ModelCompiler(
+            session_options,
+            input_model_path,
+            embed_compiled_data_into_model=True,
+            external_initializers_file_path="external_weights.bin",
+            external_initializers_size_threshold=128,
+        )
+        model_compiler.compile_to_file(compiled_model_path)
+
+        self.assertTrue(os.path.exists(compiled_model_path))
+        self.assertEqual(session_options.get_session_config_entry("session.disable_model_compile"), "1")
+        self.assertTrue(session_options.has_providers())
+
+        # Creating the session with the compiled model should not fail.
+        sess = onnxrt.InferenceSession(compiled_model_path, sess_options=session_options)
+        self.assertIsNotNone(sess)
+
+
+if __name__ == "__main__":
+    unittest.main(verbosity=1)
diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py
index 03b51790e0ef6..f4b7a78fe4c99 100644
--- a/tools/ci_build/build.py
+++ b/tools/ci_build/build.py
@@ -1667,6 +1667,9 @@ def run_onnxruntime_tests(args, source_dir, ctest_path, build_dir, configs):
                 [sys.executable, "onnxruntime_test_python.py"], cwd=cwd, dll_path=dll_path, python_path=python_path
             )
 
+            log.info("Testing AutoEP feature")
+            run_subprocess([sys.executable, "onnxruntime_test_python_autoep.py"], cwd=cwd, dll_path=dll_path)
+
             if not args.disable_contrib_ops:
                 run_subprocess([sys.executable, "onnxruntime_test_python_sparse_matmul.py"], cwd=cwd, dll_path=dll_path)
 
@@ -1761,6 +1764,12 @@ def run_onnxruntime_tests(args, source_dir, ctest_path, build_dir, configs):
                     cwd=cwd,
                 )
 
+                if not args.disable_contrib_ops:
+                    log.info("Testing Python Compile API")
+                    run_subprocess(
+                        [sys.executable, "onnxruntime_test_python_compile_api.py"], cwd=cwd, dll_path=dll_path
+                    )
+
                 if not args.skip_onnx_tests:
                     run_subprocess([os.path.join(cwd, "onnx_test_runner"), "test_models"], cwd=cwd)
                     if config != "Debug":

From bb5d2c22b8f8be3566e95323683013fab4076194 Mon Sep 17 00:00:00 2001
From: David Fan <30608893+jiafatom@users.noreply.github.com>
Date: Tue, 6 May 2025 00:12:40 -0700
Subject: [PATCH 11/84] k_quant should have zero_point (#24647)

### Description
As titled.
---
 .../python/tools/quantization/neural_compressor/weight_only.py  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/onnxruntime/python/tools/quantization/neural_compressor/weight_only.py b/onnxruntime/python/tools/quantization/neural_compressor/weight_only.py
index 4eda7efc9b8fe..558415f028c7b 100644
--- a/onnxruntime/python/tools/quantization/neural_compressor/weight_only.py
+++ b/onnxruntime/python/tools/quantization/neural_compressor/weight_only.py
@@ -506,7 +506,7 @@ def rtn_quantize(
                     k_blocks=k_blocks,
                     q_weight=q_weight.astype("uint8"),
                     scale=scale.astype(dtype),
-                    zero_point=zp if scheme == "asym" else None,
+                    zero_point=zp if scheme == "asym" or algorithm == "k_quant" else None,
                     accuracy_level=accuracy_level,
                 )
 

From 337839e33623abd220ab82e5c84d3db3bb5672e0 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 6 May 2025 01:03:36 -0700
Subject: [PATCH 12/84] Bump vite from 6.2.6 to 6.3.4 in
 /js/web/test/e2e/exports/testcases/vite-default (#24607)

Bumps [vite](https://github.com/vitejs/vite/tree/HEAD/packages/vite)
from 6.2.6 to 6.3.4.
<details>
<summary>Release notes</summary>
<p><em>Sourced from <a
href="https://github.com/vitejs/vite/releases">vite's
releases</a>.</em></p>
<blockquote>
<h2>v6.3.4</h2>
<p>Please refer to <a
href="https://github.com/vitejs/vite/blob/v6.3.4/packages/vite/CHANGELOG.md">CHANGELOG.md</a>
for details.</p>
<h2>v6.3.3</h2>
<p>Please refer to <a
href="https://github.com/vitejs/vite/blob/v6.3.3/packages/vite/CHANGELOG.md">CHANGELOG.md</a>
for details.</p>
<h2>v6.3.2</h2>
<p>Please refer to <a
href="https://github.com/vitejs/vite/blob/v6.3.2/packages/vite/CHANGELOG.md">CHANGELOG.md</a>
for details.</p>
<h2>create-vite@6.3.1</h2>
<p>Please refer to <a
href="https://github.com/vitejs/vite/blob/create-vite@6.3.1/packages/create-vite/CHANGELOG.md">CHANGELOG.md</a>
for details.</p>
<h2>v6.3.1</h2>
<p>Please refer to <a
href="https://github.com/vitejs/vite/blob/v6.3.1/packages/vite/CHANGELOG.md">CHANGELOG.md</a>
for details.</p>
<h2>create-vite@6.3.0</h2>
<p>Please refer to <a
href="https://github.com/vitejs/vite/blob/create-vite@6.3.0/packages/create-vite/CHANGELOG.md">CHANGELOG.md</a>
for details.</p>
<h2>v6.3.0</h2>
<p>Please refer to <a
href="https://github.com/vitejs/vite/blob/v6.3.0/packages/vite/CHANGELOG.md">CHANGELOG.md</a>
for details.</p>
<h2>v6.3.0-beta.2</h2>
<p>Please refer to <a
href="https://github.com/vitejs/vite/blob/v6.3.0-beta.2/packages/vite/CHANGELOG.md">CHANGELOG.md</a>
for details.</p>
<h2>v6.3.0-beta.1</h2>
<p>Please refer to <a
href="https://github.com/vitejs/vite/blob/v6.3.0-beta.1/packages/vite/CHANGELOG.md">CHANGELOG.md</a>
for details.</p>
<h2>v6.3.0-beta.0</h2>
<p>Please refer to <a
href="https://github.com/vitejs/vite/blob/v6.3.0-beta.0/packages/vite/CHANGELOG.md">CHANGELOG.md</a>
for details.</p>
<h2>v6.2.7</h2>
<p>Please refer to <a
href="https://github.com/vitejs/vite/blob/v6.2.7/packages/vite/CHANGELOG.md">CHANGELOG.md</a>
for details.</p>
</blockquote>
</details>
<details>
<summary>Changelog</summary>
<p><em>Sourced from <a
href="https://github.com/vitejs/vite/blob/main/packages/vite/CHANGELOG.md">vite's
changelog</a>.</em></p>
<blockquote>
<h2><!-- raw HTML omitted -->6.3.4 (2025-04-30)<!-- raw HTML omitted
--></h2>
<ul>
<li>fix: check static serve file inside sirv (<a
href="https://github.com/vitejs/vite/tree/HEAD/packages/vite/issues/19965">#19965</a>)
(<a
href="https://github.com/vitejs/vite/commit/c22c43de612eebb6c182dd67850c24e4fab8cacb">c22c43d</a>),
closes <a
href="https://redirect.github.com/vitejs/vite/issues/19965">#19965</a></li>
<li>fix(optimizer): return plain object when using <code>require</code>
to import externals in optimized dependenci (<a
href="https://github.com/vitejs/vite/commit/efc5eab253419fde0a6a48b8d2f233063d6a9643">efc5eab</a>),
closes <a
href="https://redirect.github.com/vitejs/vite/issues/19940">#19940</a></li>
<li>refactor: remove duplicate plugin context type (<a
href="https://github.com/vitejs/vite/tree/HEAD/packages/vite/issues/19935">#19935</a>)
(<a
href="https://github.com/vitejs/vite/commit/d6d01c2292fa4f9603e05b95d81c8724314c20e0">d6d01c2</a>),
closes <a
href="https://redirect.github.com/vitejs/vite/issues/19935">#19935</a></li>
</ul>
<h2><!-- raw HTML omitted -->6.3.3 (2025-04-24)<!-- raw HTML omitted
--></h2>
<ul>
<li>fix: ignore malformed uris in tranform middleware (<a
href="https://github.com/vitejs/vite/tree/HEAD/packages/vite/issues/19853">#19853</a>)
(<a
href="https://github.com/vitejs/vite/commit/e4d520141bcd83ad61f16767348b4a813bf9340a">e4d5201</a>),
closes <a
href="https://redirect.github.com/vitejs/vite/issues/19853">#19853</a></li>
<li>fix(assets): ensure ?no-inline is not included in the asset url in
the production environment (<a
href="https://github.com/vitejs/vite/tree/HEAD/packages/vite/issues/1949">#1949</a>
(<a
href="https://github.com/vitejs/vite/commit/16a73c05d35daa34117a173784895546212db5f4">16a73c0</a>),
closes <a
href="https://redirect.github.com/vitejs/vite/issues/19496">#19496</a></li>
<li>fix(css): resolve relative imports in sass properly on Windows (<a
href="https://github.com/vitejs/vite/tree/HEAD/packages/vite/issues/19920">#19920</a>)
(<a
href="https://github.com/vitejs/vite/commit/ffab44270488f54ae344801024474b597249071b">ffab442</a>),
closes <a
href="https://redirect.github.com/vitejs/vite/issues/19920">#19920</a></li>
<li>fix(deps): update all non-major dependencies (<a
href="https://github.com/vitejs/vite/tree/HEAD/packages/vite/issues/19899">#19899</a>)
(<a
href="https://github.com/vitejs/vite/commit/a4b500ef9ccc9b19a2882156a9ba8397e69bc6b2">a4b500e</a>),
closes <a
href="https://redirect.github.com/vitejs/vite/issues/19899">#19899</a></li>
<li>fix(ssr): fix execution order of re-export (<a
href="https://github.com/vitejs/vite/tree/HEAD/packages/vite/issues/19841">#19841</a>)
(<a
href="https://github.com/vitejs/vite/commit/ed29dee2eb2e3573b2bc337e1a9124c65222a1e5">ed29dee</a>),
closes <a
href="https://redirect.github.com/vitejs/vite/issues/19841">#19841</a></li>
<li>fix(ssr): fix live binding of default export declaration and hoist
exports getter (<a
href="https://github.com/vitejs/vite/tree/HEAD/packages/vite/issues/19842">#19842</a>)
(<a
href="https://github.com/vitejs/vite/commit/80a91ff82426a4c88d54b9f5ec9a4205cb13899b">80a91ff</a>),
closes <a
href="https://redirect.github.com/vitejs/vite/issues/19842">#19842</a></li>
<li>perf: skip sourcemap generation for renderChunk hook of
import-analysis-build plugin (<a
href="https://github.com/vitejs/vite/tree/HEAD/packages/vite/issues/19921">#19921</a>)
(<a
href="https://github.com/vitejs/vite/commit/55cfd04b10f98cde7a96814a69b9813543ea79c2">55cfd04</a>),
closes <a
href="https://redirect.github.com/vitejs/vite/issues/19921">#19921</a></li>
<li>test(ssr): test <code>ssrTransform</code> re-export deps and test
stacktrace with first line (<a
href="https://github.com/vitejs/vite/tree/HEAD/packages/vite/issues/19629">#19629</a>)
(<a
href="https://github.com/vitejs/vite/commit/9399cdaf8c3b2efd5f4015d57dc3b0e4e5b91a9d">9399cda</a>),
closes <a
href="https://redirect.github.com/vitejs/vite/issues/19629">#19629</a></li>
</ul>
<h2><!-- raw HTML omitted -->6.3.2 (2025-04-18)<!-- raw HTML omitted
--></h2>
<ul>
<li>fix: match default asserts case insensitive (<a
href="https://github.com/vitejs/vite/tree/HEAD/packages/vite/issues/19852">#19852</a>)
(<a
href="https://github.com/vitejs/vite/commit/cbdab1d6a30e07263ec51b2ca042369e736adec6">cbdab1d</a>),
closes <a
href="https://redirect.github.com/vitejs/vite/issues/19852">#19852</a></li>
<li>fix: open first url if host does not match any urls (<a
href="https://github.com/vitejs/vite/tree/HEAD/packages/vite/issues/19886">#19886</a>)
(<a
href="https://github.com/vitejs/vite/commit/6abbdce3d77990409e12380e72c7ec9dd3f8bec5">6abbdce</a>),
closes <a
href="https://redirect.github.com/vitejs/vite/issues/19886">#19886</a></li>
<li>fix(css): respect <code>css.lightningcss</code> option in css
minification process (<a
href="https://github.com/vitejs/vite/tree/HEAD/packages/vite/issues/19879">#19879</a>)
(<a
href="https://github.com/vitejs/vite/commit/b5055e0dd4c0e084115c3dbfead5736a54807e0c">b5055e0</a>),
closes <a
href="https://redirect.github.com/vitejs/vite/issues/19879">#19879</a></li>
<li>fix(deps): update all non-major dependencies (<a
href="https://github.com/vitejs/vite/tree/HEAD/packages/vite/issues/19698">#19698</a>)
(<a
href="https://github.com/vitejs/vite/commit/bab4cb92248adf6b9b18df12b2bf03889b0bd1eb">bab4cb9</a>),
closes <a
href="https://redirect.github.com/vitejs/vite/issues/19698">#19698</a></li>
<li>feat(css): improve lightningcss messages (<a
href="https://github.com/vitejs/vite/tree/HEAD/packages/vite/issues/19880">#19880</a>)
(<a
href="https://github.com/vitejs/vite/commit/c713f79b5a4bd98542d8dbe4c85ba4cce9b1f358">c713f79</a>),
closes <a
href="https://redirect.github.com/vitejs/vite/issues/19880">#19880</a></li>
</ul>
<h2><!-- raw HTML omitted -->6.3.1 (2025-04-17)<!-- raw HTML omitted
--></h2>
<ul>
<li>fix: avoid using <code>Promise.allSettled</code> in preload function
(<a
href="https://github.com/vitejs/vite/tree/HEAD/packages/vite/issues/19805">#19805</a>)
(<a
href="https://github.com/vitejs/vite/commit/35c7f35e2b67f2158ededf2af58ecec53b3f16c5">35c7f35</a>),
closes <a
href="https://redirect.github.com/vitejs/vite/issues/19805">#19805</a></li>
<li>fix: backward compat for internal plugin <code>transform</code>
calls (<a
href="https://github.com/vitejs/vite/tree/HEAD/packages/vite/issues/19878">#19878</a>)
(<a
href="https://github.com/vitejs/vite/commit/a152b7cbac72e05668f8fc23074d531ecebb77a5">a152b7c</a>),
closes <a
href="https://redirect.github.com/vitejs/vite/issues/19878">#19878</a></li>
</ul>
<h2>6.3.0 (2025-04-16)</h2>
<ul>
<li>fix(hmr): avoid infinite loop happening with
<code>hot.invalidate</code> in circular deps (<a
href="https://github.com/vitejs/vite/tree/HEAD/packages/vite/issues/19870">#19870</a>)
(<a
href="https://github.com/vitejs/vite/commit/d4ee5e8655a85f4d6bebc695b063d69406ab53ac">d4ee5e8</a>),
closes <a
href="https://redirect.github.com/vitejs/vite/issues/19870">#19870</a></li>
<li>fix(preview): use host url to open browser (<a
href="https://github.com/vitejs/vite/tree/HEAD/packages/vite/issues/19836">#19836</a>)
(<a
href="https://github.com/vitejs/vite/commit/50034340401b4043bb0b158f18ffb7ae1b7f5c86">5003434</a>),
closes <a
href="https://redirect.github.com/vitejs/vite/issues/19836">#19836</a></li>
</ul>
<h2>6.3.0-beta.2 (2025-04-11)</h2>
<ul>
<li>fix: addWatchFile doesn't work if base is specified (fixes <a
href="https://github.com/vitejs/vite/tree/HEAD/packages/vite/issues/19792">#19792</a>)
(<a
href="https://github.com/vitejs/vite/tree/HEAD/packages/vite/issues/19794">#19794</a>)
(<a
href="https://github.com/vitejs/vite/commit/8bed1de5710f2a097af0e22a196545446d98f988">8bed1de</a>),
closes <a
href="https://redirect.github.com/vitejs/vite/issues/19792">#19792</a>
<a
href="https://redirect.github.com/vitejs/vite/issues/19794">#19794</a></li>
<li>fix: correct the behavior when multiple transform filter options are
specified (<a
href="https://github.com/vitejs/vite/tree/HEAD/packages/vite/issues/19818">#19818</a>)
(<a
href="https://github.com/vitejs/vite/commit/7200deec91a501fb84734e23906f80808734540c">7200dee</a>),
closes <a
href="https://redirect.github.com/vitejs/vite/issues/19818">#19818</a></li>
<li>fix: fs check with svg and relative paths (<a
href="https://github.com/vitejs/vite/tree/HEAD/packages/vite/issues/19782">#19782</a>)
(<a
href="https://github.com/vitejs/vite/commit/62d7e81ee189d65899bb65f3263ddbd85247b647">62d7e81</a>),
closes <a
href="https://redirect.github.com/vitejs/vite/issues/19782">#19782</a></li>
</ul>
<!-- raw HTML omitted -->
</blockquote>
<p>... (truncated)</p>
</details>
<details>
<summary>Commits</summary>
<ul>
<li><a
href="https://github.com/vitejs/vite/commit/b040d547a17c4bfe8aba44534228667a50612318"><code>b040d54</code></a>
release: v6.3.4</li>
<li><a
href="https://github.com/vitejs/vite/commit/c22c43de612eebb6c182dd67850c24e4fab8cacb"><code>c22c43d</code></a>
fix: check static serve file inside sirv (<a
href="https://github.com/vitejs/vite/tree/HEAD/packages/vite/issues/19965">#19965</a>)</li>
<li><a
href="https://github.com/vitejs/vite/commit/efc5eab253419fde0a6a48b8d2f233063d6a9643"><code>efc5eab</code></a>
fix(optimizer): return plain object when using <code>require</code> to
import externals ...</li>
<li><a
href="https://github.com/vitejs/vite/commit/d6d01c2292fa4f9603e05b95d81c8724314c20e0"><code>d6d01c2</code></a>
refactor: remove duplicate plugin context type (<a
href="https://github.com/vitejs/vite/tree/HEAD/packages/vite/issues/19935">#19935</a>)</li>
<li><a
href="https://github.com/vitejs/vite/commit/db9eb97b2f530a3985b29c5d1a529772f1ab1893"><code>db9eb97</code></a>
release: v6.3.3</li>
<li><a
href="https://github.com/vitejs/vite/commit/e4d520141bcd83ad61f16767348b4a813bf9340a"><code>e4d5201</code></a>
fix: ignore malformed uris in tranform middleware (<a
href="https://github.com/vitejs/vite/tree/HEAD/packages/vite/issues/19853">#19853</a>)</li>
<li><a
href="https://github.com/vitejs/vite/commit/55cfd04b10f98cde7a96814a69b9813543ea79c2"><code>55cfd04</code></a>
perf: skip sourcemap generation for renderChunk hook of
import-analysis-build...</li>
<li><a
href="https://github.com/vitejs/vite/commit/ffab44270488f54ae344801024474b597249071b"><code>ffab442</code></a>
fix(css): resolve relative imports in sass properly on Windows (<a
href="https://github.com/vitejs/vite/tree/HEAD/packages/vite/issues/19920">#19920</a>)</li>
<li><a
href="https://github.com/vitejs/vite/commit/16a73c05d35daa34117a173784895546212db5f4"><code>16a73c0</code></a>
fix(assets): ensure ?no-inline is not included in the asset url in the
produc...</li>
<li><a
href="https://github.com/vitejs/vite/commit/9399cdaf8c3b2efd5f4015d57dc3b0e4e5b91a9d"><code>9399cda</code></a>
test(ssr): test <code>ssrTransform</code> re-export deps and test
stacktrace with first ...</li>
<li>Additional commits viewable in <a
href="https://github.com/vitejs/vite/commits/v6.3.4/packages/vite">compare
view</a></li>
</ul>
</details>
<br />


[![Dependabot compatibility
score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=vite&package-manager=npm_and_yarn&previous-version=6.2.6&new-version=6.3.4)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores)

Dependabot will resolve any conflicts with this PR as long as you don't
alter it yourself. You can also trigger a rebase manually by commenting
`@dependabot rebase`.

[//]: # (dependabot-automerge-start)
[//]: # (dependabot-automerge-end)

---

<details>
<summary>Dependabot commands and options</summary>
<br />

You can trigger Dependabot actions by commenting on this PR:
- `@dependabot rebase` will rebase this PR
- `@dependabot recreate` will recreate this PR, overwriting any edits
that have been made to it
- `@dependabot merge` will merge this PR after your CI passes on it
- `@dependabot squash and merge` will squash and merge this PR after
your CI passes on it
- `@dependabot cancel merge` will cancel a previously requested merge
and block automerging
- `@dependabot reopen` will reopen this PR if it is closed
- `@dependabot close` will close this PR and stop Dependabot recreating
it. You can achieve the same result by closing it manually
- `@dependabot show <dependency name> ignore conditions` will show all
of the ignore conditions of the specified dependency
- `@dependabot ignore this major version` will close this PR and stop
Dependabot creating any more for this major version (unless you reopen
the PR or upgrade to it yourself)
- `@dependabot ignore this minor version` will close this PR and stop
Dependabot creating any more for this minor version (unless you reopen
the PR or upgrade to it yourself)
- `@dependabot ignore this dependency` will close this PR and stop
Dependabot creating any more for this dependency (unless you reopen the
PR or upgrade to it yourself)
You can disable automated security fix PRs for this repo from the
[Security Alerts
page](https://github.com/microsoft/onnxruntime/network/alerts).

</details>

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 .../testcases/vite-default/package-lock.json  | 58 +++++++++++++++++--
 .../testcases/vite-default/package.json       |  2 +-
 2 files changed, 54 insertions(+), 6 deletions(-)

diff --git a/js/web/test/e2e/exports/testcases/vite-default/package-lock.json b/js/web/test/e2e/exports/testcases/vite-default/package-lock.json
index c9da59b4b0021..48f0a8f3e9d5c 100644
--- a/js/web/test/e2e/exports/testcases/vite-default/package-lock.json
+++ b/js/web/test/e2e/exports/testcases/vite-default/package-lock.json
@@ -12,7 +12,7 @@
       },
       "devDependencies": {
         "@vitejs/plugin-vue": "^5.2.1",
-        "vite": "^6.2.6"
+        "vite": "^6.3.5"
       }
     },
     "node_modules/@babel/helper-string-parser": {
@@ -944,6 +944,21 @@
       "integrity": "sha512-Rfkk/Mp/DL7JVje3u18FxFujQlTNR2q6QfMSMB7AvCBx91NGj/ba3kCfza0f6dVDbw7YlRf/nDrn7pQrCCyQ/w==",
       "license": "MIT"
     },
+    "node_modules/fdir": {
+      "version": "6.4.4",
+      "resolved": "https://registry.npmjs.org/fdir/-/fdir-6.4.4.tgz",
+      "integrity": "sha512-1NZP+GK4GfuAv3PqKvxQRDMjdSRZjnkq7KfhlNrCNNlZ0ygQFpebfrnfnq/W7fpUnAv9aGWmY1zKx7FYL3gwhg==",
+      "dev": true,
+      "license": "MIT",
+      "peerDependencies": {
+        "picomatch": "^3 || ^4"
+      },
+      "peerDependenciesMeta": {
+        "picomatch": {
+          "optional": true
+        }
+      }
+    },
     "node_modules/fsevents": {
       "version": "2.3.3",
       "resolved": "https://registry.npmjs.org/fsevents/-/fsevents-2.3.3.tgz",
@@ -992,6 +1007,19 @@
       "integrity": "sha512-xceH2snhtb5M9liqDsmEw56le376mTZkEX/jEb/RxNFyegNul7eNslCXP9FDj/Lcu0X8KEyMceP2ntpaHrDEVA==",
       "license": "ISC"
     },
+    "node_modules/picomatch": {
+      "version": "4.0.2",
+      "resolved": "https://registry.npmjs.org/picomatch/-/picomatch-4.0.2.tgz",
+      "integrity": "sha512-M7BAV6Rlcy5u+m6oPhAPFgJTzAioX/6B0DxyvDlo9l8+T3nLKbrczg2WLUyzd45L8RqfUMyGPzekbMvX2Ldkwg==",
+      "dev": true,
+      "license": "MIT",
+      "engines": {
+        "node": ">=12"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/jonschlinkert"
+      }
+    },
     "node_modules/postcss": {
       "version": "8.5.3",
       "resolved": "https://registry.npmjs.org/postcss/-/postcss-8.5.3.tgz",
@@ -1068,16 +1096,36 @@
         "node": ">=0.10.0"
       }
     },
+    "node_modules/tinyglobby": {
+      "version": "0.2.13",
+      "resolved": "https://registry.npmjs.org/tinyglobby/-/tinyglobby-0.2.13.tgz",
+      "integrity": "sha512-mEwzpUgrLySlveBwEVDMKk5B57bhLPYovRfPAXD5gA/98Opn0rCDj3GtLwFvCvH5RK9uPCExUROW5NjDwvqkxw==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "fdir": "^6.4.4",
+        "picomatch": "^4.0.2"
+      },
+      "engines": {
+        "node": ">=12.0.0"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/SuperchupuDev"
+      }
+    },
     "node_modules/vite": {
-      "version": "6.2.6",
-      "resolved": "https://registry.npmjs.org/vite/-/vite-6.2.6.tgz",
-      "integrity": "sha512-9xpjNl3kR4rVDZgPNdTL0/c6ao4km69a/2ihNQbcANz8RuCOK3hQBmLSJf3bRKVQjVMda+YvizNE8AwvogcPbw==",
+      "version": "6.3.5",
+      "resolved": "https://registry.npmjs.org/vite/-/vite-6.3.5.tgz",
+      "integrity": "sha512-cZn6NDFE7wdTpINgs++ZJ4N49W2vRp8LCKrn3Ob1kYNtOo21vfDoaV5GzBfLU4MovSAB8uNRm4jgzVQZ+mBzPQ==",
       "dev": true,
       "license": "MIT",
       "dependencies": {
         "esbuild": "^0.25.0",
+        "fdir": "^6.4.4",
+        "picomatch": "^4.0.2",
         "postcss": "^8.5.3",
-        "rollup": "^4.30.1"
+        "rollup": "^4.34.9",
+        "tinyglobby": "^0.2.13"
       },
       "bin": {
         "vite": "bin/vite.js"
diff --git a/js/web/test/e2e/exports/testcases/vite-default/package.json b/js/web/test/e2e/exports/testcases/vite-default/package.json
index 5169734074299..f7d5751354905 100644
--- a/js/web/test/e2e/exports/testcases/vite-default/package.json
+++ b/js/web/test/e2e/exports/testcases/vite-default/package.json
@@ -13,6 +13,6 @@
   },
   "devDependencies": {
     "@vitejs/plugin-vue": "^5.2.1",
-    "vite": "^6.2.6"
+    "vite": "^6.3.5"
   }
 }

From 01aef5f71cc1d8d35ef64cad56a1235263772dc9 Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Tue, 6 May 2025 01:08:23 -0700
Subject: [PATCH 13/84] add check in NPM packaging pipeline for @dev package
 version (#24641)

### Description

This PR adds a check for the package version for dev channel.

This PR should be able to help avoid publishing packages like "-rc.*" to
dev channel automatically.

### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
---
 tools/ci_build/github/js/validate-npm-packages.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/tools/ci_build/github/js/validate-npm-packages.py b/tools/ci_build/github/js/validate-npm-packages.py
index b009330764973..37b5b3d9807a3 100644
--- a/tools/ci_build/github/js/validate-npm-packages.py
+++ b/tools/ci_build/github/js/validate-npm-packages.py
@@ -129,6 +129,16 @@
 if RELEASE_WEB and RELEASE_REACT_NATIVE and ort_web_ver != ort_react_native_ver:
     raise Exception("version number is different for onnxruntime-web and onnxruntime-react-native")
 
+# @dev build has to match the following pattern:
+# "x.y.z-dev.*"
+if tag == "dev":
+    if RELEASE_NODE and "-dev" not in ort_node_ver:
+        raise Exception(f'@dev build version should contain "-dev". ort_node_ver={ort_node_ver}')
+    if RELEASE_WEB and "-dev" not in ort_web_ver:
+        raise Exception(f'@dev build version should contain "-dev". ort_web_ver={ort_web_ver}')
+    if RELEASE_REACT_NATIVE and "-dev" not in ort_react_native_ver:
+        raise Exception(f'@dev build version should contain "-dev". ort_react_native_ver={ort_react_native_ver}')
+
 print("====== validated versions ======")
 print(f"source_branch={source_branch}")
 print(f"tag={tag}")

From 1f4156c6146d04999e5e419df4ae6628e928aaad Mon Sep 17 00:00:00 2001
From: Scott McKay <skottmckay@gmail.com>
Date: Tue, 6 May 2025 18:29:05 +1000
Subject: [PATCH 14/84] Add support for selection policy delegate (#24635)

### Description
<!-- Describe your changes. -->
Add support for selection policy delegate
- split API function into one for the policy enum and one for the
delegate
- add `void*` for user state
  - required to wire up using the delegate in other languages.

Add C# support for specifying the selection policy delegate.

Address comments from initial C# autoep support PR.

### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
---
 .../NativeMethods.shared.cs                   |  99 +++++++++++-
 .../OrtEpDevice.shared.cs                     |  30 ++--
 .../OrtHardwareDevice.shared.cs               |  30 ++--
 .../SessionOptions.shared.cs                  | 151 +++++++++++++++++-
 .../OrtAutoEpTests.cs                         |  49 +++++-
 .../core/session/onnxruntime_c_api.h          |  38 +++--
 .../core/session/onnxruntime_cxx_api.h        |   6 +-
 .../core/session/onnxruntime_cxx_inline.h     |  11 +-
 onnxruntime/core/framework/session_options.h  |   3 +-
 .../core/session/abi_session_options.cc       |  16 +-
 onnxruntime/core/session/inference_session.cc |   5 +
 onnxruntime/core/session/inference_session.h  |   2 +
 onnxruntime/core/session/onnxruntime_c_api.cc |   3 +-
 onnxruntime/core/session/ort_apis.h           |   7 +-
 .../core/session/provider_policy_context.cc   |  52 ++++--
 onnxruntime/core/session/utils.cc             |  46 +++---
 .../test/autoep/test_autoep_selection.cc      | 149 ++++++++++++++++-
 17 files changed, 592 insertions(+), 105 deletions(-)

diff --git a/csharp/src/Microsoft.ML.OnnxRuntime/NativeMethods.shared.cs b/csharp/src/Microsoft.ML.OnnxRuntime/NativeMethods.shared.cs
index 620c13b8641b5..c543414ca13a9 100644
--- a/csharp/src/Microsoft.ML.OnnxRuntime/NativeMethods.shared.cs
+++ b/csharp/src/Microsoft.ML.OnnxRuntime/NativeMethods.shared.cs
@@ -353,6 +353,7 @@ public struct OrtApi
 
         public IntPtr SessionOptionsAppendExecutionProvider_V2;
         public IntPtr SessionOptionsSetEpSelectionPolicy;
+        public IntPtr SessionOptionsSetEpSelectionPolicyDelegate;
 
         public IntPtr HardwareDevice_Type;
         public IntPtr HardwareDevice_VendorId;
@@ -692,6 +693,11 @@ static NativeMethods()
                 (DSessionOptionsSetEpSelectionPolicy)Marshal.GetDelegateForFunctionPointer(
                     api_.SessionOptionsSetEpSelectionPolicy,
                     typeof(DSessionOptionsSetEpSelectionPolicy));
+
+            OrtSessionOptionsSetEpSelectionPolicyDelegate =
+                (DSessionOptionsSetEpSelectionPolicyDelegate)Marshal.GetDelegateForFunctionPointer(
+                    api_.SessionOptionsSetEpSelectionPolicyDelegate,
+                    typeof(DSessionOptionsSetEpSelectionPolicyDelegate));
         }
 
         internal class NativeLib
@@ -2278,28 +2284,49 @@ out IntPtr lora_adapter
 #region Auto EP API related
         //
         // OrtKeyValuePairs
+
+        /// <summary>
+        /// Create an OrtKeyValuePairs instance.
+        /// </summary>
         [UnmanagedFunctionPointer(CallingConvention.Winapi)]
         public delegate void DOrtCreateKeyValuePairs(out IntPtr /* OrtKeyValuePairs** */ kvps);
 
+        /// <summary>
+        /// Add/replace a key-value pair in the OrtKeyValuePairs instance.
+        /// </summary>
         [UnmanagedFunctionPointer(CallingConvention.Winapi)]
         public delegate void DOrtAddKeyValuePair(IntPtr /* OrtKeyValuePairs* */ kvps,
                                                  byte[] /* const char* */ key,
                                                  byte[] /* const char* */ value);
 
+        /// <summary>
+        /// Get the value for the provided key. 
+        /// </summary>
+        /// <returns>Value. Returns IntPtr.Zero if key was not found.</returns>
         [UnmanagedFunctionPointer(CallingConvention.Winapi)]
         public delegate IntPtr /* const char* */ DOrtGetKeyValue(IntPtr /* const OrtKeyValuePairs* */ kvps,
                                                                  byte[] /* const char* */ key);
 
+        /// <summary>
+        /// Get all the key-value pairs in the OrtKeyValuePairs instance.
+        /// </summary>
         [UnmanagedFunctionPointer(CallingConvention.Winapi)]
         public delegate void DOrtGetKeyValuePairs(IntPtr /* const OrtKeyValuePairs* */ kvps,
                                                   out IntPtr /* const char* const** */ keys,
                                                   out IntPtr /* const char* const** */ values,
                                                   out UIntPtr /* size_t* */ numEntries);
 
+        /// <summary>
+        /// Remove a key-value pair from the OrtKeyValuePairs instance.
+        /// Ignores keys that are not present.
+        /// </summary>
         [UnmanagedFunctionPointer(CallingConvention.Winapi)]
         public delegate void DOrtRemoveKeyValuePair(IntPtr /* OrtKeyValuePairs* */ kvps,
                                                     byte[] /* const char* */ key);
 
+        /// <summary>
+        /// Release the OrtKeyValuePairs instance.
+        /// </summary>
         [UnmanagedFunctionPointer(CallingConvention.Winapi)]
         public delegate void DOrtReleaseKeyValuePairs(IntPtr /* OrtKeyValuePairs* */ kvps);
 
@@ -2370,12 +2397,27 @@ public delegate void DOrtRemoveKeyValuePair(IntPtr /* OrtKeyValuePairs* */ kvps,
 
         //
         // Auto Selection EP registration and selection customization
+
+        /// <summary>
+        /// Register an execution provider library. 
+        /// The library must implement CreateEpFactories and ReleaseEpFactory.
+        /// </summary>
+        /// <param name="env">Environment to add the EP library to.</param>
+        /// <param name="registration_name">Name to register the library under.</param>
+        /// <param name="path">Absolute path to the library.</param>
+        /// <returns>OrtStatus*</returns>
         [UnmanagedFunctionPointer(CallingConvention.Winapi)]
         public delegate IntPtr /* OrtStatus* */ DOrtRegisterExecutionProviderLibrary(
             IntPtr /* OrtEnv* */ env,
             byte[] /* const char* */ registration_name,
             byte[] /* const ORTCHAR_T* */ path);
 
+        /// <summary>
+        /// Unregister an execution provider library.
+        /// </summary>
+        /// <param name="env">The environment to unregister the library from.</param>
+        /// <param name="registration_name">The name the library was registered under.</param>
+        /// <returns>OrtStatus*</returns>
         [UnmanagedFunctionPointer(CallingConvention.Winapi)]
         public delegate IntPtr /* OrtStatus* */ DOrtUnregisterExecutionProviderLibrary(
             IntPtr /* OrtEnv* */ env,
@@ -2384,6 +2426,11 @@ public delegate void DOrtRemoveKeyValuePair(IntPtr /* OrtKeyValuePairs* */ kvps,
         public static DOrtRegisterExecutionProviderLibrary OrtRegisterExecutionProviderLibrary;
         public static DOrtUnregisterExecutionProviderLibrary OrtUnregisterExecutionProviderLibrary;
 
+        /// <summary>
+        /// Get the OrtEpDevices that are available.
+        /// These are all the possible execution provider and device pairs.
+        /// </summary>
+        /// <returns>OrtStatus*</returns>
         [UnmanagedFunctionPointer(CallingConvention.Winapi)]
         public delegate IntPtr /* OrtStatus* */ DOrtGetEpDevices(
             IntPtr /* const OrtEnv* */ env,
@@ -2392,6 +2439,20 @@ public delegate void DOrtRemoveKeyValuePair(IntPtr /* OrtKeyValuePairs* */ kvps,
 
         public static DOrtGetEpDevices OrtGetEpDevices;
 
+        /// <summary>
+        /// Add execution provider devices to the session options.
+        /// Priority is based on the order of the OrtEpDevice instances. Highest priority first.
+        /// All OrtEpDevice instances in ep_devices must be for the same execution provider.
+        /// e.g. selecting OpenVINO for GPU and NPU would have an OrtEpDevice for GPU and NPU.
+        /// </summary>
+        /// <param name="sess_options">SessionOptions to add to.</param>
+        /// <param name="env">Environment that the OrtEpDevice instances came from by calling GetEpDevices</param>
+        /// <param name="ep_devices">One or more OrtEpDevice instances.</param>
+        /// <param name="num_ep_devices">Number of OrtEpDevice instances.</param>
+        /// <param name="ep_option_keys">User overrides for execution provider options. May be IntPtr.Zero.</param>
+        /// <param name="ep_option_vals">User overrides for execution provider options. May be IntPtr.Zero.</param>
+        /// <param name="num_ep_options">Number of user overrides for execution provider options.</param>
+        /// <returns></returns>
         [UnmanagedFunctionPointer(CallingConvention.Winapi)]
         public delegate IntPtr /* OrtStatus* */ DOrtSessionOptionsAppendExecutionProvider_V2(
             IntPtr /* OrtSessionOptions* */ sess_options,
@@ -2404,6 +2465,18 @@ public delegate void DOrtRemoveKeyValuePair(IntPtr /* OrtKeyValuePairs* */ kvps,
 
         public static DOrtSessionOptionsAppendExecutionProvider_V2 OrtSessionOptionsAppendExecutionProvider_V2;
 
+        /// <summary>
+        /// Delegate to do custom execution provider selection.
+        /// </summary>
+        /// <param name="epDevices">Available OrtEpDevices to select from.</param>
+        /// <param name="numDevices">Number of OrtEpDevices.</param>
+        /// <param name="modelMetadata">Metadata from the ONNX model.</param>
+        /// <param name="runtimeMetadata">Runtime metadata. May be IntPtr.Zero.</param>
+        /// <param name="selected">OrtEpDevices that were selected. Pre-allocated array for delegate to update.</param>
+        /// <param name="maxSelected">Maximum number of OrtEpDevices that can be selected.</param>
+        /// <param name="numSelected">Number of OrtEpDevices that were selected.</param>
+        /// <param name="state">State that was provided in when the delegate was registered.</param>
+        /// <returns>OrtStatus*</returns>
         [UnmanagedFunctionPointer(CallingConvention.Winapi)]
         public delegate IntPtr DOrtEpSelectionDelegate(
             IntPtr /* OrtEpDevice** */ epDevices,
@@ -2412,16 +2485,36 @@ public delegate IntPtr DOrtEpSelectionDelegate(
             IntPtr /* OrtKeyValuePairs* */ runtimeMetadata,
             IntPtr /* OrtEpDevice** */ selected,
             uint maxSelected,
-            out UIntPtr numSelected
+            out UIntPtr numSelected,
+            IntPtr /* void* */ state
         );
 
+        /// <summary>
+        /// Set the execution provider selection policy.
+        /// </summary>
+        /// <param name="session_options">SessionOptions to set the policy for.</param>
+        /// <param name="policy">Selection policy.</param>
+        /// <returns>OrtStatus*</returns>
         [UnmanagedFunctionPointer(CallingConvention.Winapi)]
         public delegate IntPtr /* OrtStatus* */ DSessionOptionsSetEpSelectionPolicy(
             IntPtr /* OrtSessionOptions* */ session_options,
-            int /* OrtExecutionProviderDevicePolicy */ policy,
-            IntPtr /* DOrtEpSelectionDelegate* */ selection_delegate);
+            int /* OrtExecutionProviderDevicePolicy */ policy);
         public static DSessionOptionsSetEpSelectionPolicy OrtSessionOptionsSetEpSelectionPolicy;
 
+        /// <summary>
+        /// Set the execution provider selection policy delegate.
+        /// </summary>
+        /// <param name="session_options">SessionOptions to set the policy for.</param>
+        /// <param name="selection_delegate">Selection policy delegate.</param>
+        /// <param name="state">State that is passed through to the selection delegate.</param>
+        /// <returns>OrtStatus*</returns>
+        [UnmanagedFunctionPointer(CallingConvention.Winapi)]
+        public delegate IntPtr /* OrtStatus* */ DSessionOptionsSetEpSelectionPolicyDelegate(
+            IntPtr /* OrtSessionOptions* */ session_options,
+            IntPtr /* DOrtEpSelectionDelegate* */ selection_delegate,
+            IntPtr /* void* */ state);
+        public static DSessionOptionsSetEpSelectionPolicyDelegate OrtSessionOptionsSetEpSelectionPolicyDelegate;
+
 
         #endregion
         #region Misc API
diff --git a/csharp/src/Microsoft.ML.OnnxRuntime/OrtEpDevice.shared.cs b/csharp/src/Microsoft.ML.OnnxRuntime/OrtEpDevice.shared.cs
index e3947d900214e..0318e08519128 100644
--- a/csharp/src/Microsoft.ML.OnnxRuntime/OrtEpDevice.shared.cs
+++ b/csharp/src/Microsoft.ML.OnnxRuntime/OrtEpDevice.shared.cs
@@ -10,18 +10,18 @@ namespace Microsoft.ML.OnnxRuntime
     /// Represents the combination of an execution provider and a hardware device 
     /// that the execution provider can utilize.
     /// </summary>
-    public class OrtEpDevice : SafeHandle
+    public class OrtEpDevice
     {
         /// <summary>
         /// Construct an OrtEpDevice from an existing native OrtEpDevice instance.
         /// </summary>
         /// <param name="epDeviceHandle">Native OrtEpDevice handle.</param>
         internal OrtEpDevice(IntPtr epDeviceHandle)
-            : base(epDeviceHandle, ownsHandle: false)
         {
+            _handle = epDeviceHandle;
         }
 
-        internal IntPtr Handle => handle;
+        internal IntPtr Handle => _handle;
 
         /// <summary>
         /// The name of the execution provider.
@@ -30,7 +30,7 @@ public string EpName
         {
             get
             {
-                IntPtr namePtr = NativeMethods.OrtEpDevice_EpName(handle);
+                IntPtr namePtr = NativeMethods.OrtEpDevice_EpName(_handle);
                 return NativeOnnxValueHelper.StringFromNativeUtf8(namePtr);
             }
         }
@@ -42,7 +42,7 @@ public string EpVendor
         {
             get
             {
-                IntPtr vendorPtr = NativeMethods.OrtEpDevice_EpVendor(handle);
+                IntPtr vendorPtr = NativeMethods.OrtEpDevice_EpVendor(_handle);
                 return NativeOnnxValueHelper.StringFromNativeUtf8(vendorPtr);
             }
         }
@@ -54,7 +54,7 @@ public OrtKeyValuePairs EpMetadata
         {
             get
             {
-                return new OrtKeyValuePairs(NativeMethods.OrtEpDevice_EpMetadata(handle));
+                return new OrtKeyValuePairs(NativeMethods.OrtEpDevice_EpMetadata(_handle));
             }
         }
 
@@ -65,7 +65,7 @@ public OrtKeyValuePairs EpOptions
         {
             get
             {
-                return new OrtKeyValuePairs(NativeMethods.OrtEpDevice_EpOptions(handle));
+                return new OrtKeyValuePairs(NativeMethods.OrtEpDevice_EpOptions(_handle));
             }
         }
 
@@ -76,23 +76,11 @@ public OrtHardwareDevice HardwareDevice
         {
             get
             {
-                IntPtr devicePtr = NativeMethods.OrtEpDevice_Device(handle);
+                IntPtr devicePtr = NativeMethods.OrtEpDevice_Device(_handle);
                 return new OrtHardwareDevice(devicePtr);
             }
         }
 
-        /// <summary>
-        /// Indicates whether the native handle is invalid.
-        /// </summary>
-        public override bool IsInvalid => handle == IntPtr.Zero;
-
-        /// <summary>
-        /// No-op. OrtEpDevice is always read-only as the instance is owned by native ORT.
-        /// </summary>
-        /// <returns>True</returns>
-        protected override bool ReleaseHandle()
-        {
-            return true;
-        }
+        private readonly IntPtr _handle;
     }
 }
\ No newline at end of file
diff --git a/csharp/src/Microsoft.ML.OnnxRuntime/OrtHardwareDevice.shared.cs b/csharp/src/Microsoft.ML.OnnxRuntime/OrtHardwareDevice.shared.cs
index 8e7caae90ff79..af7115a92285e 100644
--- a/csharp/src/Microsoft.ML.OnnxRuntime/OrtHardwareDevice.shared.cs
+++ b/csharp/src/Microsoft.ML.OnnxRuntime/OrtHardwareDevice.shared.cs
@@ -21,16 +21,16 @@ public enum OrtHardwareDeviceType
     /// <summary>
     /// Represents a hardware device that is available on the current system.
     /// </summary>
-    public class OrtHardwareDevice : SafeHandle
+    public class OrtHardwareDevice
     {
 
         /// <summary>
         /// Construct an OrtHardwareDevice for a native OrtHardwareDevice instance.
         /// </summary>
         /// <param name="deviceHandle">Native OrtHardwareDevice handle.</param>
-        internal OrtHardwareDevice(IntPtr deviceHandle)
-            : base(deviceHandle, ownsHandle: false)
+        internal OrtHardwareDevice(IntPtr deviceHandle)            
         {
+            _handle = deviceHandle;
         }
 
         /// <summary>
@@ -40,7 +40,7 @@ public OrtHardwareDeviceType Type
         {
             get
             {
-                return (OrtHardwareDeviceType)NativeMethods.OrtHardwareDevice_Type(handle);
+                return (OrtHardwareDeviceType)NativeMethods.OrtHardwareDevice_Type(_handle);
             }
         }
 
@@ -54,7 +54,7 @@ public uint VendorId
         {
             get
             {
-                return NativeMethods.OrtHardwareDevice_VendorId(handle);
+                return NativeMethods.OrtHardwareDevice_VendorId(_handle);
             }
         }
 
@@ -65,7 +65,7 @@ public string Vendor
         {
             get
             {
-                IntPtr vendorPtr = NativeMethods.OrtHardwareDevice_Vendor(handle);
+                IntPtr vendorPtr = NativeMethods.OrtHardwareDevice_Vendor(_handle);
                 return NativeOnnxValueHelper.StringFromNativeUtf8(vendorPtr);
             }
         }
@@ -82,7 +82,7 @@ public uint DeviceId
         {
             get
             {
-                return NativeMethods.OrtHardwareDevice_DeviceId(handle);
+                return NativeMethods.OrtHardwareDevice_DeviceId(_handle);
             }
         }
 
@@ -95,22 +95,10 @@ public OrtKeyValuePairs Metadata
         {
             get
             {
-                return new OrtKeyValuePairs(NativeMethods.OrtHardwareDevice_Metadata(handle));
+                return new OrtKeyValuePairs(NativeMethods.OrtHardwareDevice_Metadata(_handle));
             }
         }
 
-        /// <summary>
-        /// Indicates whether the native handle is invalid.
-        /// </summary>
-        public override bool IsInvalid => handle == IntPtr.Zero;
-
-        /// <summary>
-        /// No-op. OrtHardwareDevice is always read-only as the instance is owned by native ORT.
-        /// </summary>
-        /// <returns>True</returns>
-        protected override bool ReleaseHandle()
-        {
-            return true;
-        }
+        private readonly IntPtr _handle;
     }
 }
\ No newline at end of file
diff --git a/csharp/src/Microsoft.ML.OnnxRuntime/SessionOptions.shared.cs b/csharp/src/Microsoft.ML.OnnxRuntime/SessionOptions.shared.cs
index de6189e105f78..d60bf75ccbd7c 100644
--- a/csharp/src/Microsoft.ML.OnnxRuntime/SessionOptions.shared.cs
+++ b/csharp/src/Microsoft.ML.OnnxRuntime/SessionOptions.shared.cs
@@ -636,9 +636,32 @@ public void AddFreeDimensionOverrideByName(string dimName, long dimValue)
         public void SetEpSelectionPolicy(ExecutionProviderDevicePolicy policy)
         {
             NativeApiStatus.VerifySuccess(
-                NativeMethods.OrtSessionOptionsSetEpSelectionPolicy(handle, (int)policy, IntPtr.Zero));
+                NativeMethods.OrtSessionOptionsSetEpSelectionPolicy(handle, (int)policy));
         }
 
+        /// <summary>
+        /// Set the execution provider selection policy if using automatic execution provider selection.
+        /// Execution providers must be registered with the OrtEnv to be available for selection.
+        /// </summary>
+        /// <param name="selectionDelegate">Delegate that implements the custom selection policy.</param>
+        public void SetEpSelectionPolicyDelegate(EpSelectionDelegate selectionDelegate = null)
+        {
+            _epSelectionPolicyConnector = new EpSelectionPolicyConnector(selectionDelegate);
+            _epSelectionPolicyDelegate = new NativeMethods.DOrtEpSelectionDelegate(
+                EpSelectionPolicyConnector.EpSelectionPolicyWrapper);
+
+            // make sure these stay alive. not sure if this is necessary when they're class members though
+            _epSelectionPolicyConnectorHandle = GCHandle.Alloc(_epSelectionPolicyConnector);
+            _epSelectionPolicyDelegateHandle = GCHandle.Alloc(_epSelectionPolicyDelegate);
+
+            IntPtr funcPtr = Marshal.GetFunctionPointerForDelegate(_epSelectionPolicyDelegate);
+
+            NativeApiStatus.VerifySuccess(
+                NativeMethods.OrtSessionOptionsSetEpSelectionPolicyDelegate(
+                    handle, 
+                    funcPtr,
+                    GCHandle.ToIntPtr(_epSelectionPolicyConnectorHandle)));
+        }
         #endregion
 
         internal IntPtr Handle
@@ -914,7 +937,98 @@ public void SetLoadCancellationFlag(bool value)
         {
             NativeApiStatus.VerifySuccess(NativeMethods.OrtSessionOptionsSetLoadCancellationFlag(handle, value));
         }
+        #endregion
+
+        #region Selection Policy Delegate helpers
+        /// <summary>
+        /// Delegate to select execution provider devices from a list of available devices.
+        /// </summary>
+        /// <param name="epDevices">OrtEpDevices to select from.</param>
+        /// <param name="modelMetadata">Model metadata.</param>
+        /// <param name="runtimeMetadata">Runtime metadata.</param>
+        /// <param name="maxSelections">Maximum number of devices that can be selected.</param>
+        /// <returns>Selected devices. Ordered by priority. Highest priority first.</returns>
+        public delegate List<OrtEpDevice> EpSelectionDelegate(IReadOnlyList<OrtEpDevice> epDevices,
+                                                              OrtKeyValuePairs modelMetadata,
+                                                              OrtKeyValuePairs runtimeMetadata,
+                                                              uint maxSelections);
+
+        /// <summary>
+        /// Class to bridge the C# and native worlds for the EP selection policy delegate
+        /// </summary>
+        internal class EpSelectionPolicyConnector
+        {
+            private readonly EpSelectionDelegate _csharpDelegate;
+
+            internal EpSelectionPolicyConnector(EpSelectionDelegate selectionDelegate)
+            {
+                _csharpDelegate = selectionDelegate;
+            }
+
+            /// <summary>
+            /// Delegate to convert between the C and C# worlds
+            /// </summary>
+            /// <param name="epDevicesIn">OrtEpDevices to select from.</param>
+            /// <param name="numDevices">Number of OrtEpDevices.</param>
+            /// <param name="modelMetadataIn">Model metadata.</param>
+            /// <param name="runtimeMetadataIn">Runtime metadata.</param>
+            /// <param name="selectedOut">Pre-allocated OrtEpDevice buffer to update with selected devices.</param>
+            /// <param name="maxSelected">Number of entries in selectedOut.</param>
+            /// <param name="numSelected">Number of OrtEpDevies that were selected.</param>
+            /// <param name="state">Opaque state.</param>
+            /// <returns>nullptr for OrtStatus* to indicate success.</returns>
+            /// <remarks>Currently we don't have a way to create an OrtStatus instance from the C# bindings.
+            /// Can add if we need to return an explicit error message.
+            /// </remarks>
+            public static IntPtr EpSelectionPolicyWrapper(IntPtr /* OrtEpDevice** */ epDevicesIn,
+                                                          uint numDevices,
+                                                          IntPtr /* OrtKeyValuePairs* */ modelMetadataIn,
+                                                          IntPtr /* OrtKeyValuePairs* */ runtimeMetadataIn,
+                                                          IntPtr /* OrtEpDevice** */ selectedOut,
+                                                          uint maxSelected,
+                                                          out UIntPtr numSelected,
+                                                          IntPtr state)
+            {
+                Span<IntPtr> epDevicesIntPtrs;
+                Span<IntPtr> selectedDevicesIntPtrs;
+                EpSelectionPolicyConnector connector = (EpSelectionPolicyConnector)GCHandle.FromIntPtr(state).Target;
+
+                unsafe
+                {
+                    void* ptr = epDevicesIn.ToPointer();
+                    epDevicesIntPtrs = new Span<IntPtr>(ptr, checked((int)numDevices));
+                }
+
+                List<OrtEpDevice> epDevices = new List<OrtEpDevice>();
+                for (int i = 0; i < numDevices; i++)
+                {
+
+                    epDevices.Add(new OrtEpDevice(epDevicesIntPtrs[i]));
+                }
+
+                OrtKeyValuePairs modelMetadata = new OrtKeyValuePairs(modelMetadataIn);
+                OrtKeyValuePairs runtimeMetadata = new OrtKeyValuePairs(runtimeMetadataIn);
 
+                var selected = connector._csharpDelegate(epDevices, modelMetadata, runtimeMetadata, maxSelected);
+
+                numSelected = (UIntPtr)selected.Count;
+
+                unsafe
+                {
+                    void* ptr = selectedOut.ToPointer();
+                    selectedDevicesIntPtrs = new Span<IntPtr>(ptr, (int)maxSelected);
+                }
+
+                int idx = 0;
+                foreach (var epDevice in selected)
+                {
+                    selectedDevicesIntPtrs[idx] = epDevice.Handle;
+                    idx++;
+                }
+
+                return IntPtr.Zero;
+            }
+        }
         #endregion
 
         #region Private Methods
@@ -1000,8 +1114,43 @@ protected override bool ReleaseHandle()
         {
             NativeMethods.OrtReleaseSessionOptions(handle);
             handle = IntPtr.Zero;
+
+            if (_epSelectionPolicyConnectorHandle.IsAllocated)
+            {
+                _epSelectionPolicyConnectorHandle.Free();
+                _epSelectionPolicyConnector = null;
+            }
+
+            if (_epSelectionPolicyDelegateHandle.IsAllocated)
+            {
+                _epSelectionPolicyDelegateHandle.Free();
+                _epSelectionPolicyDelegate = null;
+            }
+
+
             return true;
         }
         #endregion
+
+        /// <summary>
+        /// Helper class to connect C and C# usage of the EP selection policy delegate.
+        /// </summary>
+        EpSelectionPolicyConnector _epSelectionPolicyConnector = null;
+
+        /// <summary>
+        /// Handle to the EP selection policy connector that is passed to the C API as state for the 
+        /// EP selection policy delegate.
+        /// </summary>
+        GCHandle _epSelectionPolicyConnectorHandle = default;
+
+        /// <summary>
+        /// Delegate instance that is provided to the C API.
+        /// </summary>
+        NativeMethods.DOrtEpSelectionDelegate _epSelectionPolicyDelegate = null;
+
+        /// <summary>
+        /// Handle to the EP selection policy delegate that is passed to the C API.
+        /// </summary>
+        GCHandle _epSelectionPolicyDelegateHandle = default;
     }
 }
diff --git a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/OrtAutoEpTests.cs b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/OrtAutoEpTests.cs
index 1aa4db15d275c..d95a649bd95c5 100644
--- a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/OrtAutoEpTests.cs
+++ b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/OrtAutoEpTests.cs
@@ -93,7 +93,7 @@ public void AppendToSessionOptionsV2()
     {
         var runTest = (Func<Dictionary<string, string>> getEpOptions) =>
         {
-            SessionOptions sessionOptions = new SessionOptions();
+            using SessionOptions sessionOptions = new SessionOptions();
             sessionOptions.LogSeverityLevel = OrtLoggingLevel.ORT_LOGGING_LEVEL_VERBOSE;
 
             var epDevices = ortEnvInstance.GetEpDevices();
@@ -138,7 +138,7 @@ public void AppendToSessionOptionsV2()
     [Fact]
     public void SetEpSelectionPolicy()
     {
-        SessionOptions sessionOptions = new SessionOptions();
+        using SessionOptions sessionOptions = new SessionOptions();
         sessionOptions.LogSeverityLevel = OrtLoggingLevel.ORT_LOGGING_LEVEL_VERBOSE;
 
         var epDevices = ortEnvInstance.GetEpDevices();
@@ -150,7 +150,50 @@ public void SetEpSelectionPolicy()
         var model = TestDataLoader.LoadModelFromEmbeddedResource("squeezenet.onnx");
 
         // session should load successfully
-        using (var session = new InferenceSession(model))
+        using (var session = new InferenceSession(model, sessionOptions))
+        {
+            Assert.NotNull(session);
+        }
+    }
+
+    private static List<OrtEpDevice> SelectionPolicyDelegate(IReadOnlyList<OrtEpDevice> epDevices,
+                                                              OrtKeyValuePairs modelMetadata,
+                                                              OrtKeyValuePairs runtimeMetadata,
+                                                              uint maxSelections)
+    {
+        Assert.NotEmpty(modelMetadata.Entries);
+        Assert.True(epDevices.Count > 0);
+
+        // select first device and last (if there are more than one).
+        var selected = new List<OrtEpDevice>();
+
+        selected.Add(epDevices[0]);
+
+        // add ORT CPU EP which is always last.
+        if (maxSelections > 2 && epDevices.Count > 1)
+        {
+            selected.Add(epDevices.Last());
+        }
+
+        return selected;
+    }
+
+    [Fact]
+    public void SetEpSelectionPolicyDelegate()
+    {
+        using SessionOptions sessionOptions = new SessionOptions();
+        sessionOptions.LogSeverityLevel = OrtLoggingLevel.ORT_LOGGING_LEVEL_VERBOSE;
+
+        var epDevices = ortEnvInstance.GetEpDevices();
+        Assert.NotEmpty(epDevices);
+
+        // doesn't matter what the value is. should fallback to ORT CPU EP
+        sessionOptions.SetEpSelectionPolicyDelegate(SelectionPolicyDelegate);
+        
+        var model = TestDataLoader.LoadModelFromEmbeddedResource("squeezenet.onnx");
+
+        // session should load successfully
+        using (var session = new InferenceSession(model, sessionOptions))
         {
             Assert.NotNull(session);
         }
diff --git a/include/onnxruntime/core/session/onnxruntime_c_api.h b/include/onnxruntime/core/session/onnxruntime_c_api.h
index cef5eab9a505e..6c7d910b4963b 100644
--- a/include/onnxruntime/core/session/onnxruntime_c_api.h
+++ b/include/onnxruntime/core/session/onnxruntime_c_api.h
@@ -438,18 +438,20 @@ typedef enum OrtExecutionProviderDevicePolicy {
  * \param max_ep_devices The maximum number of devices that can be selected in the pre-allocated array.
                          Currently the maximum is 8.
  * \param num_ep_devices The number of selected devices.
+ * \param state Opaque pointer. Required to use the delegate from other languages like C# and python.
  *
  * \return OrtStatus* Selection status. Return nullptr on success.
  *                    Use CreateStatus to provide error info. Use ORT_FAIL as the error code.
  *                    ORT will release the OrtStatus* if not null.
  */
-typedef OrtStatus* (*EpSelectionDelegate)(_In_ const OrtEpDevice** ep_devices,
-                                          _In_ size_t num_devices,
-                                          _In_ const OrtKeyValuePairs* model_metadata,
-                                          _In_opt_ const OrtKeyValuePairs* runtime_metadata,
-                                          _Inout_ const OrtEpDevice** selected,
-                                          _In_ size_t max_selected,
-                                          _Out_ size_t* num_selected);
+typedef OrtStatus*(ORT_API_CALL* EpSelectionDelegate)(_In_ const OrtEpDevice** ep_devices,
+                                                      _In_ size_t num_devices,
+                                                      _In_ const OrtKeyValuePairs* model_metadata,
+                                                      _In_opt_ const OrtKeyValuePairs* runtime_metadata,
+                                                      _Inout_ const OrtEpDevice** selected,
+                                                      _In_ size_t max_selected,
+                                                      _Out_ size_t* num_selected,
+                                                      _In_ void* state);
 
 /** \brief Algorithm to use for cuDNN Convolution Op
  */
@@ -5127,18 +5129,30 @@ struct OrtApi {
 
   /** \brief Set the execution provider selection policy for the session.
    *
-   * Allows users to specify a device selection policy for automatic execution provider (EP) selection,
-   * or provide a delegate callback for custom selection logic.
+   * Allows users to specify a device selection policy for automatic execution provider (EP) selection.
+   * If custom selection is required please use SessionOptionsSetEpSelectionPolicyDelegate instead.
    *
    * \param[in] session_options The OrtSessionOptions instance.
    * \param[in] policy The device selection policy to use (see OrtExecutionProviderDevicePolicy).
-   * \param[in] delegate Optional delegate callback for custom selection. Pass nullptr to use the built-in policy.
    *
    * \since Version 1.22
    */
   ORT_API2_STATUS(SessionOptionsSetEpSelectionPolicy, _In_ OrtSessionOptions* session_options,
-                  _In_ OrtExecutionProviderDevicePolicy policy,
-                  _In_opt_ EpSelectionDelegate* delegate);
+                  _In_ OrtExecutionProviderDevicePolicy policy);
+
+  /** \brief Set the execution provider selection policy delegate for the session.
+   *
+   * Allows users to provide a custom device selection policy for automatic execution provider (EP) selection.
+   *
+   * \param[in] session_options The OrtSessionOptions instance.
+   * \param[in] delegate Delegate callback for custom selection.
+   * \param[in] delegate_state Optional state that will be passed to the delegate callback. nullptr if not required.
+   *
+   * \since Version 1.22
+   */
+  ORT_API2_STATUS(SessionOptionsSetEpSelectionPolicyDelegate, _In_ OrtSessionOptions* session_options,
+                  _In_ EpSelectionDelegate delegate,
+                  _In_opt_ void* delegate_state);
 
   /** \brief Get the hardware device type.
    *
diff --git a/include/onnxruntime/core/session/onnxruntime_cxx_api.h b/include/onnxruntime/core/session/onnxruntime_cxx_api.h
index 6c175c606b4a1..bc6f381bb82a0 100644
--- a/include/onnxruntime/core/session/onnxruntime_cxx_api.h
+++ b/include/onnxruntime/core/session/onnxruntime_cxx_api.h
@@ -1103,8 +1103,10 @@ struct SessionOptionsImpl : ConstSessionOptionsImpl<T> {
                                                  const std::unordered_map<std::string, std::string>& ep_options);
 
   /// Wraps OrtApi::SessionOptionsSetEpSelectionPolicy
-  SessionOptionsImpl& SetEpSelectionPolicy(OrtExecutionProviderDevicePolicy policy,
-                                           EpSelectionDelegate* delegate = nullptr);
+  SessionOptionsImpl& SetEpSelectionPolicy(OrtExecutionProviderDevicePolicy policy);
+
+  /// Wraps OrtApi::SessionOptionsSetEpSelectionPolicyDelegate
+  SessionOptionsImpl& SetEpSelectionPolicy(EpSelectionDelegate delegate, void* state = nullptr);
 
   SessionOptionsImpl& SetCustomCreateThreadFn(OrtCustomCreateThreadFn ort_custom_create_thread_fn);  ///< Wraps OrtApi::SessionOptionsSetCustomCreateThreadFn
   SessionOptionsImpl& SetCustomThreadCreationOptions(void* ort_custom_thread_creation_options);      ///< Wraps OrtApi::SessionOptionsSetCustomThreadCreationOptions
diff --git a/include/onnxruntime/core/session/onnxruntime_cxx_inline.h b/include/onnxruntime/core/session/onnxruntime_cxx_inline.h
index 1fdb8f16d9600..94ad2118fa4d6 100644
--- a/include/onnxruntime/core/session/onnxruntime_cxx_inline.h
+++ b/include/onnxruntime/core/session/onnxruntime_cxx_inline.h
@@ -1150,9 +1150,14 @@ inline SessionOptionsImpl<T>& SessionOptionsImpl<T>::AppendExecutionProvider_V2(
 }
 
 template <typename T>
-inline SessionOptionsImpl<T>& SessionOptionsImpl<T>::SetEpSelectionPolicy(OrtExecutionProviderDevicePolicy policy,
-                                                                          EpSelectionDelegate* delegate) {
-  ThrowOnError(GetApi().SessionOptionsSetEpSelectionPolicy(this->p_, policy, delegate));
+inline SessionOptionsImpl<T>& SessionOptionsImpl<T>::SetEpSelectionPolicy(OrtExecutionProviderDevicePolicy policy) {
+  ThrowOnError(GetApi().SessionOptionsSetEpSelectionPolicy(this->p_, policy));
+  return *this;
+}
+
+template <typename T>
+inline SessionOptionsImpl<T>& SessionOptionsImpl<T>::SetEpSelectionPolicy(EpSelectionDelegate delegate, void* state) {
+  ThrowOnError(GetApi().SessionOptionsSetEpSelectionPolicyDelegate(this->p_, delegate, state));
   return *this;
 }
 
diff --git a/onnxruntime/core/framework/session_options.h b/onnxruntime/core/framework/session_options.h
index 8f8a3d6634a7e..89a43c4f71ee6 100644
--- a/onnxruntime/core/framework/session_options.h
+++ b/onnxruntime/core/framework/session_options.h
@@ -96,7 +96,8 @@ struct EpSelectionPolicy {
   // and no selection policy was explicitly provided.
   bool enable{false};
   OrtExecutionProviderDevicePolicy policy = OrtExecutionProviderDevicePolicy_DEFAULT;
-  EpSelectionDelegate* delegate{};
+  EpSelectionDelegate delegate{};
+  void* state{nullptr};  // state for the delegate
 };
 
 /**
diff --git a/onnxruntime/core/session/abi_session_options.cc b/onnxruntime/core/session/abi_session_options.cc
index b1c0467da642e..c205e05baadb9 100644
--- a/onnxruntime/core/session/abi_session_options.cc
+++ b/onnxruntime/core/session/abi_session_options.cc
@@ -367,12 +367,24 @@ ORT_API_STATUS_IMPL(OrtApis::SetDeterministicCompute, _Inout_ OrtSessionOptions*
 }
 
 ORT_API_STATUS_IMPL(OrtApis::SessionOptionsSetEpSelectionPolicy, _In_ OrtSessionOptions* options,
-                    _In_ OrtExecutionProviderDevicePolicy policy,
-                    _In_opt_ EpSelectionDelegate* delegate) {
+                    _In_ OrtExecutionProviderDevicePolicy policy) {
   API_IMPL_BEGIN
   options->value.ep_selection_policy.enable = true;
   options->value.ep_selection_policy.policy = policy;
+  options->value.ep_selection_policy.delegate = nullptr;
+  options->value.ep_selection_policy.state = nullptr;
+  return nullptr;
+  API_IMPL_END
+}
+
+ORT_API_STATUS_IMPL(OrtApis::SessionOptionsSetEpSelectionPolicyDelegate, _In_ OrtSessionOptions* options,
+                    _In_opt_ EpSelectionDelegate delegate,
+                    _In_opt_ void* state) {
+  API_IMPL_BEGIN
+  options->value.ep_selection_policy.enable = true;
+  options->value.ep_selection_policy.policy = OrtExecutionProviderDevicePolicy_DEFAULT;
   options->value.ep_selection_policy.delegate = delegate;
+  options->value.ep_selection_policy.state = state;
   return nullptr;
   API_IMPL_END
 }
diff --git a/onnxruntime/core/session/inference_session.cc b/onnxruntime/core/session/inference_session.cc
index 8ec7312cc6354..df70856a64e99 100644
--- a/onnxruntime/core/session/inference_session.cc
+++ b/onnxruntime/core/session/inference_session.cc
@@ -3266,6 +3266,7 @@ common::Status InferenceSession::SaveModelMetadata(const onnxruntime::Model& mod
 
   // save model metadata
   model_metadata_.producer_name = model.ProducerName();
+  model_metadata_.producer_version = model.ProducerVersion();
   model_metadata_.description = model.DocString();
   model_metadata_.graph_description = model.GraphDocString();
   model_metadata_.domain = model.Domain();
@@ -3430,6 +3431,10 @@ const Model& InferenceSession::GetModel() const {
   return *model_;
 }
 
+const Environment& InferenceSession::GetEnvironment() const {
+  return environment_;
+}
+
 SessionIOBinding::SessionIOBinding(InferenceSession* session) : sess_(session) {
   ORT_ENFORCE(session->NewIOBinding(&binding_).IsOK());
 }
diff --git a/onnxruntime/core/session/inference_session.h b/onnxruntime/core/session/inference_session.h
index ba9812a59fec3..51350390a0456 100644
--- a/onnxruntime/core/session/inference_session.h
+++ b/onnxruntime/core/session/inference_session.h
@@ -80,6 +80,7 @@ struct ModelMetadata {
   ModelMetadata& operator=(const ModelMetadata&) = delete;
 
   std::string producer_name;
+  std::string producer_version;
   std::string graph_name;
   std::string domain;
   std::string description;
@@ -603,6 +604,7 @@ class InferenceSession {
 #endif
 
   const Model& GetModel() const;
+  const Environment& GetEnvironment() const;
 
  protected:
 #if !defined(ORT_MINIMAL_BUILD)
diff --git a/onnxruntime/core/session/onnxruntime_c_api.cc b/onnxruntime/core/session/onnxruntime_c_api.cc
index 304966605c9cf..d03b98a9c1eb5 100644
--- a/onnxruntime/core/session/onnxruntime_c_api.cc
+++ b/onnxruntime/core/session/onnxruntime_c_api.cc
@@ -2980,6 +2980,7 @@ static constexpr OrtApi ort_api_1_to_23 = {
     &OrtApis::GetEpDevices,
     &OrtApis::SessionOptionsAppendExecutionProvider_V2,
     &OrtApis::SessionOptionsSetEpSelectionPolicy,
+    &OrtApis::SessionOptionsSetEpSelectionPolicyDelegate,
 
     &OrtApis::HardwareDevice_Type,
     &OrtApis::HardwareDevice_VendorId,
@@ -3029,7 +3030,7 @@ static_assert(offsetof(OrtApi, AddExternalInitializersFromFilesInMemory) / sizeo
 // no additions in version 19, 20, and 21
 static_assert(offsetof(OrtApi, SetEpDynamicOptions) / sizeof(void*) == 284, "Size of version 20 API cannot change");
 
-static_assert(offsetof(OrtApi, GetEpApi) / sizeof(void*) == 316, "Size of version 22 API cannot change");
+static_assert(offsetof(OrtApi, GetEpApi) / sizeof(void*) == 317, "Size of version 22 API cannot change");
 
 // So that nobody forgets to finish an API version, this check will serve as a reminder:
 static_assert(std::string_view(ORT_VERSION) == "1.23.0",
diff --git a/onnxruntime/core/session/ort_apis.h b/onnxruntime/core/session/ort_apis.h
index 7be518a39480f..47d1a543b5a31 100644
--- a/onnxruntime/core/session/ort_apis.h
+++ b/onnxruntime/core/session/ort_apis.h
@@ -577,8 +577,11 @@ ORT_API_STATUS_IMPL(SessionOptionsAppendExecutionProvider_V2, _In_ OrtSessionOpt
                     size_t num_ep_options);
 
 ORT_API_STATUS_IMPL(SessionOptionsSetEpSelectionPolicy, _In_ OrtSessionOptions* sess_options,
-                    _In_ OrtExecutionProviderDevicePolicy policy,
-                    _In_opt_ EpSelectionDelegate* delegate);
+                    _In_ OrtExecutionProviderDevicePolicy policy);
+
+ORT_API_STATUS_IMPL(SessionOptionsSetEpSelectionPolicyDelegate, _In_ OrtSessionOptions* sess_options,
+                    _In_ EpSelectionDelegate delegate,
+                    _In_opt_ void* state);
 
 // OrtHardwareDevice accessors.
 ORT_API(OrtHardwareDeviceType, HardwareDevice_Type, _In_ const OrtHardwareDevice* device);
diff --git a/onnxruntime/core/session/provider_policy_context.cc b/onnxruntime/core/session/provider_policy_context.cc
index 4ce13fe36ea86..f706bd05d8494 100644
--- a/onnxruntime/core/session/provider_policy_context.cc
+++ b/onnxruntime/core/session/provider_policy_context.cc
@@ -94,8 +94,8 @@ std::vector<const OrtEpDevice*> OrderDevices(const std::vector<const OrtEpDevice
     bool bIsDefaultCpuEp = IsDefaultCpuEp(b);
     if (!aIsDefaultCpuEp && !bIsDefaultCpuEp) {
       // neither are default CPU EP. both do/don't match vendor.
-      // TODO: implement tie-breaker for this scenario. arbitrarily prefer the first for now
-      return a < b;
+      // TODO: implement tie-breaker for this scenario. arbitrarily sort by ep name
+      return a->ep_name < b->ep_name;
     }
 
     // one is the default CPU EP
@@ -104,31 +104,57 @@ std::vector<const OrtEpDevice*> OrderDevices(const std::vector<const OrtEpDevice
 
   return sorted_devices;
 }
+
+OrtKeyValuePairs GetModelMetadata(const InferenceSession& session) {
+  OrtKeyValuePairs metadata;
+  auto status_and_metadata = session.GetModelMetadata();
+
+  if (!status_and_metadata.first.IsOK()) {
+    return metadata;
+  }
+
+  // use field names from onnx.proto
+  const auto& model_metadata = *status_and_metadata.second;
+  metadata.Add("producer_name", model_metadata.producer_name);
+  metadata.Add("producer_version", model_metadata.producer_version);
+  metadata.Add("domain", model_metadata.domain);
+  metadata.Add("model_version", std::to_string(model_metadata.version));
+  metadata.Add("doc_string", model_metadata.description);
+  metadata.Add("graph_name", model_metadata.graph_name);                // name from main GraphProto
+  metadata.Add("graph_description", model_metadata.graph_description);  // descriptions from main GraphProto
+  for (const auto& entry : model_metadata.custom_metadata_map) {
+    metadata.Add(entry.first, entry.second);
+  }
+
+  return metadata;
+}
 }  // namespace
 
 // Select execution providers based on the device policy and available devices and add to session
 Status ProviderPolicyContext::SelectEpsForSession(const Environment& env, const OrtSessionOptions& options,
                                                   InferenceSession& sess) {
-  ORT_ENFORCE(options.value.ep_selection_policy.delegate == nullptr,
-              "EP selection delegate support is not implemented yet.");
-
   // Get the list of devices from the environment and order them.
   // Ordered by preference within each type. NPU -> GPU -> NPU
   // TODO: Should environment.cc do the ordering?
-  const auto& execution_devices = OrderDevices(env.GetOrtEpDevices());
+  std::vector<const OrtEpDevice*> execution_devices = OrderDevices(env.GetOrtEpDevices());
 
   // The list of devices selected by policies
   std::vector<const OrtEpDevice*> devices_selected;
 
   // Run the delegate if it was passed in lieu of any other policy
   if (options.value.ep_selection_policy.delegate) {
-    auto policy_fn = options.value.ep_selection_policy.delegate;
+    auto model_metadata = GetModelMetadata(sess);
+    OrtKeyValuePairs runtime_metadata;  // TODO: where should this come from?
+
     std::vector<const OrtEpDevice*> delegate_devices(execution_devices.begin(), execution_devices.end());
     std::array<const OrtEpDevice*, 8> selected_devices{nullptr};
-
     size_t num_selected = 0;
-    auto* status = (*policy_fn)(delegate_devices.data(), delegate_devices.size(),
-                                nullptr, nullptr, selected_devices.data(), selected_devices.size(), &num_selected);
+
+    EpSelectionDelegate delegate = options.value.ep_selection_policy.delegate;
+    auto* status = delegate(delegate_devices.data(), delegate_devices.size(),
+                            &model_metadata, &runtime_metadata,
+                            selected_devices.data(), selected_devices.size(), &num_selected,
+                            options.value.ep_selection_policy.state);
 
     // return or fall-through for both these cases
     // going with explicit failure for now so it's obvious to user what is happening
@@ -142,6 +168,12 @@ Status ProviderPolicyContext::SelectEpsForSession(const Environment& env, const
     if (num_selected == 0) {
       return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "EP selection delegate did not select anything.");
     }
+
+    // Copy the selected devices to the output vector
+    devices_selected.reserve(num_selected);
+    for (size_t i = 0; i < num_selected; ++i) {
+      devices_selected.push_back(selected_devices[i]);
+    }
   } else {
     // Create the selector for the chosen policy
     std::unique_ptr<IEpPolicySelector> selector;
diff --git a/onnxruntime/core/session/utils.cc b/onnxruntime/core/session/utils.cc
index c05394039d8c7..8ca4ef6af1f44 100644
--- a/onnxruntime/core/session/utils.cc
+++ b/onnxruntime/core/session/utils.cc
@@ -176,20 +176,6 @@ static OrtStatus* CreateSessionAndLoadModelImpl(_In_ const OrtSessionOptions* op
         env);
   }
 
-#if !defined(ORT_MINIMAL_BUILD)
-  // TEMPORARY for testing. Manually specify the EP to select.
-  auto auto_select_ep_name = sess->GetSessionOptions().config_options.GetConfigEntry("test.ep_to_select");
-  if (auto_select_ep_name) {
-    ORT_API_RETURN_IF_STATUS_NOT_OK(TestAutoSelectEPsImpl(env, *sess, *auto_select_ep_name));
-  }
-
-  // if there are no providers registered, and there's an ep selection policy set, do auto ep selection
-  if (options != nullptr && options->provider_factories.empty() && options->value.ep_selection_policy.enable) {
-    ProviderPolicyContext context;
-    ORT_API_RETURN_IF_STATUS_NOT_OK(context.SelectEpsForSession(env, *options, *sess));
-  }
-#endif  // !defined(ORT_MINIMAL_BUILD)
-
 #if !defined(ORT_MINIMAL_BUILD) || defined(ORT_MINIMAL_BUILD_CUSTOM_OPS)
   // Add custom domains
   if (options && !options->custom_op_domains_.empty()) {
@@ -231,22 +217,38 @@ OrtStatus* InitializeSession(_In_ const OrtSessionOptions* options,
   ORT_ENFORCE(session_logger != nullptr,
               "Session logger is invalid, but should have been initialized during session construction.");
 
-  // we need to disable mem pattern if DML is one of the providers since DML doesn't have the concept of
-  // byte addressable memory
-  std::vector<std::unique_ptr<IExecutionProvider>> provider_list;
-  if (options) {
+  const bool has_provider_factories = options != nullptr && !options->provider_factories.empty();
+
+  if (has_provider_factories) {
+    std::vector<std::unique_ptr<IExecutionProvider>> provider_list;
     for (auto& factory : options->provider_factories) {
       auto provider = factory->CreateProvider(*options, *session_logger->ToExternal());
       provider_list.push_back(std::move(provider));
     }
+
+    // register the providers
+    for (auto& provider : provider_list) {
+      if (provider) {
+        ORT_API_RETURN_IF_STATUS_NOT_OK(sess.RegisterExecutionProvider(std::move(provider)));
+      }
+    }
   }
+#if !defined(ORT_MINIMAL_BUILD)
+  else {
+    // TEMPORARY for testing. Manually specify the EP to select.
+    auto auto_select_ep_name = sess.GetSessionOptions().config_options.GetConfigEntry("test.ep_to_select");
+    if (auto_select_ep_name) {
+      ORT_API_RETURN_IF_STATUS_NOT_OK(TestAutoSelectEPsImpl(sess.GetEnvironment(), sess, *auto_select_ep_name));
+    }
 
-  // register the providers
-  for (auto& provider : provider_list) {
-    if (provider) {
-      ORT_API_RETURN_IF_STATUS_NOT_OK(sess.RegisterExecutionProvider(std::move(provider)));
+    // if there are no providers registered, and there's an ep selection policy set, do auto ep selection.
+    // note: the model has already been loaded so model metadata should be available to the policy delegate callback.
+    if (options != nullptr && options->value.ep_selection_policy.enable) {
+      ProviderPolicyContext context;
+      ORT_API_RETURN_IF_STATUS_NOT_OK(context.SelectEpsForSession(sess.GetEnvironment(), *options, sess));
     }
   }
+#endif  // !defined(ORT_MINIMAL_BUILD)
 
   if (prepacked_weights_container != nullptr) {
     ORT_API_RETURN_IF_STATUS_NOT_OK(sess.AddPrePackedWeightsContainer(
diff --git a/onnxruntime/test/autoep/test_autoep_selection.cc b/onnxruntime/test/autoep/test_autoep_selection.cc
index 04b1b2ea0bdc4..cea1299adc26f 100644
--- a/onnxruntime/test/autoep/test_autoep_selection.cc
+++ b/onnxruntime/test/autoep/test_autoep_selection.cc
@@ -68,6 +68,7 @@ static void TestInference(Ort::Env& env, const std::basic_string<ORTCHAR_T>& mod
                           const std::function<void(std::vector<const OrtEpDevice*>&)>& select_devices = nullptr,
                           // auto select using policy
                           std::optional<OrtExecutionProviderDevicePolicy> policy = std::nullopt,
+                          std::optional<EpSelectionDelegate> delegate = std::nullopt,
                           bool test_session_creation_only = false) {
   Ort::SessionOptions session_options;
 
@@ -77,7 +78,9 @@ static void TestInference(Ort::Env& env, const std::basic_string<ORTCHAR_T>& mod
   }
 
   if (auto_select) {
-    if (policy) {
+    if (delegate) {
+      session_options.SetEpSelectionPolicy(*delegate, nullptr);
+    } else if (policy) {
       session_options.SetEpSelectionPolicy(*policy);
     } else {
       // manually specify EP to select
@@ -353,6 +356,150 @@ TEST(AutoEpSelection, PreferNpu) {
                        OrtExecutionProviderDevicePolicy::OrtExecutionProviderDevicePolicy_PREFER_NPU);
 }
 
+static OrtStatus* ORT_API_CALL PolicyDelegate(_In_ const OrtEpDevice** ep_devices,
+                                              _In_ size_t num_devices,
+                                              _In_ const OrtKeyValuePairs* model_metadata,
+                                              _In_opt_ const OrtKeyValuePairs* /*runtime_metadata*/,
+                                              _Inout_ const OrtEpDevice** selected,
+                                              _In_ size_t max_selected,
+                                              _Out_ size_t* num_selected,
+                                              _In_ void* /*state*/) {
+  *num_selected = 0;
+
+  if (max_selected <= 2) {
+    return Ort::GetApi().CreateStatus(ORT_INVALID_ARGUMENT, "Expected to be able to select 2 devices.");
+  }
+
+  if (model_metadata->entries.empty()) {
+    return Ort::GetApi().CreateStatus(ORT_INVALID_ARGUMENT, "Model metadata was empty.");
+  }
+
+  selected[0] = ep_devices[0];
+  *num_selected = 1;
+  if (num_devices > 1) {
+    // CPU EP is always last.
+    selected[1] = ep_devices[num_devices - 1];
+    *num_selected = 2;
+  }
+
+  return nullptr;
+}
+
+static OrtStatus* ORT_API_CALL PolicyDelegateSelectNone(_In_ const OrtEpDevice** /*ep_devices*/,
+                                                        _In_ size_t /*num_devices*/,
+                                                        _In_ const OrtKeyValuePairs* /*model_metadata*/,
+                                                        _In_opt_ const OrtKeyValuePairs* /*runtime_metadata*/,
+                                                        _Inout_ const OrtEpDevice** /*selected*/,
+                                                        _In_ size_t /*max_selected*/,
+                                                        _Out_ size_t* num_selected,
+                                                        _In_ void* /*state*/) {
+  *num_selected = 0;
+
+  return nullptr;
+}
+
+static OrtStatus* ORT_API_CALL PolicyDelegateReturnError(_In_ const OrtEpDevice** /*ep_devices*/,
+                                                         _In_ size_t /*num_devices*/,
+                                                         _In_ const OrtKeyValuePairs* /*model_metadata*/,
+                                                         _In_opt_ const OrtKeyValuePairs* /*runtime_metadata*/,
+                                                         _Inout_ const OrtEpDevice** /*selected*/,
+                                                         _In_ size_t /*max_selected*/,
+                                                         _Out_ size_t* num_selected,
+                                                         _In_ void* /*state*/) {
+  *num_selected = 0;
+
+  return Ort::GetApi().CreateStatus(ORT_INVALID_ARGUMENT, "Selection error.");
+}
+
+// test providing a delegate
+TEST(AutoEpSelection, PolicyDelegate) {
+  std::vector<Input<float>> inputs(1);
+  auto& input = inputs.back();
+  input.name = "X";
+  input.dims = {3, 2};
+  input.values = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
+
+  // prepare expected inputs and outputs
+  std::vector<int64_t> expected_dims_y = {3, 2};
+  std::vector<float> expected_values_y = {1.0f, 4.0f, 9.0f, 16.0f, 25.0f, 36.0f};
+
+  const Ort::KeyValuePairs provider_options;
+
+  TestInference<float>(*ort_env, ORT_TSTR("testdata/mul_1.onnx"),
+                       "",  // don't need EP name
+                       std::nullopt,
+                       provider_options,
+                       inputs,
+                       "Y",
+                       expected_dims_y,
+                       expected_values_y,
+                       /* auto_select */ true,
+                       /*select_devices*/ nullptr,
+                       std::nullopt,
+                       PolicyDelegate);
+}
+
+// test providing a delegate
+TEST(AutoEpSelection, PolicyDelegateSelectsNothing) {
+  std::vector<Input<float>> inputs(1);
+  auto& input = inputs.back();
+  input.name = "X";
+  input.dims = {3, 2};
+  input.values = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
+
+  // prepare expected inputs and outputs
+  std::vector<int64_t> expected_dims_y = {3, 2};
+  std::vector<float> expected_values_y = {1.0f, 4.0f, 9.0f, 16.0f, 25.0f, 36.0f};
+
+  const Ort::KeyValuePairs provider_options;
+
+  ASSERT_THROW(
+      TestInference<float>(*ort_env, ORT_TSTR("testdata/mul_1.onnx"),
+                           "",  // don't need EP name
+                           std::nullopt,
+                           provider_options,
+                           inputs,
+                           "Y",
+                           expected_dims_y,
+                           expected_values_y,
+                           /* auto_select */ true,
+                           /*select_devices*/ nullptr,
+                           std::nullopt,
+                           PolicyDelegateSelectNone,
+                           /*test_session_creation_only*/ true),
+      Ort::Exception);
+}
+
+TEST(AutoEpSelection, PolicyDelegateReturnsError) {
+  std::vector<Input<float>> inputs(1);
+  auto& input = inputs.back();
+  input.name = "X";
+  input.dims = {3, 2};
+  input.values = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
+
+  // prepare expected inputs and outputs
+  std::vector<int64_t> expected_dims_y = {3, 2};
+  std::vector<float> expected_values_y = {1.0f, 4.0f, 9.0f, 16.0f, 25.0f, 36.0f};
+
+  const Ort::KeyValuePairs provider_options;
+
+  ASSERT_THROW(
+      TestInference<float>(*ort_env, ORT_TSTR("testdata/mul_1.onnx"),
+                           "",  // don't need EP name
+                           std::nullopt,
+                           provider_options,
+                           inputs,
+                           "Y",
+                           expected_dims_y,
+                           expected_values_y,
+                           /* auto_select */ true,
+                           /*select_devices*/ nullptr,
+                           std::nullopt,
+                           PolicyDelegateReturnError,
+                           /*test_session_creation_only*/ true),
+      Ort::Exception);
+}
+
 namespace {
 struct ExamplePluginInfo {
   const std::filesystem::path library_path =

From 5160c67a3bdc1bc8de80be2afa1da3affb84ad46 Mon Sep 17 00:00:00 2001
From: Jiajia Qin <jiajiaqin@microsoft.com>
Date: Wed, 7 May 2025 00:38:05 +0800
Subject: [PATCH 15/84] [webgpu] Add Matmul8bits Support (#24546)

### Description
This PR adds the support for 8-bit quantization in the `MatMulNBits`
operation in WebGPU.

It does below things:
1. Unify to use `MatMulNBitsProgram` as the fallback path which is the
original generation path for block size = 32. Now make it support any
blocks size without limitations. And remove the original complicated
programs.
2. Enable `MatMulNBitsWideTileProgram` for all platforms.
---
 .../webgpu/quantization/matmul_nbits.cc       | 863 ++++++------------
 .../webgpu/quantization/matmul_nbits.h        |  47 +-
 .../test/contrib_ops/matmul_8bits_test.cc     |  12 +-
 3 files changed, 303 insertions(+), 619 deletions(-)

diff --git a/onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits.cc b/onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits.cc
index a29cdea6b4af9..65ecdff44acd6 100644
--- a/onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits.cc
+++ b/onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits.cc
@@ -1,6 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
+#include <string>
 #include <string_view>
 
 #include "contrib_ops/webgpu/quantization/matmul_nbits.h"
@@ -18,17 +19,42 @@ namespace webgpu {
 
 namespace {
 
-std::string QuantizedDataType(int components) {
-  switch (components) {
-    case 1:
-      return "array<output_element_t, 8>";
-    case 2:
-      return "mat4x2<output_element_t>";
-    case 4:
-      return "mat2x4<output_element_t>";
-    default:
-      return "array<output_element_t, 8>";
+std::string ReadZeroPoint(uint32_t nbits, bool has_zero_points) {
+  ORT_ENFORCE(nbits == 8 || nbits == 4, "Only 4/8 bits are supported for webgpu matmulnbits");
+  std::stringstream ss;
+  if (has_zero_points) {
+    ss << "const elements_in_uint32 = " << (32 / nbits) << "u;\n"
+       << "const bits = " << nbits << "u;\n";
+    ss << R"(
+fn mm_read_zero(row : u32, col : u32, r_dim: u32, c_dim: u32) -> output_element_t {
+  if (row < r_dim && col < c_dim) {
+    let offset = row * c_dim + col;
+
+    // u32 holds elements_in_uint32 packed nbits.
+    let array_index = offset / elements_in_uint32;
+    let component_index = offset % elements_in_uint32;
+    let packed_value = zero_points[array_index];
+
+    // Extract the nbits component
+    let shift_amount = component_index * bits;
+)";
+    ss << "    let masked_value = (packed_value >> shift_amount) & " << (nbits == 4 ? "0xFu" : "0xFF") << ";\n";
+    ss << R"(
+    return output_element_t(masked_value);
   }
+  return output_element_t(0);
+}
+)";
+  } else {
+    ss << "const default_zero_point = " << (nbits == 4 ? 8 : 128) << ";\n";
+    ss << R"(
+fn mm_read_zero(row : u32, col : u32, r_dim: u32, c_dim: u32) -> output_element_t {
+  // The default zero point is 8.
+  return output_element_t(default_zero_point);
+}
+)";
+  }
+  return ss.str();
 }
 
 constexpr unsigned int kMinMForTileOptimization = 4;
@@ -46,483 +72,6 @@ ONNX_OPERATOR_KERNEL_EX(
         .TypeConstraint("T4", DataTypeImpl::GetTensorType<int32_t>()),
     MatMulNBits);
 
-Status MatMulNBitsProgram::GenerateShaderCode(ShaderHelper& shader) const {
-  const auto& a = shader.AddInput("input_a", ShaderUsage::UseUniform | ShaderUsage::UseIndicesTypeAlias | ShaderUsage::UseValueTypeAlias);
-  const auto& b = shader.AddInput("input_b", ShaderUsage::UseUniform | ShaderUsage::UseIndicesTypeAlias | ShaderUsage::UseValueTypeAlias);
-  const auto& scales = shader.AddInput("scales", ShaderUsage::UseUniform);
-  const auto& y = shader.AddOutput("output", ShaderUsage::UseUniform | ShaderUsage::UseValueTypeAlias | ShaderUsage::UseElementTypeAlias | ShaderUsage::UseIndicesTypeAlias);
-
-  if (block_size_ == 32) {
-    const uint32_t workgroup_size = WorkgroupSizeX() * WorkgroupSizeY();
-    const uint32_t tile_size = WorkgroupSizeX() * components_b_ * 8;  // each uint32 has 8 data.
-    const uint32_t a_length_per_tile = tile_size / a.NumComponents();
-    const uint32_t blocks_per_tile = tile_size / block_size_;
-    if (tile_m_ > 1 && use_subgroup_) {
-      ORT_ENFORCE(a.NumComponents() == 4, "input a's components must be equal to 4.");
-      ORT_ENFORCE(components_b_ == 4, "input b's components must be equal to 4.");
-      shader.AdditionalImplementation() << "fn mm_readA(batch : u32, row : u32, col : u32) -> input_a_value_t {\n"
-                                           "  if (row < uniforms.input_a_shape[1] && col < uniforms.input_a_shape[2]) {\n"
-                                        << "    return " << a.GetByIndices("input_a_indices_t(batch, row, col)") << ";\n"
-                                        << "  } else {\n"
-                                           "    return input_a_value_t(0);\n"
-                                           "  }\n"
-                                           "}\n"
-                                        << "var<workgroup> sub_b: array<array<input_b_value_t, " << WorkgroupSizeX() << ">, " << WorkgroupSizeY() << ">;\n"
-                                        << "var<workgroup> sub_scale: array<array<output_value_t, " << WorkgroupSizeX() << ">, " << WorkgroupSizeY() << ">;\n"
-                                        << "var<workgroup> inter_results: array<array<array<output_value_t, " << WorkgroupSizeX() << ">, " << WorkgroupSizeY() << ">," << tile_m_ << ">;\n";
-      shader.MainFunctionBody() << "  let col = workgroup_id.x * " << WorkgroupSizeY() << ";\n"
-                                << "  let row = workgroup_id.y * " << tile_m_ << ";\n"
-                                << "  let batch = workgroup_id.z;\n";
-      shader.MainFunctionBody() << "  let n_blocks_per_col = uniforms.input_b_shape[1];\n"
-                                << "  let num_tiles =  (n_blocks_per_col - 1) / " << blocks_per_tile << " + 1;\n"
-                                // Loop over shared dimension.
-                                << "  for (var tile: u32 = 0; tile < num_tiles; tile += 1) {\n"
-                                << "    // load one tile B/scale data into shared memory.\n"
-                                   // Each thread processes one block.
-                                   "    let b_col = col + local_id.y;\n"
-                                << "    let block = tile * " << blocks_per_tile << " + local_id.x;\n"
-                                << "    if (b_col < uniforms.input_b_shape[0] && block < n_blocks_per_col) {\n"
-                                << "      sub_b[local_id.y][local_id.x] = " << b.GetByIndices("input_b_indices_t(b_col, block, 0)") << ";\n"
-                                << "      sub_scale[local_id.y][local_id.x] = " << scales.GetByOffset("b_col * n_blocks_per_col + block") << ";\n"
-                                << "    } else {\n"
-                                   "      sub_b[local_id.y][local_id.x] = input_b_value_t(0);\n"
-                                   "      sub_scale[local_id.y][local_id.x] = output_value_t(0);\n"
-                                   "    }\n"
-                                   "    workgroupBarrier();\n"
-                                << "    var in_y = (local_idx % 32) / 4;\n"
-                                   "    var in_x = (local_idx / 32) * 4 + local_idx % 4;\n"
-                                << "    var word_offset = (local_idx % 4) * " << block_size_ / a.NumComponents() << ";\n"
-                                << "    if (sg_size == 8u) {\n"
-                                   "      in_y = local_idx % 8;\n"
-                                   "      in_x = local_idx / 8;\n"
-                                << "      word_offset = 0u;\n"
-                                   "    } else if (sg_size == 16u) {\n"
-                                   "      in_y = (local_idx % 16) / 2;\n"
-                                   "      in_x = (local_idx / 16) * 2 + local_idx % 2;\n"
-                                << "      word_offset = (local_idx % 2) * " << block_size_ / a.NumComponents() << ";\n"
-                                << "    } else if (sg_size == 32u) {\n"
-                                   "      in_y = (local_idx % 32) / 4;\n"
-                                   "      in_x = (local_idx / 32) * 4 + local_idx % 4;\n"
-                                << "      word_offset = (local_idx % 4) * " << block_size_ / a.NumComponents() << ";\n"
-                                << "    } else if (sg_size == 64u) {\n"
-                                   "      in_y = local_idx / 8;\n"
-                                   "      in_x = local_idx % 8;\n"
-                                << "      word_offset = (local_idx % 8) * " << block_size_ / a.NumComponents() << ";\n"
-                                << "    }\n";
-      if (has_zero_points_) {
-        const auto& zero_points = shader.AddInput("zero_points", ShaderUsage::UseUniform);
-        shader.MainFunctionBody() << "    let zero_point_bytes_per_col = (n_blocks_per_col + 1) / 2;\n"
-                                     "    let zero_point_byte_count = b_col * zero_point_bytes_per_col + (block >> 0x1u);\n"
-                                     "    let zero_point_word_index = zero_point_byte_count >> 0x2u;\n"
-                                     "    let zero_point_byte_offset = zero_point_byte_count & 0x3u;\n"
-                                     "    let zero_point_nibble_offset: u32 = block & 0x1u;\n"
-                                     "    let zero_point_bits_offset = (zero_point_byte_offset << 3) + (zero_point_nibble_offset << 2);\n"
-                                  << "    let zero_point_word = " << zero_points.GetByOffset("zero_point_word_index") << " >> zero_point_bits_offset;\n"
-                                  << "    let zero_point = output_element_t((zero_point_word) & 0xFu);\n";
-      } else {
-        // The default zero point is 8 for unsigned 4-bit quantization.
-        shader.MainFunctionBody() << "    let zero_point = output_element_t(8.0);\n";
-      }
-      shader.MainFunctionBody() << "    let scale = sub_scale[in_y][in_x];\n"
-                                   "    let b_data = sub_b[in_y][in_x];\n";
-      shader.MainFunctionBody() << "    let a_col_start = tile * " << a_length_per_tile << ";\n";
-      for (uint32_t i = 0; i < tile_m_; i++) {
-        shader.MainFunctionBody() << "    let a_data" << i << " = mm_readA(batch, row + " << i << ", a_col_start + local_idx);\n";
-      }
-
-      shader.MainFunctionBody() << "    if (sg_size == 8u) {\n";
-      shader.MainFunctionBody() << "      for (var i: u32 = 0; i < 4; i++) {\n";
-      shader.MainFunctionBody() << "        let b_value = b_data[i];\n"
-                                   "        let b_value_lower = unpack4xU8(b_value & 0x0F0F0F0Fu);\n"
-                                   "        let b_value_upper = unpack4xU8((b_value >> 4) & 0x0F0F0F0Fu);\n"
-                                   "        let b_quantized_values = mat2x4<output_element_t>(output_element_t(b_value_lower[0]), output_element_t(b_value_upper[0]), output_element_t(b_value_lower[1]), output_element_t(b_value_upper[1]), output_element_t(b_value_lower[2]), output_element_t(b_value_upper[2]), output_element_t(b_value_lower[3]), output_element_t(b_value_upper[3]));\n"
-                                   "        let b_dequantized_values = (b_quantized_values - mat2x4<output_element_t>(zero_point, zero_point, zero_point, zero_point, zero_point, zero_point, zero_point, zero_point)) * scale;\n";
-      for (uint32_t i = 0; i < tile_m_; i++) {
-        if (i == 0) {
-          shader.MainFunctionBody() << "        var ";
-        }
-        shader.MainFunctionBody() << "        a0 = subgroupShuffle(a_data" << i << ", i * 2);\n";
-        if (i == 0) {
-          shader.MainFunctionBody() << "        var ";
-        }
-        shader.MainFunctionBody() << "        a1 = subgroupShuffle(a_data" << i << ", i * 2 + 1);\n";
-        shader.MainFunctionBody() << "        inter_results[" << i << "][in_y][in_x] += dot(a0, b_dequantized_values[0]) + dot(a1, b_dequantized_values[1]);\n";
-      }
-      shader.MainFunctionBody() << "      }\n";
-      shader.MainFunctionBody() << "    } else if (sg_size == 16u) {\n";
-      shader.MainFunctionBody() << "      for (var i: u32 = 0; i < 4; i++) {\n";
-      shader.MainFunctionBody() << "        let b_value = b_data[i];\n"
-                                   "        let b_value_lower = unpack4xU8(b_value & 0x0F0F0F0Fu);\n"
-                                   "        let b_value_upper = unpack4xU8((b_value >> 4) & 0x0F0F0F0Fu);\n"
-                                   "        let b_quantized_values = mat2x4<output_element_t>(output_element_t(b_value_lower[0]), output_element_t(b_value_upper[0]), output_element_t(b_value_lower[1]), output_element_t(b_value_upper[1]), output_element_t(b_value_lower[2]), output_element_t(b_value_upper[2]), output_element_t(b_value_lower[3]), output_element_t(b_value_upper[3]));\n"
-                                   "        let b_dequantized_values = (b_quantized_values - mat2x4<output_element_t>(zero_point, zero_point, zero_point, zero_point, zero_point, zero_point, zero_point, zero_point)) * scale;\n";
-      for (uint32_t i = 0; i < tile_m_; i++) {
-        if (i == 0) {
-          shader.MainFunctionBody() << "        var ";
-        }
-        shader.MainFunctionBody() << "        a0 = subgroupShuffle(a_data" << i << ", i * 2);\n";
-        if (i == 0) {
-          shader.MainFunctionBody() << "        var ";
-        }
-        shader.MainFunctionBody() << "        a00 = subgroupShuffle(a_data" << i << ", i * 2 + 8);\n";
-        if (i == 0) {
-          shader.MainFunctionBody() << "        var ";
-        }
-        shader.MainFunctionBody() << "        a1 = subgroupShuffle(a_data" << i << ", i * 2 + 1);\n";
-        if (i == 0) {
-          shader.MainFunctionBody() << "        var ";
-        }
-        shader.MainFunctionBody() << "        a11 = subgroupShuffle(a_data" << i << ", i * 2 + 9);\n";
-        shader.MainFunctionBody() << "        inter_results[" << i << "][in_y][in_x] += dot(select(a00, a0, local_idx % 2 == 0), b_dequantized_values[0]) + dot(select(a11, a1, local_idx % 2 == 0), b_dequantized_values[1]);\n";
-      }
-      shader.MainFunctionBody() << "        word_offset += " << 8 / a.NumComponents() << ";\n"
-                                << "      }\n";
-      shader.MainFunctionBody() << "    } else {\n";
-      shader.MainFunctionBody() << "      for (var i: u32 = 0; i < 4; i++) {\n";
-      shader.MainFunctionBody() << "        let b_value = b_data[i];\n"
-                                   "        let b_value_lower = unpack4xU8(b_value & 0x0F0F0F0Fu);\n"
-                                   "        let b_value_upper = unpack4xU8((b_value >> 4) & 0x0F0F0F0Fu);\n"
-                                   "        let b_quantized_values = mat2x4<output_element_t>(output_element_t(b_value_lower[0]), output_element_t(b_value_upper[0]), output_element_t(b_value_lower[1]), output_element_t(b_value_upper[1]), output_element_t(b_value_lower[2]), output_element_t(b_value_upper[2]), output_element_t(b_value_lower[3]), output_element_t(b_value_upper[3]));\n"
-                                   "        let b_dequantized_values = (b_quantized_values - mat2x4<output_element_t>(zero_point, zero_point, zero_point, zero_point, zero_point, zero_point, zero_point, zero_point)) * scale;\n";
-      for (uint32_t i = 0; i < tile_m_; i++) {
-        if (i == 0) {
-          shader.MainFunctionBody() << "        var ";
-        }
-        shader.MainFunctionBody() << "        a0 = subgroupShuffle(a_data" << i << ", word_offset);\n";
-        if (i == 0) {
-          shader.MainFunctionBody() << "        var ";
-        }
-        shader.MainFunctionBody() << "        a1 = subgroupShuffle(a_data" << i << ", word_offset + 1);\n";
-        shader.MainFunctionBody() << "        inter_results[" << i << "][in_y][in_x] += dot(a0, b_dequantized_values[0]) + dot(a1, b_dequantized_values[1]);\n";
-      }
-      shader.MainFunctionBody() << "        word_offset += " << 8 / a.NumComponents() << ";\n";
-      shader.MainFunctionBody() << "      }\n";
-      shader.MainFunctionBody() << "    }\n";
-      shader.MainFunctionBody() << "    workgroupBarrier();\n";
-
-      shader.MainFunctionBody() << "  }\n";
-      shader.MainFunctionBody() << "  if (local_idx < " << WorkgroupSizeY() * tile_m_ << ") {\n"
-                                << "    let inner_row = local_idx / " << WorkgroupSizeY() << ";\n"
-                                << "    let inner_col = local_idx % " << WorkgroupSizeY() << ";\n"
-                                << "    var output_value = output_value_t(0);\n"
-                                << "    for (var b = 0u; b < " << WorkgroupSizeX() << "; b++) {\n"
-                                << "      output_value += inter_results[inner_row][inner_col][b];\n"
-                                   "    }\n"
-                                   "    if (row + inner_row < uniforms.output_shape[1] && col + inner_col < uniforms.output_shape[2]) {\n"
-                                << "      " << y.SetByIndices("output_indices_t(batch, row + inner_row, col + inner_col)", "output_value") << ";\n"
-                                << "    }\n"
-                                   "  }\n";
-    } else {
-      if (tile_m_ == 1) {
-        shader.AdditionalImplementation() << "fn mm_readA(batch : u32, row : u32, col : u32) -> input_a_value_t {\n"
-                                             "  if (col < uniforms.input_a_shape[2]) {\n"
-                                          << "    return " << a.GetByIndices("input_a_indices_t(batch, row, col)") << ";\n"
-                                          << "  } else {\n"
-                                             "    return input_a_value_t(0);\n"
-                                             "  }\n"
-                                             "}\n"
-                                          << "var<workgroup> sub_a: array<input_a_value_t, " << a_length_per_tile << ">;\n"
-                                          << "var<workgroup> inter_results: array<array<output_value_t, " << WorkgroupSizeX() << ">, " << WorkgroupSizeY() << ">;\n";
-        std::string offset = "workgroup_idx * " + std::to_string(WorkgroupSizeY());
-        shader.MainFunctionBody() << "  let output_indices = " << y.OffsetToIndices(offset) << ";\n"
-                                  << "  let col = output_indices[2];\n"
-                                     "  let row = output_indices[1];\n"
-                                     "  let batch = output_indices[0];\n";
-      } else {
-        ORT_ENFORCE(tile_m_ < WorkgroupSizeY(), "tile_m must be less than or equal to WorkgroupSizeY.");
-        ORT_ENFORCE(WorkgroupSizeX() == WorkgroupSizeY(), "WorkgroupSizeX must be equal to WorkgroupSizeY.");
-
-        shader.AdditionalImplementation() << "fn mm_readA(batch : u32, row : u32, col : u32) -> input_a_value_t {\n"
-                                             "  if (row < uniforms.input_a_shape[1] && col < uniforms.input_a_shape[2]) {\n"
-                                          << "    return " << a.GetByIndices("input_a_indices_t(batch, row, col)") << ";\n"
-                                          << "  } else {\n"
-                                             "    return input_a_value_t(0);\n"
-                                             "  }\n"
-                                             "}\n"
-                                          << "var<workgroup> sub_a: array<array<input_a_value_t, " << a_length_per_tile << ">," << tile_m_ << ">;\n"
-                                          << "var<workgroup> inter_results: array<array<array<output_value_t, " << WorkgroupSizeX() << ">, " << WorkgroupSizeY() << ">," << tile_m_ << ">;\n";
-        shader.MainFunctionBody() << "  let col = workgroup_id.x * " << WorkgroupSizeY() << ";\n"
-                                  << "  let row = workgroup_id.y * " << tile_m_ << ";\n"
-                                  << "  let batch = workgroup_id.z;\n";
-      }
-      shader.MainFunctionBody() << "  let n_blocks_per_col = uniforms.input_b_shape[1];\n"
-                                << "  let num_tiles =  (n_blocks_per_col - 1) / " << blocks_per_tile << " + 1;\n"
-                                // Loop over shared dimension.
-                                << "  for (var tile: u32 = 0; tile < num_tiles; tile += 1) {\n"
-                                << "    let a_col_start = tile * " << a_length_per_tile << ";\n"
-                                << "    // load one tile A data into shared memory.\n"
-                                << "    for (var a_offset = local_idx; a_offset < " << a_length_per_tile << "; a_offset += " << workgroup_size << ") {\n"
-                                << "      let a_col = a_col_start + a_offset;\n";
-      if (tile_m_ == 1) {
-        shader.MainFunctionBody() << "      sub_a[a_offset] = mm_readA(batch, row, a_col);\n";
-      } else {
-        for (uint32_t i = 0; i < tile_m_; i++) {
-          shader.MainFunctionBody() << "      sub_a[" << i << "][a_offset] = mm_readA(batch, row + " << i << ", a_col);\n";
-        }
-      }
-      shader.MainFunctionBody() << "    }\n"
-                                   "    workgroupBarrier();\n"
-                                   // Each thread processes one block.
-                                   "    let b_row = col + local_id.y;\n"
-                                << "    let block = tile * " << blocks_per_tile << " + local_id.x;\n";
-      if (has_zero_points_) {
-        const auto& zero_points = shader.AddInput("zero_points", ShaderUsage::UseUniform);
-        shader.MainFunctionBody() << "    let zero_point_bytes_per_col = (n_blocks_per_col + 1) / 2;\n"
-                                     "    let zero_point_byte_count = b_row * zero_point_bytes_per_col + (block >> 0x1u);\n"
-                                     "    let zero_point_word_index = zero_point_byte_count >> 0x2u;\n"
-                                     "    let zero_point_byte_offset = zero_point_byte_count & 0x3u;\n"
-                                     "    let zero_point_nibble_offset: u32 = block & 0x1u;\n"
-                                     "    let zero_point_bits_offset = (zero_point_byte_offset << 3) + (zero_point_nibble_offset << 2);\n"
-                                  << "    let zero_point_word = " << zero_points.GetByOffset("zero_point_word_index") << " >> zero_point_bits_offset;\n"
-                                  << "    let zero_point = output_element_t((zero_point_word) & 0xFu);\n";
-      } else {
-        // The default zero point is 8 for unsigned 4-bit quantization.
-        shader.MainFunctionBody() << "    let zero_point = output_element_t(8.0);\n";
-      }
-      shader.MainFunctionBody() << "    var scale = output_element_t(0);\n"
-                                   "    var b_data = input_b_value_t(0);\n"
-                                << "    if (block < n_blocks_per_col) {\n"
-                                << "      scale = " << scales.GetByOffset("b_row * n_blocks_per_col + block") << ";\n"
-                                << "      b_data = " << b.GetByIndices("input_b_indices_t(b_row, block, 0)") << ";\n"
-                                << "    }\n"
-                                << "    var word_offset = local_id.x * " << block_size_ / a.NumComponents() << ";\n"
-                                << "    for (var i: u32 = 0; i < " << components_b_ << "; i++) {\n";
-      shader.MainFunctionBody() << "      let b_value = b_data";
-      if (components_b_ > 1) {
-        shader.MainFunctionBody() << "[i]";
-      }
-      shader.MainFunctionBody() << ";\n"
-                                   "      let b_value_lower = unpack4xU8(b_value & 0x0F0F0F0Fu);\n"
-                                   "      let b_value_upper = unpack4xU8((b_value >> 4) & 0x0F0F0F0Fu);\n"
-                                   "      let b_quantized_values = mat2x4<output_element_t>(output_element_t(b_value_lower[0]), output_element_t(b_value_upper[0]), output_element_t(b_value_lower[1]), output_element_t(b_value_upper[1]), output_element_t(b_value_lower[2]), output_element_t(b_value_upper[2]), output_element_t(b_value_lower[3]), output_element_t(b_value_upper[3]));\n"
-                                   "      let b_dequantized_values = (b_quantized_values - mat2x4<output_element_t>(";
-      for (int i = 0; i < 8; i++) {
-        shader.MainFunctionBody() << "zero_point";
-        if (i < 7) {
-          shader.MainFunctionBody() << ", ";
-        }
-      }
-      shader.MainFunctionBody() << ")) * scale;\n";
-      if (tile_m_ == 1) {
-        switch (a.NumComponents()) {
-          case 1:
-            shader.MainFunctionBody() << "      inter_results[local_id.y][local_id.x] += dot(vec4<output_element_t>(sub_a[word_offset], sub_a[word_offset + 1], sub_a[word_offset + 2], sub_a[word_offset + 3]), b_dequantized_values[0]) + dot(vec4<output_element_t>(sub_a[word_offset + 4], sub_a[word_offset + 5], sub_a[word_offset + 6], sub_a[word_offset + 7]), b_dequantized_values[1]);\n";
-            break;
-          case 2:
-            shader.MainFunctionBody() << "      inter_results[local_id.y][local_id.x] += dot(vec4<output_element_t>(sub_a[word_offset], sub_a[word_offset + 1]), b_dequantized_values[0]) + dot(vec4<output_element_t>(sub_a[word_offset + 2], sub_a[word_offset + 3]), b_dequantized_values[1]);\n";
-            break;
-          case 4:
-            shader.MainFunctionBody() << "      inter_results[local_id.y][local_id.x] += dot(sub_a[word_offset], b_dequantized_values[0]) + dot(sub_a[word_offset + 1], b_dequantized_values[1]);\n";
-            break;
-          default:
-            break;
-        }
-      } else {
-        for (uint32_t i = 0; i < tile_m_; i++) {
-          switch (a.NumComponents()) {
-            case 1:
-              shader.MainFunctionBody() << "      inter_results[" << i << "][local_id.y][local_id.x] += dot(vec4<output_element_t>(sub_a[" << i << "][word_offset], sub_a[" << i << "][word_offset + 1], sub_a[" << i << "][word_offset + 2], sub_a[" << i << "][word_offset + 3]), b_dequantized_values[0]) + dot(vec4<output_element_t>(sub_a[" << i << "][word_offset + 4], sub_a[" << i << "][word_offset + 5], sub_a[" << i << "][word_offset + 6], sub_a[" << i << "][word_offset + 7]), b_dequantized_values[1]);\n";
-              break;
-            case 2:
-              shader.MainFunctionBody() << "      inter_results[" << i << "][local_id.y][local_id.x] += dot(vec4<output_element_t>(sub_a[" << i << "][word_offset], sub_a[" << i << "][word_offset + 1]), b_dequantized_values[0]) + dot(vec4<output_element_t>(sub_a[" << i << "][word_offset + 2], sub_a[" << i << "][word_offset + 3]), b_dequantized_values[1]);\n";
-              break;
-            case 4:
-              shader.MainFunctionBody() << "      inter_results[" << i << "][local_id.y][local_id.x] += dot(sub_a[" << i << "][word_offset], b_dequantized_values[0]) + dot(sub_a[" << i << "][word_offset + 1], b_dequantized_values[1]);\n";
-              break;
-            default:
-              break;
-          }
-        }
-      }
-      shader.MainFunctionBody() << "      word_offset += " << 8 / a.NumComponents() << ";\n"
-                                << "    }\n"
-                                   "    workgroupBarrier();\n"
-                                   "  }\n";
-      if (tile_m_ == 1) {
-        shader.MainFunctionBody() << "  if (local_idx < " << WorkgroupSizeY() << ") {\n"
-                                  << "    var output_value = output_value_t(0);\n"
-                                  << "    for (var b = 0u; b < " << WorkgroupSizeX() << "; b++) {\n"
-                                  << "      output_value += inter_results[local_idx][b];\n"
-                                     "    }\n"
-                                     "    if (col + local_idx < uniforms.output_shape[2]) {\n"
-                                  << "      " << y.SetByIndices("output_indices_t(batch, row, col + local_idx)", "output_value") << ";\n"
-                                  << "    }\n"
-                                     "  }\n";
-      } else {
-        shader.MainFunctionBody() << "  if (local_id.y < " << tile_m_ << ") {\n"
-                                  << "    var output_value = output_value_t(0);\n"
-                                  << "    for (var b = 0u; b < " << WorkgroupSizeX() << "; b++) {\n"
-                                  << "      output_value += inter_results[local_id.y][local_id.x][b];\n"
-                                     "    }\n"
-                                     "    if (row + local_id.y < uniforms.output_shape[1] && col + local_id.x < uniforms.output_shape[2]) {\n"
-                                  << "      " << y.SetByIndices("output_indices_t(batch, row + local_id.y, col + local_id.x)", "output_value") << ";\n"
-                                  << "    }\n"
-                                     "  }\n";
-      }
-    }
-  } else {
-    const std::string quantized_data_type = QuantizedDataType(a.NumComponents());
-    const int output_element_number = y.NumComponents() * onnxruntime::narrow<int>(output_number_);
-
-    const uint32_t shared_memory_size = output_number_ * WORKGROUP_SIZE;
-    std::string offset = "workgroup_idx * " + std::to_string(output_number_);
-    shader.AdditionalImplementation() << "var<workgroup> workgroup_shared : array<output_value_t," << shared_memory_size << ">;\n";
-    shader.MainFunctionBody() << "  let output_indices = " << y.OffsetToIndices(offset) << ";\n"
-                              << "  let col = output_indices[2];\n"
-                                 "  let row = output_indices[1];\n"
-                                 "  let batch = output_indices[0];\n"
-                                 "  let n_blocks_per_col = uniforms.input_b_shape[1];\n"
-                                 "  let blob_size = uniforms.input_b_shape[2];\n"
-                                 "  for (var block = local_id.x; block < n_blocks_per_col; block += workgroup_size_x) {\n"
-                              << "    var word_offset = block * uniforms.block_size / " << a.NumComponents() << ";\n";
-
-    // prepare scale and zero point
-    shader.MainFunctionBody() << "    var col_index = col * " << y.NumComponents() << ";\n";
-    if (has_zero_points_) {
-      const auto& zero_points = shader.AddInput("zero_points", ShaderUsage::UseUniform);
-      shader.MainFunctionBody() << "    let zero_point_bytes_per_col = (n_blocks_per_col + 1) / 2;\n"
-                                   "    var zero_point_byte_count: u32;\n"
-                                   "    var zero_point_word_index: u32;\n"
-                                   "    var zero_point_byte_offset: u32;\n"
-                                   "    let zero_point_nibble_offset: u32 = block & 0x1u;\n"
-                                   "    var zero_point_bits_offset: u32;\n"
-                                   "    var zero_point_word: u32;\n";
-      for (int c = 0; c < output_element_number; c++) {
-        shader.MainFunctionBody() << "    let scale" << c << " = " << scales.GetByOffset("col_index * n_blocks_per_col + block") << ";\n"
-                                  << "    zero_point_byte_count = col_index * zero_point_bytes_per_col + (block >> 0x1u);\n"
-                                     "    zero_point_word_index = zero_point_byte_count >> 0x2u;\n"
-                                     "    zero_point_byte_offset = zero_point_byte_count & 0x3u;\n"
-                                     "    zero_point_bits_offset = (zero_point_byte_offset << 3) + (zero_point_nibble_offset << 2);\n"
-                                  << "    zero_point_word = " << zero_points.GetByOffset("zero_point_word_index") << " >> zero_point_bits_offset;\n"
-                                  << "    let zero_point" << c << " = output_element_t((zero_point_word) & 0xFu);\n"
-                                  << "    col_index += 1;\n";
-      }
-    } else {
-      shader.MainFunctionBody() << "    let zero_point = output_element_t(8.0);\n";
-      for (int c = 0; c < output_element_number; c++) {
-        shader.MainFunctionBody() << "    let scale" << c << " = " << scales.GetByOffset("col_index * n_blocks_per_col + block") << ";\n"
-                                  << "    col_index += 1;\n";
-      }
-    }
-
-    shader.MainFunctionBody() << "    for (var word: u32 = 0; word < blob_size; word += 1) {\n";
-
-    // prepare b data
-    shader.MainFunctionBody() << "      col_index = col * " << y.NumComponents() << ";\n";
-    for (int c = 0; c < output_element_number; c++) {
-      shader.MainFunctionBody() << "      let b" << c << "_data = " << b.GetByIndices("input_b_indices_t(col_index, block, word)") << ";\n"
-                                << "      col_index += 1;\n";
-    }
-    shader.MainFunctionBody() << "      var b_value : u32;\n"
-                                 "      let b_mask : u32 = 0x0F0F0F0Fu;\n"
-                                 "      var b_value_lower : vec4<u32>;\n"
-                                 "      var b_value_upper : vec4<u32>;\n"
-                              << "      var b_quantized_values : " << quantized_data_type << ";\n"
-                              << "      var b_dequantized_values : " << quantized_data_type << ";\n";
-
-    shader.MainFunctionBody() << "      for (var i: u32 = 0; i < " << components_b_ << "; i++) {\n";
-
-    // process one word
-    shader.MainFunctionBody() << "        var input_offset = " << a.IndicesToOffset("input_a_indices_t(batch, row, word_offset)") << ";\n"
-                              << "        var a_data: " << quantized_data_type << ";\n"
-                              << "        for (var j: u32 = 0; j < " << (8 / a.NumComponents()) << "; j++) {\n"
-                              << "          if (word_offset + j < uniforms.input_a_shape[2]) {\n"
-                              << "            a_data[j] = " << a.GetByOffset("input_offset") << ";\n"
-                              << "            input_offset++;\n"
-                                 "          } else {\n"
-                                 "            a_data[j] = input_a_value_t(0);\n"
-                                 "          }\n"
-                                 "        }\n";
-    for (int c = 0; c < output_element_number; c++) {
-      shader.MainFunctionBody() << "        b_value = b" << c << "_data";
-      if (components_b_ > 1) {
-        shader.MainFunctionBody() << "[i]";
-      }
-      shader.MainFunctionBody() << ";\n"
-                                   "        b_value_lower = unpack4xU8(b_value & b_mask);\n"
-                                   "        b_value_upper = unpack4xU8((b_value >> 4) & b_mask);\n"
-                                << "        b_quantized_values = " << quantized_data_type << "(output_element_t(b_value_lower[0]), output_element_t(b_value_upper[0]), output_element_t(b_value_lower[1]), output_element_t(b_value_upper[1]), output_element_t(b_value_lower[2]), output_element_t(b_value_upper[2]), output_element_t(b_value_lower[3]), output_element_t(b_value_upper[3]));\n"
-                                << "        b_dequantized_values = ";
-      if (a.NumComponents() == 1) {
-        if (has_zero_points_) {
-          shader.MainFunctionBody() << quantized_data_type << "((b_quantized_values[0] - zero_point" << c << ") * scale" << c << ", "
-                                    << "(b_quantized_values[1] - zero_point" << c << ") * scale" << c << ", "
-                                    << "(b_quantized_values[2] - zero_point" << c << ") * scale" << c << ", "
-                                    << "(b_quantized_values[3] - zero_point" << c << ") * scale" << c << ", "
-                                    << "(b_quantized_values[4] - zero_point" << c << ") * scale" << c << ", "
-                                    << "(b_quantized_values[5] - zero_point" << c << ") * scale" << c << ", "
-                                    << "(b_quantized_values[6] - zero_point" << c << ") * scale" << c << ", "
-                                    << "(b_quantized_values[7] - zero_point" << c << ") * scale" << c << ");\n";
-        } else {
-          shader.MainFunctionBody() << quantized_data_type << "((b_quantized_values[0] - zero_point) * scale" << c << ", "
-                                    << "(b_quantized_values[1] - zero_point) * scale" << c << ","
-                                    << "(b_quantized_values[2] - zero_point) * scale" << c << ","
-                                    << "(b_quantized_values[3] - zero_point) * scale" << c << ","
-                                    << "(b_quantized_values[4] - zero_point) * scale" << c << ","
-                                    << "(b_quantized_values[5] - zero_point) * scale" << c << ","
-                                    << "(b_quantized_values[6] - zero_point) * scale" << c << ","
-                                    << "(b_quantized_values[7] - zero_point) * scale" << c << ");\n";
-        }
-      } else {
-        shader.MainFunctionBody() << "(b_quantized_values - " << quantized_data_type << "(";
-        for (int i = 0; i < 8; i++) {
-          if (has_zero_points_) {
-            shader.MainFunctionBody() << "zero_point" << c;
-          } else {
-            shader.MainFunctionBody() << "zero_point";
-          }
-          if (i < 7) {
-            shader.MainFunctionBody() << ", ";
-          }
-        }
-        shader.MainFunctionBody() << ")) * scale" << c << ";\n";
-      }
-
-      shader.MainFunctionBody() << "        workgroup_shared[local_id.x * " << output_number_ << " + " << c / y.NumComponents() << "]";
-      if (y.NumComponents() > 1) {
-        shader.MainFunctionBody() << "[" << c % y.NumComponents() << "]";
-      }
-      shader.MainFunctionBody() << " += ";
-      if (a.NumComponents() == 1) {
-        shader.MainFunctionBody() << "a_data[0] * b_dequantized_values[0] + "
-                                     "a_data[1] * b_dequantized_values[1] + "
-                                     "a_data[2] * b_dequantized_values[2] + "
-                                     "a_data[3] * b_dequantized_values[3] + "
-                                     "a_data[4] * b_dequantized_values[4] + "
-                                     "a_data[5] * b_dequantized_values[5] + "
-                                     "a_data[6] * b_dequantized_values[6] + "
-                                     "a_data[7] * b_dequantized_values[7];\n";
-      } else if (a.NumComponents() == 2) {
-        shader.MainFunctionBody() << "dot(a_data[0], b_dequantized_values[0]) + "
-                                     "dot(a_data[1], b_dequantized_values[1]) + "
-                                     "dot(a_data[2], b_dequantized_values[2]) + "
-                                     "dot(a_data[3], b_dequantized_values[3]);\n";
-      } else if (a.NumComponents() == 4) {
-        shader.MainFunctionBody() << "dot(a_data[0], b_dequantized_values[0]) + "
-                                     "dot(a_data[1], b_dequantized_values[1]);\n";
-      }
-    }
-
-    shader.MainFunctionBody() << "        word_offset += " << 8 / a.NumComponents() << ";\n"
-                              << "      }\n"
-                                 "    }\n"
-                                 "  }\n"
-                                 "  workgroupBarrier();\n"
-                              << "  if (local_id.x < " << output_number_ << ") {\n"
-                              << "    var output_value = output_value_t(0);\n"
-                                 "    var workgroup_shared_offset = local_id.x;\n"
-                              << "    let blocks_num = min(" << shared_memory_size << ", n_blocks_per_col);\n"
-                              << "    for (var b = 0u; b < blocks_num; b++) {\n"
-                                 "      output_value += workgroup_shared[workgroup_shared_offset];\n"
-                              << "      workgroup_shared_offset += " << output_number_ << ";\n"
-                              << "    }\n"
-                              << "    " << y.SetByIndices("output_indices_t(batch, row, col + local_id.x)", "output_value") << "\n"
-                              << "  }\n";
-  }
-
-  return Status::OK();
-}
-
 Status MatMulNBitsWideTileProgram::GenerateShaderCode(ShaderHelper& shader) const {
   const auto& a = shader.AddInput("input_a", ShaderUsage::UseUniform | ShaderUsage::UseIndicesTypeAlias | ShaderUsage::UseValueTypeAlias);
   const auto& b = shader.AddInput("input_b", ShaderUsage::UseUniform | ShaderUsage::UseIndicesTypeAlias | ShaderUsage::UseValueTypeAlias);
@@ -541,65 +90,21 @@ Status MatMulNBitsWideTileProgram::GenerateShaderCode(ShaderHelper& shader) cons
 
   // memory read/write helpers
   shader.AdditionalImplementation() << "fn mm_read_a(batch : u32, row : u32, col : u32) -> input_a_value_t {\n"
-                                    << "  if (row < uniforms.input_a_shape[1] && col < uniforms.input_a_shape[2]) {\n"
+                                    << "  if (batch < uniforms.input_a_shape[0] && row < uniforms.input_a_shape[1] && col < uniforms.input_a_shape[2]) {\n"
                                     << "    return " << a.GetByIndices("input_a_indices_t(batch, row, col)") << ";\n"
                                     << "  }\n"
                                     << "  return input_a_value_t(0);\n"
                                     << "}\n";
+  if (nbits_ == 4) {
+    shader.AdditionalImplementation() << "\n"
+                                      << "fn mm_read_b(row : u32, col : u32) -> input_b_value_t {\n"
+                                      << "  if (row < uniforms.input_b_shape[0] && col < uniforms.input_b_shape[1]) {\n"
+                                      << "    return " << b.GetByIndices("input_b_indices_t(row, col, 0)") << ";\n"
+                                      << "  }\n"
+                                      << "  return input_b_value_t(0);\n"
+                                      << "}\n";
 
-  shader.AdditionalImplementation() << "\n"
-                                    << "fn mm_read_b(row : u32, col : u32) -> input_b_value_t {\n"
-                                    << "  if (row < uniforms.input_b_shape[0] && col < uniforms.input_b_shape[1]) {\n"
-                                    << "    return " << b.GetByIndices("input_b_indices_t(row, col, 0)") << ";\n"
-                                    << "  }\n"
-                                    << "  return input_b_value_t(0);\n"
-                                    << "}\n";
-
-  shader.AdditionalImplementation() << "\n"
-                                    << "fn mm_read_scale(row : u32, col : u32) -> output_element_t {\n"
-                                    << "  if (row < uniforms.input_b_shape[0] && col < uniforms.input_b_shape[1]) {\n"
-                                    << "    return scales[row * uniforms.input_b_shape[1] + col];\n"
-                                    << "  }\n"
-                                    << "  return output_element_t(0);\n"
-                                    << "}\n";
-
-  if (has_zero_points_) {
     shader.AdditionalImplementation() << R"(
-fn mm_read_zero(row : u32, col : u32) -> output_element_t {
-  if (row < uniforms.input_b_shape[0] && col < uniforms.input_b_shape[1]) {
-    let offset = row * uniforms.input_b_stride[0] + col * uniforms.input_b_stride[1];
-
-    // u32 holds 8 packed uint4.
-    let array_index = offset / 8u;
-    let component_index = offset % 8u;
-    let packed_value = zero_points[array_index];
-
-    // Extract the uint4 component
-    let shift_amount = component_index * 4u;
-    let masked_value = (packed_value >> shift_amount) & 0xFu;
-
-    return output_element_t(masked_value);
-  }
-  return output_element_t(0);
-}
-)";
-  } else {
-    shader.AdditionalImplementation() << R"(
-fn mm_read_zero(row : u32, col : u32) -> output_element_t {
-  // The default zero point is 8.
-  return output_element_t(8);
-}
-)";
-  }
-
-  shader.AdditionalImplementation() << "\n"
-                                    << "fn mm_write_y(batch : u32, row : u32, col : u32, value : output_value_t) {\n"
-                                    << "  if (row < uniforms.output_shape[1] && col < uniforms.output_shape[2]) {\n"
-                                    << "    " << y.SetByIndices("output_indices_t(batch, row, col)", "value") << "\n"
-                                    << "  }\n"
-                                    << "}\n";
-
-  shader.AdditionalImplementation() << R"(
 fn dequantize_packed8xU4(packed_value : u32, zero_point : output_element_t, scale : output_element_t) -> mat2x4<output_element_t> {
   let lower_values: vec4<u32> = unpack4xU8(packed_value & 0x0F0F0F0Fu);
   let upper_values: vec4<u32> = unpack4xU8((packed_value >> 4u) & 0x0F0F0F0Fu);
@@ -620,6 +125,23 @@ fn dequantize_packed8xU4(packed_value : u32, zero_point : output_element_t, scal
   return dequantized_values;
 }
 )";
+  }
+
+  shader.AdditionalImplementation() << "\n"
+                                    << "fn mm_read_scale(row : u32, col : u32) -> output_element_t {\n"
+                                    << "  if (row < uniforms.input_b_shape[0] && col < uniforms.input_b_shape[1]) {\n"
+                                    << "    return scales[row * uniforms.input_b_shape[1] + col];\n"
+                                    << "  }\n"
+                                    << "  return output_element_t(0);\n"
+                                    << "}\n"
+                                    << ReadZeroPoint(nbits_, has_zero_points_);
+
+  shader.AdditionalImplementation() << "\n"
+                                    << "fn mm_write_y(batch : u32, row : u32, col : u32, value : output_value_t) {\n"
+                                    << "  if (row < uniforms.output_shape[1] && col < uniforms.output_shape[2]) {\n"
+                                    << "    " << y.SetByIndices("output_indices_t(batch, row, col)", "value") << "\n"
+                                    << "  }\n"
+                                    << "}\n";
 
   // declare const variables
   shader.AdditionalImplementation() << "\n"
@@ -635,9 +157,9 @@ fn dequantize_packed8xU4(packed_value : u32, zero_point : output_element_t, scal
 
   // main
   shader.MainFunctionBody() << R"MAIN_FN(
-  let batch = workgroup_id.z;
-  let row = workgroup_id.y * kTileM;
-  let col = workgroup_id.x * kTileN;
+  let batch = workgroup_idx / (uniforms.num_M_tile * uniforms.num_N_tile);
+  let row = ((workgroup_idx / uniforms.num_N_tile) % uniforms.num_M_tile) * kTileM;
+  let col = (workgroup_idx % uniforms.num_N_tile) * kTileN;
 
   let a_elements_per_col = uniforms.input_a_shape[2];
   let a_blocks_per_col = (a_elements_per_col + kAComponentsForBlock32 - 1) / kAComponentsForBlock32;
@@ -655,10 +177,13 @@ fn dequantize_packed8xU4(packed_value : u32, zero_point : output_element_t, scal
     let b_row = col + local_idx;
     let b_col = a_block_idx;
 
-    let b_data = mm_read_b(b_row, b_col);
     let scale = mm_read_scale(b_row, b_col);
-    let zero_point = mm_read_zero(b_row, b_col);
+    let zero_point = mm_read_zero(b_row, b_col, uniforms.input_b_shape[0], uniforms.zero_blocks_per_col);
+)MAIN_FN";
 
+  if (nbits_ == 4) {
+    shader.MainFunctionBody() << R"MAIN_FN(
+    let b_data = mm_read_b(b_row, b_col);
     // `b` component size is 4.
     for (var b_idx = 0u; b_idx < 4u; b_idx++) {
       let b_dequantized = dequantize_packed8xU4(b_data[b_idx], zero_point, scale);
@@ -669,10 +194,37 @@ fn dequantize_packed8xU4(packed_value : u32, zero_point : output_element_t, scal
         results[m_idx] += f32(dot(a_data0, b_dequantized[0])) + f32(dot(a_data1, b_dequantized[1]));
       }
     }
+)MAIN_FN";
+  } else {
+    shader.MainFunctionBody() << "    var b_data0 = vec4<u32>(0);\n"
+                                 "    var b_data1 = vec4<u32>(0);\n"
+                                 "    if (b_row < uniforms.input_b_shape[0] && b_col < uniforms.input_b_shape[1]) {\n"
+                              << "      b_data0 = " << b.GetByIndices("input_b_indices_t(b_row, b_col, 0)") << ";\n"
+                              << "      b_data1 = " << b.GetByIndices("input_b_indices_t(b_row, b_col, 1)") << ";\n"
+                                                                                                               "    }"
+                              << R"MAIN_FN(
+    for (var b_idx = 0u; b_idx < 4u; b_idx++) {
+      let b_dequantized0 = (vec4<output_element_t>(unpack4xU8(b_data0[b_idx])) - vec4<output_element_t>(zero_point)) * scale;
+      let b_dequantized1 = (vec4<output_element_t>(unpack4xU8(b_data1[b_idx])) - vec4<output_element_t>(zero_point)) * scale;
+      for (var m_idx = 0u; m_idx < kTileM; m_idx++) {
+        let a_data0 = a_data_tile[m_idx][b_idx];
+        let a_data1 = a_data_tile[m_idx][b_idx + 4u];
+
+        results[m_idx] += f32(dot(a_data0, b_dequantized0)) + f32(dot(a_data1, b_dequantized1));
+      }
+    }
+)MAIN_FN";
+  }
+
+  shader.MainFunctionBody() << R"MAIN_FN(
 
     workgroupBarrier();
   }
 
+  if (batch >= uniforms.input_a_shape[0]) {
+    return;
+  }
+
   // Write the results.
   for (var m_idx = 0u; m_idx < kTileM; m_idx++) {
     mm_write_y(batch, row + m_idx, col + local_idx, output_value_t(results[m_idx]));
@@ -682,6 +234,156 @@ fn dequantize_packed8xU4(packed_value : u32, zero_point : output_element_t, scal
   return Status::OK();
 }
 
+// Apply similar idea with DP4AMatMulNBitsSmallMProgram algorithm.
+Status MatMulNBitsProgram::GenerateShaderCode(ShaderHelper& shader) const {
+  const auto& a = shader.AddInput("input_a", ShaderUsage::UseValueTypeAlias);
+  const auto& b = shader.AddInput("input_b");
+  shader.AddInput("scales_b");
+  if (has_zero_points_) {
+    shader.AddInput("zero_points", ShaderUsage::UseUniform);
+  }
+  shader.AddOutput("output", ShaderUsage::UseElementTypeAlias);
+  const uint32_t components_a = a.NumComponents();
+  const uint32_t components_b = b.NumComponents() / 4;  // b is stored as uint32 which includs 4 uint8.
+  constexpr uint32_t tile_size_k_vec = 16;
+  uint32_t elements_in_value_b = components_b * (32 / nbits_);
+  uint32_t tile_k_size = tile_size_k_vec * elements_in_value_b;
+  const uint32_t a_length_per_tile = tile_k_size / components_a;
+
+  shader.AdditionalImplementation() << "const a_length_per_tile = " << a_length_per_tile << "u;\n"
+                                    << "const tile_size_k_vec = " << tile_size_k_vec << ";\n"
+                                    << "const tile_size_k = " << tile_k_size << "u;\n"
+                                    << "const tile_size = " << tile_size_ << "u;\n"
+                                    << "const elements_in_value_b = " << elements_in_value_b << "u;\n"
+                                    << "const sub_tile_count = " << WorkgroupSizeX() / tile_size_k_vec << "u;\n"
+                                    << "const component_a = " << components_a << "u;\n"
+                                    << "const component_b = " << components_b << "u;\n";
+  shader.AdditionalImplementation() << R"ADDNL_FN(
+  // Shared memory
+  var<workgroup> tile_A : array<input_a_value_t, a_length_per_tile>;
+  var<workgroup> inter_results: array<array<output_element_t, tile_size_k_vec>, tile_size>;
+  fn loadSHMA(batch: u32, a_global: u32, kidx: u32, col: u32)
+  {
+    let k_offset = kidx / component_a + col;
+    if (batch < uniforms.batch_count && k_offset < uniforms.K_of_a) {
+      tile_A[col] = input_a[batch * uniforms.M * uniforms.K_of_a + a_global * uniforms.K_of_a + k_offset];
+    } else {
+      tile_A[col] = input_a_value_t(0);
+    }
+  }
+)ADDNL_FN"
+                                    << ReadZeroPoint(nbits_, has_zero_points_);
+
+  shader.MainFunctionBody() << R"MAIN_FN(
+  let batch = workgroup_idx / (uniforms.M * uniforms.num_N_tile);
+  let a_global = (workgroup_idx / uniforms.num_N_tile) % uniforms.M;
+  let b_global_base = (workgroup_idx % uniforms.num_N_tile) * tile_size;
+
+  let idx = local_idx % tile_size_k_vec;
+  let idy = local_idx / tile_size_k_vec;
+
+  for (var kidx = 0u; kidx < uniforms.K; kidx += tile_size_k)
+  {
+    for (var id = local_idx; id < a_length_per_tile; id += workgroup_size_x)
+    {
+      loadSHMA(batch, a_global, kidx, id);
+    }
+    workgroupBarrier();
+
+    for (var local_row_offset = 0u; local_row_offset < tile_size; local_row_offset += sub_tile_count)
+    {
+      var b_global = b_global_base + local_row_offset + idy;
+      var k_offset = kidx / elements_in_value_b + idx;
+      if (b_global < uniforms.N && k_offset < uniforms.K_of_b)
+      {
+        let block_idx = (kidx + idx * elements_in_value_b) / uniforms.block_size;
+        let scale_b = scales_b[b_global * uniforms.blocks_per_col + block_idx];
+        let zero = mm_read_zero(b_global, block_idx, uniforms.N, uniforms.zero_blocks_per_col);
+        var b_value = input_b[b_global * uniforms.K_of_b + k_offset];
+)MAIN_FN";
+
+  if (nbits_ == 4) {
+    shader.MainFunctionBody() << R"MAIN_FN(
+        var sum = output_element_t(0);
+        var a_offset = idx * (8 / component_a) * component_b;
+        for (var i = 0u; i < component_b; i++) {
+          let b_value_lower = vec4<output_element_t>(unpack4xU8(b_value[i] & 0x0F0F0F0Fu)) - vec4<output_element_t>(zero);
+          let b_value_upper = vec4<output_element_t>(unpack4xU8((b_value[i] >> 4) & 0x0F0F0F0Fu)) - vec4<output_element_t>(zero);
+          let b0 = vec4<output_element_t>(b_value_lower[0], b_value_upper[0], b_value_lower[1], b_value_upper[1]) * scale_b;
+          let b1 = vec4<output_element_t>(b_value_lower[2], b_value_upper[2], b_value_lower[3], b_value_upper[3]) * scale_b;
+)MAIN_FN";
+    switch (components_a) {
+      case 1:
+        shader.MainFunctionBody() << "          sum += dot(vec4<output_element_t>(tile_A[a_offset], tile_A[a_offset + 1], tile_A[a_offset + 2], tile_A[a_offset + 3]), b0) +"
+                                     " dot(vec4<output_element_t>(tile_A[a_offset + 4], tile_A[a_offset + 5], tile_A[a_offset + 6], tile_A[a_offset + 7]), b1);\n"
+                                     "          a_offset += 8;\n";
+        break;
+      case 2:
+        shader.MainFunctionBody() << "          sum += dot(vec4<output_element_t>(tile_A[a_offset], tile_A[a_offset + 1]), b0) +"
+                                     "dot(vec4<output_element_t>(tile_A[a_offset + 2], tile_A[a_offset + 3]), b1);\n"
+                                     "          a_offset += 4;\n";
+        break;
+      case 4:
+        shader.MainFunctionBody() << "          sum += dot(tile_A[a_offset], b0) + dot(tile_A[a_offset + 1], b1);\n"
+                                     "          a_offset += 2;\n";
+        break;
+      default:
+        break;
+    }
+    shader.MainFunctionBody() << "        }\n";
+  } else {
+    shader.MainFunctionBody() << R"MAIN_FN(
+        var sum = output_element_t(0);
+        var a_offset = idx * (4 / component_a) * component_b;
+        for (var i = 0u; i < component_b; i++) {
+          let b_value = (vec4<output_element_t>(unpack4xU8(b_value[i])) - vec4<output_element_t>(zero)) * scale_b;
+)MAIN_FN";
+    switch (components_a) {
+      case 1:
+        shader.MainFunctionBody() << "          sum += dot(vec4<output_element_t>(tile_A[a_offset], tile_A[a_offset + 1], tile_A[a_offset + 2], tile_A[a_offset + 3]), b_value);\n"
+                                     "          a_offset += 4;\n";
+        break;
+      case 2:
+        shader.MainFunctionBody() << "          sum += dot(vec4<output_element_t>(tile_A[a_offset], tile_A[a_offset + 1]), b_value);\n"
+                                     "          a_offset += 2;\n";
+        break;
+      case 4:
+        shader.MainFunctionBody() << "          sum += dot(tile_A[a_offset], b_value);\n"
+                                     "          a_offset += 1;\n";
+        break;
+      default:
+        break;
+    }
+    shader.MainFunctionBody() << "        }\n";
+  }
+
+  shader.MainFunctionBody() << R"MAIN_FN(
+        inter_results[local_row_offset + idy][idx] += sum;
+      }
+    }
+    workgroupBarrier();
+  }
+
+  if (batch >= uniforms.batch_count) {
+    return;
+  }
+
+  if (local_idx < tile_size) {
+    var output_value = output_element_t(0);
+    for (var b = 0u; b < tile_size_k_vec; b++) {
+      output_value += inter_results[local_idx][b];
+    }
+    let b_global =  b_global_base + local_idx;
+    let output_idx = batch * uniforms.M * uniforms.N + a_global * uniforms.N + b_global;
+    if (b_global < uniforms.N) {
+      output[output_idx] = output_value;
+    }
+  }
+)MAIN_FN";
+
+  return Status::OK();
+}
+
 Status MatMulNBits::ComputeInternal(onnxruntime::webgpu::ComputeContext& context) const {
   const Tensor* a = context.Input(0);
   const Tensor* b = context.Input(1);
@@ -724,20 +426,18 @@ Status MatMulNBits::ComputeInternal(onnxruntime::webgpu::ComputeContext& context
   }
 
   // On FP32 only GPUs, integer math is faster than FP32 therefore always use DP4A independent of length of M.
-  if ((M >= kMinMForTileOptimization || y->DataType() == DataTypeImpl::GetType<float>() || nbits == 8 ||
+  if ((M >= kMinMForTileOptimization || y->DataType() == DataTypeImpl::GetType<float>() ||
        context.AdapterInfo().vendor == std::string_view{"qualcomm"}) &&
       CanApplyDP4AMatrixMatMulNBits(context, accuracy_level_, block_size, batch_count, N, K, components_a, has_zero_points)) {
     return ApplyDP4AMatrixMatMulNBits(a, b, scales, M, N, K, block_size, kMinMForTileOptimization, nbits, context, y);
   }
 
-  // TODO: Remvoe it once the 8bits is supported for the non-dp4 path.
-  ORT_ENFORCE(nbits == 4, "Only 4 bits are supported for the non-dp4 path for webgpu matmulnbits");
+  // zero_points has shape[N * CeilDiv(n_blocks_per_col * bits, 8)]. So here we need to check whether n_blocks_per_col is divisible by 8/nbits.
+  uint32_t zero_blocks_per_col = n_blocks_per_col % (8 / nbits) == 0 ? n_blocks_per_col : n_blocks_per_col + 1;
 
   // WideTileProgram
   // This program is optimized for Block32 prefill using Tile16x128.
-  // TODO: loosen restrictions on vendor.
-  const bool use_wide_tile_program = block_size == 32 && components_a == 4 && components_b == 4 && M >= kMinMForTileOptimization &&
-                                     context.AdapterInfo().vendor == std::string_view{"intel"};
+  const bool use_wide_tile_program = block_size == 32 && components_a == 4 && components_b == 4 && M >= kMinMForTileOptimization;
   if (use_wide_tile_program) {
     // Enforce output components to 1.
     components = 1;
@@ -745,8 +445,10 @@ Status MatMulNBits::ComputeInternal(onnxruntime::webgpu::ComputeContext& context
     constexpr uint32_t workgroup_size = 128;
     constexpr uint32_t tile_m = workgroup_size / 8;
     constexpr uint32_t tile_n = workgroup_size;
+    uint32_t num_N_tile = (N + tile_n - 1) / tile_n;
+    uint32_t num_M_tile = (M + tile_m - 1) / tile_m;
 
-    MatMulNBitsWideTileProgram program{has_zero_points, tile_m, tile_n};
+    MatMulNBitsWideTileProgram program{has_zero_points, tile_m, tile_n, nbits};
     program.SetWorkgroupSize(workgroup_size);
     program.SetDispatchGroupSize((N + tile_n - 1) / tile_n,
                                  (M + tile_m - 1) / tile_m,
@@ -762,7 +464,8 @@ Status MatMulNBits::ComputeInternal(onnxruntime::webgpu::ComputeContext& context
                     {b, ProgramTensorMetadataDependency::TypeAndRank, reshaped_b_shape, onnxruntime::narrow<int>(components_b * 4)},
                     {scales, ProgramTensorMetadataDependency::None}})
         .AddOutput({y, ProgramTensorMetadataDependency::TypeAndRank, reshaped_y_shape, onnxruntime::narrow<int>(components)})
-        .AddUniformVariable({block_size});
+        .AddUniformVariables({{block_size}, {zero_blocks_per_col}, {num_N_tile}, {num_M_tile}})
+        .CacheHint(nbits, has_zero_points);
     if (has_zero_points) {
       program.AddInput({zero_points, ProgramTensorMetadataDependency::None, {(zero_points->Shape().Size() + 3) / 4}, 4});
     }
@@ -770,47 +473,21 @@ Status MatMulNBits::ComputeInternal(onnxruntime::webgpu::ComputeContext& context
     return context.RunProgram(program);
   }
 
-  // Generic program
-  // TODO: Support output_number > 1. Some cases are failed when output_number > 1.
-  constexpr uint32_t output_number = 1;
-  const uint32_t tile_m = M > kMinMForTileOptimization ? 4 : 1;
-  const bool has_subgroup = context.HasFeature(wgpu::FeatureName::Subgroups);
-  const bool use_subgroup = has_subgroup && context.AdapterInfo().vendor == std::string_view{"intel"} && components_a == 4 && block_size == 32;
-  MatMulNBitsProgram program{output_number, block_size, tile_m, static_cast<int>(components_b), has_zero_points, use_subgroup};
-  if (M > kMinMForTileOptimization && block_size == 32) {
-    components = 1;
-    constexpr uint32_t workgroup_size = 64;
-    constexpr uint32_t workgroup_y = 8;
-    constexpr uint32_t workgroup_x = workgroup_size / workgroup_y;
-    program.SetWorkgroupSize(workgroup_x, workgroup_y, 1);
-    program.SetDispatchGroupSize((N + workgroup_y - 1) / workgroup_y,
-                                 (M + tile_m - 1) / tile_m,
-                                 batch_count);
-    program.CacheHint("T_M" + std::to_string(tile_m) + "Subgroup" + std::to_string(use_subgroup));
-  } else if (block_size == 32) {
-    components = 1;
-    // TODO: Tune the workgroup size when `M=1`.
-    constexpr uint32_t workgroup_size = 128;
-    const uint32_t workgroup_y = N % 8 == 0 ? 8 : 1;
-    const uint32_t workgroup_x = workgroup_size / workgroup_y;
-    program.SetWorkgroupSize(workgroup_x, workgroup_y, 1);
-    program.SetDispatchGroupSize(data_size / components / workgroup_y);
-    program.CacheHint("T_M" + std::to_string(tile_m));
-  } else {
-    program.SetDispatchGroupSize(data_size / components / output_number);
-    program.CacheHint("O_N" + std::to_string(output_number));
-  }
-
-  TensorShape reshaped_a_shape{batch_count, M, K / components_a};
-  TensorShape reshaped_b_shape{N, n_blocks_per_col, blob_size_in_words / components_b};
-  TensorShape reshaped_y_shape{batch_count, M, N / components};
-
+  constexpr uint32_t workgroup_size = 128;
+  constexpr uint32_t tile_size = 8;
+  constexpr uint32_t kU32Components = 4;
+  uint32_t components_b_with_u32 = components_b * kU32Components;
+  uint32_t num_N_tile = (N + tile_size - 1) / tile_size;
+  MatMulNBitsProgram program{tile_size, nbits, has_zero_points};
+  program.SetWorkgroupSize(workgroup_size);
+  program.SetDispatchGroupSize((N + tile_size - 1) / tile_size, M, batch_count);
   program
-      .AddInputs({{a, ProgramTensorMetadataDependency::TypeAndRank, reshaped_a_shape, static_cast<int>(components_a)},
-                  {b, ProgramTensorMetadataDependency::TypeAndRank, reshaped_b_shape, static_cast<int>(components_b * 4 /** b will be accessed as uint32 which includs 4 uint8. So here we need to multiply 4.*/)},
-                  {scales, ProgramTensorMetadataDependency::None}})
-      .AddOutput({y, ProgramTensorMetadataDependency::TypeAndRank, reshaped_y_shape, static_cast<int>(components)})
-      .AddUniformVariable({block_size});
+      .AddInputs({{a, ProgramTensorMetadataDependency::TypeAndRank, static_cast<int>(components_a)},
+                  {b, ProgramTensorMetadataDependency::TypeAndRank, static_cast<int>(components_b_with_u32)},
+                  {scales, ProgramTensorMetadataDependency::TypeAndRank}})
+      .AddOutput({y, ProgramTensorMetadataDependency::TypeAndRank})
+      .AddUniformVariables({{M}, {N}, {K}, {K / components_a}, {n_blocks_per_col * blob_size / components_b_with_u32}, {block_size}, {n_blocks_per_col}, {zero_blocks_per_col}, {num_N_tile}, {batch_count}})
+      .CacheHint(nbits, has_zero_points);
   if (has_zero_points) {
     program.AddInput({zero_points, ProgramTensorMetadataDependency::None, {(zero_points->Shape().Size() + 3) / 4}, 4});
   }
diff --git a/onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits.h b/onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits.h
index d5e4bc68fc33a..807576c91752b 100644
--- a/onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits.h
+++ b/onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits.h
@@ -12,41 +12,44 @@ namespace webgpu {
 
 using namespace onnxruntime::webgpu;
 
-class MatMulNBitsProgram final : public Program<MatMulNBitsProgram> {
+class MatMulNBitsWideTileProgram final : public Program<MatMulNBitsWideTileProgram> {
  public:
-  MatMulNBitsProgram(uint32_t output_number, uint32_t block_size, uint32_t tile_m, int components_b, bool has_zero_points, bool use_subgroup) : Program{"MatMulNBits"},
-                                                                                                                                                output_number_{output_number},
-                                                                                                                                                block_size_{block_size},
-                                                                                                                                                tile_m_{tile_m},
-                                                                                                                                                components_b_{components_b},
-                                                                                                                                                has_zero_points_{has_zero_points},
-                                                                                                                                                use_subgroup_(use_subgroup) {
-  }
+  MatMulNBitsWideTileProgram(bool has_zero_points, uint32_t tile_m, uint32_t tile_n, uint32_t nbits)
+      : Program{"MatMulNBitsWideTileProgram"}, has_zero_points_{has_zero_points}, tile_m_(tile_m), tile_n_(tile_n), nbits_(nbits) {}
 
   Status GenerateShaderCode(ShaderHelper& sh) const override;
-  WEBGPU_PROGRAM_DEFINE_UNIFORM_VARIABLES({"block_size", ProgramUniformVariableDataType::Uint32});
+  WEBGPU_PROGRAM_DEFINE_UNIFORM_VARIABLES({"block_size", ProgramUniformVariableDataType::Uint32},
+                                          {"zero_blocks_per_col", ProgramUniformVariableDataType::Uint32},
+                                          {"num_N_tile", ProgramUniformVariableDataType::Uint32},
+                                          {"num_M_tile", ProgramUniformVariableDataType::Uint32});
 
  private:
-  uint32_t output_number_;
-  uint32_t block_size_;
-  uint32_t tile_m_;
-  int components_b_;
   bool has_zero_points_;
-  bool use_subgroup_;
+  uint32_t tile_m_;
+  uint32_t tile_n_;
+  uint32_t nbits_;
 };
 
-class MatMulNBitsWideTileProgram final : public Program<MatMulNBitsWideTileProgram> {
+class MatMulNBitsProgram final : public Program<MatMulNBitsProgram> {
  public:
-  MatMulNBitsWideTileProgram(bool has_zero_points, uint32_t tile_m, uint32_t tile_n)
-      : Program{"MatMulNBitsWideTileProgram"}, has_zero_points_{has_zero_points}, tile_m_(tile_m), tile_n_(tile_n) {}
-
+  MatMulNBitsProgram(uint32_t tile_size, uint32_t nbits, bool has_zero_points) : Program{"MatMulNBits"}, tile_size_(tile_size), nbits_(nbits), has_zero_points_(has_zero_points) {}
   Status GenerateShaderCode(ShaderHelper& sh) const override;
-  WEBGPU_PROGRAM_DEFINE_UNIFORM_VARIABLES({"block_size", ProgramUniformVariableDataType::Uint32});
+  WEBGPU_PROGRAM_DEFINE_UNIFORM_VARIABLES(
+      {"M", ProgramUniformVariableDataType::Uint32},
+      {"N", ProgramUniformVariableDataType::Uint32},
+      {"K", ProgramUniformVariableDataType::Uint32},
+      {"K_of_a", ProgramUniformVariableDataType::Uint32},
+      {"K_of_b", ProgramUniformVariableDataType::Uint32},
+      {"block_size", ProgramUniformVariableDataType::Uint32},
+      {"blocks_per_col", ProgramUniformVariableDataType::Uint32},
+      {"zero_blocks_per_col", ProgramUniformVariableDataType::Uint32},
+      {"num_N_tile", ProgramUniformVariableDataType::Uint32},
+      {"batch_count", ProgramUniformVariableDataType::Uint32});
 
  private:
+  uint32_t tile_size_;
+  uint32_t nbits_;
   bool has_zero_points_;
-  uint32_t tile_m_;
-  uint32_t tile_n_;
 };
 
 class MatMulNBits final : public WebGpuKernel {
diff --git a/onnxruntime/test/contrib_ops/matmul_8bits_test.cc b/onnxruntime/test/contrib_ops/matmul_8bits_test.cc
index b29fc5181eb46..257d3b3efdf9c 100644
--- a/onnxruntime/test/contrib_ops/matmul_8bits_test.cc
+++ b/onnxruntime/test/contrib_ops/matmul_8bits_test.cc
@@ -2,7 +2,7 @@
 // Licensed under the MIT License.
 
 #ifndef ORT_MINIMAL_BUILD
-#if (defined(MLAS_TARGET_AMD64_IX86) && !defined(USE_DML) && !defined(USE_WEBGPU) && !defined(USE_COREML)) || defined(USE_CUDA)
+#if (defined(MLAS_TARGET_AMD64_IX86) && !defined(USE_DML) && !defined(USE_WEBGPU) && !defined(USE_COREML)) || defined(USE_CUDA) || defined(USE_WEBGPU)
 
 #include <optional>
 
@@ -186,6 +186,10 @@ void RunTest8Bits(const TestOptions8Bits& opts) {
   std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
 #ifdef USE_CUDA
   execution_providers.emplace_back(DefaultCudaExecutionProvider());
+#elif USE_WEBGPU
+  execution_providers.emplace_back(DefaultWebGpuExecutionProvider());
+#endif
+#if defined(USE_CUDA) || defined(USE_WEBGPU)
   test.ConfigEps(std::move(execution_providers));
   test.RunWithConfig();
   execution_providers.clear();
@@ -226,8 +230,8 @@ void TestMatMul8BitsTyped() {
     RunTest8Bits<AType>(opts);
   }
 
-// CUDA does not support bias for MatMulNBits
-#if not defined(USE_CUDA)
+// CUDA/WEBGPU does not support bias for MatMulNBits
+#if !defined(USE_CUDA) && !defined(USE_WEBGPU)
   {
     TestOptions8Bits opts = base_opts;
     opts.has_bias = true;
@@ -279,7 +283,7 @@ TEST(MatMulNBits, Float32_8b_AccuracyLevel4_Float) {
   TestMatMul8BitsTyped<float, 2, 5120, 3072, 32, 4>();
 }
 
-#ifdef USE_CUDA
+#if defined(USE_CUDA) || defined(USE_WEBGPU)
 TEST(MatMulNBits, Float32_8b_AccuracyLevel4_Float16) {
   TestMatMul8BitsTyped<MLFloat16, 2, 4, 32, 16, 4>();
   TestMatMul8BitsTyped<MLFloat16, 199, 40, 576, 32, 4>();

From 3a618f21a6a9efc327be0a2a0c77459978190ac0 Mon Sep 17 00:00:00 2001
From: Kee <xuke537@hotmail.com>
Date: Wed, 7 May 2025 00:58:51 +0800
Subject: [PATCH 16/84] [VSINPU EP]Fix gather OP with scalar indice issue
 (#24603)

### Description
If indices is a scalar(0 dimensional tensor) , gather OP produces
incorrect output shape.
Fix the gather op bug in VSINPU EP.


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->

Signed-off-by: Kee <xuke537@hotmail.com>
---
 .../vsinpu/builders/impl/gather_op_builder.h       | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/onnxruntime/core/providers/vsinpu/builders/impl/gather_op_builder.h b/onnxruntime/core/providers/vsinpu/builders/impl/gather_op_builder.h
index bd91bbd81e1fa..0d3374bce325b 100644
--- a/onnxruntime/core/providers/vsinpu/builders/impl/gather_op_builder.h
+++ b/onnxruntime/core/providers/vsinpu/builders/impl/gather_op_builder.h
@@ -59,21 +59,33 @@ class GatherOpBuilder : public BaseOpBuilder {
                      std::vector<std::shared_ptr<tim::vx::Tensor>>& outputs,
                      const NodeUnit& node_unit) override {
     LOGS_DEFAULT(VERBOSE) << "Creating Gather Op.";
+    auto indices = node_unit.Inputs()[1];
+    int8_t is_scalar_indices = 0;
     NodeAttrHelper helper(node_unit.GetNode());
     auto axis = helper.Get("axis", 0);
     axis = util::ReverseAxis(axis, inputs[0]->GetShape().size());
     auto op = graph_ep->GetGraph()->CreateOperation<tim::vx::ops::Gather>(axis, 0);
+    auto indices_shape_proto = indices.node_arg.Shape();
+    if (indices_shape_proto != nullptr) {
+      if (indices_shape_proto->dim_size() == 0) {
+        is_scalar_indices = 1;
+      }
+    } else {
+      is_scalar_indices = 1;
+    }
 
     bool is_i64_indices = inputs[1]->GetDataType() == tim::vx::DataType::INT64;
     if (!is_i64_indices) {
+      inputs[1]->SetScalar(is_scalar_indices);
       (*op).BindInputs(inputs).BindOutputs(outputs);
     } else {
       std::vector<int64_t> origin_data(inputs[1]->GetSpec().GetElementNum());
       inputs[1]->CopyDataFromTensor(origin_data.data());
       std::vector<int32_t> transformed_data(origin_data.begin(), origin_data.end());
-      tim::vx::TensorSpec ts = inputs[1]->GetSpec().SetAttribute(tim::vx::TensorAttribute::INPUT);
+      tim::vx::TensorSpec ts = inputs[1]->GetSpec();
       ts.SetDataType(tim::vx::DataType::INT32);
       auto transformed_indices = graph_ep->GetGraph()->CreateTensor(ts, transformed_data.data());
+      transformed_indices->SetScalar(is_scalar_indices);
       (*op).BindInput(inputs[0]).BindInput(transformed_indices).BindOutput(outputs[0]);
     }
     graph_ep->GetOps().push_back(std::move(op));

From 65b4c3775040aaf4b7c9fe2e8e317c55cac4d12c Mon Sep 17 00:00:00 2001
From: Satya Kumar Jandhyala <satya.k.jandhyala@gmail.com>
Date: Tue, 6 May 2025 10:49:54 -0700
Subject: [PATCH 17/84] [Native WebGPU] Fixed type mismatch. (#24655)

### Description
<!-- Describe your changes. -->
Fix type mismatch using float in place of unsigned int.


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
---
 .../core/providers/webgpu/quantization/quantize_linear.cc     | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/onnxruntime/core/providers/webgpu/quantization/quantize_linear.cc b/onnxruntime/core/providers/webgpu/quantization/quantize_linear.cc
index e2b5d73168935..0305049e9b789 100644
--- a/onnxruntime/core/providers/webgpu/quantization/quantize_linear.cc
+++ b/onnxruntime/core/providers/webgpu/quantization/quantize_linear.cc
@@ -79,7 +79,7 @@ Status DequantizeLinearProgram::GenerateShaderCode(ShaderHelper& shader) const {
       if (packed_) {
         shader.MainFunctionBody()
             << "let zero_point_index = " << output.IndicesGet("output_indices", "uniforms.axis") << ";\n"
-            << "let zero_point_input = " << zero_point.GetByOffset("zero_point_index / 4") << ";\n"
+            << "let zero_point_input = " << zero_point.GetByOffset("u32(zero_point_index / 4)") << ";\n"
             << "let zero_point_vec = " << unpack << ";\n"
             << "let zero_point_value = zero_point_vec[zero_point_index % 4];\n";
       } else {
@@ -92,7 +92,7 @@ Status DequantizeLinearProgram::GenerateShaderCode(ShaderHelper& shader) const {
       if (packed_) {
         shader.MainFunctionBody()
             << "let zero_point_offset = " << scale.GetByIndices("scale_indices") << ";\n"
-            << "let zero_point_input = " << zero_point.GetByOffset("zero_point_offset / 4") << ";\n"
+            << "let zero_point_input = " << zero_point.GetByOffset("u32(zero_point_offset / 4)") << ";\n"
             << "let zero_point_vec = " << unpack << ";\n"
             << "let zero_point_value = zero_point_vec[zero_point_offset % 4];\n";
       } else {

From 7942b0caa991ef03fc6d266a72971ad4d3354da3 Mon Sep 17 00:00:00 2001
From: Guenther Schmuelling <guschmue@microsoft.com>
Date: Tue, 6 May 2025 11:29:21 -0700
Subject: [PATCH 18/84] [webgpu] fix compile errors in instancenorm (#24639)

fix shader compile; don't know how this made it past ci
---
 onnxruntime/core/providers/webgpu/nn/instance_norm.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/onnxruntime/core/providers/webgpu/nn/instance_norm.cc b/onnxruntime/core/providers/webgpu/nn/instance_norm.cc
index f3bccec4872fc..7b39980f85605 100644
--- a/onnxruntime/core/providers/webgpu/nn/instance_norm.cc
+++ b/onnxruntime/core/providers/webgpu/nn/instance_norm.cc
@@ -88,13 +88,13 @@ Status InstanceNormProgram::GenerateShaderCode(ShaderHelper& shader) const {
   const auto& channel_scale_shift = shader.AddInput("channel_scale_shift", ShaderUsage::UseUniform | ShaderUsage::UseIndicesTypeAlias);
   const auto& output = shader.AddOutput("output", ShaderUsage::UseUniform | ShaderUsage::UseIndicesTypeAlias | ShaderUsage::UseValueTypeAlias);
   shader.MainFunctionBody() << shader.GuardAgainstOutOfBoundsWorkgroupSizes("uniforms.output_size")
-                            << "let outputIndices = " << output.OffsetToIndices("global_idx")
+                            << "let outputIndices = " << output.OffsetToIndices("global_idx") << ";\n"
                             << "let batch = outputIndices[0];\n"
                             << "let channel = outputIndices[1];\n"
                             << "let channel_scale_shift_indices = channel_scale_shift_indices_t(batch, channel, 0);\n"
                             << "let channel_scale_shift = " << channel_scale_shift.GetByIndices("channel_scale_shift_indices") << ";\n"
                             << "let input_value = " << input.GetByOffset("global_idx") << ";\n"
-                            << "let output_value = input_value * output_value_t(channel_scale_sift.x) + output_value_t(channel_scale_shift.y);\n"
+                            << "let output_value = input_value * output_value_t(channel_scale_shift.x) + output_value_t(channel_scale_shift.y);\n"
                             << output.SetByOffset("global_idx", "output_value") << ";\n";
   return Status::OK();
 }

From cab3c421ead7631a080c692f2de850cc76fdc1c9 Mon Sep 17 00:00:00 2001
From: vraspar <vrajang@outlook.com>
Date: Tue, 6 May 2025 11:30:36 -0700
Subject: [PATCH 19/84] Fix source name in CUDA publishing pipeline
 configuration (#24645)

### Description
Python Cuda Publishing pipeline references old test pipeline
---
 .../github/azure-pipelines/py-cuda-publishing-pipeline.yml    | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/ci_build/github/azure-pipelines/py-cuda-publishing-pipeline.yml b/tools/ci_build/github/azure-pipelines/py-cuda-publishing-pipeline.yml
index 230c391c00ebd..016c09e6c01da 100644
--- a/tools/ci_build/github/azure-pipelines/py-cuda-publishing-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/py-cuda-publishing-pipeline.yml
@@ -1,7 +1,7 @@
 resources:
   pipelines:
   - pipeline: build
-    source: 'Python CUDA12 Package Test Pipeline'
+    source: 'Python CUDA Package Test Pipeline'
     trigger:
       branches:
         include:
@@ -37,4 +37,4 @@ extends:
     stages:
     - template: stages/py-cuda-publishing-stage.yml
       parameters:
-        artifact_feed: $(ArtifactFeed)
\ No newline at end of file
+        artifact_feed: $(ArtifactFeed)

From 1f4ca889744857d151e60e5867f683876caa5355 Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Tue, 6 May 2025 12:08:24 -0700
Subject: [PATCH 20/84] allow upload log on failure for further investigating
 (#24649)

### Description

The random failure on Web CI is hard to investigate because it's not
reproducible. Add this step to upload the log to help investigate the
issue.
---
 .github/workflows/windows-web-ci-workflow.yml | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/.github/workflows/windows-web-ci-workflow.yml b/.github/workflows/windows-web-ci-workflow.yml
index 57f687d8502ff..b45663a6145e3 100644
--- a/.github/workflows/windows-web-ci-workflow.yml
+++ b/.github/workflows/windows-web-ci-workflow.yml
@@ -205,6 +205,14 @@ jobs:
           log_file_path: ${{ runner.temp }}\web\test\07\chrome_debug.log
           is_chromium_log: true
 
+      # this step is added to help investigate the shader validation failure which is hard to reproduce
+      - name: Upload WebGPU shader validation log on failure
+        if: ${{ failure() && inputs.run_webgpu_tests == true && inputs.build_config == 'Debug' }}
+        uses: actions/upload-artifact@v4
+        with:
+          name: webgpu-shader-validation-logs
+          path: ${{ runner.temp }}\web\test\07\chrome_debug.log
+
       - name: E2E package consuming test
         if: ${{ inputs.build_config == 'Release' }}
         run: npm run test:e2e -- --browser=Chrome_default

From 8aa0b2851729ea99a7664c88bb5b476dfd46b669 Mon Sep 17 00:00:00 2001
From: Satya Kumar Jandhyala <satya.k.jandhyala@gmail.com>
Date: Tue, 6 May 2025 12:59:27 -0700
Subject: [PATCH 21/84] [JSEP] Fix outputSize calculation causing duplicate
 indices. (#24650)

### Description
Fix the outputSize computation causing duplicate indices. The outputSize
should be the size of indices tensor without counting the last
dimension.


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
Fix the issue https://github.com/microsoft/onnxruntime/issues/24070
---
 js/web/lib/wasm/jsep/webgpu/ops/scatter-nd.ts | 100 ++++++------------
 1 file changed, 33 insertions(+), 67 deletions(-)

diff --git a/js/web/lib/wasm/jsep/webgpu/ops/scatter-nd.ts b/js/web/lib/wasm/jsep/webgpu/ops/scatter-nd.ts
index ec1d23e4887d5..286984c15feca 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/scatter-nd.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/scatter-nd.ts
@@ -78,46 +78,15 @@ const atomicReductionSnippet = (reduction: string, ptr: string, v: string, type:
   }
 };
 
-const calcDataOffsetSnippet = (dataRank: number, parallel: boolean) =>
-  `${
-    dataRank === 1
-      ? `
-    let element_count_dim = uniforms.output_strides;
-    let dim_value = uniforms.output_shape;`
-      : `
-    let element_count_dim = uniforms.output_strides[${parallel ? 'i - indices_start' : 'i'}];
-    let dim_value = uniforms.output_shape[${parallel ? 'i - indices_start' : 'i'} + uniforms.last_index_dimension];`
-  }
-    
-    if (index >= 0) {
-      if (index >= i32(dim_value)) {
-        index = i32(dim_value - 1);
-      }
-    } else {
-      if (index < -i32(dim_value)) {
-        index = 0;
-      } else {
-        index += i32(dim_value);
-      }
-    }
-    data_offset += u32((u32(index) * element_count_dim));`;
-
-const updateElementsSnippet = (attributes: ScatterNDAttributes, outputTypeValue: ReductionType, parallel: boolean) =>
-  `for (var i = 0u; i < uniforms.num_updates_elements; i++) {
-        let value = updates[uniforms.num_updates_elements * ${parallel ? 'global_idx' : 'idx'} + i];
-        ${atomicReductionSnippet(attributes.reduction, 'output[data_offset + i]', 'value', outputTypeValue)}
-      }`;
-
 const createScatterNDProgramInfo = (inputs: readonly TensorView[], attributes: ScatterNDAttributes): ProgramInfo => {
   const inputShape = inputs[0].dims;
   const indicesShape = inputs[1].dims;
   const outputShape = inputShape;
   // TODO: support bool with components 4.
   const components = 1;
-  const outputSize = Math.ceil(ShapeUtil.size(indicesShape) / components);
+  const outputSize = Math.ceil(ShapeUtil.sizeToDimension(indicesShape, indicesShape.length - 1) / components);
   const lastIndexDimension = indicesShape[indicesShape.length - 1];
   const numUpdatesElements = ShapeUtil.sizeFromDimension(inputShape, lastIndexDimension);
-  const numIndicesElements = ShapeUtil.sizeFromDimension(indicesShape, 0) / lastIndexDimension;
 
   const programUniforms: ProgramUniform[] = [
     { type: DataType.uint32, data: outputSize },
@@ -142,48 +111,45 @@ const createScatterNDProgramInfo = (inputs: readonly TensorView[], attributes: S
         .declareVariables(indices, updates, output)}
       ${shaderHelper.mainStart()}
         ${shaderHelper.guardAgainstOutOfBoundsWorkgroupSizes('uniforms.output_size')}
-  var hasDuplicates = false;
-  if (${attributes.reduction === 'none'}) {
-    for (var i = 0; i < ${numIndicesElements}; i = i + 1) {
-      for (var j = i + 1; j < ${numIndicesElements}; j = j + 1) {
-        var index_i = i32(indices[i].x);
-        var index_j = i32(indices[j].x);
-        if (index_i == index_j) {
-          hasDuplicates = true;
-          break;
-        }
+  var data_offset = 0u;
+  let indices_start = uniforms.last_index_dimension * global_idx;
+  let indices_end = indices_start + uniforms.last_index_dimension;
+  for (var i = indices_start; i < indices_end; i++) {
+    var index = i32(indices[i].x);
+    ${
+      inputs[0].dims.length === 1
+        ? `
+    let element_count_dim = uniforms.output_strides;
+    let dim_value = uniforms.output_shape;`
+        : `
+    let element_count_dim = uniforms.output_strides[i - indices_start];
+    let dim_value = uniforms.output_shape[i - indices_start];`
+    }
+    if (index >= 0) {
+      if (index >= i32(dim_value)) {
+        index = i32(dim_value - 1);
       }
-      if (hasDuplicates) {
-        break;
+    } else {
+      if (index < -i32(dim_value)) {
+        index = 0;
+      } else {
+        index += i32(dim_value);
       }
     }
+    data_offset += u32((u32(index) * element_count_dim));
   }
 
-  if (${attributes.reduction === 'none'} && hasDuplicates) {
-    if (global_idx != 0u) {
-      return;
-    }
-    // Process each index-update pair individually when duplicates exist
-    for (var idx = 0u; idx < ${numIndicesElements}u; idx++) {
-      var data_offset = 0u;
-      for (var i = 0u; i < uniforms.last_index_dimension; i++) {
-        var index = i32(indices[idx * uniforms.last_index_dimension + i].x);
-        ${calcDataOffsetSnippet(inputShape.length, false)}
-      }
-      ${updateElementsSnippet(attributes, output.type.value as ReductionType, false)}
-    }
-    return;
+  for (var i = 0u; i < uniforms.num_updates_elements; i++) {
+    let value = updates[uniforms.num_updates_elements * global_idx + i];
+    ${atomicReductionSnippet(
+      attributes.reduction,
+      'output[data_offset + i]',
+      'value',
+      output.type.value as ReductionType,
+    )}
   }
 
-  var data_offset = 0u;
-  var indices_start = uniforms.last_index_dimension * global_idx;
-  var indices_end = indices_start + uniforms.last_index_dimension;
-  for (var i = indices_start; i < indices_end; i++) {
-    var index = i32(indices[i].x);
-    ${calcDataOffsetSnippet(inputShape.length, true)}
-  }
-  ${updateElementsSnippet(attributes, output.type.value as ReductionType, true)}
-  }`;
+      }`;
   };
   return {
     name: 'ScatterND',

From 7bec521e9bbf5ad88a5c8c7eb7898360923f7023 Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Tue, 6 May 2025 13:08:57 -0700
Subject: [PATCH 22/84] fix header include in webgpu_context.cc (#24648)

### Description
<!-- Describe your changes. -->

header file "dawn/dawn_proc.h" is only used in a non-monolithic build of
dawn.
---
 onnxruntime/core/providers/webgpu/webgpu_context.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/onnxruntime/core/providers/webgpu/webgpu_context.cc b/onnxruntime/core/providers/webgpu/webgpu_context.cc
index 17f4fa1bd44b3..27380645baf57 100644
--- a/onnxruntime/core/providers/webgpu/webgpu_context.cc
+++ b/onnxruntime/core/providers/webgpu/webgpu_context.cc
@@ -10,7 +10,9 @@
 #endif
 
 #if !defined(__wasm__)
+#if !defined(BUILD_DAWN_MONOLITHIC_LIBRARY)
 #include "dawn/dawn_proc.h"
+#endif
 #if !defined(USE_EXTERNAL_DAWN)
 #include "dawn/native/DawnNative.h"
 #endif

From cdff2c1179e3d29c25ad8ae790a089ee65945ed0 Mon Sep 17 00:00:00 2001
From: xhcao <xinghua.cao@intel.com>
Date: Wed, 7 May 2025 04:31:07 +0800
Subject: [PATCH 23/84] [webgpu]: optimize pool operators (#24598)

The patch optimizes pool operators when output size is small and kernel
size is big

### Description
<!-- Describe your changes. -->


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
---
 onnxruntime/core/providers/webgpu/nn/pool.cc | 74 +++++++++++++++++---
 onnxruntime/core/providers/webgpu/nn/pool.h  |  6 +-
 2 files changed, 70 insertions(+), 10 deletions(-)

diff --git a/onnxruntime/core/providers/webgpu/nn/pool.cc b/onnxruntime/core/providers/webgpu/nn/pool.cc
index 12c135dbbf46d..d650392b71fb5 100644
--- a/onnxruntime/core/providers/webgpu/nn/pool.cc
+++ b/onnxruntime/core/providers/webgpu/nn/pool.cc
@@ -95,6 +95,9 @@ Status PoolProgram::GenerateShaderCode(ShaderHelper& shader) const {
     var_decl_code = SS_GET(var_decl_ss);
 
     sampling_code = "      value = max(value, x_val);\n";
+    if (are_small_output_big_kernel_) {
+      downsampling_code = "  sum_or_max_shared[local_idx] = value;\n";
+    }
   } else {
     SS(var_decl_ss, kStringInitialSize);
     var_decl_ss << "  var value = " << (is_float16_ ? "f16(0)" : "f32(0)") << ";\n";
@@ -113,7 +116,12 @@ Status PoolProgram::GenerateShaderCode(ShaderHelper& shader) const {
     sampling_code = SS_GET(sampling_ss);
 
     SS(downsampling_ss, kStringInitialSize);
-    downsampling_ss << "  value /= " << (is_float16_ ? "f16" : "f32") << "(count);\n";
+    if (are_small_output_big_kernel_) {
+      downsampling_ss << "  sum_or_max_shared[local_idx] = value;\n"
+                      << "  count_shared[local_idx] = count;\n";
+    } else {
+      downsampling_ss << "  value /= " << (is_float16_ ? "f16" : "f32") << "(count);\n";
+    }
     downsampling_code = SS_GET(downsampling_ss);
   }
 
@@ -125,13 +133,54 @@ Status PoolProgram::GenerateShaderCode(ShaderHelper& shader) const {
   auto data_dim_end = input.Rank();
   data_dim_end = is_nhwc_ ? data_dim_end - 1 : data_dim_end;
 
+  std::string sum_or_max_shared;
+  if (are_small_output_big_kernel_) {
+    shader.AdditionalImplementation()
+        << "var<workgroup> sum_or_max_shared : array<" << (is_float16_ ? "f16" : "f32") << ",workgroup_size_x >;\n"
+        << (!is_max_pool_ ? "var<workgroup> count_shared : array<u32, workgroup_size_x>;\n" : "");
+
+    SS(shared_ss, 512);
+    std::string sum_or_max_shared_op;
+    std::string count_shared_op;
+    if (is_max_pool_) {
+      sum_or_max_shared_op = "sum_or_max_shared[local_idx] = max(sum_or_max_shared[local_idx], sum_or_max_shared[local_idx + reduce_size]);\n";
+    } else {
+      sum_or_max_shared_op = "sum_or_max_shared[local_idx] += sum_or_max_shared[local_idx + reduce_size];\n";
+      count_shared_op = "count_shared[local_idx] += count_shared[local_idx + reduce_size];\n";
+    }
+
+    shared_ss << "  workgroupBarrier();\n"
+              << "  var reduce_size : u32 = workgroup_size_x;\n"
+              << "  for (var curr_size = reduce_size >> 1;  curr_size > 0; curr_size = reduce_size >> 1) {\n"
+              << "    reduce_size = curr_size + (reduce_size & 1);\n"
+              << "    if (local_idx < curr_size) {\n"
+              << "      " << sum_or_max_shared_op
+              << "      " << count_shared_op
+              << "    }\n"
+              << "    workgroupBarrier();\n"
+              << "  }\n";
+    sum_or_max_shared = SS_GET(shared_ss);
+  }
+  std::string kernel_loop_decl_code = are_small_output_big_kernel_ ? "  for (var i: u32 = local_idx; i < uniforms.kernel_size; i += workgroup_size_x) {\n" : "  for (var i: u32 = 0; i < uniforms.kernel_size; i++) {\n";
+
+  SS(output_ss, kStringInitialSize);
+  if (are_small_output_big_kernel_) {
+    output_ss << "  if (local_idx == 0) {\n"
+              << "    value = sum_or_max_shared[0]" << (!is_max_pool_ ? (is_float16_ ? " / f16(count_shared[0])" : " / f32(count_shared[0])") : "") << ";\n"
+              << "    " << output.SetByOffset("workgroup_idx", "value") << ";\n"
+              << "  }\n";
+  } else {
+    output_ss << "  " << output.SetByOffset("global_idx", "value") << ";\n";
+  }
+  std::string output_code = SS_GET(output_ss);
+
   auto& body = shader.MainFunctionBody();
-  body << shader.GuardAgainstOutOfBoundsWorkgroupSizes("uniforms.output_size")
-       << "  let y_indices = " << output.OffsetToIndices("global_idx") << ";\n"
+  body << (are_small_output_big_kernel_ ? "" : shader.GuardAgainstOutOfBoundsWorkgroupSizes("uniforms.output_size"))
+       << "  let y_indices = " << output.OffsetToIndices((are_small_output_big_kernel_ ? "workgroup_idx" : "global_idx")) << ";\n"
        << "  var x_indices = y_indices;\n"
        << "  var k_indices: array<u32, " << kernel_rank << ">;\n"
        << var_decl_code
-       << "  for (var i: u32 = 0; i < uniforms.kernel_size; i++) {\n"
+       << kernel_loop_decl_code
        << "    var offset = i;\n"
        // ---- Compute offset to indices in pooling window.
        << "    for (var j = 0; j < " << kernel_rank << "; j++) {\n"
@@ -162,7 +211,8 @@ Status PoolProgram::GenerateShaderCode(ShaderHelper& shader) const {
        << "    }\n"
        << "  }\n"
        << downsampling_code
-       << "  " << output.SetByOffset("global_idx", "value") << ";\n";
+       << sum_or_max_shared
+       << output_code;
 
   return Status::OK();
 }
@@ -225,7 +275,6 @@ Status Pool<PoolType, is_nhwc>::ComputeInternal(ComputeContext& context) const {
   }
   bool is_float16 = X->GetElementType() == ONNX_NAMESPACE::TensorProto_DataType_FLOAT16;
   bool count_include_pad = pool_attrs_.count_include_pad;
-  PoolProgram program{is_max_pool, is_nhwc, kernel_shape, is_float16, count_include_pad};
 
   // Number of elements
   uint32_t output_size = gsl::narrow_cast<uint32_t>(Y->Shape().Size());
@@ -235,16 +284,25 @@ Status Pool<PoolType, is_nhwc>::ComputeInternal(ComputeContext& context) const {
   const auto strides_u32 = NarrowToU32(strides);
   const auto dilations_u32 = NarrowToU32(dilations);
 
-  program.CacheHint(kernel_shape.size(), is_max_pool, is_nhwc, is_float16, count_include_pad)
+  bool are_small_output_big_kernel = output_size <= 128 && kernel_size >= 128;
+  PoolProgram program{is_max_pool, is_nhwc, kernel_shape, is_float16, count_include_pad, are_small_output_big_kernel};
+
+  program.CacheHint(kernel_shape.size(), is_max_pool, is_nhwc, is_float16, count_include_pad, are_small_output_big_kernel)
       .AddInputs({{X, ProgramTensorMetadataDependency::TypeAndRank}})
       .AddOutputs({{Y}})
-      .SetDispatchGroupSize((output_size + WORKGROUP_SIZE - 1) / WORKGROUP_SIZE)
       .AddUniformVariables({output_size, kernel_size,
                             gsl::span<const uint32_t>(kernel_strides.data(), kernel_strides.size()),
                             gsl::span<const uint32_t>(pads_u32.data(), pads_u32.size()),
                             gsl::span<const uint32_t>(strides_u32.data(), strides_u32.size()),
                             gsl::span<const uint32_t>(dilations_u32.data(), dilations_u32.size())});
 
+  if (are_small_output_big_kernel) {
+    program.SetWorkgroupSize(128)
+        .SetDispatchGroupSize(output_size);
+  } else {
+    program.SetDispatchGroupSize((output_size + WORKGROUP_SIZE - 1) / WORKGROUP_SIZE);
+  }
+
   return context.RunProgram(program);
 }
 
diff --git a/onnxruntime/core/providers/webgpu/nn/pool.h b/onnxruntime/core/providers/webgpu/nn/pool.h
index c1716542e5549..57bdc64954acd 100644
--- a/onnxruntime/core/providers/webgpu/nn/pool.h
+++ b/onnxruntime/core/providers/webgpu/nn/pool.h
@@ -14,13 +14,14 @@ namespace webgpu {
 class PoolProgram final : public Program<PoolProgram> {
  public:
   PoolProgram(bool is_max_pool, bool is_nhwc, const TensorShapeVector& kernel_shape, bool is_float16,
-              bool count_include_pad)
+              bool count_include_pad, bool are_small_output_big_kernel)
       : Program{"Pool"},
         is_max_pool_{is_max_pool},
         is_nhwc_{is_nhwc},
         kernel_shape_{kernel_shape},
         is_float16_{is_float16},
-        count_include_pad_{count_include_pad} {}
+        count_include_pad_{count_include_pad},
+        are_small_output_big_kernel_{are_small_output_big_kernel} {}
 
   Status GenerateShaderCode(ShaderHelper& sh) const override;
 
@@ -39,6 +40,7 @@ class PoolProgram final : public Program<PoolProgram> {
   const TensorShapeVector kernel_shape_;
   const bool is_float16_;
   const bool count_include_pad_;
+  const bool are_small_output_big_kernel_;
 };
 
 template <typename PoolType, bool is_nhwc>

From 3c6fa0efe4caf8e39b3754d75897dd98b73a63fb Mon Sep 17 00:00:00 2001
From: Adrian Lizarraga <adlizarraga@microsoft.com>
Date: Tue, 6 May 2025 16:42:20 -0700
Subject: [PATCH 24/84] Add support for EP selection delegate callback to
 Python bindings (#24634)

### Description
Follow up to https://github.com/microsoft/onnxruntime/pull/24614

Example Python program (adapted from unit tests) that specifies a custom
EP selection function to select a OrtEpDevice(s) for compiling:
```python
    def test_compile_with_ep_selection_delegate(self):
        # ...
        # User's custom EP selection function.
        def my_delegate(
            ep_devices: Sequence[onnxrt.OrtEpDevice],
            model_metadata: dict[str, str],
            runtime_metadata: dict[str, str],
            max_selections: int,
        ) -> Sequence[onnxrt.OrtEpDevice]:
            self.assertTrue(len(model_metadata) > 0)
            self.assertTrue(ep_devices and max_selections > 0)

            # Select the first and last devices (if there are more than one)
            selected_devices = [ep_devices[0]]
            if max_selections > 2 and len(ep_devices) > 1:
                selected_devices.append(ep_devices[-1])  # ORT CPU EP is always last

            return selected_devices

        session_options = onnxrt.SessionOptions()
        session_options.set_provider_selection_policy_delegate(my_delegate)

        model_compiler = onnxrt.ModelCompiler(
            session_options,
            input_model_path,
            embed_compiled_data_into_model=True,
            external_initializers_file_path=None,
        )
        model_compiler.compile_to_file(output_model_path)
```

How to raise an exception from the Python EP selection function:
```python
        # User's custom EP selection function.
        custom_error_message = "MY ERROR"

        def my_delegate_that_fails(
            ep_devices: Sequence[onnxrt.OrtEpDevice],
            model_metadata: dict[str, str],
            runtime_metadata: dict[str, str],
            max_selections: int,
        ) -> Sequence[onnxrt.OrtEpDevice]:
            self.assertTrue(len(ep_devices) >= 1)
            raise ValueError(custom_error_message)

        sess_options = onnxrt.SessionOptions()
        sess_options.set_provider_selection_policy_delegate(my_delegate_that_fails)

        # Create session and expect ORT to raise a Fail exception that contains our message.
        with self.assertRaises(Fail) as context:
            onnxrt.InferenceSession(get_name("mul_1.onnx"), sess_options=sess_options)
        self.assertIn(custom_error_message, str(context.exception))
```
### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
---
 .../core/session/provider_policy_context.cc   |   6 +
 .../python/onnxruntime_pybind_state.cc        | 136 +++++++++++++-----
 .../python/onnxruntime_pybind_state_common.h  |  17 ++-
 .../python/onnxruntime_test_python_autoep.py  |  86 ++++++++++-
 .../onnxruntime_test_python_compile_api.py    |  41 ++++++
 5 files changed, 241 insertions(+), 45 deletions(-)

diff --git a/onnxruntime/core/session/provider_policy_context.cc b/onnxruntime/core/session/provider_policy_context.cc
index f706bd05d8494..a4e0c16b411a1 100644
--- a/onnxruntime/core/session/provider_policy_context.cc
+++ b/onnxruntime/core/session/provider_policy_context.cc
@@ -169,6 +169,12 @@ Status ProviderPolicyContext::SelectEpsForSession(const Environment& env, const
       return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "EP selection delegate did not select anything.");
     }
 
+    if (num_selected > selected_devices.size()) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL,
+                             "EP selection delegate selected too many EP devices (", num_selected, "). ",
+                             "The limit is ", selected_devices.size(), " EP devices.");
+    }
+
     // Copy the selected devices to the output vector
     devices_selected.reserve(num_selected);
     for (size_t i = 0; i < num_selected; ++i) {
diff --git a/onnxruntime/python/onnxruntime_pybind_state.cc b/onnxruntime/python/onnxruntime_pybind_state.cc
index c29a8b0497b1f..d5f8c7c181960 100644
--- a/onnxruntime/python/onnxruntime_pybind_state.cc
+++ b/onnxruntime/python/onnxruntime_pybind_state.cc
@@ -23,6 +23,7 @@
 #include "core/framework/arena_extend_strategy.h"
 #include "core/framework/data_transfer_utils.h"
 #include "core/framework/data_types_internal.h"
+#include "core/framework/error_code_helper.h"
 #include "core/framework/provider_options_utils.h"
 #include "core/framework/random_seed.h"
 #include "core/framework/sparse_tensor.h"
@@ -1798,38 +1799,69 @@ void addGlobalMethods(py::module& m) {
 #endif
 }
 
-// TODO(adrianlizarraga): C API's delegate function needs a void* param to store state.
-// using PyEpSelectionDelegate = std::function<std::vector<const OrtEpDevice*>(const std::vector<const OrtEpDevice*>& ep_devices,
-//                                                                            const std::unordered_map<std::string, std::string>& model_metadata,
-//                                                                            const std::unordered_map<std::string, std::string>& runtime_metadata)>;
-//
-// static OrtStatus* PyDelegateWrapper(void* delegate_state,
-//                                     _In_ const OrtEpDevice** ep_devices,
-//                                     _In_ size_t num_devices,
-//                                     _In_ const OrtKeyValuePairs* model_metadata,
-//                                     _In_opt_ const OrtKeyValuePairs* runtime_metadata,
-//                                     _Inout_ const OrtEpDevice** selected,
-//                                     _In_ size_t max_selected,
-//                                     _Out_ size_t* num_selected) {
-//   PyEpSelectionDelegate* actual_delegate = reinterpret_cast<PyEpSelectionDelegate*>(delegate_state);
-//   std::vector<const OrtEpDevice*> py_ep_devices(ep_devices, ep_devices + num_devices);
-//   std::unordered_map<std::string, std::string> py_model_metadata =
-//       model_metadata ? model_metadata->entries : std::unordered_map<std::string, std::string>{};
-//   std::unordered_map<std::string, std::string> py_runtime_metadata =
-//       runtime_metadata ? runtime_metadata->entries : std::unordered_map<std::string, std::string>{};
-//
-//   std::vector<const OrtEpDevice*> py_selected = (*actual_delegate)(py_ep_devices, py_model_metadata, py_runtime_metadata);
-//
-//   // TODO: Check max_selected and return OrtStatus if necessary.
-//   assert(py_selected.size() <= max_selected);
-//
-//   *num_selected = py_selected.size();
-//   for (size_t i = 0; i < py_selected.size(); ++i) {
-//     selected[i] = py_selected[i];
-//   }
-//
-//   return nullptr;
-// };
+#if !defined(ORT_MINIMAL_BUILD)
+/**
+ * Calls the user's Python EP selection function and coverts the results to a format that can be used
+ * by ORT to select OrtEpDevice instances. The user's function is set by calling
+ * SessionOptions.set_provider_selection_policy_delegate() on the Python side. The result of this wrapper
+ * function is used in core/session/provider_policy_context.cc.
+ *
+ * @param ep_devices OrtEpDevices to select from.
+ * @param num_devices Number of OrtEpDevices to select from.
+ * @param model_metadata Model's metadata.
+ * @param runtime_metadata Runtime metadata.
+ * @param selected Pre-allocated OrtEpDevice buffer to update with selected devices.
+ * @param max_selected Maximum number of entries in the pre-allocated 'selected' buffer.
+ * @param state Opaque state that holds a pointer to the user's Python function.
+ *
+ * @return nullptr OrtStatus* to indicate success.
+ */
+static OrtStatus* ORT_API_CALL PyEpSelectionPolicyWrapper(_In_ const OrtEpDevice** ep_devices,
+                                                          _In_ size_t num_devices,
+                                                          _In_ const OrtKeyValuePairs* model_metadata,
+                                                          _In_opt_ const OrtKeyValuePairs* runtime_metadata,
+                                                          _Inout_ const OrtEpDevice** selected,
+                                                          _In_ size_t max_selected,
+                                                          _Out_ size_t* num_selected,
+                                                          _In_ void* state) {
+  PyEpSelectionDelegate* actual_delegate = reinterpret_cast<PyEpSelectionDelegate*>(state);
+  std::vector<const OrtEpDevice*> py_ep_devices(ep_devices, ep_devices + num_devices);
+  std::unordered_map<std::string, std::string> py_model_metadata =
+      model_metadata ? model_metadata->entries : std::unordered_map<std::string, std::string>{};
+  std::unordered_map<std::string, std::string> py_runtime_metadata =
+      runtime_metadata ? runtime_metadata->entries : std::unordered_map<std::string, std::string>{};
+
+  *num_selected = 0;
+  std::vector<const OrtEpDevice*> py_selected;
+  OrtStatus* status = nullptr;
+
+  // Call the Python delegate function and convert any exceptions to a status.
+  ORT_TRY {
+    py_selected = (*actual_delegate)(py_ep_devices, py_model_metadata, py_runtime_metadata, max_selected);
+  }
+  ORT_CATCH(const std::exception& e) {
+    ORT_HANDLE_EXCEPTION([&]() {
+      status = ToOrtStatus(ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, e.what()));
+    });
+  }
+
+  if (status != nullptr) {
+    return status;
+  }
+
+  if (py_selected.size() > max_selected) {
+    return ToOrtStatus(ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "selected too many EP devices (", py_selected.size(), "). ",
+                                       "The limit is ", max_selected, " EP devices."));
+  }
+
+  *num_selected = py_selected.size();
+  for (size_t i = 0; i < py_selected.size(); ++i) {
+    selected[i] = py_selected[i];
+  }
+
+  return nullptr;
+}
+#endif  // !defined(ORT_MINIMAL_BUILD)
 
 void addObjectMethods(py::module& m, ExecutionProviderRegistrationFn ep_registration_fn) {
   py::enum_<GraphOptimizationLevel>(m, "GraphOptimizationLevel")
@@ -2035,21 +2067,47 @@ must refer to the same execution provider.)pbdoc")
       .def(
           // Equivalent to the C API's SessionOptionsSetEpSelectionPolicy.
           "set_provider_selection_policy",
-          [](PySessionOptions* sess_options,
+          [](PySessionOptions* py_sess_options,
              OrtExecutionProviderDevicePolicy policy) {
 #if !defined(ORT_MINIMAL_BUILD)
-            sess_options->value.ep_selection_policy.enable = true;
-            sess_options->value.ep_selection_policy.policy = policy;
-            sess_options->value.ep_selection_policy.delegate = nullptr;  // TODO: need a void* param in delegate.
+            py_sess_options->py_ep_selection_delegate = nullptr;
+
+            py_sess_options->value.ep_selection_policy.enable = true;
+            py_sess_options->value.ep_selection_policy.policy = policy;
+            py_sess_options->value.ep_selection_policy.delegate = nullptr;
+            py_sess_options->value.ep_selection_policy.state = nullptr;
 #else
-            ORT_UNUSED_PARAMETER(sess_options);
+            ORT_UNUSED_PARAMETER(py_sess_options);
             ORT_UNUSED_PARAMETER(policy);
             ORT_THROW("EP selection policies are not supported in this build");
 #endif
           },
           R"pbdoc(Sets the execution provider selection policy for the session. Allows users to specify a
-selection policy for automatic execution provider (EP) selection, or provide a delegate callback
-for custom selection logic.)pbdoc")
+selection policy for automatic execution provider (EP) selection.)pbdoc")
+      .def(
+          // Equivalent to the C API's SessionOptionsSetEpSelectionPolicyDelegate.
+          "set_provider_selection_policy_delegate",
+          [](PySessionOptions* py_sess_options,
+             PyEpSelectionDelegate delegate_fn) {
+#if !defined(ORT_MINIMAL_BUILD)
+            py_sess_options->py_ep_selection_delegate = delegate_fn;  // Store python callback in PySessionOptions
+
+            py_sess_options->value.ep_selection_policy.enable = true;
+            py_sess_options->value.ep_selection_policy.policy = OrtExecutionProviderDevicePolicy_DEFAULT;
+            py_sess_options->value.ep_selection_policy.delegate = PyEpSelectionPolicyWrapper;
+            py_sess_options->value.ep_selection_policy.state =
+                reinterpret_cast<void*>(&py_sess_options->py_ep_selection_delegate);
+#else
+            ORT_UNUSED_PARAMETER(py_sess_options);
+            ORT_UNUSED_PARAMETER(delegate_fn);
+            ORT_THROW("EP selection policies are not supported in this build");
+#endif
+          },
+          R"pbdoc(Sets the execution provider selection policy delegate for the session. Allows users to specify a
+custom selection policy function for automatic execution provider (EP) selection. The delegate must return a list of
+selected OrtEpDevice instances. The signature of the delegate is
+def custom_delegate(ep_devices: Sequence[OrtEpDevice], model_metadata: dict[str, str], runtime_metadata: dict[str, str],
+max_selections: int) -> Sequence[OrtEpDevice])pbdoc")
       .def(
           "has_providers",
           [](PySessionOptions* sess_options) -> bool {
diff --git a/onnxruntime/python/onnxruntime_pybind_state_common.h b/onnxruntime/python/onnxruntime_pybind_state_common.h
index a964f199d43b3..4114bd4078799 100644
--- a/onnxruntime/python/onnxruntime_pybind_state_common.h
+++ b/onnxruntime/python/onnxruntime_pybind_state_common.h
@@ -4,6 +4,8 @@
 
 #pragma once
 
+#include <unordered_map>
+
 #include "core/common/logging/logging.h"
 #include "core/common/logging/sinks/cerr_sink.h"
 #include "core/common/optional.h"
@@ -229,7 +231,20 @@ extern OrtDevice::DeviceId cuda_device_id;
 // TODO remove deprecated global config
 extern size_t gpu_mem_limit;
 
-using PySessionOptions = OrtSessionOptions;
+#if !defined(ORT_MINIMAL_BUILD)
+using PyEpSelectionDelegate = std::function<std::vector<const OrtEpDevice*>(const std::vector<const OrtEpDevice*>& ep_devices,
+                                                                            const std::unordered_map<std::string, std::string>& model_metadata,
+                                                                            const std::unordered_map<std::string, std::string>& runtime_metadata,
+                                                                            size_t max_selections)>;
+#endif
+
+// Thin wrapper over internal C OrtSessionOptions to store additional state.
+struct PySessionOptions : public OrtSessionOptions {
+#if !defined(ORT_MINIMAL_BUILD)
+  // Callback function from Python application that allows the user to specify custom EP selection logic.
+  PyEpSelectionDelegate py_ep_selection_delegate;
+#endif  // !defined(ORT_MINIMAL_BUILD)
+};
 
 // Thin wrapper over internal C++ InferenceSession to accommodate custom op library management for the Python user
 struct PyInferenceSession {
diff --git a/onnxruntime/test/python/onnxruntime_test_python_autoep.py b/onnxruntime/test/python/onnxruntime_test_python_autoep.py
index 1d7dba5662257..61dc0ff221318 100644
--- a/onnxruntime/test/python/onnxruntime_test_python_autoep.py
+++ b/onnxruntime/test/python/onnxruntime_test_python_autoep.py
@@ -6,13 +6,14 @@
 import platform
 import sys
 import unittest
+from collections.abc import Sequence
 
 import numpy as np
 from autoep_helper import AutoEpTestCase
 from helper import get_name
 
 import onnxruntime as onnxrt
-from onnxruntime.capi.onnxruntime_pybind11_state import InvalidArgument
+from onnxruntime.capi.onnxruntime_pybind11_state import Fail, InvalidArgument
 
 # handle change from python 3.8 and on where loading a dll from the current directory needs to be explicitly allowed.
 if platform.system() == "Windows" and sys.version_info.major >= 3 and sys.version_info.minor >= 8:  # noqa: YTT204
@@ -68,8 +69,8 @@ def test_cuda_ep_register_and_inference(self):
         output_expected = np.array([[1.0, 4.0], [9.0, 16.0], [25.0, 36.0]], dtype=np.float32)
         np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08)
 
-        # TODO(adrianlizarraga): Unregistering CUDA EP library causes issues. Investigate.
-        # self.unregister_execution_provider_library(ep_registration_name)
+        del sess  # Delete session before unregistering library
+        self.unregister_execution_provider_library(ep_registration_name)
 
     def test_cuda_prefer_gpu_and_inference(self):
         """
@@ -100,8 +101,83 @@ def test_cuda_prefer_gpu_and_inference(self):
         output_expected = np.array([[1.0, 4.0], [9.0, 16.0], [25.0, 36.0]], dtype=np.float32)
         np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08)
 
-        # TODO(adrianlizarraga): Unregistering CUDA EP library causes issues. Investigate.
-        # self.unregister_execution_provider_library(ep_registration_name)
+        del sess  # Delete session before unregistering library
+        self.unregister_execution_provider_library(ep_registration_name)
+
+    def test_cuda_ep_selection_delegate_and_inference(self):
+        """
+        Test selecting CUDA EP via the custom EP selection delegate function and then run inference.
+        """
+        ep_lib_path = "onnxruntime_providers_cuda.dll"
+        ep_registration_name = "CUDAExecutionProvider"
+
+        if sys.platform != "win32":
+            self.skipTest("Skipping test because device discovery is only supported on Windows")
+
+        if ep_registration_name not in available_providers:
+            self.skipTest("Skipping test because it needs to run on CUDA EP")
+
+        self.register_execution_provider_library(ep_registration_name, ep_lib_path)
+
+        # User's custom EP selection function.
+        def my_delegate(
+            ep_devices: Sequence[onnxrt.OrtEpDevice],
+            model_metadata: dict[str, str],
+            runtime_metadata: dict[str, str],
+            max_selections: int,
+        ) -> Sequence[onnxrt.OrtEpDevice]:
+            self.assertGreater(len(model_metadata), 0)
+            self.assertGreaterEqual(len(ep_devices), 2)
+            self.assertGreaterEqual(max_selections, 2)
+
+            cuda_ep_device = next((d for d in ep_devices if d.ep_name == ep_registration_name), None)
+            self.assertIsNotNone(cuda_ep_device)
+
+            # Select the CUDA device and the ORT CPU EP device (should always be last)
+            return [cuda_ep_device, ep_devices[-1]]
+
+        sess_options = onnxrt.SessionOptions()
+        sess_options.set_provider_selection_policy_delegate(my_delegate)
+        self.assertTrue(sess_options.has_providers())
+
+        # Run sample model and check output
+        sess = onnxrt.InferenceSession(get_name("mul_1.onnx"), sess_options=sess_options)
+
+        x = np.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], dtype=np.float32)
+        input_name = sess.get_inputs()[0].name
+        res = sess.run([], {input_name: x})
+        output_expected = np.array([[1.0, 4.0], [9.0, 16.0], [25.0, 36.0]], dtype=np.float32)
+        np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08)
+
+        del sess  # Delete session before unregistering library
+        self.unregister_execution_provider_library(ep_registration_name)
+
+    def test_custom_ep_selection_delegate_that_raises_error(self):
+        """
+        Test a custom EP selection delegate function that raises a Python exception. ORT should re-raise as FAIL.
+        """
+        if sys.platform != "win32":
+            self.skipTest("Skipping test because device discovery is only supported on Windows")
+
+        # User's custom EP selection function.
+        custom_error_message = "MY ERROR"
+
+        def my_delegate_that_fails(
+            ep_devices: Sequence[onnxrt.OrtEpDevice],
+            model_metadata: dict[str, str],
+            runtime_metadata: dict[str, str],
+            max_selections: int,
+        ) -> Sequence[onnxrt.OrtEpDevice]:
+            self.assertGreaterEqual(len(ep_devices), 1)
+            raise ValueError(custom_error_message)
+
+        sess_options = onnxrt.SessionOptions()
+        sess_options.set_provider_selection_policy_delegate(my_delegate_that_fails)
+
+        # Create session and expect ORT to raise a Fail exception that contains our message.
+        with self.assertRaises(Fail) as context:
+            onnxrt.InferenceSession(get_name("mul_1.onnx"), sess_options=sess_options)
+        self.assertIn(custom_error_message, str(context.exception))
 
     def test_example_plugin_ep_devices(self):
         """
diff --git a/onnxruntime/test/python/onnxruntime_test_python_compile_api.py b/onnxruntime/test/python/onnxruntime_test_python_compile_api.py
index f5f23e2da1e43..866388f3aa226 100644
--- a/onnxruntime/test/python/onnxruntime_test_python_compile_api.py
+++ b/onnxruntime/test/python/onnxruntime_test_python_compile_api.py
@@ -6,6 +6,7 @@
 import platform
 import sys
 import unittest
+from collections.abc import Sequence
 
 import onnx
 from autoep_helper import AutoEpTestCase
@@ -52,6 +53,46 @@ def test_compile_with_files_prefer_npu_policy(self):
         self.assertTrue(os.path.exists(output_model_path))
         self.unregister_execution_provider_library(ep_registration_name)
 
+    def test_compile_with_ep_selection_delegate(self):
+        """
+        Tests compiling a model (to/from files) using an EP selection delegate callback.
+        """
+        if sys.platform != "win32":
+            self.skipTest("Skipping test because provider selection policies are only supported on Windows")
+
+        input_model_path = get_name("nhwc_resize_scales_opset18.onnx")
+        output_model_path = os.path.join(self._tmp_dir_path, "model.compiled.delegate.onnx")
+
+        # User's custom EP selection function.
+        def my_delegate(
+            ep_devices: Sequence[onnxrt.OrtEpDevice],
+            model_metadata: dict[str, str],
+            runtime_metadata: dict[str, str],
+            max_selections: int,
+        ) -> Sequence[onnxrt.OrtEpDevice]:
+            self.assertGreater(len(ep_devices), 0)
+            self.assertGreater(len(model_metadata), 0)
+            self.assertGreater(max_selections, 0)
+
+            # Select the first and last devices (if there are more than one)
+            selected_devices = [ep_devices[0]]
+            if max_selections > 2 and len(ep_devices) > 1:
+                selected_devices.append(ep_devices[-1])  # ORT CPU EP is always last
+
+            return selected_devices
+
+        session_options = onnxrt.SessionOptions()
+        session_options.set_provider_selection_policy_delegate(my_delegate)
+
+        model_compiler = onnxrt.ModelCompiler(
+            session_options,
+            input_model_path,
+            embed_compiled_data_into_model=True,
+            external_initializers_file_path=None,
+        )
+        model_compiler.compile_to_file(output_model_path)
+        self.assertTrue(os.path.exists(output_model_path))
+
     def test_compile_with_input_and_output_files(self):
         """
         Tests compiling a model (to/from files) using explicit EP.

From c1ef02f74b0d648cc7e8558805fa90846ea11a35 Mon Sep 17 00:00:00 2001
From: Adrian Lizarraga <adlizarraga@microsoft.com>
Date: Tue, 6 May 2025 17:52:55 -0700
Subject: [PATCH 25/84] [Gen C\C++ API docs] Fix documentation warnings for new
 autoEP/compile APIs (#24661)

### Description
Fixes documentation errors in comments within onnxruntime_c_api.h and
onnxruntime__cxx_api.h.


### Motivation and Context
The [Generate C/C++ API
docs](https://github.com/microsoft/onnxruntime/actions/runs/14855108283/job/41706460753#logs)
action is failing with error:

```shell
Run mkdir -p build/doxygen
/mnt/vss/_work/onnxruntime/onnxruntime/include/onnxruntime/core/session/onnxruntime_cxx_api.h:775: error: explicit link request to 'OrtKeyValuePair' could not be resolved (warning treated as error, aborting now)
```
---
 include/onnxruntime/core/session/onnxruntime_c_api.h | 12 ++++++------
 .../onnxruntime/core/session/onnxruntime_cxx_api.h   |  2 +-
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/include/onnxruntime/core/session/onnxruntime_c_api.h b/include/onnxruntime/core/session/onnxruntime_c_api.h
index 6c7d910b4963b..a4a32fbea630a 100644
--- a/include/onnxruntime/core/session/onnxruntime_c_api.h
+++ b/include/onnxruntime/core/session/onnxruntime_c_api.h
@@ -435,9 +435,9 @@ typedef enum OrtExecutionProviderDevicePolicy {
  * \param model_metadata The model metadata.
  * \param runtime_metadata The runtime metadata. May be nullptr.
  * \param selected Pre-allocated array to populate with selected OrtEpDevice pointers from ep_devices.
- * \param max_ep_devices The maximum number of devices that can be selected in the pre-allocated array.
-                         Currently the maximum is 8.
- * \param num_ep_devices The number of selected devices.
+ * \param max_selected The maximum number of devices that can be selected in the pre-allocated array.
+                       Currently the maximum is 8.
+ * \param num_selected The number of selected devices.
  * \param state Opaque pointer. Required to use the delegate from other languages like C# and python.
  *
  * \return OrtStatus* Selection status. Return nullptr on success.
@@ -6116,7 +6116,7 @@ struct OrtEpFactory {
    * \param[in] session_options The OrtSessionOptions instance that contains the configuration options for the
    *                            session. This will include ep_options from GetSupportedDevices as well as any
    *                            user provided overrides.
-   *                            Execution provider options will have been added with a prefix of 'ep.<ep name>.'.
+   *                            Execution provider options will have been added with a prefix of 'ep.[ep name].'.
    *                            The OrtSessionOptions instance will NOT be valid after this call and should not be
    *                            stored for later use.
    * \param[in] logger The OrtLogger instance for the session that the execution provider should use for logging.
@@ -6124,7 +6124,7 @@ struct OrtEpFactory {
    *
    * \snippet{doc} snippets.dox OrtStatus Return Value
    *
-   * \since Version <coming soon>. This is a placeholder.
+   * \since Version [coming soon]. This is a placeholder.
    */
   OrtStatus*(ORT_API_CALL* CreateEp)(_In_ OrtEpFactory* this_ptr,
                                      _In_reads_(num_devices) const OrtHardwareDevice* const* devices,
@@ -6138,7 +6138,7 @@ struct OrtEpFactory {
    * \param[in] this_ptr The OrtEpFactory instance.
    * \param[in] ep The OrtEp instance to release.
    *
-   * \since Version <coming soon>. This is a placeholder.
+   * \since Version [coming soon]. This is a placeholder.
    */
   void(ORT_API_CALL* ReleaseEp)(OrtEpFactory* this_ptr, struct OrtEp* ep);
 };
diff --git a/include/onnxruntime/core/session/onnxruntime_cxx_api.h b/include/onnxruntime/core/session/onnxruntime_cxx_api.h
index bc6f381bb82a0..39c20e237b02c 100644
--- a/include/onnxruntime/core/session/onnxruntime_cxx_api.h
+++ b/include/onnxruntime/core/session/onnxruntime_cxx_api.h
@@ -772,7 +772,7 @@ struct KeyValuePairsImpl : Ort::detail::Base<T> {
 // Const object holder that does not own the underlying object
 using ConstKeyValuePairs = detail::KeyValuePairsImpl<Ort::detail::Unowned<const OrtKeyValuePairs>>;
 
-/** \brief Wrapper around ::OrtKeyValuePair */
+/** \brief Wrapper around ::OrtKeyValuePairs */
 struct KeyValuePairs : detail::KeyValuePairsImpl<OrtKeyValuePairs> {
   explicit KeyValuePairs(std::nullptr_t) {}  ///< No instance is created
   /// Take ownership of a pointer created by C API

From 6fef0693192d879e6adae4412d9164b823441309 Mon Sep 17 00:00:00 2001
From: Satya Kumar Jandhyala <satya.k.jandhyala@gmail.com>
Date: Tue, 6 May 2025 18:51:12 -0700
Subject: [PATCH 26/84] [Native WebGPU] Added ScatterND (#24623)

### Description
Added ScatterND operator to Native WebGPU EP.


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
Required to increase coverage.
---
 .../core/providers/webgpu/shader_helper.cc    |  19 +-
 .../providers/webgpu/tensor/scatter_nd.cc     | 222 ++++++++++++++++++
 .../core/providers/webgpu/tensor/scatter_nd.h |  60 +++++
 .../webgpu/webgpu_execution_provider.cc       |  10 +
 4 files changed, 305 insertions(+), 6 deletions(-)
 create mode 100644 onnxruntime/core/providers/webgpu/tensor/scatter_nd.cc
 create mode 100644 onnxruntime/core/providers/webgpu/tensor/scatter_nd.h

diff --git a/onnxruntime/core/providers/webgpu/shader_helper.cc b/onnxruntime/core/providers/webgpu/shader_helper.cc
index 59855e6117641..36f6b512a0a93 100644
--- a/onnxruntime/core/providers/webgpu/shader_helper.cc
+++ b/onnxruntime/core/providers/webgpu/shader_helper.cc
@@ -125,7 +125,8 @@ namespace {
 // Validate if the tensor element type matches the program variable data type
 Status ValidateVariableDataType(int32_t element_type, ProgramVariableDataType var_type, bool is_atomic = false) {
   if (is_atomic) {
-    ORT_RETURN_IF_NOT(var_type == ProgramVariableDataType::Int32 || var_type == ProgramVariableDataType::Uint32,
+    // float32 is not a valid data type for atomic. However the data may be bitcast-ed to i32 and used to simulate atomic operation using  atomicCompareExchangeWeak.
+    ORT_RETURN_IF_NOT(var_type == ProgramVariableDataType::Int32 || var_type == ProgramVariableDataType::Uint32 || var_type == ProgramVariableDataType::Float32,
                       "Unexpected program variable type ", int(var_type), " for atomic variable");
   }
 
@@ -422,11 +423,17 @@ Status ShaderHelper::GenerateSourceCode(std::string& code, std::vector<int>& sha
     bool is_atomic = program_.Outputs()[i].is_atomic;
     ss << "@group(0) @binding(" << input_vars_.size() + i << ") var<storage, read_write> " << output->name_ << ": array<";
     if (is_atomic) {
-      ss << "atomic<";
-    }
-    ss << output->StorageType();
-    if (is_atomic) {
-      ss << ">";
+      if (output->type_ == ProgramVariableDataType::Float32) {
+        ss << "atomic<i32>";
+      } else if (output->type_ == ProgramVariableDataType::Uint32) {
+        ss << "atomic<u32>";
+      } else if (output->type_ == ProgramVariableDataType::Int32) {
+        ss << "atomic<i32>";
+      } else {
+        ORT_RETURN_IF(true, "Unsupported atomic type: ", int(output->type_));
+      }
+    } else {
+      ss << output->StorageType();
     }
     ss << ">;\n";
   }
diff --git a/onnxruntime/core/providers/webgpu/tensor/scatter_nd.cc b/onnxruntime/core/providers/webgpu/tensor/scatter_nd.cc
new file mode 100644
index 0000000000000..986255ea1f185
--- /dev/null
+++ b/onnxruntime/core/providers/webgpu/tensor/scatter_nd.cc
@@ -0,0 +1,222 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include <string>
+#include <iostream>
+#include "core/providers/webgpu/webgpu_kernel.h"
+#include "core/providers/webgpu/webgpu_supported_types.h"
+#include "core/providers/webgpu/shader_helper.h"
+#include "core/providers/webgpu/shader_variable.h"
+#include "scatter_nd.h"
+
+namespace onnxruntime {
+namespace webgpu {
+
+Status ScatterNDProgram::GenerateShaderCode(ShaderHelper& shader) const {
+  shader.AddInput("indices", ShaderUsage::UseUniform | ShaderUsage::UseIndicesTypeAlias);
+  shader.AddInput("updates", ShaderUsage::UseUniform | ShaderUsage::UseIndicesTypeAlias);
+  const auto& output = shader.AddOutput("output", ShaderUsage::UseUniform | ShaderUsage::UseIndicesTypeAlias | ShaderUsage::UseShapeAndStride);
+  const auto output_rank = static_cast<size_t>(output.Rank());
+  auto atomic_reduction_snippet = [](ScatterNDReduction reduction, const std::string& ptr, const std::string& value, const std::string& data_type) -> std ::string {
+    std::ostringstream ss;
+    bool is_32_bit_integer = data_type == "i32" || data_type == "u32";
+    bool is_unsigned_integer = data_type == "u32";
+    std::ostringstream ss_float_start;
+    ss_float_start << "    {\n"
+                   << "      var oldValue = 0" << (is_unsigned_integer ? "u" : "") << ";\n"
+                   << "      loop {\n"
+                   << "        let newValueF32 = ";
+    std::ostringstream ss_float_end;
+    ss_float_end << ";\n"
+                 << "          let newValue = bitcast<" << (is_unsigned_integer ? "u32" : "i32") << ">(newValueF32);\n"
+                 << "          let res = atomicCompareExchangeWeak(&" << ptr << ", oldValue, newValue);\n"
+                 << "          if res.exchanged {\n"
+                 << "            break;\n"
+                 << "          }\n"
+                 << "          oldValue = res.old_value;\n"
+                 << "        }\n"
+                 << "      }\n";
+    switch (reduction) {
+      case ScatterNDReduction::None:
+        ss << "    " << ptr << " = " << value << ";\n";
+        break;
+      case ScatterNDReduction::Add:
+        if (is_32_bit_integer) {
+          ss << "  atomicAdd(&" << ptr << ", bitcast<" << data_type << ">(" << value << "));\n";
+        } else {
+          // atomicAdd only supports uint/int type. For float, we use
+          // atomicCompareExchangeWeak to simulate.
+          ss << ss_float_start.str() << "bitcast<" << data_type << ">(oldValue) + (" << value << ")" << ss_float_end.str()
+             << "\n";
+        }
+        break;
+      case ScatterNDReduction::Max:
+        if (is_32_bit_integer) {
+          ss << "  atomicMax(&" << ptr << ", bitcast<" << data_type << ">(" << value << "));\n";
+        } else {
+          // atomicMax only supports uint/int type. For float, we use
+          // atomicCompareExchangeWeak to simulate.
+          ss << ss_float_start.str() << "max(bitcast<" << data_type << ">(oldValue), (" << value << "))" << ss_float_end.str();
+        }
+        break;
+      case ScatterNDReduction::Min:
+        if (is_32_bit_integer) {
+          ss << "  atomicMin(&" << ptr << ", bitcast<" << data_type << ">(" << value << "));\n";
+        } else {
+          // atomicMin only supports uint/int type. For float, we use
+          // atomicCompareExchangeWeak to simulate.
+          ss << ss_float_start.str() << "min(bitcast<" << data_type << ">(oldValue), (" << value << "))" << ss_float_end.str();
+        }
+        break;
+      case ScatterNDReduction::Mul:
+        // atomicMul is not supported, we use atomicCompareExchangeWeak to simulate.
+        ss << ss_float_start.str() << "(bitcast<" << data_type << ">(oldValue) * (" << value << "))" << ss_float_end.str();
+        break;
+      default:
+        ORT_THROW("Unsupported reduction type: ", static_cast<int>(reduction));
+        // The controlflow should never reach here.
+    }
+    return ss.str();
+  };
+
+  auto calc_data_offset_snippet = [](size_t output_rank) -> std::string {
+    std::ostringstream ss;
+    if (output_rank < 2) {
+      ss << "    let element_count_dim = 1u;\n";
+    } else {
+      ss << "    let element_count_dim = select(" << GetElementAt("uniforms.output_stride", "i - indices_start", output_rank - 1) << ", 1u, i - indices_start == " << (output_rank - 1) << ");\n";
+    }
+    ss << "    let dim_value = " << GetElementAt("uniforms.output_shape", "i - indices_start", output_rank) << ";\n"
+       << "    if (index >= 0) {\n"
+       << "      if (index >= i32(dim_value)) {\n"
+       << "        index = i32(dim_value - 1);\n"
+       << "      }\n"
+       << "    } else {\n"
+       << "      if (index < -i32(dim_value)) {\n"
+       << "        index = 0;\n"
+       << "      } else {\n"
+       << "        index += i32(dim_value);\n"
+       << "      }\n"
+       << "    }\n"
+       << "    data_offset += u32((u32(index) * element_count_dim));\n";
+    return ss.str();
+  };
+
+  auto update_elements_snippet = [atomic_reduction_snippet](ScatterNDReduction reduction, const std::string& data_type) -> std::string {
+    std::ostringstream ss;
+    ss << "  for (var i = 0u; i < uniforms.num_updates_elements; i++) {\n"
+       << "    let value = updates[uniforms.num_updates_elements * global_idx + i];\n"
+       << atomic_reduction_snippet(reduction, "output[data_offset + i]", "value", data_type) << "\n"
+       << "  }\n";
+    return ss.str();
+  };
+  std::string data_type_str;
+  bool reducible = false;
+  if (data_type_ == DataTypeImpl::GetType<int32_t>()) {
+    reducible = true;
+    data_type_str = "i32";
+  } else if (data_type_ == DataTypeImpl::GetType<uint32_t>()) {
+    reducible = true;
+    data_type_str = "u32";
+  } else if (data_type_ == DataTypeImpl::GetType<float>()) {
+    reducible = true;
+    data_type_str = "f32";
+  } else {
+    // Default value.
+    data_type_str = "output_element_t";
+  }
+  if (reduction_ != ScatterNDReduction::None && !reducible) {
+    ORT_THROW("ScatterND: Reduction is not supported for data type ", data_type_str);
+  }
+  shader.MainFunctionBody() << shader.GuardAgainstOutOfBoundsWorkgroupSizes("uniforms.output_size")
+                            << "  var data_offset = 0u;\n"
+                            << "  var indices_start = uniforms.last_index_dimension * global_idx;\n"
+                            << "  var indices_end = indices_start + uniforms.last_index_dimension;\n"
+                            << "  for (var i = indices_start; i < indices_end; i++) {\n"
+                            << "    var index = i32(indices[i].x);\n"
+                            << calc_data_offset_snippet(output_rank)
+                            << "  }\n"
+                            << update_elements_snippet(reduction_, data_type_str);
+  return Status::OK();
+}
+
+Status ScatterND::ComputeInternal(ComputeContext& context) const {
+  const Tensor* input = context.Input<Tensor>(0);
+  const auto* indices = context.Input<Tensor>(1);
+  const auto* updates = context.Input<Tensor>(2);
+  const auto& input_shape = input->Shape();
+  const auto& indices_shape = indices->Shape();
+  auto indices_rank = indices_shape.NumDimensions();
+  auto last_index_dimension = static_cast<uint32_t>(indices_shape[indices_rank - 1]);
+  auto num_updates_elements = static_cast<uint32_t>(input_shape.SizeFromDimension(last_index_dimension));
+  // TODO: support bool with components 4.
+  const size_t components = 1;
+  auto output_size = static_cast<uint32_t>((indices_shape.SizeToDimension(indices_rank - 1) + components - 1) / components);
+  auto* output = context.Output(0, input_shape);
+  MLDataType data_type = input->DataType();
+  const void* source = input->DataRaw();
+  void* target = output->MutableDataRaw();
+  // If source and target pointers are not equal (non-inplace operation), we need to copy the data.
+  if (target != source) {
+    ORT_RETURN_IF_ERROR(Info().GetDataTransferManager().CopyTensor(*input, *output));
+  }
+  ScatterNDProgram program(reduction_, data_type);
+  program
+      .CacheHint(static_cast<uint32_t>(reduction_))
+      .AddInputs({{indices, ProgramTensorMetadataDependency::TypeAndRank},
+                  {updates, ProgramTensorMetadataDependency::TypeAndRank}})
+      .SetDispatchGroupSize((output_size + WORKGROUP_SIZE - 1) / WORKGROUP_SIZE)
+      .AddUniformVariables({output_size, last_index_dimension, num_updates_elements});
+  if (reduction_ != ScatterNDReduction::None && (data_type == DataTypeImpl::GetType<float>() || data_type == DataTypeImpl::GetType<int32_t>() ||
+                                                 data_type == DataTypeImpl::GetType<uint32_t>())) {
+    program.AddOutput({output, ProgramTensorMetadataDependency::TypeAndRank, ProgramOutput::Atomic});
+  } else {
+    program.AddOutput({output, ProgramTensorMetadataDependency::TypeAndRank});
+  }
+  return context.RunProgram(program);
+}
+
+ONNX_OPERATOR_KERNEL_EX(
+    ScatterND,
+    kOnnxDomain,
+    18,
+    kWebGpuExecutionProvider,
+    (*KernelDefBuilder::Create())
+        .TypeConstraint("T", WebGpuSupportedNumberTypes())
+        .MayInplace(0, 0),
+    ScatterND);
+
+ONNX_OPERATOR_VERSIONED_KERNEL_EX(
+    ScatterND,
+    kOnnxDomain,
+    16,
+    17,
+    kWebGpuExecutionProvider,
+    (*KernelDefBuilder::Create())
+        .TypeConstraint("T", WebGpuSupportedNumberTypes())
+        .MayInplace(0, 0),
+    ScatterND);
+
+ONNX_OPERATOR_VERSIONED_KERNEL_EX(
+    ScatterND,
+    kOnnxDomain,
+    13,
+    15,
+    kWebGpuExecutionProvider,
+    (*KernelDefBuilder::Create())
+        .TypeConstraint("T", WebGpuSupportedNumberTypes())
+        .MayInplace(0, 0),
+    ScatterND);
+
+ONNX_OPERATOR_VERSIONED_KERNEL_EX(
+    ScatterND,
+    kOnnxDomain,
+    11,
+    12,
+    kWebGpuExecutionProvider,
+    (*KernelDefBuilder::Create())
+        .TypeConstraint("T", WebGpuSupportedNumberTypes())
+        .MayInplace(0, 0),
+    ScatterND);
+}  // namespace webgpu
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/webgpu/tensor/scatter_nd.h b/onnxruntime/core/providers/webgpu/tensor/scatter_nd.h
new file mode 100644
index 0000000000000..40bcbadebf65d
--- /dev/null
+++ b/onnxruntime/core/providers/webgpu/tensor/scatter_nd.h
@@ -0,0 +1,60 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+#include "core/providers/webgpu/webgpu_supported_types.h"
+#include "core/providers/webgpu/webgpu_kernel.h"
+#include "core/providers/webgpu/program.h"
+#include "core/framework/data_transfer_manager.h"
+
+namespace onnxruntime {
+namespace webgpu {
+
+enum class ScatterNDReduction : int {
+  None = 0,
+  Add = 1,
+  Mul = 2,
+  Min = 3,
+  Max = 4,
+};
+
+class ScatterNDProgram final : public Program<ScatterNDProgram> {
+ public:
+  ScatterNDProgram(ScatterNDReduction reduction, MLDataType data_type) : Program{"ScatterND"}, reduction_(reduction), data_type_(data_type) {}
+
+  Status GenerateShaderCode(ShaderHelper& sh) const override;
+
+  WEBGPU_PROGRAM_DEFINE_UNIFORM_VARIABLES({"output_size", ProgramUniformVariableDataType::Uint32},
+                                          {"last_index_dimension", ProgramUniformVariableDataType::Uint32},
+                                          {"num_updates_elements", ProgramUniformVariableDataType::Uint32});
+  ScatterNDReduction reduction_;
+  MLDataType data_type_;
+};
+
+class ScatterND : public WebGpuKernel {
+ public:
+  ScatterND(const OpKernelInfo& info) : WebGpuKernel(info) {
+    std::string reduction = info.GetAttrOrDefault<std::string>("reduction", "none");
+    if (reduction == "add") {
+      reduction_ = ScatterNDReduction::Add;
+    } else if (reduction == "mul") {
+      reduction_ = ScatterNDReduction::Mul;
+    } else if (reduction == "min") {
+      reduction_ = ScatterNDReduction::Min;
+    } else if (reduction == "max") {
+      reduction_ = ScatterNDReduction::Max;
+    } else if (reduction == "none") {
+      reduction_ = ScatterNDReduction::None;
+    } else {
+      ORT_THROW("Reduction '", reduction, "' is not supported on webgpu when opset <= 18.");
+    }
+  }
+
+  Status ComputeInternal(ComputeContext& context) const override;
+
+ private:
+  ScatterNDReduction reduction_{ScatterNDReduction::None};
+};
+
+}  // namespace webgpu
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc b/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc
index 928e48d78d7e5..9ea79e4cf28a3 100644
--- a/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc
+++ b/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc
@@ -400,6 +400,11 @@ class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxD
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 21, 22, DequantizeLinear);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 23, DequantizeLinear);
 
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 11, 12, ScatterND);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 13, 15, ScatterND);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 16, 17, ScatterND);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 18, ScatterND);
+
 std::unique_ptr<KernelRegistry> RegisterKernels() {
   auto kernel_registry = std::make_unique<onnxruntime::KernelRegistry>();
 
@@ -732,6 +737,11 @@ std::unique_ptr<KernelRegistry> RegisterKernels() {
 
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 11, 13, CumSum)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 14, CumSum)>,
+
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 11, 12, ScatterND)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 13, 15, ScatterND)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 16, 17, ScatterND)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 18, ScatterND)>,
   };
 
   for (auto& function_table_entry : function_table) {

From 8e7c0ac3acbcfb98ee0371579b642434c5ecc403 Mon Sep 17 00:00:00 2001
From: Baiju Meswani <bmeswani@microsoft.com>
Date: Tue, 6 May 2025 20:28:36 -0700
Subject: [PATCH 27/84] Use build id when publishing symbols (#24662)

---
 .../c-api-artifacts-package-and-publish-steps-windows.yml       | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/ci_build/github/azure-pipelines/templates/c-api-artifacts-package-and-publish-steps-windows.yml b/tools/ci_build/github/azure-pipelines/templates/c-api-artifacts-package-and-publish-steps-windows.yml
index 8c3a9eba82356..1f71e8c0e0125 100644
--- a/tools/ci_build/github/azure-pipelines/templates/c-api-artifacts-package-and-publish-steps-windows.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/c-api-artifacts-package-and-publish-steps-windows.yml
@@ -43,7 +43,7 @@ steps:
         SymbolExpirationInDays: '36530'
         IndexableFileFormats: 'Default'
         DetailedLog: true
-        SymbolsArtifactName: 'Symbols_${{parameters.buildConfig}}'
+        SymbolsArtifactName: 'Symbols_${{parameters.artifactNameNoVersionString}}_$(Build.BuildId)'
 
     - task: CmdLine@2
       displayName: 'Copy build artifacts for zipping'

From f14fd59fe9bfe419376357128163e687bf88b6d6 Mon Sep 17 00:00:00 2001
From: Scott McKay <skottmckay@gmail.com>
Date: Wed, 7 May 2025 13:33:48 +1000
Subject: [PATCH 28/84] More error checking for user provided C# policy
 selection delegate (#24666)

### Description
<!-- Describe your changes. -->
Handle user selection policy delegate throwing or returning too many
selections in C# code and create error message.


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
---
 .../NativeMethods.shared.cs                   |  8 ++
 .../SessionOptions.shared.cs                  | 82 +++++++++++-------
 .../OrtAutoEpTests.cs                         | 84 +++++++++++++++++++
 3 files changed, 144 insertions(+), 30 deletions(-)

diff --git a/csharp/src/Microsoft.ML.OnnxRuntime/NativeMethods.shared.cs b/csharp/src/Microsoft.ML.OnnxRuntime/NativeMethods.shared.cs
index c543414ca13a9..664a77ceab037 100644
--- a/csharp/src/Microsoft.ML.OnnxRuntime/NativeMethods.shared.cs
+++ b/csharp/src/Microsoft.ML.OnnxRuntime/NativeMethods.shared.cs
@@ -407,12 +407,15 @@ static NativeMethods()
             api_ = (OrtApi)OrtGetApi(ORT_API_VERSION);
             OrtGetVersionString = (DOrtGetVersionString)Marshal.GetDelegateForFunctionPointer(OrtGetApiBase().GetVersionString, typeof(DOrtGetVersionString));
 #endif
+            OrtCreateStatus = (DOrtCreateStatus)Marshal.GetDelegateForFunctionPointer(
+                api_.CreateStatus, typeof(DOrtCreateStatus));
 
             OrtCreateEnv = (DOrtCreateEnv)Marshal.GetDelegateForFunctionPointer(api_.CreateEnv, typeof(DOrtCreateEnv));
             OrtCreateEnvWithCustomLogger = (DOrtCreateEnvWithCustomLogger)Marshal.GetDelegateForFunctionPointer(api_.CreateEnvWithCustomLogger, typeof(DOrtCreateEnvWithCustomLogger));
             OrtCreateEnvWithGlobalThreadPools = (DOrtCreateEnvWithGlobalThreadPools)Marshal.GetDelegateForFunctionPointer(api_.CreateEnvWithGlobalThreadPools, typeof(DOrtCreateEnvWithGlobalThreadPools));
             OrtCreateEnvWithCustomLoggerAndGlobalThreadPools = (DOrtCreateEnvWithCustomLoggerAndGlobalThreadPools)Marshal.GetDelegateForFunctionPointer(api_.CreateEnvWithCustomLoggerAndGlobalThreadPools, typeof(DOrtCreateEnvWithCustomLoggerAndGlobalThreadPools));
             OrtReleaseEnv = (DOrtReleaseEnv)Marshal.GetDelegateForFunctionPointer(api_.ReleaseEnv, typeof(DOrtReleaseEnv));
+            
             OrtEnableTelemetryEvents = (DOrtEnableTelemetryEvents)Marshal.GetDelegateForFunctionPointer(api_.EnableTelemetryEvents, typeof(DOrtEnableTelemetryEvents));
             OrtDisableTelemetryEvents = (DOrtDisableTelemetryEvents)Marshal.GetDelegateForFunctionPointer(api_.DisableTelemetryEvents, typeof(DOrtDisableTelemetryEvents));
 
@@ -933,6 +936,11 @@ internal class NativeLib
 #endregion Status API
 
 #region InferenceSession API
+        [UnmanagedFunctionPointer(CallingConvention.Winapi)]
+        public delegate IntPtr /* OrtStatus* */ DOrtCreateStatus(
+            uint /* OrtErrorCode */ code, 
+            byte[] /* const char* */ msg);
+        public static DOrtCreateStatus OrtCreateStatus;
 
         [UnmanagedFunctionPointer(CallingConvention.Winapi)]
         public delegate IntPtr /* OrtStatus* */ DOrtCreateSession(
diff --git a/csharp/src/Microsoft.ML.OnnxRuntime/SessionOptions.shared.cs b/csharp/src/Microsoft.ML.OnnxRuntime/SessionOptions.shared.cs
index d60bf75ccbd7c..9794d2c184d5d 100644
--- a/csharp/src/Microsoft.ML.OnnxRuntime/SessionOptions.shared.cs
+++ b/csharp/src/Microsoft.ML.OnnxRuntime/SessionOptions.shared.cs
@@ -989,41 +989,63 @@ public static IntPtr EpSelectionPolicyWrapper(IntPtr /* OrtEpDevice** */ epDevic
                                                           out UIntPtr numSelected,
                                                           IntPtr state)
             {
-                Span<IntPtr> epDevicesIntPtrs;
-                Span<IntPtr> selectedDevicesIntPtrs;
-                EpSelectionPolicyConnector connector = (EpSelectionPolicyConnector)GCHandle.FromIntPtr(state).Target;
+                numSelected = UIntPtr.Zero;
 
-                unsafe
+                try
                 {
-                    void* ptr = epDevicesIn.ToPointer();
-                    epDevicesIntPtrs = new Span<IntPtr>(ptr, checked((int)numDevices));
-                }
-
-                List<OrtEpDevice> epDevices = new List<OrtEpDevice>();
-                for (int i = 0; i < numDevices; i++)
-                {
-
-                    epDevices.Add(new OrtEpDevice(epDevicesIntPtrs[i]));
-                }
 
-                OrtKeyValuePairs modelMetadata = new OrtKeyValuePairs(modelMetadataIn);
-                OrtKeyValuePairs runtimeMetadata = new OrtKeyValuePairs(runtimeMetadataIn);
-
-                var selected = connector._csharpDelegate(epDevices, modelMetadata, runtimeMetadata, maxSelected);
-
-                numSelected = (UIntPtr)selected.Count;
-
-                unsafe
-                {
-                    void* ptr = selectedOut.ToPointer();
-                    selectedDevicesIntPtrs = new Span<IntPtr>(ptr, (int)maxSelected);
+                    Span<IntPtr> epDevicesIntPtrs;
+                    Span<IntPtr> selectedDevicesIntPtrs;
+                    EpSelectionPolicyConnector connector = (EpSelectionPolicyConnector)GCHandle.FromIntPtr(state).Target;
+
+                    unsafe
+                    {
+                        void* ptr = epDevicesIn.ToPointer();
+                        epDevicesIntPtrs = new Span<IntPtr>(ptr, checked((int)numDevices));
+                    }
+
+                    List<OrtEpDevice> epDevices = new List<OrtEpDevice>();
+                    for (int i = 0; i < numDevices; i++)
+                    {
+
+                        epDevices.Add(new OrtEpDevice(epDevicesIntPtrs[i]));
+                    }
+
+                    OrtKeyValuePairs modelMetadata = new OrtKeyValuePairs(modelMetadataIn);
+                    OrtKeyValuePairs runtimeMetadata = new OrtKeyValuePairs(runtimeMetadataIn);
+
+                    var selected = connector._csharpDelegate(epDevices, modelMetadata, runtimeMetadata, maxSelected);
+
+                    if (selected.Count > maxSelected)
+                    {
+                        var error = $"The number of selected devices ({selected.Count}) returned by " +
+                                    $"the C# selection delegate exceeds the maximum ({maxSelected}).";
+                        IntPtr status = NativeMethods.OrtCreateStatus((uint)ErrorCode.Fail,
+                                            NativeOnnxValueHelper.StringToZeroTerminatedUtf8(error));
+                        return status;
+                    }
+
+                    numSelected = (UIntPtr)selected.Count;
+
+                    unsafe
+                    {
+                        void* ptr = selectedOut.ToPointer();
+                        selectedDevicesIntPtrs = new Span<IntPtr>(ptr, (int)maxSelected);
+                    }
+
+                    int idx = 0;
+                    foreach (var epDevice in selected)
+                    {
+                        selectedDevicesIntPtrs[idx] = epDevice.Handle;
+                        idx++;
+                    }
                 }
-
-                int idx = 0;
-                foreach (var epDevice in selected)
+                catch (Exception ex)
                 {
-                    selectedDevicesIntPtrs[idx] = epDevice.Handle;
-                    idx++;
+                    var error = $"The C# selection delegate threw an exception: {ex.Message}";
+                    IntPtr status = NativeMethods.OrtCreateStatus((uint)ErrorCode.Fail,
+                                            NativeOnnxValueHelper.StringToZeroTerminatedUtf8(error));
+                    return status;
                 }
 
                 return IntPtr.Zero;
diff --git a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/OrtAutoEpTests.cs b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/OrtAutoEpTests.cs
index d95a649bd95c5..9368f9d8bc298 100644
--- a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/OrtAutoEpTests.cs
+++ b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/OrtAutoEpTests.cs
@@ -198,5 +198,89 @@ public void SetEpSelectionPolicyDelegate()
             Assert.NotNull(session);
         }
     }
+
+    // select max + 1, starting with all devices
+    private static List<OrtEpDevice> SelectionPolicyDelegateTooMany(IReadOnlyList<OrtEpDevice> epDevices,
+                                                                    OrtKeyValuePairs modelMetadata,
+                                                                    OrtKeyValuePairs runtimeMetadata,
+                                                                    uint maxSelections)
+    {
+        Assert.NotEmpty(modelMetadata.Entries);
+        Assert.True(epDevices.Count > 0);
+        var selected = new List<OrtEpDevice>(epDevices);
+
+        while (selected.Count < (maxSelections + 1))
+        {
+            selected.Add(epDevices.Last());
+        }
+
+        return selected;
+    }
+
+    [Fact]
+    public void SetEpSelectionPolicyDelegateTooMany()
+    {
+        using SessionOptions sessionOptions = new SessionOptions();
+        sessionOptions.LogSeverityLevel = OrtLoggingLevel.ORT_LOGGING_LEVEL_VERBOSE;
+
+        var epDevices = ortEnvInstance.GetEpDevices();
+        Assert.NotEmpty(epDevices);
+
+        // select too many devices
+        sessionOptions.SetEpSelectionPolicyDelegate(SelectionPolicyDelegateTooMany);
+
+        var model = TestDataLoader.LoadModelFromEmbeddedResource("squeezenet.onnx");
+
+        // session should fail
+        try
+        {
+            using var session = new InferenceSession(model, sessionOptions);
+            Assert.Fail("Should have thrown an exception");
+        }
+        catch (OnnxRuntimeException ex)
+        {
+            // Current C++ max is 8. We copy all devices and keep adding until we exceed that.
+            const int max = 8;
+            var numSelected = epDevices.Count > max ? epDevices.Count : (max + 1);
+            var expected = "[ErrorCode:Fail] EP selection delegate failed: The number of selected devices " +
+                          $"({numSelected}) returned by the C# selection delegate exceeds the maximum ({max})";
+            Assert.Contains(expected, ex.Message);
+        }
+    }
+
+    // throw exception in user provided delegate
+    private static List<OrtEpDevice> SelectionPolicyDelegateThrows(IReadOnlyList<OrtEpDevice> epDevices,
+                                                                    OrtKeyValuePairs modelMetadata,
+                                                                    OrtKeyValuePairs runtimeMetadata,
+                                                                    uint maxSelections)
+    {
+        throw new ArgumentException("Test exception");
+    }
+
+    [Fact]
+    public void SetEpSelectionPolicyDelegateThrows()
+    {
+        using SessionOptions sessionOptions = new SessionOptions();
+        sessionOptions.LogSeverityLevel = OrtLoggingLevel.ORT_LOGGING_LEVEL_VERBOSE;
+
+        var epDevices = ortEnvInstance.GetEpDevices();
+        Assert.NotEmpty(epDevices);
+
+        sessionOptions.SetEpSelectionPolicyDelegate(SelectionPolicyDelegateThrows);
+
+        var model = TestDataLoader.LoadModelFromEmbeddedResource("squeezenet.onnx");
+
+        try
+        {
+            using var session = new InferenceSession(model, sessionOptions);
+            Assert.Fail("Should have thrown an exception");
+        }
+        catch (OnnxRuntimeException ex)
+        {
+            var expected = "[ErrorCode:Fail] EP selection delegate failed: " +
+                           "The C# selection delegate threw an exception: Test exception";
+            Assert.Contains(expected, ex.Message);
+        }
+    }
 }
 #endif
\ No newline at end of file

From ef3caaf34b4ff8677ec3cb010faa61b22abe3b54 Mon Sep 17 00:00:00 2001
From: Adrian Lizarraga <adlizarraga@microsoft.com>
Date: Tue, 6 May 2025 21:24:31 -0700
Subject: [PATCH 29/84] [Clean up] Fix ep_name vs ep_registration_name usage in
 autoEP Python unit tests (#24667)

### Description
Cleans up the usage of `ep_name` and `ep_registration_name` in the
autoEP Python unit tests.


### Motivation and Context
Addresses comments from a previous PR:
https://github.com/microsoft/onnxruntime/pull/24634
> nit: the registration name and EP names don't need to match. could we
call this 'ep_name' to avoid potentially creating an assumption that
they always do?
---
 .../python/onnxruntime_pybind_state.cc        |  2 +-
 .../python/onnxruntime_test_python_autoep.py  | 43 +++++++++----------
 .../onnxruntime_test_python_compile_api.py    |  6 +--
 3 files changed, 24 insertions(+), 27 deletions(-)

diff --git a/onnxruntime/python/onnxruntime_pybind_state.cc b/onnxruntime/python/onnxruntime_pybind_state.cc
index d5f8c7c181960..aa2c0cc6a0f86 100644
--- a/onnxruntime/python/onnxruntime_pybind_state.cc
+++ b/onnxruntime/python/onnxruntime_pybind_state.cc
@@ -1801,7 +1801,7 @@ void addGlobalMethods(py::module& m) {
 
 #if !defined(ORT_MINIMAL_BUILD)
 /**
- * Calls the user's Python EP selection function and coverts the results to a format that can be used
+ * Calls the user's Python EP selection function and converts the results to a format that can be used
  * by ORT to select OrtEpDevice instances. The user's function is set by calling
  * SessionOptions.set_provider_selection_policy_delegate() on the Python side. The result of this wrapper
  * function is used in core/session/provider_policy_context.cc.
diff --git a/onnxruntime/test/python/onnxruntime_test_python_autoep.py b/onnxruntime/test/python/onnxruntime_test_python_autoep.py
index 61dc0ff221318..417a6e27fb7b2 100644
--- a/onnxruntime/test/python/onnxruntime_test_python_autoep.py
+++ b/onnxruntime/test/python/onnxruntime_test_python_autoep.py
@@ -28,24 +28,23 @@ def test_cuda_ep_register_and_inference(self):
         Test registration of CUDA EP, adding its OrtDevice to the SessionOptions, and running inference.
         """
         ep_lib_path = "onnxruntime_providers_cuda.dll"
-        ep_registration_name = "CUDAExecutionProvider"
+        ep_name = "CUDAExecutionProvider"
 
         if sys.platform != "win32":
             self.skipTest("Skipping test because device discovery is only supported on Windows")
 
-        if ep_registration_name not in available_providers:
+        if ep_name not in available_providers:
             self.skipTest("Skipping test because it needs to run on CUDA EP")
 
-        self.register_execution_provider_library(ep_registration_name, ep_lib_path)
+        self.register_execution_provider_library(ep_name, ep_lib_path)
 
         ep_devices = onnxrt.get_ep_devices()
         has_cpu_ep = False
         cuda_ep_device = None
         for ep_device in ep_devices:
-            ep_name = ep_device.ep_name
-            if ep_name == "CPUExecutionProvider":
+            if ep_device.ep_name == "CPUExecutionProvider":
                 has_cpu_ep = True
-            if ep_name == ep_registration_name:
+            if ep_device.ep_name == ep_name:
                 cuda_ep_device = ep_device
 
         self.assertTrue(has_cpu_ep)
@@ -70,22 +69,22 @@ def test_cuda_ep_register_and_inference(self):
         np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08)
 
         del sess  # Delete session before unregistering library
-        self.unregister_execution_provider_library(ep_registration_name)
+        self.unregister_execution_provider_library(ep_name)
 
     def test_cuda_prefer_gpu_and_inference(self):
         """
         Test selecting CUDA EP via the PREFER_GPU policy and running inference.
         """
         ep_lib_path = "onnxruntime_providers_cuda.dll"
-        ep_registration_name = "CUDAExecutionProvider"
+        ep_name = "CUDAExecutionProvider"
 
         if sys.platform != "win32":
             self.skipTest("Skipping test because device discovery is only supported on Windows")
 
-        if ep_registration_name not in available_providers:
+        if ep_name not in available_providers:
             self.skipTest("Skipping test because it needs to run on CUDA EP")
 
-        self.register_execution_provider_library(ep_registration_name, ep_lib_path)
+        self.register_execution_provider_library(ep_name, ep_lib_path)
 
         # Set a policy to prefer GPU. Cuda should be selected.
         sess_options = onnxrt.SessionOptions()
@@ -102,22 +101,22 @@ def test_cuda_prefer_gpu_and_inference(self):
         np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08)
 
         del sess  # Delete session before unregistering library
-        self.unregister_execution_provider_library(ep_registration_name)
+        self.unregister_execution_provider_library(ep_name)
 
     def test_cuda_ep_selection_delegate_and_inference(self):
         """
         Test selecting CUDA EP via the custom EP selection delegate function and then run inference.
         """
         ep_lib_path = "onnxruntime_providers_cuda.dll"
-        ep_registration_name = "CUDAExecutionProvider"
+        ep_name = "CUDAExecutionProvider"
 
         if sys.platform != "win32":
             self.skipTest("Skipping test because device discovery is only supported on Windows")
 
-        if ep_registration_name not in available_providers:
+        if ep_name not in available_providers:
             self.skipTest("Skipping test because it needs to run on CUDA EP")
 
-        self.register_execution_provider_library(ep_registration_name, ep_lib_path)
+        self.register_execution_provider_library(ep_name, ep_lib_path)
 
         # User's custom EP selection function.
         def my_delegate(
@@ -130,7 +129,7 @@ def my_delegate(
             self.assertGreaterEqual(len(ep_devices), 2)
             self.assertGreaterEqual(max_selections, 2)
 
-            cuda_ep_device = next((d for d in ep_devices if d.ep_name == ep_registration_name), None)
+            cuda_ep_device = next((d for d in ep_devices if d.ep_name == ep_name), None)
             self.assertIsNotNone(cuda_ep_device)
 
             # Select the CUDA device and the ORT CPU EP device (should always be last)
@@ -150,7 +149,7 @@ def my_delegate(
         np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08)
 
         del sess  # Delete session before unregistering library
-        self.unregister_execution_provider_library(ep_registration_name)
+        self.unregister_execution_provider_library(ep_name)
 
     def test_custom_ep_selection_delegate_that_raises_error(self):
         """
@@ -192,18 +191,16 @@ def test_example_plugin_ep_devices(self):
         except FileNotFoundError:
             self.skipTest(f"Skipping test because EP library '{ep_lib_path}' cannot be found")
 
-        ep_registration_name = "example_ep"
-        self.register_execution_provider_library(ep_registration_name, os.path.realpath(ep_lib_path))
+        ep_name = "example_ep"
+        self.register_execution_provider_library(ep_name, os.path.realpath(ep_lib_path))
 
         ep_devices = onnxrt.get_ep_devices()
         has_cpu_ep = False
         test_ep_device = None
         for ep_device in ep_devices:
-            ep_name = ep_device.ep_name
-
-            if ep_name == "CPUExecutionProvider":
+            if ep_device.ep_name == "CPUExecutionProvider":
                 has_cpu_ep = True
-            if ep_name == ep_registration_name:
+            if ep_device.ep_name == ep_name:
                 test_ep_device = ep_device
 
         self.assertTrue(has_cpu_ep)
@@ -236,7 +233,7 @@ def test_example_plugin_ep_devices(self):
             sess_options.add_provider_for_devices([test_ep_device], {"opt1": "val1"})
         self.assertIn("EP is not currently supported", str(context.exception))
 
-        self.unregister_execution_provider_library(ep_registration_name)
+        self.unregister_execution_provider_library(ep_name)
 
 
 if __name__ == "__main__":
diff --git a/onnxruntime/test/python/onnxruntime_test_python_compile_api.py b/onnxruntime/test/python/onnxruntime_test_python_compile_api.py
index 866388f3aa226..7a410d4bbeb6a 100644
--- a/onnxruntime/test/python/onnxruntime_test_python_compile_api.py
+++ b/onnxruntime/test/python/onnxruntime_test_python_compile_api.py
@@ -34,8 +34,8 @@ def test_compile_with_files_prefer_npu_policy(self):
             self.skipTest("Skipping test because provider selection policies are only supported on Windows")
 
         ep_lib_path = "onnxruntime_providers_qnn.dll"
-        ep_registration_name = "QNNExecutionProvider"
-        self.register_execution_provider_library(ep_registration_name, ep_lib_path)
+        ep_name = "QNNExecutionProvider"
+        self.register_execution_provider_library(ep_name, ep_lib_path)
 
         input_model_path = get_name("nhwc_resize_scales_opset18.onnx")
         output_model_path = os.path.join(self._tmp_dir_path, "model.compiled0.onnx")
@@ -51,7 +51,7 @@ def test_compile_with_files_prefer_npu_policy(self):
         )
         model_compiler.compile_to_file(output_model_path)
         self.assertTrue(os.path.exists(output_model_path))
-        self.unregister_execution_provider_library(ep_registration_name)
+        self.unregister_execution_provider_library(ep_name)
 
     def test_compile_with_ep_selection_delegate(self):
         """

From 76ae65a7cda306e6e6601bcb4d9e53e32c84c333 Mon Sep 17 00:00:00 2001
From: quic-tirupath <quic_tirupath@quicinc.com>
Date: Tue, 6 May 2025 21:34:51 -0700
Subject: [PATCH 30/84] [QNN EP] Fix Resize Op support translation (#24657)

- Use ResizeNearestNeighbor Op for Resize with interpolation_mode=Nearest and rank-4 inputs.
 - Add a Unit test to verify the modified translation.

### Description
ResizeNearestNeighbor Op is faster for Resize with interpolation_mode=Nearest and rank-4 inputs.


### Motivation and Context
This commit matches Resize Op behavior in QNN-EP with QNN Offline converter path. This fix also improves inference time.
---
 .../qnn/builder/opbuilder/resize_op_builder.cc  | 17 +++++++----------
 onnxruntime/test/providers/qnn/resize_test.cc   |  9 +++++++++
 2 files changed, 16 insertions(+), 10 deletions(-)

diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/resize_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/resize_op_builder.cc
index 347f0651069dc..85844721b1f2c 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/resize_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/resize_op_builder.cc
@@ -185,18 +185,15 @@ Status ResizeOpBuilder::IsOpSupported(QnnModelWrapper& qnn_model_wrapper,
                         "QNN EP: Resize on the NPU does not support nearest_mode ", nearest_mode.c_str());
 #endif
 
-      const bool use_resize_nn_op = nearest_mode == "floor";
+      // Use ResizeNearestNeighbor for rank-4 inputs.
+      const bool use_resize_nn_op = input_rank == 4;
 
       // If HTP uses ResizeNearestNeighbor ("floor"), then the "pytorch_half_pixel" coordinate_transformation_mode
       // is not supported.
-      ORT_RETURN_IF(use_resize_nn_op && transformation_mode == "pytorch_half_pixel",
+      ORT_RETURN_IF(!use_resize_nn_op && nearest_mode == "floor" && transformation_mode == "pytorch_half_pixel",
                     "QNN EP: Resize on the NPU does not support the combination of nearest_mode == 'floor' ",
                     " and coordinate_transformation_mode == 'pytorch_half_pixel'.");
 
-      // QNN's ResizeNearestNeighbor requires rank 4 inputs.
-      ORT_RETURN_IF(use_resize_nn_op && input_rank != 4,
-                    "QNN EP: Resize on the NPU with nearest_mode == 'floor' requires an input with rank 4.");
-
 #if QNN_API_VERSION_MAJOR >= 2 && QNN_API_VERSION_MINOR >= 14
       // QNN's Resize only supports "round_prefer_ceil" if transformation_mode is "align_corners".
       ORT_RETURN_IF(!use_resize_nn_op && transformation_mode != "align_corners",
@@ -267,11 +264,11 @@ Status ResizeOpBuilder::ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_w
   const bool is_npu_backend = IsNpuBackend(qnn_model_wrapper.GetQnnBackendType());
   std::string qnn_op_type = "Resize";
 
-  if (is_npu_backend && input_rank == 4 && interp_mode == "nearest" && nearest_mode == "floor") {
+  if (is_npu_backend && input_rank == 4 && interp_mode == "nearest") {
     // Translate Resize with
-    // {input_rank: 4, mode: "nearest", nearest_mode: "floor", coordinate_transformation_mode: XXX} to
-    // QNN's ResizeNearestNeighbor operator on the HTP backend. This combination of parameters is not supported on HTP
-    // via QNN's Resize operator. Note that QNN's ResizeNearestNeighbor operator always uses "floor" rounding.
+    // {input_rank: 4, mode: "nearest", coordinate_transformation_mode: XXX} to
+    // QNN's ResizeNearestNeighbor operator on the HTP backend. QNN ResizeNearestNeighbor
+    // seems to be faster than QNN Resize.
     qnn_op_type = "ResizeNearestNeighbor";
 
     // 'align_corners'
diff --git a/onnxruntime/test/providers/qnn/resize_test.cc b/onnxruntime/test/providers/qnn/resize_test.cc
index fbd729fa998d9..702d4e6eddb1b 100644
--- a/onnxruntime/test/providers/qnn/resize_test.cc
+++ b/onnxruntime/test/providers/qnn/resize_test.cc
@@ -399,6 +399,15 @@ TEST_F(QnnHTPBackendTests, ResizeU8_2xNearestHalfPixelRoundPreferFloor_Unsupport
                               ExpectedEPNodeAssignment::None);  // No longer supported as of QNN SDK 2.21
 }
 
+// Test 2x QDQ Resize mode: "nearest", coordinate_transformation_mode: "half_pixel", nearest_mode: "round_prefer_Ceil"
+// Maps to QNN's ResizeNearesetNeighbor operator.
+TEST_F(QnnHTPBackendTests, ResizeU8_2xNearestHalfPixelRoundPreferCeil) {
+  std::vector<float> input_data = GetFloatDataInRange(-10.0f, 10.0f, 48);
+  RunQDQResizeOpTest<uint8_t>(TestInputDef<float>({1, 3, 4, 4}, false, input_data),
+                              {1, 3, 8, 8}, "nearest", "half_pixel", "round_prefer_ceil",
+                              ExpectedEPNodeAssignment::All);
+}
+
 // Test 2x QDQ Resize mode: "nearest", coordinate_transformation_mode: "align_corners", nearest_mode: "round_prefer_ceil"
 // Maps to QNN's Resize operator.
 // UPDATE: "round_prefer_ceil" is supported as of QNN SDK 2.21 if using "align_corners". (Unsupported in QNN SDK 2.19).

From dc09448cf757c02a6c70b3acf422bab22abf1cc4 Mon Sep 17 00:00:00 2001
From: Dmitri Smirnov <yuslepukhin@users.noreply.github.com>
Date: Tue, 6 May 2025 21:46:23 -0700
Subject: [PATCH 31/84] Fix improper vector iterator handling and a couple of
 warnings (#24664)

### Description
<!-- Describe your changes. -->
When erasing elements from a vector while iterating over it, the
iterator must be updated to the new position after the erase operation
as a return value, otherwise results in decrement of end operator which
is undefined behavior.
Fix a couple of warnings as well.

### Motivation and Context
Causes test assert.
---
 onnxruntime/core/framework/session_state_utils.cc          | 2 +-
 .../core/providers/tensorrt/tensorrt_execution_provider.cc | 7 ++++---
 .../tensorrt/tensorrt_execution_provider_custom_ops.h      | 2 +-
 3 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/onnxruntime/core/framework/session_state_utils.cc b/onnxruntime/core/framework/session_state_utils.cc
index cef902e506075..cacd772b61d76 100644
--- a/onnxruntime/core/framework/session_state_utils.cc
+++ b/onnxruntime/core/framework/session_state_utils.cc
@@ -369,7 +369,7 @@ common::Status SaveInitializedTensors(
   if (memory_profile_func)
     memory_profile_func(planner);
 
-  for (auto i : planned_initializers_memory_sizes_in_byte) {
+  for (const auto& i : planned_initializers_memory_sizes_in_byte) {
     LOGS(logger, INFO) << "[Memory] SessionStateInitializer statically allocates "
                        << i.second << " bytes for " << i.first.ToString() << std::endl;
   }
diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
index ded135bf50ec8..72eb2579e9d42 100644
--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
@@ -2587,11 +2587,12 @@ TensorrtExecutionProvider::GetCapability(const GraphViewer& graph,
     supported_nodes_vector.clear();
   }
 
-  // Remove subgraphs if its size is less than the predefined minimal size
-  for (auto it = supported_nodes_vector.begin(); it != supported_nodes_vector.end(); ++it) {
+  for (auto it = supported_nodes_vector.begin(); it != supported_nodes_vector.end();) {
     const size_t subgraph_size = it->first.size();
     if (subgraph_size < min_subgraph_size_) {
-      supported_nodes_vector.erase(it--);
+      it = supported_nodes_vector.erase(it);
+    } else {
+      ++it;
     }
   }
 
diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_custom_ops.h b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_custom_ops.h
index a72de6ed75399..8d4dc19690eac 100644
--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_custom_ops.h
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_custom_ops.h
@@ -78,7 +78,7 @@ struct TensorRTCustomOp : Ort::CustomOpBase<TensorRTCustomOp, TensorRTCustomKern
  private:
   const char* provider_{onnxruntime::kTensorrtExecutionProvider};
   void* compute_stream_;
-  const char* name_;
+  const char* name_ = nullptr;
   size_t num_inputs_ = 1;   // set to 1 to match with default min_arity for variadic input
   size_t num_outputs_ = 1;  // set to 1 to match with default min_arity for variadic output
 };

From 1f3ae1fb13e3fff209d2171d21ae7d7e8ac3e727 Mon Sep 17 00:00:00 2001
From: Ashrit Shetty <ashrit.shetty@microsoft.com>
Date: Wed, 7 May 2025 00:31:50 -0700
Subject: [PATCH 32/84] Use GUID_DEVCLASS_COMPUTEACCELERATOR for NPU discovery
 (#24660)

### Description
Updated device discovery to use `GUID_DEVCLASS_COMPUTEACCELERATOR`
instead of `GUID_DEVCLASS_SYSTEM` when querying SetupAPI for potential
NPU devices. This provides a more specific and accurate class for
identifying compute accelerators like NPUs.

This change also saves us an average of 5 milliseconds by not looping
through unnecessary system devices.

### Motivation and Context
When looking for NPUs, the previous code used `GUID_DEVCLASS_SYSTEM` as
the class to query for potential devices and didn't return the Qualcomm
NPU.
---
 onnxruntime/core/platform/windows/device_discovery.cc | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/onnxruntime/core/platform/windows/device_discovery.cc b/onnxruntime/core/platform/windows/device_discovery.cc
index 5a5b5041a5912..1f8600d6ca4a6 100644
--- a/onnxruntime/core/platform/windows/device_discovery.cc
+++ b/onnxruntime/core/platform/windows/device_discovery.cc
@@ -75,11 +75,12 @@ std::unordered_map<uint64_t, DeviceInfo> GetDeviceInfoSetupApi(const std::unorde
 
   const GUID local_DXCORE_ADAPTER_ATTRIBUTE_D3D12_GENERIC_ML = {0xb71b0d41, 0x1088, 0x422f, 0xa2, 0x7c, 0x2, 0x50, 0xb7, 0xd3, 0xa9, 0x88};
   const GUID local_DXCORE_HARDWARE_TYPE_ATTRIBUTE_NPU = {0xd46140c4, 0xadd7, 0x451b, 0x9e, 0x56, 0x6, 0xfe, 0x8c, 0x3b, 0x58, 0xed};
+  const GUID local_GUID_DEVCLASS_COMPUTEACCELERATOR = {0xf01a9d53, 0x3ff6, 0x48d2, 0x9f, 0x97, 0xc8, 0xa7, 0x00, 0x4b, 0xe1, 0x0c};
 
   std::array<GUID, 3> guids = {
       GUID_DEVCLASS_DISPLAY,
       GUID_DEVCLASS_PROCESSOR,
-      GUID_DEVCLASS_SYSTEM,
+      local_GUID_DEVCLASS_COMPUTEACCELERATOR,
   };
 
   for (auto guid : guids) {
@@ -183,9 +184,9 @@ std::unordered_map<uint64_t, DeviceInfo> GetDeviceInfoSetupApi(const std::unorde
           entry->type = OrtHardwareDeviceType_GPU;
         } else if (guid == GUID_DEVCLASS_PROCESSOR) {
           entry->type = is_npu ? OrtHardwareDeviceType_NPU : OrtHardwareDeviceType_CPU;
-        } else if (guid == GUID_DEVCLASS_SYSTEM) {
+        } else if (guid == local_GUID_DEVCLASS_COMPUTEACCELERATOR) {
           if (!is_npu) {
-            // we're only iterating system devices to look for NPUs so drop anything else
+            // we're only iterating compute accelerator devices to look for NPUs so drop anything else
             device_info.erase(key);
             continue;
           }

From 143d8be6df2a36cb56f3b3189fb6f2f1c4e83e0d Mon Sep 17 00:00:00 2001
From: Yuduo Wu <6426433+1duo@users.noreply.github.com>
Date: Wed, 7 May 2025 10:00:21 -0700
Subject: [PATCH 33/84] [QNN-EP] Einsum QDQ tests followup: node selector
 (#24659)

[QNN-EP] Einsum QDQ tests followup: node selector.

Verified now Einsum QDQ tests are running with `QNN_DATATYPE_UFIXED_POINT_8`:

```
2025-05-06 11:22:09.738247549 [V:onnxruntime:qdq_model_logger, qnn_model_wrapper.cc:305 ComposeQnnGraph] Qnn_OpConfig node name: node_token_15 package_name: qti.aisw QNN_op_type: MatMul num_of_inputs: 2 num_of_outputs: 1 num_of_params: 2
 node_inputs:
 name=node id=2 version=1 type=QNN_TENSOR_TYPE_NATIVE dataFormat=0 dataType=QNN_DATATYPE_UFIXED_POINT_8 rank=2 dimensions=(2 3 ) memType=QNN_TENSORMEMTYPE_RAW quantizeParams: encodingDefinition=QNN_DEFINITION_DEFINED quantizationEncoding=QNN_QUANTIZATION_ENCODING_SCALE_OFFSET scale=0.000980392 offset=-102
 name=node_token_6 id=4 version=1 type=QNN_TENSOR_TYPE_NATIVE dataFormat=0 dataType=QNN_DATATYPE_UFIXED_POINT_8 rank=2 dimensions=(3 4 ) memType=QNN_TENSORMEMTYPE_RAW quantizeParams: encodingDefinition=QNN_DEFINITION_DEFINED quantizationEncoding=QNN_QUANTIZATION_ENCODING_SCALE_OFFSET scale=0.00215686 offset=-46
 node_outputs:
 name=node_token_16 id=5 version=1 type=QNN_TENSOR_TYPE_NATIVE dataFormat=0 dataType=QNN_DATATYPE_UFIXED_POINT_8 rank=2 dimensions=(2 4 ) memType=QNN_TENSORMEMTYPE_RAW quantizeParams: encodingDefinition=QNN_DEFINITION_DEFINED quantizationEncoding=QNN_QUANTIZATION_ENCODING_SCALE_OFFSET scale=0.000441176 offset=-40
 node_params:
 type=QNN_PARAMTYPE_SCALAR name=transpose_in0 value=0
 type=QNN_PARAMTYPE_SCALAR name=transpose_in1 value=0
```
---
 .../selectors_actions/qdq_selectors.cc        | 31 +++++++++++++++++++
 .../selectors_actions/qdq_selectors.h         | 16 ++++++++++
 .../selectors_actions/shared/utils.cc         | 11 +++++++
 3 files changed, 58 insertions(+)

diff --git a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.cc b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.cc
index 05627dd25857f..6515661a2ee6a 100644
--- a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.cc
+++ b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.cc
@@ -401,6 +401,37 @@ void ConvSelector::UpdateBuilder(NodesToOptimizeIndicesBuilder& builder) const {
   builder.input_nodes.resize(3, NodesToOptimizeIndices::kEmptyNodeIndex);
 }
 
+bool EinsumNodeGroupSelector::Check(const GraphViewer& graph_viewer,
+                                    const Node& node, const Node* redundant_clip_node,
+                                    const std::vector<const Node*>& dq_nodes,
+                                    const std::vector<const Node*>& q_nodes) const {
+  if (!CheckQDQNodes(graph_viewer, node, redundant_clip_node, dq_nodes, q_nodes, /*num_dq_inputs=*/-1,
+                     /*is_empty_q_nodes_allowed=*/true)) {
+    return false;
+  }
+  size_t num_dq_inputs = dq_nodes.size();
+  for (size_t i = 0; i < num_dq_inputs; ++i) {
+    int32_t dt_input = dq_nodes[i]->InputDefs()[0]->TypeAsProto()->tensor_type().elem_type();
+    if (!allow_int8_ && dt_input == ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_INT8) {
+      return false;
+    }
+    if (!allow_16bit_ && Is16BitIntType(dt_input)) {
+      return false;
+    }
+    if (!allow_4bit_ && Is4BitIntType(dt_input)) {
+      return false;
+    }
+  }
+  if (!q_nodes.empty()) {
+    int32_t dt_input0 = dq_nodes[0]->InputDefs()[0]->TypeAsProto()->tensor_type().elem_type();
+    int32_t dt_output = q_nodes[0]->OutputDefs()[0]->TypeAsProto()->tensor_type().elem_type();
+    if (dt_input0 != dt_output) {
+      return false;
+    }
+  }
+  return true;
+}
+
 bool MatMulNodeGroupSelector::Check(const GraphViewer& graph_viewer, const Node& node, const Node* redundant_clip_node,
                                     const std::vector<const Node*>& dq_nodes,
                                     const std::vector<const Node*>& q_nodes) const {
diff --git a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.h b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.h
index 36e04146040db..e4f4844fb88ad 100644
--- a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.h
+++ b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.h
@@ -182,6 +182,22 @@ class PadNodeGroupSelector : public NodeGroupSelector {
              const std::vector<const Node*>& q_nodes) const override;
 };
 
+// one ore more DQ nodes for each input -> node -> Q
+class EinsumNodeGroupSelector : public NodeGroupSelector {
+ public:
+  explicit EinsumNodeGroupSelector(bool allow_int8 = true, bool allow_16bit = true, bool allow_4bit = true)
+      : allow_int8_(allow_int8), allow_16bit_(allow_16bit), allow_4bit_(allow_4bit) {}
+
+ private:
+  bool Check(const GraphViewer& graph_viewer,
+             const Node& node, const Node* redundant_clip_node,
+             const std::vector<const Node*>& dq_nodes,
+             const std::vector<const Node*>& q_nodes) const override;
+  bool allow_int8_;
+  bool allow_16bit_;
+  bool allow_4bit_;
+};
+
 // 2 DQ nodes for input -> node -> optional Q if QLinearMatMul, MatMulIntegerToFloat if not
 // The lack of a trailing Q isn't really a QDQ node group, so we default support for that to off.
 class MatMulNodeGroupSelector : public NodeGroupSelector {
diff --git a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/shared/utils.cc b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/shared/utils.cc
index e531d19d4c643..d3957a34dcfca 100644
--- a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/shared/utils.cc
+++ b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/shared/utils.cc
@@ -113,6 +113,9 @@ static const OpVersionsAndSelector::OpVersionsMap GetConvOpVersionsMap() {
 static const OpVersionsAndSelector::OpVersionsMap GetConvTransposeOpVersionsMap() {
   return {{"ConvTranspose", {}}};
 }
+static const OpVersionsAndSelector::OpVersionsMap GetEinsumOpVersionsMap() {
+  return {{"Einsum", {}}};
+}
 static const OpVersionsAndSelector::OpVersionsMap GetMatMulOpVersionsMap() {
   return {{"MatMul", {}}};
 }
@@ -202,6 +205,13 @@ void RegisterConvTransposeSelector(Selectors& qdq_selectors) {
                                  std::move(selector));
 }
 
+void RegisterEinsumSelector(Selectors& qdq_selectors) {
+  /* register selector for einsum op */
+  std::unique_ptr<NodeGroupSelector> selector = std::make_unique<EinsumNodeGroupSelector>();
+  qdq_selectors.RegisterSelector(GetEinsumOpVersionsMap(),
+                                 std::move(selector));
+}
+
 void RegisterMatMulSelector(Selectors& qdq_selectors) {
   /* register selector for matmul op */
   std::unique_ptr<NodeGroupSelector> selector = std::make_unique<MatMulNodeGroupSelector>();
@@ -267,6 +277,7 @@ void SelectorManager::CreateSelectors() {
   RegisterSplitSelector(qdq_selectors_);
   RegisterConvSelector(qdq_selectors_);
   RegisterConvTransposeSelector(qdq_selectors_);
+  RegisterEinsumSelector(qdq_selectors_);
   RegisterMatMulSelector(qdq_selectors_);
   RegisterGemmSelector(qdq_selectors_);
   RegisterInstanceAndLayerNormalizationSelector(qdq_selectors_);

From 523f486a5b56d4b1139a8cbcb35ced78e3794096 Mon Sep 17 00:00:00 2001
From: Ashrit Shetty <ashrit.shetty@microsoft.com>
Date: Wed, 7 May 2025 17:11:12 -0700
Subject: [PATCH 34/84] Align hardware ID parsing with DXCore representation
 (#24682)

### Description
Modified the `get_id` lambda to parse 4-character hardware ID components
(e.g., from VEN_xxxx or DEV_yyyy) by converting the ASCII string
representation directly to a uint32_t using `WStringToUint32Id`.

This replaces the previous hexadecimal string-to-integer conversion and
aligns with how DXCore reports these vendor and device IDs, ensuring
consistent ID interpretation.

### Motivation and Context
Before this change, we were assuming the hardware ID components were
hexadecimal strings and converting them to integers. This assumption is
incorrect and was leading to incorrect interpretations of the vendor and
device IDs.

Down the line when we compare the vendor and device id with the ids we
receive from DXCORE, there are no matches, and we fail to copy the
information collected in `GetDeviceInfoSetupApi`.

### Testing
Tested with all the sample apps and stepped through the code to verify
the Vendor and Device IDs match the IDs from DXCORE on both Qualcomm and
AMD.

---------

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
---
 .../core/platform/windows/device_discovery.cc | 31 +++++++++++++++----
 1 file changed, 25 insertions(+), 6 deletions(-)

diff --git a/onnxruntime/core/platform/windows/device_discovery.cc b/onnxruntime/core/platform/windows/device_discovery.cc
index 1f8600d6ca4a6..5c4c41a5799c8 100644
--- a/onnxruntime/core/platform/windows/device_discovery.cc
+++ b/onnxruntime/core/platform/windows/device_discovery.cc
@@ -68,6 +68,20 @@ uint64_t GetLuidKey(LUID luid) {
   return (uint64_t(luid.HighPart) << 32) | luid.LowPart;
 }
 
+// Converts a wide string (up to 4 characters) representing a hardware ID component (e.g., "ABCD" from "VEN_ABCD")
+// into a uint32_t. The conversion is done in a little-endian manner, meaning the first character
+// of the string becomes the least significant byte of the integer, and the fourth character
+// becomes the most significant byte.
+uint32_t WStringToUint32Id(const std::wstring& vendor_name) {
+  uint32_t vendor_id = 0;
+  for (size_t i = 0; i < 4 && i < vendor_name.size(); ++i) {
+    // For little-endian, place each character at the appropriate byte position
+    // First character goes into lowest byte, last character into highest byte
+    vendor_id |= static_cast<unsigned char>(vendor_name[i] & 0xFF) << (i * 8);
+  }
+  return vendor_id;
+}
+
 // returns info for display and processor entries. key is (vendor_id << 32 | device_id)
 // npus: (vendor_id << 32 | device_id) for devices we think are NPUs from DXCORE
 std::unordered_map<uint64_t, DeviceInfo> GetDeviceInfoSetupApi(const std::unordered_set<uint64_t>& npus) {
@@ -104,28 +118,33 @@ std::unordered_map<uint64_t, DeviceInfo> GetDeviceInfoSetupApi(const std::unorde
       //// Get hardware ID (contains VEN_xxxx&DEV_xxxx)
       if (SetupDiGetDeviceRegistryPropertyW(devInfo, &devData, SPDRP_HARDWAREID, &regDataType,
                                             (PBYTE)buffer, sizeof(buffer), &size)) {
+        uint32_t vendor_id = 0;
+        uint32_t device_id = 0;
+
         // PCI\VEN_xxxx&DEV_yyyy&...
         // ACPI\VEN_xxxx&DEV_yyyy&... if we're lucky.
         // ACPI values seem to be very inconsistent, so we check fairly carefully and always require a device id.
         const auto get_id = [](const std::wstring& hardware_id, const std::wstring& prefix) -> uint32_t {
           if (auto idx = hardware_id.find(prefix); idx != std::wstring::npos) {
             auto id = hardware_id.substr(idx + prefix.size(), 4);
-            if (std::all_of(id.begin(), id.end(), iswxdigit)) {
-              return std::stoul(id, nullptr, 16);
+            if (id.size() == 4) {
+              // DXCore reports vendor and device IDs as 32-bit integer representations of the ASCII string.
+              return WStringToUint32Id(id);
             }
           }
 
           return 0;
         };
 
-        uint32_t vendor_id = get_id(buffer, L"VEN_");
-        uint32_t device_id = get_id(buffer, L"DEV_");
-
         // Processor ID should come from CPUID mapping.
-        if (vendor_id == 0 && guid == GUID_DEVCLASS_PROCESSOR) {
+        if (guid == GUID_DEVCLASS_PROCESSOR) {
           vendor_id = CPUIDInfo::GetCPUIDInfo().GetCPUVendorId();
+        } else {
+          vendor_id = get_id(buffer, L"VEN_");
         }
 
+        device_id = get_id(buffer, L"DEV_");
+
         // Won't always have a vendor id from an ACPI entry.  ACPI is not defined for this purpose.
         if (vendor_id == 0 && device_id == 0) {
           continue;

From b7e7b6ff9a3df4a002fffb1305e24ff7a38bd858 Mon Sep 17 00:00:00 2001
From: "genmingz@AMD" <danyue333@163.com>
Date: Thu, 8 May 2025 08:22:32 +0800
Subject: [PATCH 35/84] [VitisAI] fix graph_save dump unsorted Model (#24678)

### Description
When calling graph_save, a model with unsorted nodes will be saved, which will cause errors when using the model later.

### Motivation and Context
The resolve function is called in the graph_save process, which causes the nodes to be sorted.

Co-authored-by: genmingz <genmingz@amd.com>
---
 onnxruntime/core/providers/vitisai/imp/graph.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/onnxruntime/core/providers/vitisai/imp/graph.cc b/onnxruntime/core/providers/vitisai/imp/graph.cc
index e9db58143577b..88bdbaed40c73 100644
--- a/onnxruntime/core/providers/vitisai/imp/graph.cc
+++ b/onnxruntime/core/providers/vitisai/imp/graph.cc
@@ -178,6 +178,8 @@ void graph_save(const Graph& graph, const std::string& filename, const std::stri
   *model_proto->mutable_graph() = *graph_proto_subgraph;
   auto& logger = logging::LoggingManager::DefaultLogger();
   auto model = Model::Create(std::move(*model_proto), ToPathString(filename), nullptr, logger);
+  auto status = model->MainGraph().Resolve();
+  vai_assert(status.IsOK(), "graph resolve error:" + status.ErrorMessage());
   if (initializer_size_threshold == std::numeric_limits<size_t>::max()) {
     model_proto = model->ToProto();
   } else {

From 0e0002b032627aa9a30fba1cc6b2dc2d15723a3f Mon Sep 17 00:00:00 2001
From: mingyue <131847423+mingyueliuh@users.noreply.github.com>
Date: Thu, 8 May 2025 08:50:48 +0800
Subject: [PATCH 36/84] [Fix] compare OrtDevice error (#24677)

### Description
Fix compare OrtDevice when Debug mode
Related #24371


### Motivation and Context
add compare device alignment in  OrtDevice compare function
---
 include/onnxruntime/core/framework/ortdevice.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/onnxruntime/core/framework/ortdevice.h b/include/onnxruntime/core/framework/ortdevice.h
index 472575d1998f5..2a377238e0e27 100644
--- a/include/onnxruntime/core/framework/ortdevice.h
+++ b/include/onnxruntime/core/framework/ortdevice.h
@@ -103,7 +103,7 @@ struct OrtDevice {
 };
 
 inline bool operator==(const OrtDevice& left, const OrtDevice& other) {
-  return left.Id() == other.Id() && left.MemType() == other.MemType() && left.Type() == other.Type();
+  return left.Id() == other.Id() && left.MemType() == other.MemType() && left.Type() == other.Type() && left.GetAlignment() == other.GetAlignment();
 }
 
 inline bool operator!=(const OrtDevice& left, const OrtDevice& other) {

From 0aaccafd41eca1580ec409d4ccd32cd1288c7e05 Mon Sep 17 00:00:00 2001
From: Ted Themistokleous
 <107195283+TedThemistokleous@users.noreply.github.com>
Date: Thu, 8 May 2025 02:37:36 -0400
Subject: [PATCH 37/84] [MIGraphX EP] [ROCm EP] Update CI to use ROCm 6.4
 (#23535)

### Description
<!-- Describe your changes. -->
Update Onnxruntime CI to use latest ROCm release ROCm 6.3.2


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->

Ensure CI is testing up to date with recent Onnxruntime. AMD validates
changes based off the latest ROCm release and adds additional features
on top to validate changes prior to being pushed to our internal testing
branches and then up streamed to Microsoft/Onnxruntime:main . Right now
Onnxruntime is testing things a few releases back for their CI

---------

Co-authored-by: Ted Themistokleous <tedthemistokleous@amd.com>
---
 .../github/azure-pipelines/linux-migraphx-ci-pipeline.yml       | 2 +-
 .../ci_build/github/azure-pipelines/linux-rocm-ci-pipeline.yml  | 2 +-
 .../github/linux/docker/migraphx-ci-pipeline-env.Dockerfile     | 2 +-
 .../github/linux/docker/rocm-ci-pipeline-env.Dockerfile         | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml
index de5df97d37d3d..c6ebb80f98e12 100644
--- a/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml
@@ -37,7 +37,7 @@ variables:
   - name: render
     value: 109
   - name: RocmVersion
-    value: 6.2.3
+    value: 6.4
 
 jobs:
 - job: Linux_Build
diff --git a/tools/ci_build/github/azure-pipelines/linux-rocm-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-rocm-ci-pipeline.yml
index af5a8d1decb6e..7388ed6d5a1e9 100644
--- a/tools/ci_build/github/azure-pipelines/linux-rocm-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/linux-rocm-ci-pipeline.yml
@@ -37,7 +37,7 @@ variables:
   - name: render
     value: 109
   - name: RocmVersion
-    value: 6.3.2
+    value: 6.4
 
 jobs:
 - job: Linux_Build
diff --git a/tools/ci_build/github/linux/docker/migraphx-ci-pipeline-env.Dockerfile b/tools/ci_build/github/linux/docker/migraphx-ci-pipeline-env.Dockerfile
index 667ffe03d8922..7b02a5e658d31 100644
--- a/tools/ci_build/github/linux/docker/migraphx-ci-pipeline-env.Dockerfile
+++ b/tools/ci_build/github/linux/docker/migraphx-ci-pipeline-env.Dockerfile
@@ -1,7 +1,7 @@
 # Refer to https://github.com/RadeonOpenCompute/ROCm-docker/blob/master/dev/Dockerfile-ubuntu-22.04-complete
 FROM ubuntu:22.04
 
-ARG ROCM_VERSION=6.2.3
+ARG ROCM_VERSION=6.4
 ARG AMDGPU_VERSION=${ROCM_VERSION}
 ARG APT_PREF='Package: *\nPin: release o=repo.radeon.com\nPin-Priority: 600'
 
diff --git a/tools/ci_build/github/linux/docker/rocm-ci-pipeline-env.Dockerfile b/tools/ci_build/github/linux/docker/rocm-ci-pipeline-env.Dockerfile
index 1cd5f289dd1c9..83a4e04435b95 100644
--- a/tools/ci_build/github/linux/docker/rocm-ci-pipeline-env.Dockerfile
+++ b/tools/ci_build/github/linux/docker/rocm-ci-pipeline-env.Dockerfile
@@ -1,7 +1,7 @@
 # Refer to https://github.com/RadeonOpenCompute/ROCm-docker/blob/master/dev/Dockerfile-ubuntu-22.04-complete
 FROM ubuntu:22.04
 
-ARG ROCM_VERSION=6.3.2
+ARG ROCM_VERSION=6.4
 ARG AMDGPU_VERSION=${ROCM_VERSION}
 ARG APT_PREF='Package: *\nPin: release o=repo.radeon.com\nPin-Priority: 600'
 

From 05793bd2ce1a7f348b32660a3553a3439aecf0d0 Mon Sep 17 00:00:00 2001
From: Hector Li <hecli@microsoft.com>
Date: Thu, 8 May 2025 10:51:43 -0700
Subject: [PATCH 38/84] Qnn nuget package update for arm64x (#24690)

### Description
Update the folder name from win-arm64x to win-arm64 since it is invalid RID: https://learn.microsoft.com/en-us/dotnet/core/rid-catalog#windows-rids
---
 .../targets/netstandard/props_qnn.xml         | 20 +++++++++----------
 .../azure-pipelines/templates/qnn-ep-win.yml  |  2 +-
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/csharp/src/Microsoft.ML.OnnxRuntime/targets/netstandard/props_qnn.xml b/csharp/src/Microsoft.ML.OnnxRuntime/targets/netstandard/props_qnn.xml
index fa0e957418fab..83ffb22ccf6b2 100644
--- a/csharp/src/Microsoft.ML.OnnxRuntime/targets/netstandard/props_qnn.xml
+++ b/csharp/src/Microsoft.ML.OnnxRuntime/targets/netstandard/props_qnn.xml
@@ -12,7 +12,7 @@
 
   <ItemDefinitionGroup Condition="'$(PlatformTarget)' == 'ARM64'">
     <Link>
-      <AdditionalDependencies>$(MSBuildThisFileDirectory)../../runtimes/win-arm64x/native/onnxruntime.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalDependencies>$(MSBuildThisFileDirectory)../../runtimes/win-arm64/native/onnxruntime.lib;%(AdditionalDependencies)</AdditionalDependencies>
     </Link>
   </ItemDefinitionGroup>
 
@@ -24,7 +24,7 @@
 
   <ItemDefinitionGroup Condition="'$(PlatformTarget)' == 'x64' OR ('$(PlatformTarget)' == 'AnyCPU' AND '$(Prefer32Bit)' != 'true')">
    <Link>
-      <AdditionalDependencies>$(MSBuildThisFileDirectory)../../runtimes/win-arm64x/native/onnxruntime.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalDependencies>$(MSBuildThisFileDirectory)../../runtimes/win-arm64/native/onnxruntime.lib;%(AdditionalDependencies)</AdditionalDependencies>
    </Link>
   </ItemDefinitionGroup>
 
@@ -36,7 +36,7 @@
 
   <PropertyGroup>
     <EnginePlatform Condition="'$(Platform)' == 'Win32'">x86</EnginePlatform>
-    <EnginePlatform Condition="'$(Platform)' == 'ARM64'">arm64x</EnginePlatform>
+    <EnginePlatform Condition="'$(Platform)' == 'ARM64'">arm64</EnginePlatform>
     <EnginePlatform Condition="'$(Platform)' == 'ARM'">arm</EnginePlatform>
     <EnginePlatform Condition="'$(Platform)' != 'Win32' AND '$(Platform)' != 'ARM64'">$(Platform)</EnginePlatform>
   </PropertyGroup>
@@ -47,31 +47,31 @@
 
   <ItemGroup>
     <!-- x64 -->
-    <None Include="$(MSBuildThisFileDirectory)..\..\runtimes\win-arm64x\native\onnxruntime.dll"
+    <None Include="$(MSBuildThisFileDirectory)..\..\runtimes\win-arm64\native\onnxruntime.dll"
           Condition="('$(PlatformTarget)' == 'x64' OR ('$(PlatformTarget)' == 'AnyCPU' AND '$(Prefer32Bit)' != 'true')) AND
-                     Exists('$(MSBuildThisFileDirectory)..\..\runtimes\win-arm64x\native\onnxruntime.dll')">
+                     Exists('$(MSBuildThisFileDirectory)..\..\runtimes\win-arm64\native\onnxruntime.dll')">
       <Link>onnxruntime.dll</Link>
       <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
       <Visible>false</Visible>
     </None>
-    <None Include="$(MSBuildThisFileDirectory)..\..\runtimes\win-arm64x\native\onnxruntime_providers_shared.dll"
+    <None Include="$(MSBuildThisFileDirectory)..\..\runtimes\win-arm64\native\onnxruntime_providers_shared.dll"
           Condition="('$(PlatformTarget)' == 'x64' OR ('$(PlatformTarget)' == 'AnyCPU' AND '$(Prefer32Bit)' != 'true')) AND
-                     Exists('$(MSBuildThisFileDirectory)..\..\runtimes\win-arm64x\native\onnxruntime_providers_shared.dll')">
+                     Exists('$(MSBuildThisFileDirectory)..\..\runtimes\win-arm64\native\onnxruntime_providers_shared.dll')">
       <Link>onnxruntime_providers_shared.dll</Link>
       <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
       <Visible>false</Visible>
     </None>
 
     <!-- arm64 -->
-    <None Include="$(MSBuildThisFileDirectory)..\..\runtimes\win-arm64x\native\onnxruntime.dll"
+    <None Include="$(MSBuildThisFileDirectory)..\..\runtimes\win-arm64\native\onnxruntime.dll"
           Condition="'$(PlatformTarget)' == 'ARM64'">
       <Link>onnxruntime.dll</Link>
       <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
       <Visible>false</Visible>
     </None>
-    <None Include="$(MSBuildThisFileDirectory)..\..\runtimes\win-arm64x\native\onnxruntime_providers_shared.dll"
+    <None Include="$(MSBuildThisFileDirectory)..\..\runtimes\win-arm64\native\onnxruntime_providers_shared.dll"
           Condition="'$(PlatformTarget)' == 'ARM64' AND
-                     Exists('$(MSBuildThisFileDirectory)..\..\runtimes\win-arm64x\native\onnxruntime_providers_shared.dll')">
+                     Exists('$(MSBuildThisFileDirectory)..\..\runtimes\win-arm64\native\onnxruntime_providers_shared.dll')">
       <Link>onnxruntime_providers_shared.dll</Link>
       <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
       <Visible>false</Visible>
diff --git a/tools/ci_build/github/azure-pipelines/templates/qnn-ep-win.yml b/tools/ci_build/github/azure-pipelines/templates/qnn-ep-win.yml
index 6bfc00b5b46eb..d739724f8744a 100644
--- a/tools/ci_build/github/azure-pipelines/templates/qnn-ep-win.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/qnn-ep-win.yml
@@ -98,7 +98,7 @@ stages:
         solution: '$(Build.SourcesDirectory)\csharp\OnnxRuntime.CSharp.proj'
         platform: 'Any CPU'
         configuration: ${{ parameters.build_config }}
-        msbuildArguments: '-t:CreatePackage -p:OnnxRuntimeBuildDirectory="$(Build.BinariesDirectory)" -p:OrtPackageId=$(OrtPackageId) -p:IsReleaseBuild=${{ parameters.IsReleaseBuild }} -p:TargetArchitecture=arm64x'
+        msbuildArguments: '-t:CreatePackage -p:OnnxRuntimeBuildDirectory="$(Build.BinariesDirectory)" -p:OrtPackageId=$(OrtPackageId) -p:IsReleaseBuild=${{ parameters.IsReleaseBuild }} -p:TargetArchitecture=arm64'
         workingDirectory: '$(Build.SourcesDirectory)\csharp'
 
     - task: CopyFiles@2

From 3dc91e6c31e008983dbc36ef63f0fbb253741dc7 Mon Sep 17 00:00:00 2001
From: Ishwar Raut <ishwar.raut@gmail.com>
Date: Thu, 8 May 2025 17:29:32 -0700
Subject: [PATCH 39/84] Removed the dependencies from cuda ep dll (#24656)

### Remove the dependencies from CUDA EP DLL-
<!-- Removed the dependencies from cuda ep dll-->
1. Copied the CUDA allocator from CUDA EP to NV RTX EP
2. Copied data transfer from CUDA EP to NV RTX EP
3. Implemented the CUDA error handling in NV RTX EP


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
@ankan-ban @gedoensmax @chilo-ms to review

---------

Co-authored-by: iraut <iraut@nvidia.com>
---
 .../providers/nv_tensorrt_rtx/nv_allocator.cc | 100 ++++++++++++
 .../providers/nv_tensorrt_rtx/nv_allocator.h  |  65 ++++++++
 .../providers/nv_tensorrt_rtx/nv_cuda_call.cc | 145 ++++++++++++++++++
 .../nv_tensorrt_rtx/nv_data_transfer.cc       |  92 +++++++++++
 .../nv_tensorrt_rtx/nv_data_transfer.h        |  25 +++
 .../nv_tensorrt_rtx/nv_execution_provider.cc  |  21 +--
 .../provider_bridge_provider.cc               |   2 +-
 tools/ci_build/build.py                       |   2 +-
 8 files changed, 436 insertions(+), 16 deletions(-)
 create mode 100644 onnxruntime/core/providers/nv_tensorrt_rtx/nv_allocator.cc
 create mode 100644 onnxruntime/core/providers/nv_tensorrt_rtx/nv_allocator.h
 create mode 100644 onnxruntime/core/providers/nv_tensorrt_rtx/nv_cuda_call.cc
 create mode 100644 onnxruntime/core/providers/nv_tensorrt_rtx/nv_data_transfer.cc
 create mode 100644 onnxruntime/core/providers/nv_tensorrt_rtx/nv_data_transfer.h

diff --git a/onnxruntime/core/providers/nv_tensorrt_rtx/nv_allocator.cc b/onnxruntime/core/providers/nv_tensorrt_rtx/nv_allocator.cc
new file mode 100644
index 0000000000000..4e8179d86fd73
--- /dev/null
+++ b/onnxruntime/core/providers/nv_tensorrt_rtx/nv_allocator.cc
@@ -0,0 +1,100 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "nv_allocator.h"
+#include "nv_includes.h"
+#include "core/providers/cuda/shared_inc/cuda_call.h"
+namespace onnxruntime {
+
+void CUDAAllocator::CheckDevice(bool throw_when_fail) const {
+#ifndef NDEBUG
+  // check device to match at debug build
+  // if it's expected to change, call cudaSetDevice instead of the check
+  int current_device;
+  auto cuda_err = cudaGetDevice(&current_device);
+  if (cuda_err == cudaSuccess) {
+    ORT_ENFORCE(current_device == Info().id);
+  } else if (throw_when_fail) {
+    CUDA_CALL_THROW(cuda_err);
+  }
+#else
+  ORT_UNUSED_PARAMETER(throw_when_fail);
+#endif
+}
+
+void CUDAAllocator::SetDevice(bool throw_when_fail) const {
+  int current_device;
+  auto cuda_err = cudaGetDevice(&current_device);
+  if (cuda_err == cudaSuccess) {
+    int allocator_device_id = Info().id;
+    if (current_device != allocator_device_id) {
+      cuda_err = cudaSetDevice(allocator_device_id);
+    }
+  }
+
+  if (cuda_err != cudaSuccess && throw_when_fail) {
+    CUDA_CALL_THROW(cuda_err);
+  }
+}
+
+void* CUDAAllocator::Alloc(size_t size) {
+  SetDevice(true);
+  CheckDevice(true);
+  void* p = nullptr;
+  if (size > 0) {
+    // BFCArena was updated recently to handle the exception and adjust the request size
+    CUDA_CALL_THROW(cudaMalloc((void**)&p, size));
+  }
+  return p;
+}
+
+void CUDAAllocator::Free(void* p) {
+  SetDevice(false);
+  CheckDevice(false);  // ignore CUDA failure when free
+  cudaFree(p);         // do not throw error since it's OK for cudaFree to fail during shutdown
+}
+
+void* CUDAExternalAllocator::Alloc(size_t size) {
+  void* p = nullptr;
+  if (size > 0) {
+    p = alloc_(size);
+
+    // review(codemzs): ORT_ENFORCE does not seem appropriate.
+    ORT_ENFORCE(p != nullptr);
+  }
+
+  return p;
+}
+
+void CUDAExternalAllocator::Free(void* p) {
+  free_(p);
+  std::lock_guard<std::mutex> lock(lock_);
+  auto it = reserved_.find(p);
+  if (it != reserved_.end()) {
+    reserved_.erase(it);
+    if (empty_cache_) empty_cache_();
+  }
+}
+
+void* CUDAExternalAllocator::Reserve(size_t size) {
+  void* p = Alloc(size);
+  if (!p) return nullptr;
+  std::lock_guard<std::mutex> lock(lock_);
+  ORT_ENFORCE(reserved_.find(p) == reserved_.end());
+  reserved_.insert(p);
+  return p;
+}
+
+void* CUDAPinnedAllocator::Alloc(size_t size) {
+  void* p = nullptr;
+  if (size > 0) {
+    CUDA_CALL_THROW(cudaMallocHost((void**)&p, size));
+  }
+  return p;
+}
+
+void CUDAPinnedAllocator::Free(void* p) {
+  CUDA_CALL_THROW(cudaFreeHost(p));
+}
+
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/nv_tensorrt_rtx/nv_allocator.h b/onnxruntime/core/providers/nv_tensorrt_rtx/nv_allocator.h
new file mode 100644
index 0000000000000..a3f05bded5de9
--- /dev/null
+++ b/onnxruntime/core/providers/nv_tensorrt_rtx/nv_allocator.h
@@ -0,0 +1,65 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/common/inlined_containers.h"
+#include "core/framework/allocator.h"
+#include <mutex>
+
+namespace onnxruntime {
+
+class CUDAAllocator : public IAllocator {
+ public:
+  CUDAAllocator(OrtDevice::DeviceId device_id, const char* name)
+      : IAllocator(
+            OrtMemoryInfo(name, OrtAllocatorType::OrtDeviceAllocator,
+                          OrtDevice(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, device_id),
+                          device_id, OrtMemTypeDefault)) {}
+  void* Alloc(size_t size) override;
+  void Free(void* p) override;
+
+ private:
+  void CheckDevice(bool throw_when_fail) const;
+  void SetDevice(bool throw_when_fail) const;
+};
+
+class CUDAExternalAllocator : public CUDAAllocator {
+  typedef void* (*ExternalAlloc)(size_t size);
+  typedef void (*ExternalFree)(void* p);
+  typedef void (*ExternalEmptyCache)();
+
+ public:
+  CUDAExternalAllocator(OrtDevice::DeviceId device_id, const char* name, void* alloc, void* free, void* empty_cache)
+      : CUDAAllocator(device_id, name) {
+    alloc_ = reinterpret_cast<ExternalAlloc>(alloc);
+    free_ = reinterpret_cast<ExternalFree>(free);
+    empty_cache_ = reinterpret_cast<ExternalEmptyCache>(empty_cache);
+  }
+
+  void* Alloc(size_t size) override;
+  void Free(void* p) override;
+  void* Reserve(size_t size) override;
+
+ private:
+  mutable std::mutex lock_;
+  ExternalAlloc alloc_;
+  ExternalFree free_;
+  ExternalEmptyCache empty_cache_;
+  InlinedHashSet<void*> reserved_;
+};
+
+// TODO: add a default constructor
+class CUDAPinnedAllocator : public IAllocator {
+ public:
+  CUDAPinnedAllocator(const char* name)
+      : IAllocator(
+            OrtMemoryInfo(name, OrtAllocatorType::OrtDeviceAllocator,
+                          OrtDevice(OrtDevice::CPU, OrtDevice::MemType::CUDA_PINNED, 0 /*CPU device always with id 0*/),
+                          0, OrtMemTypeCPUOutput)) {}
+
+  void* Alloc(size_t size) override;
+  void Free(void* p) override;
+};
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/nv_tensorrt_rtx/nv_cuda_call.cc b/onnxruntime/core/providers/nv_tensorrt_rtx/nv_cuda_call.cc
new file mode 100644
index 0000000000000..8e9ea1257cdd2
--- /dev/null
+++ b/onnxruntime/core/providers/nv_tensorrt_rtx/nv_cuda_call.cc
@@ -0,0 +1,145 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/providers/shared_library/provider_api.h"
+#include "core/providers/cuda/shared_inc/cuda_call.h"
+#include <core/platform/env.h>
+
+#ifdef _WIN32
+#else  // POSIX
+#include <unistd.h>
+#include <string.h>
+#endif
+
+namespace onnxruntime {
+
+using namespace common;
+
+template <typename ERRTYPE>
+const char* CudaErrString(ERRTYPE) {
+  ORT_NOT_IMPLEMENTED();
+}
+
+#define CASE_ENUM_TO_STR(x) \
+  case x:                   \
+    return #x
+
+template <>
+const char* CudaErrString<cudaError_t>(cudaError_t x) {
+  cudaDeviceSynchronize();
+  return cudaGetErrorString(x);
+}
+
+#ifndef USE_CUDA_MINIMAL
+template <>
+const char* CudaErrString<cublasStatus_t>(cublasStatus_t e) {
+  cudaDeviceSynchronize();
+  switch (e) {
+    CASE_ENUM_TO_STR(CUBLAS_STATUS_SUCCESS);
+    CASE_ENUM_TO_STR(CUBLAS_STATUS_NOT_INITIALIZED);
+    CASE_ENUM_TO_STR(CUBLAS_STATUS_ALLOC_FAILED);
+    CASE_ENUM_TO_STR(CUBLAS_STATUS_INVALID_VALUE);
+    CASE_ENUM_TO_STR(CUBLAS_STATUS_ARCH_MISMATCH);
+    CASE_ENUM_TO_STR(CUBLAS_STATUS_MAPPING_ERROR);
+    CASE_ENUM_TO_STR(CUBLAS_STATUS_EXECUTION_FAILED);
+    CASE_ENUM_TO_STR(CUBLAS_STATUS_INTERNAL_ERROR);
+    CASE_ENUM_TO_STR(CUBLAS_STATUS_NOT_SUPPORTED);
+    CASE_ENUM_TO_STR(CUBLAS_STATUS_LICENSE_ERROR);
+    default:
+      return "(look for CUBLAS_STATUS_xxx in cublas_api.h)";
+  }
+}
+
+template <>
+const char* CudaErrString<curandStatus>(curandStatus) {
+  cudaDeviceSynchronize();
+  return "(see curand.h & look for curandStatus or CURAND_STATUS_xxx)";
+}
+
+template <>
+const char* CudaErrString<cudnnStatus_t>(cudnnStatus_t e) {
+  cudaDeviceSynchronize();
+  return cudnnGetErrorString(e);
+}
+
+template <>
+const char* CudaErrString<cufftResult>(cufftResult e) {
+  cudaDeviceSynchronize();
+  switch (e) {
+    CASE_ENUM_TO_STR(CUFFT_SUCCESS);
+    CASE_ENUM_TO_STR(CUFFT_ALLOC_FAILED);
+    CASE_ENUM_TO_STR(CUFFT_INVALID_VALUE);
+    CASE_ENUM_TO_STR(CUFFT_INTERNAL_ERROR);
+    CASE_ENUM_TO_STR(CUFFT_SETUP_FAILED);
+    CASE_ENUM_TO_STR(CUFFT_INVALID_SIZE);
+    default:
+      return "Unknown cufft error status";
+  }
+}
+#endif
+
+#ifdef ORT_USE_NCCL
+template <>
+const char* CudaErrString<ncclResult_t>(ncclResult_t e) {
+  cudaDeviceSynchronize();
+  return ncclGetErrorString(e);
+}
+#endif
+
+template <typename ERRTYPE>
+int GetErrorCode(ERRTYPE err) {
+  return static_cast<int>(err);
+}
+
+template <typename ERRTYPE, bool THRW, typename SUCCTYPE>
+std::conditional_t<THRW, void, Status> CudaCall(
+    ERRTYPE retCode, const char* exprString, const char* libName, SUCCTYPE successCode, const char* msg,
+    const char* file, const int line) {
+  if (retCode != successCode) {
+    try {
+#ifdef _WIN32
+      std::string hostname_str = GetEnvironmentVar("COMPUTERNAME");
+      if (hostname_str.empty()) {
+        hostname_str = "?";
+      }
+      const char* hostname = hostname_str.c_str();
+#else
+      char hostname[HOST_NAME_MAX];
+      if (gethostname(hostname, HOST_NAME_MAX) != 0)
+        strcpy(hostname, "?");
+#endif
+      int currentCudaDevice = -1;
+      cudaGetDevice(&currentCudaDevice);
+      cudaGetLastError();  // clear last CUDA error
+      static char str[1024];
+      snprintf(str, 1024, "%s failure %d: %s ; GPU=%d ; hostname=%s ; file=%s ; line=%d ; expr=%s; %s",
+               libName, GetErrorCode(retCode), CudaErrString(retCode), currentCudaDevice,
+               hostname,
+               file, line, exprString, msg);
+      if constexpr (THRW) {
+        // throw an exception with the error info
+        ORT_THROW(str);
+      } else {
+        LOGS_DEFAULT(ERROR) << str;
+        return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, str);
+      }
+    } catch (const std::exception& e) {  // catch, log, and rethrow since CUDA code sometimes hangs in destruction,
+                                         // so we'd never get to see the error
+      if constexpr (THRW) {
+        ORT_THROW(e.what());
+      } else {
+        LOGS_DEFAULT(ERROR) << e.what();
+        return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, e.what());
+      }
+    }
+  }
+  if constexpr (!THRW) {
+    return Status::OK();
+  }
+}
+
+template Status CudaCall<cudaError, false>(cudaError retCode, const char* exprString, const char* libName, cudaError successCode, const char* msg, const char* file, const int line);
+template void CudaCall<cudaError, true>(cudaError retCode, const char* exprString, const char* libName, cudaError successCode, const char* msg, const char* file, const int line);
+
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/nv_tensorrt_rtx/nv_data_transfer.cc b/onnxruntime/core/providers/nv_tensorrt_rtx/nv_data_transfer.cc
new file mode 100644
index 0000000000000..4779ddd1a9556
--- /dev/null
+++ b/onnxruntime/core/providers/nv_tensorrt_rtx/nv_data_transfer.cc
@@ -0,0 +1,92 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/providers/shared_library/provider_api.h"
+
+#include "nv_data_transfer.h"
+
+#include "core/providers/cuda/shared_inc/cuda_call.h"
+#define CUDA_RETURN_IF_ERROR(expr) ORT_RETURN_IF_ERROR(CUDA_CALL(expr))
+namespace onnxruntime {
+bool GPUDataTransfer::CanCopy(const OrtDevice& src_device, const OrtDevice& dst_device) const {
+  return src_device.Type() == OrtDevice::GPU || src_device.MemType() == OrtDevice::MemType::CUDA_PINNED ||
+         dst_device.Type() == OrtDevice::GPU || dst_device.MemType() == OrtDevice::MemType::CUDA_PINNED;
+}
+
+common::Status GPUDataTransfer::CopyTensor(const Tensor& src, Tensor& dst) const {
+  size_t bytes = src.SizeInBytes();
+  const void* src_data = src.DataRaw();
+  void* dst_data = dst.MutableDataRaw();
+
+  auto& src_device = src.Location().device;
+  auto& dst_device = dst.Location().device;
+
+  // for the sync version of memcpy, launch to cuda default stream
+  if (dst_device.Type() == OrtDevice::GPU) {
+    if (src_device.Type() == OrtDevice::GPU) {
+      // Copy only if the two addresses are different.
+      if (dst_data != src_data) {
+        CUDA_RETURN_IF_ERROR(cudaMemcpy(dst_data, src_data, bytes, cudaMemcpyDeviceToDevice));
+        // For device memory to device memory copy, no host-side synchronization is performed by cudaMemcpy.
+        // see https://docs.nvidia.com/cuda/cuda-runtime-api/api-sync-behavior.html
+        CUDA_RETURN_IF_ERROR(cudaStreamSynchronize(nullptr));
+      }
+    } else {
+      // copy from other CPU memory to GPU, this is blocking
+      CUDA_RETURN_IF_ERROR(cudaMemcpy(dst_data, src_data, bytes, cudaMemcpyHostToDevice));
+      if (src_device.MemType() != OrtDevice::MemType::CUDA_PINNED) {
+        // For cudaMemcpy from pageable host memory to device memory, DMA to final destination may not have completed.
+        // see https://docs.nvidia.com/cuda/cuda-runtime-api/api-sync-behavior.html
+        CUDA_RETURN_IF_ERROR(cudaStreamSynchronize(nullptr));
+      }
+    }
+  } else if (src_device.Type() == OrtDevice::GPU) {
+    // copying from GPU to CPU memory, this is blocking
+    CUDA_RETURN_IF_ERROR(cudaMemcpy(dst_data, src_data, bytes, cudaMemcpyDeviceToHost));
+  } else {
+    // copying between cpu memory
+    ORT_ENFORCE(dst_data != src_data);
+    memcpy(dst_data, src_data, bytes);
+  }
+
+  return Status::OK();
+}
+
+common::Status GPUDataTransfer::CopyTensorAsync(const Tensor& src, Tensor& dst, Stream& stream) const {
+  size_t bytes = src.SizeInBytes();
+  const void* src_data = src.DataRaw();
+  void* dst_data = dst.MutableDataRaw();
+
+  auto& src_device = src.Location().device;
+  auto& dst_device = dst.Location().device;
+
+  if (dst_device.Type() == OrtDevice::GPU) {
+    if (src_device.Type() == OrtDevice::CPU) {
+      // copy from pinned or non-pinned CPU memory to GPU
+      CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(dst_data, src_data, bytes, cudaMemcpyHostToDevice, static_cast<cudaStream_t>(stream.GetHandle())));
+    } else if (src_device.Type() == OrtDevice::GPU) {
+      // copying between GPU, this is non-blocking
+      if (dst_data != src_data) {
+        CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(dst_data, src_data, bytes, cudaMemcpyDeviceToDevice, static_cast<cudaStream_t>(stream.GetHandle())));
+      }
+    }
+  } else if (src_device.Type() == OrtDevice::GPU) {
+    if (dst_device.Type() == OrtDevice::CPU) {
+      // copy from GPU to pinned or non-pinned CPU memory.
+      CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(dst_data, src_data, bytes, cudaMemcpyDeviceToHost, static_cast<cudaStream_t>(stream.GetHandle())));
+    }
+  } else {
+    if (src_device.MemType() == OrtDevice::MemType::CUDA_PINNED) {
+      // sync the stream first to make sure the data arrived
+      CUDA_RETURN_IF_ERROR(cudaStreamSynchronize(static_cast<cudaStream_t>(stream.GetHandle())));
+    }
+
+    ORT_ENFORCE(dst_data != src_data);
+    memcpy(dst_data, src_data, bytes);
+  }
+
+  return Status::OK();
+}
+
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/nv_tensorrt_rtx/nv_data_transfer.h b/onnxruntime/core/providers/nv_tensorrt_rtx/nv_data_transfer.h
new file mode 100644
index 0000000000000..272ea367ac7e4
--- /dev/null
+++ b/onnxruntime/core/providers/nv_tensorrt_rtx/nv_data_transfer.h
@@ -0,0 +1,25 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "nv_includes.h"
+#include "core/framework/data_transfer.h"
+
+namespace onnxruntime {
+
+class GPUDataTransfer : public IDataTransfer {
+ public:
+  GPUDataTransfer() = default;
+  ~GPUDataTransfer() = default;
+
+  bool CanCopy(const OrtDevice& src_device, const OrtDevice& dst_device) const override;
+
+  // Dumpen MSVC warning about not fully overriding
+  using IDataTransfer::CopyTensor;
+  common::Status CopyTensor(const Tensor& src, Tensor& dst) const override;
+  common::Status CopyTensorAsync(const Tensor& src, Tensor& dst, Stream& stream) const override;
+};
+
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/nv_tensorrt_rtx/nv_execution_provider.cc b/onnxruntime/core/providers/nv_tensorrt_rtx/nv_execution_provider.cc
index 25c130a849793..469d139ed03bb 100644
--- a/onnxruntime/core/providers/nv_tensorrt_rtx/nv_execution_provider.cc
+++ b/onnxruntime/core/providers/nv_tensorrt_rtx/nv_execution_provider.cc
@@ -1,4 +1,5 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 // Licensed under the MIT License.
 #include <fstream>
 #include <list>
@@ -12,10 +13,11 @@
 #include "nv_execution_provider.h"
 #include "nv_execution_provider_utils.h"
 #include "nv_execution_provider_custom_ops.h"
+#include "nv_allocator.h"
+#include "nv_data_transfer.h"
 #include "onnx_ctx_model_helper.h"
 #include "core/providers/cuda/shared_inc/cuda_call.h"
 #include "core/providers/cuda/math/unary_elementwise_ops_impl.h"
-#include "core/providers/cuda/gpu_data_transfer.h"
 #include "core/session/allocator_adapters.h"
 #include "cuda_runtime_api.h"
 #include <gsl/gsl>
@@ -113,16 +115,6 @@ void Impl_Cast(
 }
 }  // namespace cuda
 
-template <>
-Status CudaCall<cudaError, false>(cudaError retCode, const char* exprString, const char* libName, cudaError successCode, const char* msg, const char* file, const int line) {
-  return g_host->CudaCall_false(retCode, exprString, libName, successCode, msg, file, line);
-}
-
-template <>
-void CudaCall<cudaError, true>(cudaError retCode, const char* exprString, const char* libName, cudaError successCode, const char* msg, const char* file, const int line) {
-  return g_host->CudaCall_true(retCode, exprString, libName, successCode, msg, file, line);
-}
-
 #if NV_TENSORRT_MAJOR >= 10
 void* OutputAllocator::reallocateOutputAsync(char const* /*tensorName*/, void* /*currentMemory*/, uint64_t size,
                                              uint64_t /*alignment*/, cudaStream_t /*stream*/) noexcept {
@@ -1311,13 +1303,14 @@ void NvExecutionProvider::IncrementRegularRunCountBeforeGraphCapture() {
 
 std::vector<AllocatorPtr> NvExecutionProvider::CreatePreferredAllocators() {
   AllocatorCreationInfo default_memory_info(
-      [](OrtDevice::DeviceId device_id) { return CreateCUDAAllocator(device_id, onnxruntime::CUDA); },
+      [](OrtDevice::DeviceId device_id) { return std::make_unique<CUDAAllocator>(device_id, CUDA); },
       narrow<OrtDevice::DeviceId>(device_id_));
 
   AllocatorCreationInfo pinned_allocator_info(
       [](OrtDevice::DeviceId device_id) {
         ORT_UNUSED_PARAMETER(device_id);
-        return CreateCUDAPinnedAllocator(onnxruntime::CUDA_PINNED);
+        return std::make_unique<CUDAPinnedAllocator>(CUDA_PINNED);
+        ;
       },
       0);
 
@@ -1325,7 +1318,7 @@ std::vector<AllocatorPtr> NvExecutionProvider::CreatePreferredAllocators() {
 }
 
 std::unique_ptr<IDataTransfer> NvExecutionProvider::GetDataTransfer() const {
-  return onnxruntime::CreateGPUDataTransfer();
+  return std::make_unique<GPUDataTransfer>();
 }
 
 Status NvExecutionProvider::OnRunStart(const onnxruntime::RunOptions& /*run_options*/) {
diff --git a/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc b/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc
index eee6a05f12729..afabc1fa9b1c9 100644
--- a/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc
+++ b/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc
@@ -347,7 +347,7 @@ common::Status IExecutionProvider::Compile(const std::vector<FusedNodeAndGraph>&
   return g_host->IExecutionProvider__Compile(this, fused_nodes_and_graphs, node_compute_funcs);
 }
 
-#if defined(USE_TENSORRT) || defined(USE_NV)
+#if defined(USE_TENSORRT)
 std::unique_ptr<IAllocator> CreateCUDAAllocator(int16_t device_id, const char* name) {
   return g_host->CreateCUDAAllocator(device_id, name);
 }
diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py
index f4b7a78fe4c99..06e48a36942e2 100644
--- a/tools/ci_build/build.py
+++ b/tools/ci_build/build.py
@@ -2198,7 +2198,7 @@ def main():
 
     cmake_extra_defines = normalize_arg_list(args.cmake_extra_defines)
 
-    if args.use_tensorrt or args.use_nv_tensorrt_rtx:
+    if args.use_tensorrt:
         args.use_cuda = True
 
     if args.build_wheel or args.gen_doc or args.enable_training:

From d684d90cb1fee0722600511dcf36f95b0dd126fb Mon Sep 17 00:00:00 2001
From: quic-tirupath <quic_tirupath@quicinc.com>
Date: Fri, 9 May 2025 08:02:28 -0700
Subject: [PATCH 40/84] [QNN EP] Fix Weight Bias Quantization implementation
 (#24693)

- ORT is populating all node attributes in GetAttributes() API call with keeping default values for unspecified attributes in NodeProto.
- Check the possibility of default values and don't skip weight_bias_quantization on Conv operator if the weight has DeQuantize node with attribute block_size=0 is read.

### Description
GetAttributes() API call is populating all attributes in node definition with assigning default values for unspecified attributes in the model.
Weight Bias Quantization is being skipped when block_size attribute present in the DequantizeLinear node producing the weight for a Conv operator. Gracefully handle the default value of block_size i.e., 0 and apply the Weight Bias Quantization as the default value '0' has no significance.

### Motivation and Context
Applying Weight Bias Quantization on Conv operator enables ORT QDQ transformer to fold the DQ-->Conv-->Q pattern into Conv operator. This improves inference time for some QDQ ONNX models.
---
 .../optimizer/qdq_transformer/weight_bias_quantization.cc     | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/onnxruntime/core/optimizer/qdq_transformer/weight_bias_quantization.cc b/onnxruntime/core/optimizer/qdq_transformer/weight_bias_quantization.cc
index 83c5d7bc8d92a..58e90ea3c71c2 100644
--- a/onnxruntime/core/optimizer/qdq_transformer/weight_bias_quantization.cc
+++ b/onnxruntime/core/optimizer/qdq_transformer/weight_bias_quantization.cc
@@ -89,7 +89,9 @@ Status WeightBiasQuantization::ApplyImpl(Graph& graph, bool& modified, int graph
       }
 
       const auto& dq_attrs = dq_1->GetAttributes();
-      if (dq_attrs.find("block_size") != dq_attrs.end()) {
+      auto attr_it = dq_attrs.find("block_size");
+      // Default value of block_size=0 has no significance. Don't skip weight_bias_quantization.
+      if (attr_it != dq_attrs.end() && attr_it->second.i() != 0) {
         continue;
       }
 

From 1920ba16c6c9749f9028785ee7918dd8fbdb1fa9 Mon Sep 17 00:00:00 2001
From: Dmitri Smirnov <yuslepukhin@users.noreply.github.com>
Date: Fri, 9 May 2025 08:40:29 -0700
Subject: [PATCH 41/84] Implement Public API GetTensorSizeInBytes (#24686)

### Description
<!-- Describe your changes. -->
Implement Public API GetTensorSizeInBytes

### Motivation and Context
Addresses  https://github.com/microsoft/onnxruntime/issues/24680
---
 .../core/session/onnxruntime_c_api.h          | 15 +++++++++
 .../core/session/onnxruntime_cxx_api.h        | 12 +++++++
 .../core/session/onnxruntime_cxx_inline.h     |  7 ++++
 onnxruntime/core/session/onnxruntime_c_api.cc | 28 ++++++++++++++++
 onnxruntime/core/session/ort_apis.h           |  3 ++
 onnxruntime/test/shared_lib/test_inference.cc | 33 +++++++++++++++++++
 6 files changed, 98 insertions(+)

diff --git a/include/onnxruntime/core/session/onnxruntime_c_api.h b/include/onnxruntime/core/session/onnxruntime_c_api.h
index a4a32fbea630a..25b6d72394e0c 100644
--- a/include/onnxruntime/core/session/onnxruntime_c_api.h
+++ b/include/onnxruntime/core/session/onnxruntime_c_api.h
@@ -5251,6 +5251,21 @@ struct OrtApi {
    * \since Version 1.22.
    */
   const OrtEpApi*(ORT_API_CALL* GetEpApi)();
+
+  /** \brief Compute total size in bytes of the tensor data contained in an OrtValue.
+   *
+   * Returns the total number of bytes used to store the tensor data. For numeric tensors,
+   * this is sizeof(element_type) * total_element_count. OrtValues that are not tensors or
+   * that are tensors that contain strings will cause an error to be returned.
+   *
+   * \param[in] ort_value OrtValue instance containing a tensor
+   * \param[out] size The total size of the tensor data in bytes
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   *
+   * \since Version 1.23
+   */
+  ORT_API2_STATUS(GetTensorSizeInBytes, _In_ const OrtValue* ort_value, _Out_ size_t* size);
 };
 
 /*
diff --git a/include/onnxruntime/core/session/onnxruntime_cxx_api.h b/include/onnxruntime/core/session/onnxruntime_cxx_api.h
index 39c20e237b02c..8876c40fe9e6c 100644
--- a/include/onnxruntime/core/session/onnxruntime_cxx_api.h
+++ b/include/onnxruntime/core/session/onnxruntime_cxx_api.h
@@ -1738,6 +1738,18 @@ struct ConstValueImpl : Base<T> {
   /// <returns>byte length for the specified string element</returns>
   size_t GetStringTensorElementLength(size_t element_index) const;
 
+  /// <summary>
+  /// Returns the total size of the tensor data in bytes.
+  /// </summary>
+  /// <returns>The total size of the tensor data in bytes</returns>
+  /// <exception>Throws an exception if the OrtValue does not contain a tensor or
+  /// if it contains a tensor that contains strings</exception>
+  /// <remarks>
+  /// For numeric tensors, this is sizeof(element_type) * total_element_count.
+  ///
+  /// </remarks>
+  size_t GetTensorSizeInBytes() const;  ///< Wraps OrtApi::GetTensorSizeInBytes
+
 #if !defined(DISABLE_SPARSE_TENSORS)
   /// <summary>
   /// The API returns the sparse data format this OrtValue holds in a sparse tensor.
diff --git a/include/onnxruntime/core/session/onnxruntime_cxx_inline.h b/include/onnxruntime/core/session/onnxruntime_cxx_inline.h
index 94ad2118fa4d6..0d0b3198a8736 100644
--- a/include/onnxruntime/core/session/onnxruntime_cxx_inline.h
+++ b/include/onnxruntime/core/session/onnxruntime_cxx_inline.h
@@ -1823,6 +1823,13 @@ inline size_t ConstValueImpl<T>::GetStringTensorElementLength(size_t element_ind
   return out;
 }
 
+template <typename T>
+inline size_t ConstValueImpl<T>::GetTensorSizeInBytes() const {
+  size_t out;
+  ThrowOnError(GetApi().GetTensorSizeInBytes(this->p_, &out));
+  return out;
+}
+
 template <typename T>
 template <typename R>
 inline const R* ConstValueImpl<T>::GetTensorData() const {
diff --git a/onnxruntime/core/session/onnxruntime_c_api.cc b/onnxruntime/core/session/onnxruntime_c_api.cc
index d03b98a9c1eb5..868fab767fa7b 100644
--- a/onnxruntime/core/session/onnxruntime_c_api.cc
+++ b/onnxruntime/core/session/onnxruntime_c_api.cc
@@ -1206,6 +1206,33 @@ ORT_API_STATUS_IMPL(OrtApis::GetStringTensorElementLength, _In_ const OrtValue*
   API_IMPL_END
 }
 
+ORT_API_STATUS_IMPL(OrtApis::GetTensorSizeInBytes, _In_ const OrtValue* value, _Out_ size_t* size) {
+  API_IMPL_BEGIN
+
+  if (value == nullptr) {
+    return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, "Input `value` argument must not be null");
+  }
+
+  if (size == nullptr) {
+    return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, "Output `size` argument must not be null");
+  }
+
+  if (!value->IsAllocated() || !value->IsTensor()) {
+    return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, "OrtValue is expected to contain a tensor");
+  }
+
+  const auto& tensor = value->Get<onnxruntime::Tensor>();
+
+  // Check if this is a string tensor
+  if (tensor.IsDataTypeString()) {
+    return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, "String tensors are not supported by this API");
+  }
+
+  *size = tensor.SizeInBytes();
+  return nullptr;
+  API_IMPL_END
+}
+
 ORT_API_STATUS_IMPL(OrtApis::GetStringTensorContent, _In_ const OrtValue* value, _Out_writes_bytes_all_(s_len) void* s,
                     size_t s_len, _Out_writes_all_(offsets_len) size_t* offsets, size_t offsets_len) {
   API_IMPL_BEGIN
@@ -2996,6 +3023,7 @@ static constexpr OrtApi ort_api_1_to_23 = {
 
     &OrtApis::GetEpApi,
     // End of Version 22 - DO NOT MODIFY ABOVE (see above text for more information)
+    &OrtApis::GetTensorSizeInBytes,
 };
 
 // OrtApiBase can never change as there is no way to know what version of OrtApiBase is returned by OrtGetApiBase.
diff --git a/onnxruntime/core/session/ort_apis.h b/onnxruntime/core/session/ort_apis.h
index 47d1a543b5a31..81af6694f6273 100644
--- a/onnxruntime/core/session/ort_apis.h
+++ b/onnxruntime/core/session/ort_apis.h
@@ -598,4 +598,7 @@ ORT_API(const OrtKeyValuePairs*, EpDevice_EpOptions, _In_ const OrtEpDevice* ep_
 ORT_API(const OrtHardwareDevice*, EpDevice_Device, _In_ const OrtEpDevice* ep_device);
 
 ORT_API(const OrtEpApi*, GetEpApi);
+
+ORT_API_STATUS_IMPL(GetTensorSizeInBytes, _In_ const OrtValue* ort_value, _Out_ size_t* size);
+
 }  // namespace OrtApis
diff --git a/onnxruntime/test/shared_lib/test_inference.cc b/onnxruntime/test/shared_lib/test_inference.cc
index 0a4ea71933724..6460e3cb3aec4 100644
--- a/onnxruntime/test/shared_lib/test_inference.cc
+++ b/onnxruntime/test/shared_lib/test_inference.cc
@@ -425,6 +425,39 @@ TEST_P(CApiTestWithProvider, simple) {
                        nullptr, nullptr);
 }
 
+template <class T, size_t element_count_to_create>
+void TestGetTensorSizeInBytes(Ort::ConstMemoryInfo cpu_meminfo) {
+  constexpr const size_t expected_size_in_bytes = sizeof(T) * element_count_to_create;
+  constexpr const std::array<int64_t, 2> dims = {1, static_cast<int64_t>(element_count_to_create)};
+  std::array<T, element_count_to_create> data;
+  std::fill(data.begin(), data.end(), T{1});
+
+  auto value = Ort::Value::CreateTensor<T>(cpu_meminfo, data.data(),
+                                           data.size(), dims.data(), dims.size());
+
+  auto type_info = value.GetTypeInfo();
+  ASSERT_EQ(type_info.GetONNXType(), ONNX_TYPE_TENSOR);
+  auto tensor_type_info = type_info.GetTensorTypeAndShapeInfo();
+  const auto element_count = tensor_type_info.GetElementCount();
+  ASSERT_EQ(expected_size_in_bytes / sizeof(T), element_count);
+  ASSERT_EQ(expected_size_in_bytes, value.GetTensorSizeInBytes());
+}
+
+TEST(CApiTest, TestGetTensorSizeInBytes) {
+  Ort::MemoryInfo cpu_meminfo("Cpu", OrtDeviceAllocator, 0, OrtMemTypeDefault);
+  TestGetTensorSizeInBytes<float, 1>(cpu_meminfo.GetConst());
+  TestGetTensorSizeInBytes<float, 2>(cpu_meminfo.GetConst());
+  TestGetTensorSizeInBytes<float, 3>(cpu_meminfo.GetConst());
+  TestGetTensorSizeInBytes<float, 4>(cpu_meminfo.GetConst());
+  TestGetTensorSizeInBytes<float, 5>(cpu_meminfo.GetConst());
+
+  TestGetTensorSizeInBytes<int64_t, 1>(cpu_meminfo.GetConst());
+  TestGetTensorSizeInBytes<int64_t, 2>(cpu_meminfo.GetConst());
+  TestGetTensorSizeInBytes<int64_t, 3>(cpu_meminfo.GetConst());
+  TestGetTensorSizeInBytes<int64_t, 4>(cpu_meminfo.GetConst());
+  TestGetTensorSizeInBytes<int64_t, 5>(cpu_meminfo.GetConst());
+}
+
 TEST(CApiTest, dim_param) {
   Ort::SessionOptions session_options;
   Ort::Session session(*ort_env, NAMED_AND_ANON_DIM_PARAM_URI, session_options);

From 67f95573694b86cf34ac2174022714f05a8f8e68 Mon Sep 17 00:00:00 2001
From: Dmitri Smirnov <yuslepukhin@users.noreply.github.com>
Date: Fri, 9 May 2025 08:40:53 -0700
Subject: [PATCH 42/84] Fix improper iterator usage (#24698)

### Description
<!-- Describe your changes. -->
This pull request includes a minor but important fix to the iteration
logic in the `NvExecutionProvider::GetCapability` method to ensure
correctness and avoid potential undefined behavior when removing
elements from a container during iteration.

### Iteration logic fix:
* In
`onnxruntime/core/providers/nv_tensorrt_rtx/nv_execution_provider.cc`,
the loop for removing subgraphs smaller than the minimal size was
updated to use `it = supported_nodes_vector.erase(it)` instead of
`erase(it--)`. This ensures the iterator remains valid and avoids
decrementing it unnecessarily, improving code safety and readability.

### Motivation and Context
Prevent possible memory corruption.
Similar code was addressed in tensorrt EP.
---
 .../core/providers/nv_tensorrt_rtx/nv_execution_provider.cc | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/onnxruntime/core/providers/nv_tensorrt_rtx/nv_execution_provider.cc b/onnxruntime/core/providers/nv_tensorrt_rtx/nv_execution_provider.cc
index 469d139ed03bb..b2950cd2c5da3 100644
--- a/onnxruntime/core/providers/nv_tensorrt_rtx/nv_execution_provider.cc
+++ b/onnxruntime/core/providers/nv_tensorrt_rtx/nv_execution_provider.cc
@@ -2014,10 +2014,12 @@ NvExecutionProvider::GetCapability(const GraphViewer& graph,
   }
 
   // Remove subgraphs if its size is less than the predefined minimal size
-  for (auto it = supported_nodes_vector.begin(); it != supported_nodes_vector.end(); ++it) {
+  for (auto it = supported_nodes_vector.begin(); it != supported_nodes_vector.end();) {
     const size_t subgraph_size = it->first.size();
     if (subgraph_size < min_subgraph_size_) {
-      supported_nodes_vector.erase(it--);
+      it = supported_nodes_vector.erase(it);
+    } else {
+      ++it;
     }
   }
 

From 0d9f4e70df88da00c84522a450e85b4cc126130f Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Fri, 9 May 2025 09:09:38 -0700
Subject: [PATCH 43/84] [web CI] fixes shader key validation failure (#24701)

### Description

loose the regex for the log prefix. this fixes the shader key validation
failure and makes it less sensitive to future log format change.


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->

with chrome update, the chrome_debug logging format changed a little
bit.
---
 .../webgpu-validate-shader-key/parse-chromium-debug-log.js      | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/actions/webgpu-validate-shader-key/parse-chromium-debug-log.js b/.github/actions/webgpu-validate-shader-key/parse-chromium-debug-log.js
index 2342381f03934..d34ff95192089 100644
--- a/.github/actions/webgpu-validate-shader-key/parse-chromium-debug-log.js
+++ b/.github/actions/webgpu-validate-shader-key/parse-chromium-debug-log.js
@@ -16,7 +16,7 @@ async function processChromiumDebugLog() {
 
   for await (const line of rl) {
     const result =
-      /^\[.+INFO:CONSOLE\(\d+\)]\ "(?<raw_log_data>.+)",\ source:\ [^"]+?\(\d+\)$/.exec(
+      /^\[.+INFO:CONSOLE.+?]\ "(?<raw_log_data>.+)",\ source:\ [^"]+?\(\d+\)$/.exec(
         line
       );
     if (!result) {

From 8a97463aeac9f2be8a3bffdcac6b8a18c7b19bb1 Mon Sep 17 00:00:00 2001
From: Qiujiao <qiujiao.wu@intel.com>
Date: Sat, 10 May 2025 00:10:01 +0800
Subject: [PATCH 44/84] Add trace event control for ORT Web performance
 profiling (#23393)

### Description
Add trace event control to better profile ORT web performance


### Motivation and Context
ORT Web's current tracing implementation lacks interfaces for
performance profiling using about://tracing. This PR introduces these
interfaces, enabling performance bottleneck identification in ORT Web
and adding several trace events for WebNN.
---
 js/common/lib/inference-session-impl.ts       |  6 ++++-
 js/common/lib/trace.ts                        | 22 +++++++++++++++++++
 js/web/lib/wasm/jsep/init.ts                  |  2 ++
 js/web/lib/wasm/wasm-core-impl.ts             |  8 ++++++-
 js/web/lib/wasm/wasm-types.ts                 |  1 +
 .../core/providers/webnn/builders/model.cc    |  8 +++++++
 .../core/providers/webnn/data_transfer.cc     | 11 ++++++++++
 onnxruntime/wasm/pre-jsep.js                  |  1 +
 8 files changed, 57 insertions(+), 2 deletions(-)

diff --git a/js/common/lib/inference-session-impl.ts b/js/common/lib/inference-session-impl.ts
index 797dba8b94089..877c595bffd15 100644
--- a/js/common/lib/inference-session-impl.ts
+++ b/js/common/lib/inference-session-impl.ts
@@ -6,7 +6,7 @@ import { InferenceSessionHandler } from './backend.js';
 import { InferenceSession as InferenceSessionInterface } from './inference-session.js';
 import { OnnxValue } from './onnx-value.js';
 import { Tensor } from './tensor.js';
-import { TRACE_FUNC_BEGIN, TRACE_FUNC_END } from './trace.js';
+import { TRACE_FUNC_BEGIN, TRACE_FUNC_END, TRACE_EVENT_BEGIN, TRACE_EVENT_END } from './trace.js';
 
 type SessionOptions = InferenceSessionInterface.SessionOptions;
 type RunOptions = InferenceSessionInterface.RunOptions;
@@ -22,6 +22,7 @@ export class InferenceSession implements InferenceSessionInterface {
   run(feeds: FeedsType, fetches: FetchesType, options?: RunOptions): Promise<ReturnType>;
   async run(feeds: FeedsType, arg1?: FetchesType | RunOptions, arg2?: RunOptions): Promise<ReturnType> {
     TRACE_FUNC_BEGIN();
+    TRACE_EVENT_BEGIN('InferenceSession.run');
     const fetches: { [name: string]: OnnxValue | null } = {};
     let options: RunOptions = {};
     // check inputs
@@ -120,6 +121,7 @@ export class InferenceSession implements InferenceSessionInterface {
         }
       }
     }
+    TRACE_EVENT_END('InferenceSession.run');
     TRACE_FUNC_END();
     return returnValue;
   }
@@ -144,6 +146,7 @@ export class InferenceSession implements InferenceSessionInterface {
     arg3?: SessionOptions,
   ): Promise<InferenceSessionInterface> {
     TRACE_FUNC_BEGIN();
+    TRACE_EVENT_BEGIN('InferenceSession.create');
     // either load from a file or buffer
     let filePathOrUint8Array: string | Uint8Array;
     let options: SessionOptions = {};
@@ -207,6 +210,7 @@ export class InferenceSession implements InferenceSessionInterface {
     // resolve backend, update session options with validated EPs, and create session handler
     const [backend, optionsWithValidatedEPs] = await resolveBackendAndExecutionProviders(options);
     const handler = await backend.createInferenceSessionHandler(filePathOrUint8Array, optionsWithValidatedEPs);
+    TRACE_EVENT_END('InferenceSession.create');
     TRACE_FUNC_END();
     return new InferenceSession(handler);
   }
diff --git a/js/common/lib/trace.ts b/js/common/lib/trace.ts
index 25d178f15a29d..0f20dd39935ac 100644
--- a/js/common/lib/trace.ts
+++ b/js/common/lib/trace.ts
@@ -51,3 +51,25 @@ export const TRACE_FUNC_END = (extraMsg?: string) => {
   }
   TRACE_FUNC('END', extraMsg);
 };
+
+/**
+ * @ignore
+ */
+export const TRACE_EVENT_BEGIN = (extraMsg?: string) => {
+  if (typeof env.trace === 'undefined' ? !env.wasm.trace : !env.trace) {
+    return;
+  }
+  // eslint-disable-next-line no-console
+  console.time(`ORT::${extraMsg}`);
+};
+
+/**
+ * @ignore
+ */
+export const TRACE_EVENT_END = (extraMsg?: string) => {
+  if (typeof env.trace === 'undefined' ? !env.wasm.trace : !env.trace) {
+    return;
+  }
+  // eslint-disable-next-line no-console
+  console.timeEnd(`ORT::${extraMsg}`);
+};
diff --git a/js/web/lib/wasm/jsep/init.ts b/js/web/lib/wasm/jsep/init.ts
index 8ab6b054bf8a7..463e26d0208e5 100644
--- a/js/web/lib/wasm/jsep/init.ts
+++ b/js/web/lib/wasm/jsep/init.ts
@@ -291,6 +291,8 @@ export const init = async (
       },
       // jsepDownloadTensor
       async (tensorId: number, dstBuffer: ArrayBufferView | ArrayBuffer) => backend.downloadTensor(tensorId, dstBuffer),
+      // jsepEnableTraceEvent
+      !!env.trace,
     ]);
   }
 };
diff --git a/js/web/lib/wasm/wasm-core-impl.ts b/js/web/lib/wasm/wasm-core-impl.ts
index f42a224ed2e85..cfdc0053b3485 100644
--- a/js/web/lib/wasm/wasm-core-impl.ts
+++ b/js/web/lib/wasm/wasm-core-impl.ts
@@ -6,7 +6,7 @@
 // https://github.com/webmachinelearning/webnn/issues/677
 /// <reference path="jsep/webnn/webnn.d.ts" />
 
-import { Env, InferenceSession, Tensor } from 'onnxruntime-common';
+import { Env, InferenceSession, Tensor, TRACE_EVENT_BEGIN, TRACE_EVENT_END } from 'onnxruntime-common';
 
 import {
   SerializableInternalBuffer,
@@ -711,6 +711,7 @@ export const run = async (
   try {
     [runOptionsHandle, runOptionsAllocs] = setRunOptions(options);
 
+    TRACE_EVENT_BEGIN('wasm prepareInputOutputTensor');
     // create input tensors
     for (let i = 0; i < inputCount; i++) {
       await prepareInputOutputTensor(
@@ -736,6 +737,7 @@ export const run = async (
         enableGraphCapture,
       );
     }
+    TRACE_EVENT_END('wasm prepareInputOutputTensor');
 
     for (let i = 0; i < inputCount; i++) {
       wasm.setValue(inputValuesOffset + i * ptrSize, inputTensorHandles[i], '*');
@@ -755,6 +757,7 @@ export const run = async (
         );
       }
 
+      TRACE_EVENT_BEGIN('wasm bindInputsOutputs');
       // process inputs
       for (let i = 0; i < inputCount; i++) {
         const index = inputIndices[i];
@@ -788,6 +791,7 @@ export const run = async (
           }
         }
       }
+      TRACE_EVENT_END('wasm bindInputsOutputs');
       activeSessions.set(sessionId, [
         sessionHandle,
         inputNamesUTF8Encoded,
@@ -830,6 +834,7 @@ export const run = async (
     const output: TensorMetadata[] = [];
     const outputPromises: Array<Promise<[number, Tensor.DataType]>> = [];
 
+    TRACE_EVENT_BEGIN('wasm ProcessOutputTensor');
     for (let i = 0; i < outputCount; i++) {
       const tensor = Number(wasm.getValue(outputValuesOffset + i * ptrSize, '*'));
       if (tensor === outputTensorHandles[i]) {
@@ -1028,6 +1033,7 @@ export const run = async (
     for (const [index, data] of await Promise.all(outputPromises)) {
       output[index][2] = data;
     }
+    TRACE_EVENT_END('wasm ProcessOutputTensor');
     return output;
   } finally {
     wasm.webnnOnRunEnd?.(sessionHandle);
diff --git a/js/web/lib/wasm/wasm-types.ts b/js/web/lib/wasm/wasm-types.ts
index f2d051927b1d5..29a4028ae46cc 100644
--- a/js/web/lib/wasm/wasm-types.ts
+++ b/js/web/lib/wasm/wasm-types.ts
@@ -71,6 +71,7 @@ export declare namespace JSEP {
         ensureTensor: EnsureTensorFunction,
         uploadTensor: UploadTensorFunction,
         downloadTensor: DownloadTensorFunction,
+        enableTraceEvent: boolean,
       ],
     ): void;
   }
diff --git a/onnxruntime/core/providers/webnn/builders/model.cc b/onnxruntime/core/providers/webnn/builders/model.cc
index 40fdfc609e6a1..ef829e82823d0 100644
--- a/onnxruntime/core/providers/webnn/builders/model.cc
+++ b/onnxruntime/core/providers/webnn/builders/model.cc
@@ -158,6 +158,11 @@ onnxruntime::common::Status Model::Dispatch(const InlinedHashMap<std::string, On
                                             const InlinedHashMap<std::string, OnnxTensorData>& outputs) {
   auto webnnEnsureTensor = emscripten::val::module_property("webnnEnsureTensor");
   auto promises = emscripten::val::array();
+  bool trace = emscripten::val::module_property("webnnEnableTraceEvent").as<bool>();
+  emscripten::val console = emscripten::val::global("console");
+  if (trace) {
+    console.call<void>("time", emscripten::val("ORT::Dispatch::jsepEnsureTensor"));
+  }
   for (const auto& [_, tensor] : inputs) {
     emscripten::val shape = emscripten::val::array();
     for (const auto& dim : tensor.tensor_info.shape) {
@@ -176,6 +181,9 @@ onnxruntime::common::Status Model::Dispatch(const InlinedHashMap<std::string, On
     auto ml_tensor = webnnEnsureTensor(emscripten::val::undefined(), reinterpret_cast<intptr_t>(tensor.buffer), tensor.tensor_info.data_type, shape, false);
     promises.call<void>("push", ml_tensor);
   }
+  if (trace) {
+    console.call<void>("timeEnd", emscripten::val("ORT::Dispatch::jsepEnsureTensor"));
+  }
   auto ml_tensors = emscripten::val::global("Promise").call<emscripten::val>("all", promises).await();
   for (const auto& [name, _] : inputs) {
     wnn_inputs_.set(name, ml_tensors.call<emscripten::val>("shift"));
diff --git a/onnxruntime/core/providers/webnn/data_transfer.cc b/onnxruntime/core/providers/webnn/data_transfer.cc
index aa85277b72453..17369e6fbc75d 100644
--- a/onnxruntime/core/providers/webnn/data_transfer.cc
+++ b/onnxruntime/core/providers/webnn/data_transfer.cc
@@ -20,6 +20,11 @@ common::Status DataTransfer::CopyTensor(const Tensor& src, Tensor& dst) const {
     // We don't need to transfer the tensor to an MLTensor, so we don't need to copy the data.
     return Status::OK();
   }
+  bool trace = emscripten::val::module_property("webnnEnableTraceEvent").as<bool>();
+  emscripten::val console = emscripten::val::global("console");
+  if (trace) {
+    console.call<void>("time", emscripten::val("ORT::DataTransfer::CopyTensor"));
+  }
 
   size_t bytes = src.SizeInBytes();
   if (bytes > 0) {
@@ -30,10 +35,16 @@ common::Status DataTransfer::CopyTensor(const Tensor& src, Tensor& dst) const {
 
     if (dst_device.Type() == OrtDevice::GPU) {
       EM_ASM({ Module.webnnUploadTensor($0, HEAPU8.subarray($1, $1 + $2)); }, dst_data, reinterpret_cast<intptr_t>(src_data), bytes);
+      if (trace) {
+        console.call<void>("timeEnd", emscripten::val("ORT::DataTransfer::webnnUploadTensor"));
+      }
     } else {
       auto webnnDownloadTensor = emscripten::val::module_property("webnnDownloadTensor");
       auto subarray = emscripten::typed_memory_view(bytes, static_cast<char*>(dst_data));
       webnnDownloadTensor(reinterpret_cast<intptr_t>(src_data), subarray).await();
+      if (trace) {
+        console.call<void>("timeEnd", emscripten::val("ORT::DataTransfer::webnnDownloadTensor"));
+      }
     }
   }
 
diff --git a/onnxruntime/wasm/pre-jsep.js b/onnxruntime/wasm/pre-jsep.js
index df09920bddebd..8232a286d4480 100644
--- a/onnxruntime/wasm/pre-jsep.js
+++ b/onnxruntime/wasm/pre-jsep.js
@@ -104,6 +104,7 @@ Module["jsepInit"] = (name, params) => {
       Module["webnnEnsureTensor"],
       Module.webnnUploadTensor,
       Module["webnnDownloadTensor"],
+      Module["webnnEnableTraceEvent"],
     ] = params.slice(1);
 
     // This function is called from both JS and an EM_ASM block, it needs both a minifiable name and an explicit name.

From 17cffcb936febb5d9b166574b1803e77cc83182d Mon Sep 17 00:00:00 2001
From: vraspar <vrajang@outlook.com>
Date: Fri, 9 May 2025 16:47:01 -0700
Subject: [PATCH 45/84] Update AnyChart library to version 8.13.0 in perf_view
 HTML (#24684)

### Description
The perf view uses outdated any chart version, which results in
perf_view not showing any summary.

Previously happened:
https://github.com/microsoft/onnxruntime/issues/16873
---
 tools/perf_view/ort_perf_view.html | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/perf_view/ort_perf_view.html b/tools/perf_view/ort_perf_view.html
index 509fe5593f6a1..9cd65f06d9337 100644
--- a/tools/perf_view/ort_perf_view.html
+++ b/tools/perf_view/ort_perf_view.html
@@ -3,9 +3,9 @@
   <head>
     <title id="filename">Onnxruntime Perf View</title>
     <script type="text/javascript" src="https://cdn.anychart.com/releases/v8/js/anychart-core.min.js"></script>
-    <script src="https://cdn.anychart.com/releases/8.11.0/js/anychart-ui.min.js"></script>
+    <script src="https://cdn.anychart.com/releases/8.13.0/js/anychart-ui.min.js"></script>
     <script type="text/javascript" src="https://cdn.anychart.com/releases/v8/js/anychart-treemap.min.js"></script>
-    <link rel="stylesheet" type="text/css" href="https://cdn.anychart.com/releases/8.11.1/css/anychart-ui.min.css"/>
+    <link rel="stylesheet" type="text/css" href="https://cdn.anychart.com/releases/8.13.0/css/anychart-ui.min.css"/>
     <style>
       html, body {
         width: 100%;

From 782c8119f85ba16d2d9c244f36afc3e7dd7766e2 Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Fri, 9 May 2025 23:27:52 -0700
Subject: [PATCH 46/84] add branch name to artifact "validation-scripts"
 (#24710)

### Description

This fix allows the JS Release Pipeline to have the info of the source
branch name.

### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
---
 .../github/azure-pipelines/npm-packaging-pipeline.yml       | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/tools/ci_build/github/azure-pipelines/npm-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/npm-packaging-pipeline.yml
index e5d45f9f07bfb..f6d404c3bde62 100644
--- a/tools/ci_build/github/azure-pipelines/npm-packaging-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/npm-packaging-pipeline.yml
@@ -114,6 +114,12 @@ extends:
             targetFolder: $(Build.ArtifactStagingDirectory)\validation-scripts
           displayName: 'Copy validation scripts'
 
+        - script: |
+            echo "== Source Branch =="
+            echo "$(Build.SourceBranch)"
+            echo "$(Build.SourceBranch)" > $(Build.ArtifactStagingDirectory)\node-artifacts\_branch.txt
+          displayName: 'Extract Source Branch'
+
         - task: 1ES.PublishPipelineArtifact@1
           inputs:
             artifactName: 'validation_scripts'

From a95655105af58655563a08ed113ea09b9d3d956a Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Sat, 10 May 2025 10:20:09 -0700
Subject: [PATCH 47/84] [Build] unify ORT shared lib build config for macOS/iOS
 (#24706)

### Description

unify ORT shared lib build config for macOS/iOS:
- `APPLE` includes 'iOS' according to
[documentation](https://cmake.org/cmake/help/latest/variable/APPLE.html)
- disable build time rpath (LC_RPATH)
---
 cmake/onnxruntime.cmake | 19 ++++++++-----------
 1 file changed, 8 insertions(+), 11 deletions(-)

diff --git a/cmake/onnxruntime.cmake b/cmake/onnxruntime.cmake
index 6c1d4485ebcc9..2f4fd548b0e8e 100644
--- a/cmake/onnxruntime.cmake
+++ b/cmake/onnxruntime.cmake
@@ -153,17 +153,14 @@ else()
 endif()
 
 
-if (APPLE OR ${CMAKE_SYSTEM_NAME} MATCHES "^iOS")
-    target_link_options(onnxruntime PRIVATE  "LINKER:-exported_symbols_list,${SYMBOL_FILE}")
-    if (${CMAKE_SYSTEM_NAME} STREQUAL "iOS")
-      set_target_properties(onnxruntime PROPERTIES
-        MACOSX_RPATH TRUE
-        INSTALL_RPATH_USE_LINK_PATH FALSE
-        BUILD_WITH_INSTALL_NAME_DIR TRUE
-        INSTALL_NAME_DIR @rpath)
-    else()
-        set_target_properties(onnxruntime PROPERTIES INSTALL_RPATH "@loader_path")
-    endif()
+if (APPLE)
+  target_link_options(onnxruntime PRIVATE  "LINKER:-exported_symbols_list,${SYMBOL_FILE}")
+  set_target_properties(onnxruntime PROPERTIES
+    MACOSX_RPATH TRUE
+    SKIP_BUILD_RPATH TRUE
+    INSTALL_RPATH_USE_LINK_PATH FALSE
+    BUILD_WITH_INSTALL_NAME_DIR TRUE
+    INSTALL_NAME_DIR @rpath)
 endif()
 
 
From 2b5c0bc3ad320212748d07ad9727ee0baf366eab Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Sat, 10 May 2025 11:55:20 -0700
Subject: [PATCH 48/84] fix deps copy for nodejs binding build (#24712)

### Description
<!-- Describe your changes. -->

fix the copy of the dependencies and now it does not only work in
windows.

### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
---
 js/node/CMakeLists.txt | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/js/node/CMakeLists.txt b/js/node/CMakeLists.txt
index 2bd6f22e5f901..52af5dc48a21a 100644
--- a/js/node/CMakeLists.txt
+++ b/js/node/CMakeLists.txt
@@ -113,12 +113,6 @@ endif()
 if (WIN32)
   file(COPY ${ONNXRUNTIME_WIN_BIN_DIR}/onnxruntime.dll
       DESTINATION ${dist_folder})
-  if (ORT_NODEJS_DLL_DEPS)
-    foreach(dll ${ORT_NODEJS_DLL_DEPS})
-      file(COPY ${dll} DESTINATION ${dist_folder})
-    endforeach()
-  endif()
-
 elseif (APPLE)
   file(COPY ${ONNXRUNTIME_BUILD_DIR}/libonnxruntime.dylib
       DESTINATION ${dist_folder} FOLLOW_SYMLINK_CHAIN)
@@ -128,3 +122,9 @@ elseif (UNIX)
 else()
   message(FATAL_ERROR "Platform not supported.")
 endif()
+
+if (ORT_NODEJS_DLL_DEPS)
+  foreach(dll ${ORT_NODEJS_DLL_DEPS})
+    file(COPY ${dll} DESTINATION ${dist_folder})
+  endforeach()
+endif()

From 7d719758f34ef99123cc26aa18bbfda508108c57 Mon Sep 17 00:00:00 2001
From: Yuduo Wu <6426433+1duo@users.noreply.github.com>
Date: Sun, 11 May 2025 13:09:38 -0700
Subject: [PATCH 49/84] Allow TransposeOptimizer handle layout agonistic GeLU
 (#24708)

For NHWC EP, this GeLU op missing from handler would result in extra
transpose wraps around GeLU ops. This fix address this issue and
improves performance.
---
 .../layout_transformation/layout_transformation_dev_notes.md    | 2 +-
 .../transpose_optimization/onnx_transpose_optimization.cc       | 1 +
 onnxruntime/test/perftest/strings_helper.cc                     | 2 +-
 3 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/onnxruntime/core/optimizer/layout_transformation/layout_transformation_dev_notes.md b/onnxruntime/core/optimizer/layout_transformation/layout_transformation_dev_notes.md
index 5b58e2b646717..0daa9d1ddeaee 100644
--- a/onnxruntime/core/optimizer/layout_transformation/layout_transformation_dev_notes.md
+++ b/onnxruntime/core/optimizer/layout_transformation/layout_transformation_dev_notes.md
@@ -13,7 +13,7 @@ Foreach
   1. call GetCapability
   2. IF EP.DesiredFormat == NHWC
     2.1. Invoke Layout Transformer
-    2.2 If graph is modified -> call GetCapability (layout transformer can add new nodes to the graph)
+    2.2. If graph is modified -> call GetCapability (layout transformer can add new nodes to the graph)
   3 Compile
 ```
 
diff --git a/onnxruntime/core/optimizer/transpose_optimization/onnx_transpose_optimization.cc b/onnxruntime/core/optimizer/transpose_optimization/onnx_transpose_optimization.cc
index 10cb6eb97bdd6..93c7efc9ca167 100644
--- a/onnxruntime/core/optimizer/transpose_optimization/onnx_transpose_optimization.cc
+++ b/onnxruntime/core/optimizer/transpose_optimization/onnx_transpose_optimization.cc
@@ -2487,6 +2487,7 @@ constexpr HandlerInfo reshape_handler = {&FirstInput, &HandleReshape, /*transpos
 static const std::unordered_map<std::string_view, const HandlerInfo&> handler_map{
     {"Cast", simple_node_handler},
     {"Exp", simple_node_handler},
+    {"Gelu", simple_node_handler},
     {"Identity", simple_node_handler},
     {"LeakyRelu", simple_node_handler},
     {"Log", simple_node_handler},
diff --git a/onnxruntime/test/perftest/strings_helper.cc b/onnxruntime/test/perftest/strings_helper.cc
index e09c8fac70887..9fd49da1d0486 100644
--- a/onnxruntime/test/perftest/strings_helper.cc
+++ b/onnxruntime/test/perftest/strings_helper.cc
@@ -41,7 +41,7 @@ void ParseSessionConfigs(const std::string& configs_string,
         available_keys_str += ", ";
       }
       ORT_THROW("[ERROR] wrong key type entered : `", key,
-                "`. The following runtime key options are avaible: [", available_keys_str, "]");
+                "`. The following runtime key options are available: [", available_keys_str, "]");
     }
 
     auto it = session_configs.find(key);

From 16acdb355a1abfa2d3d210095e23ab5d1e658f47 Mon Sep 17 00:00:00 2001
From: Kee <xuke537@hotmail.com>
Date: Tue, 13 May 2025 02:13:15 +0800
Subject: [PATCH 50/84] [VSINPU EP]Fix logger build error (#24602)

### Description
Fix the build error in VSINPU EP which caused by  #23030


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->

Signed-off-by: Kee <xuke537@hotmail.com>
---
 .../core/providers/vsinpu/vsinpu_ep_graph.cc     | 15 +++++++++------
 .../core/providers/vsinpu/vsinpu_ep_graph.h      |  5 +++--
 .../vsinpu/vsinpu_execution_provider.cc          | 16 +++++++---------
 .../providers/vsinpu/vsinpu_execution_provider.h |  2 +-
 4 files changed, 20 insertions(+), 18 deletions(-)

diff --git a/onnxruntime/core/providers/vsinpu/vsinpu_ep_graph.cc b/onnxruntime/core/providers/vsinpu/vsinpu_ep_graph.cc
index db8a87d9eaf24..ac113ffc1dc64 100644
--- a/onnxruntime/core/providers/vsinpu/vsinpu_ep_graph.cc
+++ b/onnxruntime/core/providers/vsinpu/vsinpu_ep_graph.cc
@@ -73,20 +73,23 @@ bool GraphEP::Prepare() {
 }
 
 bool GraphEP::SupportedOp(const onnxruntime::GraphViewer& graph_viewer,
-                          const NodeUnit& node_unit) {
+                          const NodeUnit& node_unit,
+                          const logging::Logger& logger) {
   const auto& supported_builtins = vsi::npu::SupportedBuiltinOps();
   const auto& target_node = node_unit.GetNode();
   const auto& it = supported_builtins.find(target_node.OpType());
   if (supported_builtins.end() != it) {
     return it->second->IsSupported(graph_viewer, node_unit);
   }
-  LOGS_DEFAULT(WARNING) << "Fallback unsupported op (node_unit) " << node_unit.OpType()
+  LOGS(logger, WARNING) << "Fallback unsupported op (node_unit) " << node_unit.OpType()
                         << "  to cpu.";
   return false;
 }
 
-bool GraphEP::IsNodeSupportedInGroup(const NodeUnit& node_unit, const GraphViewer& graph_viewer) {
-  return SupportedOp(graph_viewer, node_unit);
+bool GraphEP::IsNodeSupportedInGroup(const NodeUnit& node_unit,
+                                     const GraphViewer& graph_viewer,
+                                     const logging::Logger& logger) {
+  return SupportedOp(graph_viewer, node_unit, logger);
 }
 
 const NodeUnit& GraphEP::GetNodeUnit(const Node* node) const {
@@ -151,7 +154,7 @@ bool GraphEP::BindTensors(const std::shared_ptr<NodeIOInfo>& nodeio_info) {
   if (!input_names.empty()) {
     for (auto& name : input_names) {
       if (tensors_.find(name) == tensors_.end() || tensors_[name] == nullptr) {
-        LOGS_DEFAULT(ERROR) << "Input tensor not defined or not found!";
+        LOGS(logger_, ERROR) << "Input tensor not defined or not found!";
         return false;
       }
       (*op).BindInput(tensors_[name]);
@@ -160,7 +163,7 @@ bool GraphEP::BindTensors(const std::shared_ptr<NodeIOInfo>& nodeio_info) {
   if (!output_names.empty()) {
     for (auto& name : output_names) {
       if (tensors_.find(name) == tensors_.end() || tensors_[name] == nullptr) {
-        LOGS_DEFAULT(ERROR) << "Output tensor not defined or not found!";
+        LOGS(logger_, ERROR) << "Output tensor not defined or not found!";
         return false;
       }
       (*op).BindOutput(tensors_[name]);
diff --git a/onnxruntime/core/providers/vsinpu/vsinpu_ep_graph.h b/onnxruntime/core/providers/vsinpu/vsinpu_ep_graph.h
index 5bb332fad0177..31a983810b20a 100644
--- a/onnxruntime/core/providers/vsinpu/vsinpu_ep_graph.h
+++ b/onnxruntime/core/providers/vsinpu/vsinpu_ep_graph.h
@@ -57,11 +57,12 @@ class GraphEP {
   bool Prepare();
 
   static bool SupportedOp(const onnxruntime::GraphViewer& graph_viewer,
-                          const NodeUnit& node_unit);
+                          const NodeUnit& node_unit, const logging::Logger& logger);
 
   // If a node is supported by VSINPU in a partition node group
   // `node_outputs_in_group` is the set of the output names of the nodes added to this group so far
-  static bool IsNodeSupportedInGroup(const NodeUnit& node_unit, const GraphViewer& graph_viewer);
+  static bool IsNodeSupportedInGroup(const NodeUnit& node_unit, const GraphViewer& graph_viewer,
+                                     const logging::Logger& logger);
 
   const NodeUnit& GetNodeUnit(const Node* node) const;
 
diff --git a/onnxruntime/core/providers/vsinpu/vsinpu_execution_provider.cc b/onnxruntime/core/providers/vsinpu/vsinpu_execution_provider.cc
index 76cf9c9a797e1..3b70ab3c9241b 100644
--- a/onnxruntime/core/providers/vsinpu/vsinpu_execution_provider.cc
+++ b/onnxruntime/core/providers/vsinpu/vsinpu_execution_provider.cc
@@ -37,8 +37,6 @@
 #include "core/optimizer/qdq_transformer/selectors_actions/shared/utils.h"
 #include "core/providers/partitioning_utils.h"
 
-#include "core/providers/qnn/ort_api.h"
-
 namespace onnxruntime {
 VSINPUExecutionProvider::VSINPUExecutionProvider(const VSINPUExecutionProviderInfo& info)
     : IExecutionProvider{onnxruntime::kVSINPUExecutionProvider,
@@ -56,17 +54,17 @@ VSINPUExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph_vie
                                        const GraphOptimizerRegistry& /* graph_optimizer_registry */,
                                        IResourceAccountant* /* resource_accountant */) const {
   std::vector<std::unique_ptr<ComputeCapability>> result;
-
+  const auto& logger = *GetLogger();
   if (graph_viewer.IsSubgraph()) {
     return result;
   }
 
   for (const auto& tensor : graph_viewer.GetAllInitializedTensors()) {
     if (tensor.second->has_data_location()) {
-      LOGS_DEFAULT(VERBOSE) << "location:" << tensor.second->data_location();
+      LOGS(logger, VERBOSE) << "location:" << tensor.second->data_location();
       if (tensor.second->data_location() ==
           ONNX_NAMESPACE::TensorProto_DataLocation_EXTERNAL) {
-        LOGS_DEFAULT(WARNING) << "VSINPU: Initializers with external data location are not "
+        LOGS(logger, WARNING) << "VSINPU: Initializers with external data location are not "
                                  "currently supported";
         return result;
       }
@@ -93,11 +91,11 @@ VSINPUExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph_vie
       supported = it->second;
     } else {
       // We only check the target node of the node unit
-      supported = vsi::npu::GraphEP::IsNodeSupportedInGroup(*node_unit, graph_viewer);
+      supported = vsi::npu::GraphEP::IsNodeSupportedInGroup(*node_unit, graph_viewer, logger);
       node_unit_supported_result[node_unit] = supported;
     }
 
-    LOGS_DEFAULT(VERBOSE) << "Node supported: [" << supported
+    LOGS(logger, VERBOSE) << "Node supported: [" << supported
                           << "] Operator type: [" << node.OpType()
                           << "] index: [" << node.Index()
                           << "] name: [" << node.Name()
@@ -158,9 +156,9 @@ VSINPUExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph_vie
   // If the graph is partitioned in multiple subgraphs, and this may impact performance,
   // we want to give users a summary message at warning level.
   if (num_of_partitions > 1) {
-    LOGS_DEFAULT(WARNING) << summary_msg;
+    LOGS(logger, WARNING) << summary_msg;
   } else {
-    LOGS_DEFAULT(INFO) << summary_msg;
+    LOGS(logger, INFO) << summary_msg;
   }
 
   return result;
diff --git a/onnxruntime/core/providers/vsinpu/vsinpu_execution_provider.h b/onnxruntime/core/providers/vsinpu/vsinpu_execution_provider.h
index 1f96bde81b1d6..8053fef46a4f1 100644
--- a/onnxruntime/core/providers/vsinpu/vsinpu_execution_provider.h
+++ b/onnxruntime/core/providers/vsinpu/vsinpu_execution_provider.h
@@ -50,7 +50,7 @@ class VSINPUExecutionProvider : public IExecutionProvider {
   std::mutex& GetMutex() { return mutex_; }
 
  private:
-  int device_id_;
+  OrtDevice::DeviceId device_id_;
   std::mutex mutex_;
 };
 

From c76991cdf5e6b17f466f4f2df3cf32cb44303f4a Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Mon, 12 May 2025 11:15:58 -0700
Subject: [PATCH 51/84] [doc] upgrade README.md for nodejs (#24709)

### Description

WebGPU is not enabled in the Nodejs binding prebuilt binaries for Linux
x64 and arm64.


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
---
 js/node/README.md | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/js/node/README.md b/js/node/README.md
index c271d8daccc8b..b8414546c4729 100644
--- a/js/node/README.md
+++ b/js/node/README.md
@@ -27,13 +27,14 @@ The following table lists the supported versions of ONNX Runtime Node.js binding
 | EPs/Platforms | Windows x64        | Windows arm64      | Linux x64          | Linux arm64        | MacOS x64          | MacOS arm64        |
 | ------------- | ------------------ | ------------------ | ------------------ | ------------------ | ------------------ | ------------------ |
 | CPU           | ✔️                 | ✔️                 | ✔️                 | ✔️                 | ✔️                 | ✔️                 |
-| WebGPU        | ✔️ <sup>\[1]</sup> | ✔️ <sup>\[1]</sup> | ✔️ <sup>\[1]</sup> | ✔️ <sup>\[1]</sup> | ✔️ <sup>\[1]</sup> | ✔️ <sup>\[1]</sup> |
+| WebGPU        | ✔️ <sup>\[1]</sup> | ✔️ <sup>\[1]</sup> | ❌ <sup>\[2]</sup> | ❌ <sup>\[2]</sup> | ✔️ <sup>\[1]</sup> | ✔️ <sup>\[1]</sup> |
 | DirectML      | ✔️                 | ✔️                 | ❌                 | ❌                 | ❌                 | ❌                 |
-| CUDA          | ❌                 | ❌                 | ✔️<sup>\[2]</sup>  | ❌                 | ❌                 | ❌                 |
+| CUDA          | ❌                 | ❌                 | ✔️<sup>\[3]</sup>  | ❌                 | ❌                 | ❌                 |
 | CoreML        | ❌                 | ❌                 | ❌                 | ❌                 | ✔️                 | ✔️                 |
 
 - \[1]: WebGPU support is currently experimental.
-- \[2]: CUDA v12. See [CUDA EP Installation](#cuda-ep-installation) for details.
+- \[2]: WebGPU support is not available on Linux x64 and arm64 yet in the pre-built binaries.
+- \[3]: CUDA v12. See [CUDA EP Installation](#cuda-ep-installation) for details.
 
 To use on platforms without pre-built binaries, you can build Node.js binding from source and consume it by `npm install <onnxruntime_repo_root>/js/node/`. See also [instructions](https://onnxruntime.ai/docs/build/inferencing.html#apis-and-language-bindings) for building ONNX Runtime Node.js binding locally.
 

From 7a32d0ac3501044759780df8b96fb63e262a6057 Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Mon, 12 May 2025 11:17:47 -0700
Subject: [PATCH 52/84] [build] ensure Python to be found in vcpkg (#24713)

### Description
<!-- Describe your changes. -->

Sometimes Python cannot be found by CMake when using vcpkg. This is
because vcpkg does not inherit environment `PATH` from the parent
process when launching CMake (which is intentionally done by vcpkg), and
in this case, a non-standard Python installation (eg. virtual env or
conda) may fail the `find_package(Python3 REQUIRED)`.

This PR write the env variable `Python3_ROOT_DIR` and adds it to
`VCPKG_KEEP_ENV_VARS` to ensure CMake launched by vcpkg to be able to
always find Python that runs `build.py`.


This change should not break the existing behavior unless there are
scenarios that intentionally uses different python for running build.py
and vcpkg build.
---
 tools/ci_build/build.py | 21 ++++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py
index 06e48a36942e2..900485d74866f 100644
--- a/tools/ci_build/build.py
+++ b/tools/ci_build/build.py
@@ -1228,9 +1228,28 @@ def generate_build_tree(
             ]
         env = {}
         if args.use_vcpkg:
-            env["VCPKG_KEEP_ENV_VARS"] = "TRT_UPLOAD_AUTH_TOKEN;EMSDK;EMSDK_NODE;EMSDK_PYTHON"
+            vcpkg_keep_env_vars = ["TRT_UPLOAD_AUTH_TOKEN"]
+
             if args.build_wasm:
                 env["EMSDK"] = emsdk_dir
+                vcpkg_keep_env_vars += ["EMSDK", "EMSDK_NODE", "EMSDK_PYTHON"]
+
+            #
+            # Workaround for vcpkg failed to find the correct path of Python
+            #
+            # Since vcpkg does not inherit the environment variables `PATH` from the parent process, CMake will fail to
+            # find the Python executable if the Python executable is not in the default location. This usually happens
+            # to the Python installed by Anaconda.
+            #
+            # To minimize the impact of this problem, we set the `Python3_ROOT_DIR` environment variable to the
+            # directory of current Python executable.
+            #
+            # see https://cmake.org/cmake/help/latest/module/FindPython3.html
+            #
+            env["Python3_ROOT_DIR"] = str(Path(os.path.dirname(sys.executable)).resolve())
+            vcpkg_keep_env_vars += ["Python3_ROOT_DIR"]
+
+            env["VCPKG_KEEP_ENV_VARS"] = ";".join(vcpkg_keep_env_vars)
 
         run_subprocess(
             [*temp_cmake_args, f"-DCMAKE_BUILD_TYPE={config}"],

From 1720dbb8562199e19ad12a2274fe5415b4b2e8f4 Mon Sep 17 00:00:00 2001
From: Yuduo Wu <6426433+1duo@users.noreply.github.com>
Date: Mon, 12 May 2025 14:43:18 -0700
Subject: [PATCH 53/84] [QNN-EP] [NFC] Fix qnn_node_group.h file location
 (#24707)

NFC, move file location only.
---
 onnxruntime/core/providers/qnn/builder/qnn_model.cc    |  4 ++--
 .../providers/qnn/builder/qnn_node_group/dq_q_fusion.h |  2 +-
 .../builder/qnn_node_group/hardsigmoid_mul_fusion.h    |  2 +-
 .../qnn/builder/qnn_node_group/qnn_node_group.cc       | 10 +++++-----
 .../qnn/builder/{ => qnn_node_group}/qnn_node_group.h  |  0
 .../qnn/builder/qnn_node_group/reshape_gemm_fusion.h   |  2 +-
 .../core/providers/qnn/builder/qnn_node_group/utils.cc |  2 +-
 .../core/providers/qnn/builder/qnn_node_group/utils.h  |  2 +-
 .../core/providers/qnn/qnn_execution_provider.cc       |  4 ++--
 9 files changed, 14 insertions(+), 14 deletions(-)
 rename onnxruntime/core/providers/qnn/builder/{ => qnn_node_group}/qnn_node_group.h (100%)

diff --git a/onnxruntime/core/providers/qnn/builder/qnn_model.cc b/onnxruntime/core/providers/qnn/builder/qnn_model.cc
index 8421bd4a99196..ec84820bb7896 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_model.cc
+++ b/onnxruntime/core/providers/qnn/builder/qnn_model.cc
@@ -8,10 +8,10 @@
 #include <gsl/gsl>
 #include "QnnOpDef.h"
 
-#include "core/providers/qnn/ort_api.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
-#include "core/providers/qnn/builder/qnn_node_group.h"
+#include "core/providers/qnn/builder/qnn_node_group/qnn_node_group.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
+#include "core/providers/qnn/ort_api.h"
 #include "core/providers/qnn/qnn_allocator.h"
 #include "core/providers/qnn/shared_context.h"
 
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_node_group/dq_q_fusion.h b/onnxruntime/core/providers/qnn/builder/qnn_node_group/dq_q_fusion.h
index d3d552bc172ec..cbc052cbebe25 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_node_group/dq_q_fusion.h
+++ b/onnxruntime/core/providers/qnn/builder/qnn_node_group/dq_q_fusion.h
@@ -7,8 +7,8 @@
 #include <unordered_map>
 #include <vector>
 
+#include "core/providers/qnn/builder/qnn_node_group/qnn_node_group.h"
 #include "core/providers/qnn/ort_api.h"
-#include "core/providers/qnn/builder/qnn_node_group.h"
 
 namespace onnxruntime {
 namespace qnn {
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_node_group/hardsigmoid_mul_fusion.h b/onnxruntime/core/providers/qnn/builder/qnn_node_group/hardsigmoid_mul_fusion.h
index 0a1b16d24ffcd..51243b9ffa79b 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_node_group/hardsigmoid_mul_fusion.h
+++ b/onnxruntime/core/providers/qnn/builder/qnn_node_group/hardsigmoid_mul_fusion.h
@@ -7,8 +7,8 @@
 #include <unordered_map>
 #include <vector>
 
+#include "core/providers/qnn/builder/qnn_node_group/qnn_node_group.h"
 #include "core/providers/qnn/ort_api.h"
-#include "core/providers/qnn/builder/qnn_node_group.h"
 
 namespace onnxruntime {
 namespace qnn {
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_node_group/qnn_node_group.cc b/onnxruntime/core/providers/qnn/builder/qnn_node_group/qnn_node_group.cc
index 85969b9e2dc05..0390a305b2df9 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_node_group/qnn_node_group.cc
+++ b/onnxruntime/core/providers/qnn/builder/qnn_node_group/qnn_node_group.cc
@@ -1,8 +1,6 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "core/providers/qnn/builder/qnn_node_group.h"
-
 #include <gsl/gsl>
 #include <limits>
 #include <memory>
@@ -10,13 +8,15 @@
 #include <unordered_map>
 #include <utility>
 #include <vector>
-#include "core/providers/qnn/ort_api.h"
-#include "core/providers/qnn/builder/qnn_utils.h"
-#include "core/providers/qnn/builder/qnn_model_wrapper.h"
+
 #include "core/providers/qnn/builder/op_builder_factory.h"
+#include "core/providers/qnn/builder/qnn_model_wrapper.h"
 #include "core/providers/qnn/builder/qnn_node_group/dq_q_fusion.h"
 #include "core/providers/qnn/builder/qnn_node_group/hardsigmoid_mul_fusion.h"
+#include "core/providers/qnn/builder/qnn_node_group/qnn_node_group.h"
 #include "core/providers/qnn/builder/qnn_node_group/reshape_gemm_fusion.h"
+#include "core/providers/qnn/builder/qnn_utils.h"
+#include "core/providers/qnn/ort_api.h"
 
 namespace onnxruntime {
 namespace qnn {
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_node_group.h b/onnxruntime/core/providers/qnn/builder/qnn_node_group/qnn_node_group.h
similarity index 100%
rename from onnxruntime/core/providers/qnn/builder/qnn_node_group.h
rename to onnxruntime/core/providers/qnn/builder/qnn_node_group/qnn_node_group.h
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_node_group/reshape_gemm_fusion.h b/onnxruntime/core/providers/qnn/builder/qnn_node_group/reshape_gemm_fusion.h
index 6c953e6cf72c5..7e3f4b962a15c 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_node_group/reshape_gemm_fusion.h
+++ b/onnxruntime/core/providers/qnn/builder/qnn_node_group/reshape_gemm_fusion.h
@@ -9,8 +9,8 @@
 #include <unordered_map>
 #include <vector>
 
+#include "core/providers/qnn/builder/qnn_node_group/qnn_node_group.h"
 #include "core/providers/qnn/ort_api.h"
-#include "core/providers/qnn/builder/qnn_node_group.h"
 
 namespace onnxruntime {
 namespace qnn {
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_node_group/utils.cc b/onnxruntime/core/providers/qnn/builder/qnn_node_group/utils.cc
index 93b2fca296389..bd74f3d43b325 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_node_group/utils.cc
+++ b/onnxruntime/core/providers/qnn/builder/qnn_node_group/utils.cc
@@ -4,8 +4,8 @@
 #include <string_view>
 #include <unordered_map>
 
+#include "core/providers/qnn/builder/qnn_node_group/qnn_node_group.h"
 #include "core/providers/qnn/ort_api.h"
-#include "core/providers/qnn/builder/qnn_node_group.h"
 
 namespace onnxruntime {
 namespace qnn {
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_node_group/utils.h b/onnxruntime/core/providers/qnn/builder/qnn_node_group/utils.h
index c4cf4e8a20a92..f0b2afb67006e 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_node_group/utils.h
+++ b/onnxruntime/core/providers/qnn/builder/qnn_node_group/utils.h
@@ -7,8 +7,8 @@
 #include <string_view>
 #include <unordered_map>
 
+#include "core/providers/qnn/builder/qnn_node_group/qnn_node_group.h"
 #include "core/providers/qnn/ort_api.h"
-#include "core/providers/qnn/builder/qnn_node_group.h"
 
 namespace onnxruntime {
 namespace qnn {
diff --git a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
index 2d117927cbaf7..65ef19f0b6c0e 100644
--- a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
+++ b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
@@ -8,13 +8,13 @@
 #include <string_view>
 #include <unordered_set>
 
-#include "core/providers/qnn/ort_api.h"
 #include "core/providers/qnn/builder/onnx_ctx_model_helper.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
 #include "core/providers/qnn/builder/qnn_def.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
-#include "core/providers/qnn/builder/qnn_node_group.h"
+#include "core/providers/qnn/builder/qnn_node_group/qnn_node_group.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
+#include "core/providers/qnn/ort_api.h"
 #include "core/providers/qnn/qnn_allocator.h"
 #include "core/providers/qnn/qnn_telemetry.h"
 #include "core/providers/qnn/rpcmem_library.h"

From be20f165a93e2219718d8c4fa002e6b678077548 Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Mon, 12 May 2025 18:17:22 -0700
Subject: [PATCH 54/84] [build] fix EMSDK environment variables when using
 vcpkg (#24730)

### Description
<!-- Describe your changes. -->

This PR fixes the EMSDK environment variables when using vcpkg.

If user didn't call any EMSDK script, the `build.py` will still be able
to work because it does the following:
- run `emsdk install <emsdk_version>`
- run `emsdk activate <emsdk_version>`
- (newly added in this PR) run either of the following, capture the
stdout and parse environment variables `EMSDK`, `EMSDK_NODE` and
`EMSDK_PYTHON` and append to the env.
- (Windows) run `cmd /s /c "call .\emsdk_env && set"` to print out all
envvars
- (UNIX) run `sh -c ". ./emsdk_env.sh && printenv"` to print out all
envvars
---
 tools/ci_build/build.py | 38 ++++++++++++++++++++++++++++++++++++--
 1 file changed, 36 insertions(+), 2 deletions(-)

diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py
index 900485d74866f..8dce6be731402 100644
--- a/tools/ci_build/build.py
+++ b/tools/ci_build/build.py
@@ -1231,8 +1231,42 @@ def generate_build_tree(
             vcpkg_keep_env_vars = ["TRT_UPLOAD_AUTH_TOKEN"]
 
             if args.build_wasm:
-                env["EMSDK"] = emsdk_dir
-                vcpkg_keep_env_vars += ["EMSDK", "EMSDK_NODE", "EMSDK_PYTHON"]
+                emsdk_vars = ["EMSDK", "EMSDK_NODE", "EMSDK_PYTHON"]
+
+                # If environment variables 'EMSDK' is not set, run emsdk_env to set them
+                if "EMSDK" not in os.environ:
+                    if is_windows():
+                        # Run `cmd /s /c call .\\emsdk_env && set` to run emsdk_env and dump the environment variables
+                        emsdk_env = run_subprocess(
+                            ["cmd", "/s", "/c", "call .\\emsdk_env && set"],
+                            cwd=emsdk_dir,
+                            capture_stdout=True,
+                        )
+                    else:
+                        # Run `sh -c ". ./emsdk_env.sh && printenv"` to run emsdk_env and dump the environment variables
+                        emsdk_env = run_subprocess(
+                            ["sh", "-c", ". ./emsdk_env.sh && printenv"],
+                            cwd=emsdk_dir,
+                            capture_stdout=True,
+                        )
+
+                    # check for EMSDK environment variables and set them in the environment
+                    for line in emsdk_env.stdout.decode().splitlines():
+                        if "=" in line:
+                            key, value = line.rstrip().split("=", 1)
+                            if key in emsdk_vars:
+                                os.environ[key] = value
+
+                for var in emsdk_vars:
+                    if var in os.environ:
+                        env[var] = os.environ[var]
+                    elif var == "EMSDK":
+                        # EMSDK must be set, but EMSDK_NODE and EMSDK_PYTHON are optional
+                        raise BuildError(
+                            "EMSDK environment variable is not set correctly. Please run `emsdk_env` to set them."
+                        )
+
+                vcpkg_keep_env_vars += emsdk_vars
 
             #
             # Workaround for vcpkg failed to find the correct path of Python

From df1dba7bc91134a26cafe5195b00ad876f96f6e2 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 12 May 2025 21:38:45 -0700
Subject: [PATCH 55/84] Bump ruff from 0.11.6 to 0.11.9 (#24718)

---
 requirements-lintrunner.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements-lintrunner.txt b/requirements-lintrunner.txt
index 02408f6ed17e8..37df56216bde9 100644
--- a/requirements-lintrunner.txt
+++ b/requirements-lintrunner.txt
@@ -3,6 +3,6 @@
 lintrunner==0.12.7
 lintrunner-adapters==0.12.4
 # RUFF
-ruff==0.11.6
+ruff==0.11.9
 # CLANGFORMAT
 clang-format==19.1.7

From 723efe2480474546cee764f74b44cbc803c1e4ce Mon Sep 17 00:00:00 2001
From: Ishwar Raut <ishwar.raut@gmail.com>
Date: Mon, 12 May 2025 23:11:22 -0700
Subject: [PATCH 56/84] [NvTensorRtRtx EP]  Changes for the TensorRt RTX dll
 name change (#24703)

### Description
<!-- Describe your changes. -->
1 added the changes for the latest TesnorRt RTX name change


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
The current name of the TensorRt Rtx DLL will create the confusion for
devs

@ankan-ban @gedoensmax @jywu-msft

---------

Co-authored-by: iraut <iraut@nvidia.com>
---
 cmake/onnxruntime_providers_nv.cmake          | 49 +++++++------------
 .../nv_tensorrt_rtx/nv_execution_provider.cc  |  9 +---
 2 files changed, 18 insertions(+), 40 deletions(-)

diff --git a/cmake/onnxruntime_providers_nv.cmake b/cmake/onnxruntime_providers_nv.cmake
index 06d44b5289518..12d824fc3360e 100644
--- a/cmake/onnxruntime_providers_nv.cmake
+++ b/cmake/onnxruntime_providers_nv.cmake
@@ -36,44 +36,30 @@
 
 
   file(READ ${TENSORRT_INCLUDE_DIR}/NvInferVersion.h NVINFER_VER_CONTENT)
-  string(REGEX MATCH "define NV_TENSORRT_MAJOR * +([0-9]+)" NV_TENSORRT_MAJOR "${NVINFER_VER_CONTENT}")
-  string(REGEX REPLACE "define NV_TENSORRT_MAJOR * +([0-9]+)" "\\1" NV_TENSORRT_MAJOR "${NV_TENSORRT_MAJOR}")
-  string(REGEX MATCH "define NV_TENSORRT_MINOR * +([0-9]+)" NV_TENSORRT_MINOR "${NVINFER_VER_CONTENT}")
-  string(REGEX REPLACE "define NV_TENSORRT_MINOR * +([0-9]+)" "\\1" NV_TENSORRT_MINOR "${NV_TENSORRT_MINOR}")
-  string(REGEX MATCH "define NV_TENSORRT_PATCH * +([0-9]+)" NV_TENSORRT_PATCH "${NVINFER_VER_CONTENT}")
-  string(REGEX REPLACE "define NV_TENSORRT_PATCH * +([0-9]+)" "\\1" NV_TENSORRT_PATCH "${NV_TENSORRT_PATCH}")
-  math(EXPR NV_TENSORRT_MAJOR_INT "${NV_TENSORRT_MAJOR}")
-  math(EXPR NV_TENSORRT_MINOR_INT "${NV_TENSORRT_MINOR}")
-  math(EXPR NV_TENSORRT_PATCH_INT "${NV_TENSORRT_PATCH}")
-
-  if (NV_TENSORRT_MAJOR)
-    MESSAGE(STATUS "NV_TENSORRT_MAJOR is ${NV_TENSORRT_MAJOR}")
+  string(REGEX MATCH "define TRT_MAJOR_RTX * +([0-9]+)" NV_TRT_MAJOR_RTX "${NVINFER_VER_CONTENT}")
+  string(REGEX REPLACE "define TRT_MAJOR_RTX * +([0-9]+)" "\\1" NV_TRT_MAJOR_RTX "${NV_TRT_MAJOR_RTX}")
+  string(REGEX MATCH "define TRT_MINOR_RTX * +([0-9]+)" NV_TRT_MINOR_RTX "${NVINFER_VER_CONTENT}")
+  string(REGEX REPLACE "define TRT_MINOR_RTX * +([0-9]+)" "\\1" NV_TRT_MINOR_RTX "${NV_TRT_MINOR_RTX}")
+  math(EXPR NV_TRT_MAJOR_RTX_INT "${NV_TRT_MAJOR_RTX}")
+  math(EXPR NV_TRT_MINOR_RTX_INT "${NV_TRT_MINOR_RTX}")
+
+  if (NV_TRT_MAJOR_RTX)
+    MESSAGE(STATUS "NV_TRT_MAJOR_RTX is ${NV_TRT_MAJOR_RTX}")
   else()
-    MESSAGE(STATUS "Can't find NV_TENSORRT_MAJOR macro")
+    MESSAGE(STATUS "Can't find NV_TRT_MAJOR_RTX macro")
   endif()
 
-  # Check TRT version >= 10.0.1.6
-  if ((NV_TENSORRT_MAJOR_INT GREATER 10) OR
-      (NV_TENSORRT_MAJOR_INT EQUAL 10 AND NV_TENSORRT_MINOR_INT GREATER 0) OR
-      (NV_TENSORRT_MAJOR_INT EQUAL 10 AND NV_TENSORRT_PATCH_INT GREATER 0))
-    set(TRT_GREATER_OR_EQUAL_TRT_10_GA ON)
-  else()
-    message( FATAL_ERROR "Only TensorRT 10.x or higher is supported." )
-  endif()
-
-  # TensorRT 10 GA onwards, the TensorRT libraries will have major version appended to the end on Windows,
-  # for example, nvinfer_10.dll, nvonnxparser_10.dll ...
-  if (WIN32 AND TRT_GREATER_OR_EQUAL_TRT_10_GA)
-    set(NVINFER_LIB "nvinfer_${NV_TENSORRT_MAJOR}")
-    set(PARSER_LIB "nvonnxparser_${NV_TENSORRT_MAJOR}")
+  if (WIN32)
+    set(NVINFER_LIB "tensorrt_rtx_${NV_TRT_MAJOR_RTX}_${NV_TRT_MINOR_RTX}")
+    set(PARSER_LIB "tensorrt_onnxparser_rtx_${NV_TRT_MAJOR_RTX}_${NV_TRT_MINOR_RTX}")
   endif()
 
   if (NOT NVINFER_LIB)
-     set(NVINFER_LIB "nvinfer")
+     set(NVINFER_LIB "tensorrt_rtx")
   endif()
 
   if (NOT PARSER_LIB)
-     set(PARSER_LIB "nvonnxparser")
+     set(PARSER_LIB "tensorrt_onnxparser_rtx")
   endif()
 
   MESSAGE(STATUS "Looking for ${NVINFER_LIB}")
@@ -100,9 +86,8 @@
     set(TENSORRT_LIBRARY ${TENSORRT_LIBRARY_INFER} ${TENSORRT_LIBRARY_NVONNXPARSER})
     MESSAGE(STATUS "Find TensorRT libs at ${TENSORRT_LIBRARY}")
   else()
-    if (TRT_GREATER_OR_EQUAL_TRT_10_GA)
-      set(ONNX_USE_LITE_PROTO ON)
-    endif()
+    set(ONNX_USE_LITE_PROTO ON)
+
     onnxruntime_fetchcontent_declare(
       onnx_tensorrt
       URL ${DEP_URL_onnx_tensorrt}
diff --git a/onnxruntime/core/providers/nv_tensorrt_rtx/nv_execution_provider.cc b/onnxruntime/core/providers/nv_tensorrt_rtx/nv_execution_provider.cc
index b2950cd2c5da3..6a7ff63dbc0ed 100644
--- a/onnxruntime/core/providers/nv_tensorrt_rtx/nv_execution_provider.cc
+++ b/onnxruntime/core/providers/nv_tensorrt_rtx/nv_execution_provider.cc
@@ -2581,11 +2581,7 @@ Status NvExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphViewer& gr
     if (mem_size > max_ctx_mem_size_) {
       max_ctx_mem_size_ = mem_size;
     }
-#if NV_TENSORRT_MAJOR < 10
-    trt_context = std::unique_ptr<nvinfer1::IExecutionContext>(trt_engine->createExecutionContextWithoutDeviceMemory());
-#else
     trt_context = std::unique_ptr<nvinfer1::IExecutionContext>(trt_engine->createExecutionContext(nvinfer1::ExecutionContextAllocationStrategy::kUSER_MANAGED));
-#endif
   } else {
     trt_context = std::unique_ptr<nvinfer1::IExecutionContext>(trt_engine->createExecutionContext());
   }
@@ -2966,11 +2962,8 @@ Status NvExecutionProvider::CreateNodeComputeInfoFromPrecompiledEngine(const Gra
     if (mem_size > max_ctx_mem_size_) {
       max_ctx_mem_size_ = mem_size;
     }
-#if NV_TENSORRT_MAJOR < 10
-    trt_context = std::unique_ptr<nvinfer1::IExecutionContext>(trt_engine->createExecutionContextWithoutDeviceMemory());
-#else
     trt_context = std::unique_ptr<nvinfer1::IExecutionContext>(trt_engine->createExecutionContext(nvinfer1::ExecutionContextAllocationStrategy::kUSER_MANAGED));
-#endif
+
   } else {
     trt_context = std::unique_ptr<nvinfer1::IExecutionContext>(trt_engine->createExecutionContext());
   }

From 5905ba013dc306d62ef24fb09aa519120f26561a Mon Sep 17 00:00:00 2001
From: Jiajia Qin <jiajiaqin@microsoft.com>
Date: Tue, 13 May 2025 23:14:35 +0800
Subject: [PATCH 57/84] [webgpu] Fix deepseek incorrect issue (#24723)

### Description
<!-- Describe your changes. -->
Deepseek-r1 f16 works incorrectly on flash attention path. However,
Deepseek-r1 f32 works correctly on flash attention path. (Both of them
works correctly on non-fa path)

This PR fixes the incorrect result of deepseek-r1 on flash attention
path. It seems that the result of qk and softmax(qk) overflow on f16.
After changing the computation to use f32 instead of f16 on qk and
softmax(qk) and store qk' result as type `float`, the deepseek-r1 model
works correctly.

This PR includes below changes:
1. Add head_idx boundary checking. It's not related with deepseek
fixing. But it's needed for boundary checking since
[ProgramManager::NormalizeDispatchGroupSize](https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/core/providers/webgpu/program_manager.cc#L22-L39)
will dispatch more workgroups based on the algorithm when it exceeds the
`maxComputeWorkgroupsPerDimension`.
2. Use f32 to do the computation of qk and softmax(qk).
3. Store the results of qk and the max/sum as float instead of q's
original type.
---
 .../webgpu/bert/flash_attention.cc            | 145 ++++++++++--------
 .../contrib_ops/webgpu/bert/flash_attention.h |   9 +-
 2 files changed, 87 insertions(+), 67 deletions(-)

diff --git a/onnxruntime/contrib_ops/webgpu/bert/flash_attention.cc b/onnxruntime/contrib_ops/webgpu/bert/flash_attention.cc
index eb5de4634a4d8..33f90d0a5f791 100644
--- a/onnxruntime/contrib_ops/webgpu/bert/flash_attention.cc
+++ b/onnxruntime/contrib_ops/webgpu/bert/flash_attention.cc
@@ -143,7 +143,7 @@ Status FlashAttentionProgram::GenerateShaderCode(ShaderHelper& shader) const {
   const max_k_step: u32 = 16u;
   const vec_factor: u32 = 4u;
   const qkv_head_size_vec: u32 = qkv_head_size / vec_factor;
-  const min_value : q_element_t = q_element_t(-65504.0);
+  const min_value = f32(-3.402823e+38f);;
 
   // Default SHM usage limit is 16KB in Dawn.
   // vec4<f16> * qkv_head_size_vec * max_k_step = 8 * (128/4) * 16 = 4KB. 128 is head_size for phi4.
@@ -195,32 +195,32 @@ Status FlashAttentionProgram::GenerateShaderCode(ShaderHelper& shader) const {
   // Move half of o_tile from private memory into workgroup memory to reduce register pressure.
   // Note that register spill was observed on Qualcomm if whole o_tile is on private memory.
   // vec4<f16> * half_qkv_head_size_vec * workgroup_size_x = 8 * (128/4/2) * 64 = 8KB.
-  var<workgroup> o_tile_r : array<array<q_value_t, half_qkv_head_size_vec>, workgroup_size_x>;
+  var<workgroup> o_tile_r : array<array<vec4<f32>, half_qkv_head_size_vec>, workgroup_size_x>;
 
   // Private memory per lane.
-  var<private> o_tile : array<q_value_t, half_qkv_head_size_vec>;
+  var<private> o_tile : array<vec4<f32>, half_qkv_head_size_vec>;
   fn writeo(o_idx_global: u32, head_idx: u32, local_idx: u32)
   {
       // Stored as float16[batch_size,sequence_length,3072]
       let offset = o_idx_global * num_heads * qkv_head_size_vec + head_idx * qkv_head_size_vec;
       for (var idx:u32 = 0; idx < half_qkv_head_size_vec; idx ++)
       {
-          output[offset+idx] = o_tile[idx];
-          output[offset+idx+half_qkv_head_size_vec] = o_tile_r[local_idx][idx];
+          output[offset+idx] = q_value_t(o_tile[idx]);
+          output[offset+idx+half_qkv_head_size_vec] = q_value_t(o_tile_r[local_idx][idx]);
       }
   }
     )HELPER_FN";
   } else {
     shader.AdditionalImplementation() << R"HELPER_FN(
   // Private memory per lane.
-  var<private> o_tile : array<q_value_t, qkv_head_size_vec>;
+  var<private> o_tile : array<vec4<f32>, qkv_head_size_vec>;
   fn writeo(o_idx_global: u32, head_idx: u32)
   {
       // Stored as float16[batch_size,sequence_length,3072]
       let offset = o_idx_global * num_heads * qkv_head_size_vec + head_idx * qkv_head_size_vec;
       for (var idx:u32 = 0; idx < qkv_head_size_vec; idx ++)
       {
-          output[offset+idx] = o_tile[idx];
+          output[offset+idx] = q_value_t(o_tile[idx]);
       }
   }
     )HELPER_FN";
@@ -268,8 +268,8 @@ Status FlashAttentionProgram::GenerateShaderCode(ShaderHelper& shader) const {
     loadq(q_idx_global, head_idx);
   }
 
-  var previous_max : q_element_t = min_value;
-  var previous_denom : q_element_t = 0;
+  var previous_max : f32 = min_value;
+  var previous_denom : f32 = 0;
 
   for(var k_start = 0u; k_start < uniforms.total_sequence_length; k_start+=capped_sg_size)
   {
@@ -279,16 +279,16 @@ Status FlashAttentionProgram::GenerateShaderCode(ShaderHelper& shader) const {
     workgroupBarrier();
 
     // Compute QKt
-    var qk_1:vec4<q_element_t>;
-    var qk_2:vec4<q_element_t>;
-    var qk_3:vec4<q_element_t>;
-    var qk_4:vec4<q_element_t>;
+    var qk_1:vec4<f32>;
+    var qk_2:vec4<f32>;
+    var qk_3:vec4<f32>;
+    var qk_4:vec4<f32>;
     if (sg_size > 8)
     {
       for (var i:u32 = 0u; i < qkv_head_size_vec; i++)
       {
-        var k_local = k_tile[capped_sg_id][i];
-        var q_own = q_tile[i];
+        var k_local = vec4<f32>(k_tile[capped_sg_id][i]);
+        var q_own = vec4<f32>(q_tile[i]);
         qk_1[0] += dot(q_own, subgroupShuffle(k_local, 0));
         qk_1[1] += dot(q_own, subgroupShuffle(k_local, 1));
         qk_1[2] += dot(q_own, subgroupShuffle(k_local, 2));
@@ -311,8 +311,8 @@ Status FlashAttentionProgram::GenerateShaderCode(ShaderHelper& shader) const {
     {
       for (var i:u32 = 0u; i < qkv_head_size_vec; i++)
       {
-        var k_local = k_tile[capped_sg_id][i];
-        var q_own = q_tile[i];
+        var k_local = vec4<f32>(k_tile[capped_sg_id][i]);
+        var q_own = vec4<f32>(q_tile[i]);
         qk_1[0] += dot(q_own, subgroupShuffle(k_local, 0));
         qk_1[1] += dot(q_own, subgroupShuffle(k_local, 1));
         qk_1[2] += dot(q_own, subgroupShuffle(k_local, 2));
@@ -324,12 +324,12 @@ Status FlashAttentionProgram::GenerateShaderCode(ShaderHelper& shader) const {
       }
     }
 
-    qk_1 = qk_1 * q_element_t(uniforms.alpha) + loadAttentionBias(q_idx_global, k_start, head_idx);
-    qk_2 = qk_2 * q_element_t(uniforms.alpha) + loadAttentionBias(q_idx_global, k_start+4, head_idx);
+    qk_1 = qk_1 * uniforms.alpha + vec4<f32>(loadAttentionBias(q_idx_global, k_start, head_idx));
+    qk_2 = qk_2 * uniforms.alpha + vec4<f32>(loadAttentionBias(q_idx_global, k_start+4, head_idx));
     if (sg_size > 8)
     {
-      qk_3 = qk_3 * q_element_t(uniforms.alpha) + loadAttentionBias(q_idx_global, k_start+8, head_idx);
-      qk_4 = qk_4 * q_element_t(uniforms.alpha) + loadAttentionBias(q_idx_global, k_start+12, head_idx);
+      qk_3 = qk_3 * uniforms.alpha + vec4<f32>(loadAttentionBias(q_idx_global, k_start+8, head_idx));
+      qk_4 = qk_4 * uniforms.alpha + vec4<f32>(loadAttentionBias(q_idx_global, k_start+12, head_idx));
     }
 
     let seq_causal_length = select(uniforms.total_sequence_length, uniforms.past_sequence_length + q_idx_global + 1, uniforms.is_gqa > 0);
@@ -386,11 +386,11 @@ Status FlashAttentionProgram::GenerateShaderCode(ShaderHelper& shader) const {
     }
     let local_max = max(max(local_max_temp.x, local_max_temp.y),max(local_max_temp.z, local_max_temp.w));
     let new_max = max(previous_max, local_max);
-    qk_1 = q_value_t(exp(vec4<f32>(qk_1) - f32(new_max)));
-    qk_2 = q_value_t(exp(vec4<f32>(qk_2) - f32(new_max)));
+    qk_1 = exp(qk_1 - new_max);
+    qk_2 = exp(qk_2 - new_max);
     if (sg_size > 8) {
-      qk_3 = q_value_t(exp(vec4<f32>(qk_3) - f32(new_max)));
-      qk_4 = q_value_t(exp(vec4<f32>(qk_4) - f32(new_max)));
+      qk_3 = exp(qk_3 - new_max);
+      qk_4 = exp(qk_4 - new_max);
     }
     let sum_vec = qk_1 + qk_2 + qk_3 + qk_4;
     let sum = sum_vec.x + sum_vec.y + sum_vec.z + sum_vec.w;
@@ -398,7 +398,7 @@ Status FlashAttentionProgram::GenerateShaderCode(ShaderHelper& shader) const {
     // Compute lhs term of update di prime and the compute di prime.
     let dleft = previous_denom * exp(previous_max-new_max);
     var d = dleft + sum;
-    d = select(d,q_element_t(0.0000001),d==0);
+    d = select(d,f32(0.0000001),d==0);
     qk_1 = qk_1 / d;
     qk_2 = qk_2 / d;
     if (sg_size > 8) {
@@ -416,7 +416,7 @@ Status FlashAttentionProgram::GenerateShaderCode(ShaderHelper& shader) const {
     if (sg_size > 8) {
       for (var i:u32 = 0; i < half_qkv_head_size_vec; i++)
       {
-          var val = v_tile[capped_sg_id][i];
+          var val = vec4<f32>(v_tile[capped_sg_id][i]);
           var sum = subgroupShuffle(val, 0) * qk_1[0];
           sum += subgroupShuffle(val, 1) * qk_1[1];
           sum += subgroupShuffle(val, 2) * qk_1[2];
@@ -435,7 +435,7 @@ Status FlashAttentionProgram::GenerateShaderCode(ShaderHelper& shader) const {
           sum += subgroupShuffle(val, 15) * qk_4[3];
           o_tile[i] = o_tile[i] * o_ratio + sum;
 
-          val = v_tile[capped_sg_id][half_qkv_head_size_vec + i];
+          val = vec4<f32>(v_tile[capped_sg_id][half_qkv_head_size_vec + i]);
           sum = subgroupShuffle(val, 0) * qk_1[0];
           sum += subgroupShuffle(val, 1) * qk_1[1];
           sum += subgroupShuffle(val, 2) * qk_1[2];
@@ -459,7 +459,7 @@ Status FlashAttentionProgram::GenerateShaderCode(ShaderHelper& shader) const {
     {
       for (var i:u32 = 0; i < half_qkv_head_size_vec; i++)
       {
-          var val = v_tile[capped_sg_id][i];
+          var val = vec4<f32>(v_tile[capped_sg_id][i]);
           var sum = subgroupShuffle(val, 0) * qk_1[0];
           sum += subgroupShuffle(val, 1) * qk_1[1];
           sum += subgroupShuffle(val, 2) * qk_1[2];
@@ -470,7 +470,7 @@ Status FlashAttentionProgram::GenerateShaderCode(ShaderHelper& shader) const {
           sum += subgroupShuffle(val, 7) * qk_2[3];
           o_tile[i] = o_tile[i] * o_ratio + sum;
 
-          val = v_tile[capped_sg_id][half_qkv_head_size_vec + i];
+          val = vec4<f32>(v_tile[capped_sg_id][half_qkv_head_size_vec + i]);
           sum = subgroupShuffle(val, 0) * qk_1[0];
           sum += subgroupShuffle(val, 1) * qk_1[1];
           sum += subgroupShuffle(val, 2) * qk_1[2];
@@ -493,7 +493,7 @@ Status FlashAttentionProgram::GenerateShaderCode(ShaderHelper& shader) const {
     if (sg_size > 8) {
       for (var i:u32 = 0; i < qkv_head_size_vec; i++)
       {
-          var val = v_tile[capped_sg_id][i];
+          var val = vec4<f32>(v_tile[capped_sg_id][i]);
           var sum = subgroupShuffle(val, 0) * qk_1[0];
           sum += subgroupShuffle(val, 1) * qk_1[1];
           sum += subgroupShuffle(val, 2) * qk_1[2];
@@ -517,7 +517,7 @@ Status FlashAttentionProgram::GenerateShaderCode(ShaderHelper& shader) const {
     {
       for (var i:u32 = 0; i < qkv_head_size_vec; i++)
       {
-          var val = select(vec4<q_element_t>(0), v_tile[capped_sg_id][i], k_start + capped_sg_id < seq_causal_length);
+          var val = vec4<f32>(v_tile[capped_sg_id][i]);
           var sum = subgroupShuffle(val, 0) * qk_1[0];
           sum += subgroupShuffle(val, 1) * qk_1[1];
           sum += subgroupShuffle(val, 2) * qk_1[2];
@@ -547,7 +547,7 @@ Status FlashAttentionDecodeQKTProgram::GenerateShaderCode(ShaderHelper& shader)
     shader.AddInput("attention_bias", ShaderUsage::UseUniform);
   }
   shader.AddOutput("output", ShaderUsage::UseUniform);
-  shader.AddOutput("metadata", ShaderUsage::UseUniform | ShaderUsage::UseValueTypeAlias | ShaderUsage::UseElementTypeAlias);
+  shader.AddOutput("metadata", ShaderUsage::UseUniform | ShaderUsage::UseValueTypeAlias);
   // Note that this shader adopts similar algorithm with dp4a generation shader.
   //
   // This algorithm works to compute dot product of keys with queries parallelly, by processing on the k (head_size) dimension at each step amongst tile_size_k_vec threads,
@@ -569,8 +569,8 @@ Status FlashAttentionDecodeQKTProgram::GenerateShaderCode(ShaderHelper& shader)
                                     << "const sub_tile_count = " << WorkgroupSizeX() / tile_size_k_vec << "u;\n";
   shader.AdditionalImplementation() << R"ADDNL_FN(
 var<workgroup> tile_q: array<q_value_t, tile_size_k_vec>;
-var<workgroup> inner_qk_values: array<array<q_element_t, tile_size_k_vec>, tile_size>;
-var<workgroup> tile_qk: array<q_element_t, tile_size>;
+var<workgroup> inner_qk_values: array<array<f32, tile_size_k_vec>, tile_size>;
+var<workgroup> tile_qk: array<f32, tile_size>;
 )ADDNL_FN";
 
   if (has_attention_bias_) {
@@ -602,30 +602,34 @@ var<workgroup> tile_qk: array<q_element_t, tile_size>;
         tile_q[local_idx] = q[q_offset + k + local_idx];
       }
       workgroupBarrier();
-      let q_data = tile_q[local_col];
+      let q_data = vec4<f32>(tile_q[local_col]);
       if (k + local_col < uniforms.head_size_vec) {
         for (var row_offset = 0u; row_offset < tile_size; row_offset += sub_tile_count) {
           if (total_seq_offset + row_offset + local_row < total_sequence_length) {
-            inner_qk_values[row_offset + local_row][local_col] += dot(present_key[present_offset + (total_seq_offset + row_offset + local_row) * uniforms.head_size_vec + k + local_col], q_data);
+            inner_qk_values[row_offset + local_row][local_col] += dot(vec4<f32>(present_key[present_offset + (total_seq_offset + row_offset + local_row) * uniforms.head_size_vec + k + local_col]), q_data);
           }
         }
       }
       workgroupBarrier();
     }
 
-    if (local_idx < tile_size && total_seq_offset + local_idx < total_sequence_length) {
-      var sum = q_element_t(0);
+    if (local_idx < tile_size && total_seq_offset + local_idx < total_sequence_length && head_idx < uniforms.num_heads) {
+      var sum = f32(0);
       for (var i = 0u; i < tile_size_k_vec; i++) {
         sum += inner_qk_values[local_idx][i];
       }
 
       let output_idx = head_idx * total_sequence_length + total_seq_offset + local_idx;
-      sum = sum * q_element_t(uniforms.alpha) + loadAttentionBias(output_idx);
+      sum = sum * uniforms.alpha + f32(loadAttentionBias(output_idx));
       tile_qk[local_idx] = sum;
       output[output_idx] = sum;
     }
     workgroupBarrier();
 
+    if (head_idx >= uniforms.num_heads) {
+      return;
+    }
+
     if (local_idx == 0u) {
       // Calculate the max and sum in current split.
       var l_max = f32(-3.402823e+38f);
@@ -637,7 +641,7 @@ var<workgroup> tile_qk: array<q_element_t, tile_size>;
         l_sum += exp(f32(tile_qk[i]) - l_max);
       }
       let meta_offset = head_idx * uniforms.num_total_seq_length_tile + workgroup_idx % uniforms.num_total_seq_length_tile;
-      metadata[meta_offset] = metadata_value_t(metadata_element_t(l_max), metadata_element_t(l_sum));
+      metadata[meta_offset] = metadata_value_t(l_max, l_sum);
     }
 )MAIN_FN";
 
@@ -672,15 +676,16 @@ Status ComputeFlashAttentionDecodeQKT(onnxruntime::webgpu::ComputeContext& conte
                             // present_sequence_length is used to index into the KV cache, for static kv cache it is the max sequence length.
                             {static_cast<uint32_t>(parameters.is_gqa_ ? parameters.seqlen_present_kv_cache_ : parameters.total_sequence_length_)},
                             {static_cast<uint32_t>(parameters.n_reps)},
-                            {num_total_seq_length_tile}});
+                            {num_total_seq_length_tile},
+                            {static_cast<uint32_t>(parameters.num_heads_)}});
 
   return context.RunProgram(program);
 }
 
 Status FlashAttentionDecodeSplitVxProgram::GenerateShaderCode(ShaderHelper& shader) const {
   shader.AddInput("metadata", ShaderUsage::UseUniform);
-  shader.AddInput("qk", ShaderUsage::UseUniform | ShaderUsage::UseValueTypeAlias | ShaderUsage::UseElementTypeAlias);
-  shader.AddInput("present_value", ShaderUsage::UseUniform | ShaderUsage::UseValueTypeAlias);
+  shader.AddInput("qk", ShaderUsage::UseUniform | ShaderUsage::UseValueTypeAlias);
+  shader.AddInput("present_value", ShaderUsage::UseUniform | ShaderUsage::UseValueTypeAlias | ShaderUsage::UseElementTypeAlias);
   shader.AddOutput("out_split_vx", ShaderUsage::UseUniform);
 
   // Note that this shader adopts similar algorithm with dp4a generation shader.
@@ -700,7 +705,7 @@ Status FlashAttentionDecodeSplitVxProgram::GenerateShaderCode(ShaderHelper& shad
                                     << "const tile_size_k_vec = " << tile_size_k_vec << "u;\n"
                                     << "const sub_tile_count = " << WorkgroupSizeX() / tile_size_k_vec << "u;\n";
   shader.AdditionalImplementation() << R"HELPER_FN(
-var<workgroup> tile_qk: array<qk_value_t, tile_size>;
+var<workgroup> tile_qk: array<present_value_element_t, tile_size>;
 var<workgroup> tile_output: array<present_value_value_t, head_size_vec>;
 var<workgroup> qkv_values: array<array<present_value_value_t, tile_size_k_vec>, sub_tile_count>;
 
@@ -718,24 +723,26 @@ var<workgroup> qkv_values: array<array<present_value_value_t, tile_size_k_vec>,
     let present_offset = u32(head_idx / uniforms.n_reps) * uniforms.head_size_vec * uniforms.present_sequence_length;
 
     // Calculate the global max and sum in qk.
-    var g_max = f32(-3.402823e+38f);
-    for (var i = 0u; i < uniforms.num_total_seq_length_tile; i++)
-    {
-      let meta_offset = head_idx * uniforms.num_total_seq_length_tile + i;
-      g_max = max(g_max, f32(metadata[meta_offset].x));
-    }
-    var g_sum = f32(0);
-    for (var i = 0u; i < uniforms.num_total_seq_length_tile; i++)
+    if (head_idx < uniforms.num_heads)
     {
-      let meta_offset = head_idx * uniforms.num_total_seq_length_tile + i;
-      let m_value = metadata[meta_offset];
-      g_sum += exp(f32(m_value.x) - g_max) * f32(m_value.y);
-    }
+      var g_max = f32(-3.402823e+38f);
+      var g_sum = f32(0);
+      for (var i = 0u; i < uniforms.num_total_seq_length_tile; i++)
+      {
+        let meta_offset = head_idx * uniforms.num_total_seq_length_tile + i;
+        g_max = max(g_max, metadata[meta_offset].x);
+      }
+      for (var i = 0u; i < uniforms.num_total_seq_length_tile; i++)
+      {
+        let meta_offset = head_idx * uniforms.num_total_seq_length_tile + i;
+        let m_value = metadata[meta_offset];
+        g_sum += exp(m_value.x - g_max) * m_value.y;
+      }
 
-    if (total_seq_offset + local_idx < total_sequence_length) {
-       tile_qk[local_idx] = qk_value_t(exp(f32(qk[head_idx * total_sequence_length + total_seq_offset + local_idx]) - g_max) / g_sum);
+      if (total_seq_offset + local_idx < total_sequence_length) {
+        tile_qk[local_idx] = present_value_element_t(exp(qk[head_idx * total_sequence_length + total_seq_offset + local_idx] - g_max) / g_sum);
+      }
     }
-
     for (var k: u32 = 0u; k < uniforms.head_size_vec; k += tile_size_k_vec) {
       var value = present_value_value_t(0);
       qkv_values[local_row][local_col] = present_value_value_t(0);
@@ -760,6 +767,10 @@ var<workgroup> qkv_values: array<array<present_value_value_t, tile_size_k_vec>,
       workgroupBarrier();
     }
 
+    if (head_idx >= uniforms.num_heads) {
+      return;
+    }
+
     for (var i = local_idx; i < uniforms.head_size_vec; i += workgroup_size_x) {
       let out_offset = head_idx * uniforms.num_total_seq_length_tile * uniforms.head_size_vec + (workgroup_idx % uniforms.num_total_seq_length_tile) * uniforms.head_size_vec + i;
       out_split_vx[out_offset] = tile_output[i];
@@ -791,7 +802,8 @@ Status ComputeFlashAttentionDecodeSplitVxScore(onnxruntime::webgpu::ComputeConte
                             {static_cast<uint32_t>(head_size_vec)},
                             {static_cast<uint32_t>(parameters.is_gqa_ ? parameters.seqlen_present_kv_cache_ : parameters.total_sequence_length_)},
                             {static_cast<uint32_t>(parameters.n_reps)},
-                            num_total_seq_length_tile});
+                            num_total_seq_length_tile,
+                            {static_cast<uint32_t>(parameters.num_heads_)}});
 
   return context.RunProgram(program);
 }
@@ -830,6 +842,10 @@ var<workgroup> tile_input: array<array<output_value_t, TILE_SIZE>, TILE_SIZE>;
     tile_input[local_row][local_col] = value;
     workgroupBarrier();
 
+    if (head_idx >= uniforms.num_heads) {
+      return;
+    }
+
     if (local_idx < TILE_SIZE && head_size_offset + local_idx < uniforms.head_size_vec) {
       value = output_value_t(0);
       for (var i = 0u; i < TILE_SIZE; i++) {
@@ -860,7 +876,8 @@ Status ComputeFlashAttentionDecodeVxReduce(onnxruntime::webgpu::ComputeContext&
       .SetWorkgroupSize(tile_size * tile_size)
       .AddUniformVariables({{static_cast<uint32_t>(parameters.v_head_size_ / components)},
                             num_total_seq_length_tile,
-                            {num_head_size_tile}});
+                            {num_head_size_tile},
+                            {static_cast<uint32_t>(parameters.num_heads_)}});
 
   return context.RunProgram(program);
 }
@@ -903,14 +920,14 @@ Status ApplyFlashAttention(const Tensor* Q, const Tensor* K, const Tensor* V, co
   const TensorShapeVector qk_dims({parameters.batch_size_, parameters.num_heads_,
                                    parameters.sequence_length_, parameters.total_sequence_length_});
   const TensorShape qk_shape(qk_dims);
-  Tensor qk = context.CreateGPUTensor(Q->DataType(), qk_shape);
+  Tensor qk = context.CreateGPUTensor(DataTypeImpl::GetType<float>(), qk_shape);
   constexpr uint32_t tile_size = 64;
   const uint32_t num_total_seq_length_tile = (parameters.total_sequence_length_ + tile_size - 1) / tile_size;
   // The metadata is used to store the max and sum of each tile.
   const TensorShapeVector metadata_dims({parameters.batch_size_, parameters.num_heads_,
                                          num_total_seq_length_tile, 2});
   const TensorShape metadata_shape(metadata_dims);
-  Tensor metadata = context.CreateGPUTensor(Q->DataType(), metadata_shape);
+  Tensor metadata = context.CreateGPUTensor(DataTypeImpl::GetType<float>(), metadata_shape);
   ORT_RETURN_IF_ERROR(ComputeFlashAttentionDecodeQKT(context, Q, attention_bias, &qk, present_key, &metadata,
                                                      parameters, num_total_seq_length_tile, tile_size));
 
diff --git a/onnxruntime/contrib_ops/webgpu/bert/flash_attention.h b/onnxruntime/contrib_ops/webgpu/bert/flash_attention.h
index 181e411cdc91f..bfa4e7cb6d53f 100644
--- a/onnxruntime/contrib_ops/webgpu/bert/flash_attention.h
+++ b/onnxruntime/contrib_ops/webgpu/bert/flash_attention.h
@@ -78,7 +78,8 @@ class FlashAttentionDecodeQKTProgram final : public Program<FlashAttentionDecode
                                           {"alpha", ProgramUniformVariableDataType::Float32},
                                           {"present_sequence_length", ProgramUniformVariableDataType::Uint32},
                                           {"n_reps", ProgramUniformVariableDataType::Uint32},
-                                          {"num_total_seq_length_tile", ProgramUniformVariableDataType::Uint32});
+                                          {"num_total_seq_length_tile", ProgramUniformVariableDataType::Uint32},
+                                          {"num_heads", ProgramUniformVariableDataType::Uint32});
 
  private:
   bool has_attention_bias_;
@@ -97,7 +98,8 @@ class FlashAttentionDecodeSplitVxProgram final : public Program<FlashAttentionDe
                                           {"head_size_vec", ProgramUniformVariableDataType::Uint32},
                                           {"present_sequence_length", ProgramUniformVariableDataType::Uint32},
                                           {"n_reps", ProgramUniformVariableDataType::Uint32},
-                                          {"num_total_seq_length_tile", ProgramUniformVariableDataType::Uint32});
+                                          {"num_total_seq_length_tile", ProgramUniformVariableDataType::Uint32},
+                                          {"num_heads", ProgramUniformVariableDataType::Uint32});
 
  private:
   uint32_t tile_size_;
@@ -114,7 +116,8 @@ class FlashAttentionDecodeVxReduceProgram final : public Program<FlashAttentionD
 
   WEBGPU_PROGRAM_DEFINE_UNIFORM_VARIABLES({"head_size_vec", ProgramUniformVariableDataType::Uint32},
                                           {"num_total_seq_length_tile", ProgramUniformVariableDataType::Uint32},
-                                          {"num_head_size_tile", ProgramUniformVariableDataType::Uint32});
+                                          {"num_head_size_tile", ProgramUniformVariableDataType::Uint32},
+                                          {"num_heads", ProgramUniformVariableDataType::Uint32});
 
  private:
   uint32_t tile_size_;

From 03a2544ab9b7b3d86f0f590f99368d7578e2dee9 Mon Sep 17 00:00:00 2001
From: Ashrit Shetty <ashrit.shetty@microsoft.com>
Date: Tue, 13 May 2025 15:24:07 -0700
Subject: [PATCH 58/84] Add telemetry for logging GPU and NPU driver
 information in ONNX Runtime (#24734)

### Description
This feature enhances telemetry by gathering and logging driver
information for GPU and NPU devices detected on Windows systems. The
implementation includes:

- **Driver Information Collection**:
- Introduced a `device_version_info` map to categorize driver
information by device type (GPU or NPU).
  - For each detected device:
- Queries the Windows Registry for driver version information using
`RegQueryValueExW`.
- Collects driver names (device descriptions) and versions into the
appropriate device type category.

- **Telemetry Logging**:
  - Calls `LogDriverInfoEvent` with the following parameters:
    - `device_class`: Indicates the device type ("GPU" or "NPU").
- `driver_names`: Comma-separated list of driver names as wide strings.
- `driver_versions`: Comma-separated list of driver versions as wide
strings.

### Motivation and Context
Having telemetry for GPU and NPU driver information is essential for
monitoring and diagnosing performance issues between driver versions.


### Performance and Testing
This change was tested on Windows with a variety of sample apps along
with onnxruntime_test_all
and onnxruntime_perf_test
The code adds an additional 25-30 microseconds for each NPU/GPU that
gets enumerated.

---------

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
---
 onnxruntime/core/platform/telemetry.cc        |  8 +++
 onnxruntime/core/platform/telemetry.h         |  4 ++
 .../core/platform/windows/device_discovery.cc | 64 +++++++++++++++++++
 .../core/platform/windows/telemetry.cc        | 17 +++++
 onnxruntime/core/platform/windows/telemetry.h |  4 ++
 5 files changed, 97 insertions(+)

diff --git a/onnxruntime/core/platform/telemetry.cc b/onnxruntime/core/platform/telemetry.cc
index 206774c896ff5..e9c7830f6d7a4 100644
--- a/onnxruntime/core/platform/telemetry.cc
+++ b/onnxruntime/core/platform/telemetry.cc
@@ -89,4 +89,12 @@ void Telemetry::LogExecutionProviderEvent(LUID* adapterLuid) const {
   ORT_UNUSED_PARAMETER(adapterLuid);
 }
 
+void Telemetry::LogDriverInfoEvent(const std::string_view device_class,
+                                   const std::wstring_view& driver_names,
+                                   const std::wstring_view& driver_versions) const {
+  ORT_UNUSED_PARAMETER(device_class);
+  ORT_UNUSED_PARAMETER(driver_names);
+  ORT_UNUSED_PARAMETER(driver_versions);
+}
+
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/platform/telemetry.h b/onnxruntime/core/platform/telemetry.h
index bc261fddcd56e..d9afcace2fb81 100644
--- a/onnxruntime/core/platform/telemetry.h
+++ b/onnxruntime/core/platform/telemetry.h
@@ -69,6 +69,10 @@ class Telemetry {
 
   virtual void LogExecutionProviderEvent(LUID* adapterLuid) const;
 
+  virtual void LogDriverInfoEvent(const std::string_view device_class,
+                                  const std::wstring_view& driver_names,
+                                  const std::wstring_view& driver_versions) const;
+
  private:
   ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(Telemetry);
 };
diff --git a/onnxruntime/core/platform/windows/device_discovery.cc b/onnxruntime/core/platform/windows/device_discovery.cc
index 5c4c41a5799c8..a3b3ed0cec93e 100644
--- a/onnxruntime/core/platform/windows/device_discovery.cc
+++ b/onnxruntime/core/platform/windows/device_discovery.cc
@@ -12,6 +12,7 @@
 
 #include "core/common/cpuid_info.h"
 #include "core/common/logging/logging.h"
+#include "core/platform/env.h"
 #include "core/session/abi_devices.h"
 
 //// For SetupApi info
@@ -56,6 +57,26 @@ struct DeviceInfo {
   std::unordered_map<std::wstring, std::wstring> metadata;
 };
 
+struct DriverInfo {
+  std::wstring driver_versions;
+  std::wstring driver_names;
+
+  void AddDevice(const std::wstring& driver_version, const std::wstring& driver_name) {
+    if (!driver_version.empty()) {
+      if (!driver_versions.empty()) {
+        driver_versions += L", ";
+      }
+      driver_versions += driver_version;
+    }
+    if (!driver_name.empty()) {
+      if (!driver_names.empty()) {
+        driver_names += L", ";
+      }
+      driver_names += driver_name;
+    }
+  }
+};
+
 uint64_t GetDeviceKey(uint32_t vendor_id, uint32_t device_id) {
   return (uint64_t(vendor_id) << 32) | device_id;
 }
@@ -91,6 +112,8 @@ std::unordered_map<uint64_t, DeviceInfo> GetDeviceInfoSetupApi(const std::unorde
   const GUID local_DXCORE_HARDWARE_TYPE_ATTRIBUTE_NPU = {0xd46140c4, 0xadd7, 0x451b, 0x9e, 0x56, 0x6, 0xfe, 0x8c, 0x3b, 0x58, 0xed};
   const GUID local_GUID_DEVCLASS_COMPUTEACCELERATOR = {0xf01a9d53, 0x3ff6, 0x48d2, 0x9f, 0x97, 0xc8, 0xa7, 0x00, 0x4b, 0xe1, 0x0c};
 
+  std::unordered_map<OrtHardwareDeviceType, DriverInfo> device_version_info;
+
   std::array<GUID, 3> guids = {
       GUID_DEVCLASS_DISPLAY,
       GUID_DEVCLASS_PROCESSOR,
@@ -232,11 +255,52 @@ std::unordered_map<uint64_t, DeviceInfo> GetDeviceInfoSetupApi(const std::unorde
           entry->vendor = std::wstring(buffer, wcslen(buffer));
         }
       }
+
+      // Generate telemetry event to log the GPU and NPU driver name and version.
+      if (entry->type == OrtHardwareDeviceType_CPU) {
+        // Skip processor entries for telemetry.
+        continue;
+      }
+
+      // Open the device's driver registry key
+      HKEY dev_reg_key = SetupDiOpenDevRegKey(devInfo, &devData,
+                                              DICS_FLAG_GLOBAL,
+                                              0,
+                                              DIREG_DRV,
+                                              KEY_READ);
+
+      if (dev_reg_key != INVALID_HANDLE_VALUE) {
+        // Query the "DriverVersion" string
+        std::wstring driver_version_str;
+        wchar_t driver_version[256];
+        DWORD str_size = sizeof(driver_version);
+        DWORD type = 0;
+        if (RegQueryValueExW(dev_reg_key, L"DriverVersion",
+                             nullptr, &type,
+                             reinterpret_cast<LPBYTE>(driver_version),
+                             &str_size) == ERROR_SUCCESS &&
+            type == REG_SZ) {
+          // Ensure proper null termination of a string retrieved from the Windows Registry API.
+          driver_version[(str_size / sizeof(wchar_t)) - 1] = 0;
+          driver_version_str = driver_version;
+        }
+        RegCloseKey(dev_reg_key);
+        device_version_info[entry->type].AddDevice(driver_version_str, entry->description);
+      }
     }
 
     SetupDiDestroyDeviceInfoList(devInfo);
   }
 
+  // Log driver information for GPUs and NPUs
+  const Env& env = Env::Default();
+  for (const auto& [type, info] : device_version_info) {
+    if (!info.driver_versions.empty() || !info.driver_names.empty()) {
+      const std::string_view driver_class = (type == OrtHardwareDeviceType_GPU) ? "GPU" : "NPU";
+      env.GetTelemetryProvider().LogDriverInfoEvent(driver_class, info.driver_names, info.driver_versions);
+    }
+  }
+
   return device_info;
 }
 
diff --git a/onnxruntime/core/platform/windows/telemetry.cc b/onnxruntime/core/platform/windows/telemetry.cc
index 47789af9d5a47..0775e19c5654b 100644
--- a/onnxruntime/core/platform/windows/telemetry.cc
+++ b/onnxruntime/core/platform/windows/telemetry.cc
@@ -385,4 +385,21 @@ void WindowsTelemetry::LogExecutionProviderEvent(LUID* adapterLuid) const {
                     TraceLoggingUInt32(adapterLuid->HighPart, "adapterLuidHighPart"));
 }
 
+void WindowsTelemetry::LogDriverInfoEvent(const std::string_view device_class, const std::wstring_view& driver_names, const std::wstring_view& driver_versions) const {
+  if (global_register_count_ == 0 || enabled_ == false)
+    return;
+
+  TraceLoggingWrite(telemetry_provider_handle,
+                    "DriverInfo",
+                    TraceLoggingBool(true, "UTCReplace_AppSessionGuid"),
+                    TelemetryPrivacyDataTag(PDT_ProductAndServiceUsage),
+                    TraceLoggingKeyword(MICROSOFT_KEYWORD_MEASURES),
+                    TraceLoggingLevel(WINEVENT_LEVEL_INFO),
+                    // Telemetry info
+                    TraceLoggingUInt8(0, "schemaVersion"),
+                    TraceLoggingString(device_class.data(), "deviceClass"),
+                    TraceLoggingWideString(driver_names.data(), "driverNames"),
+                    TraceLoggingWideString(driver_versions.data(), "driverVersions"));
+}
+
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/platform/windows/telemetry.h b/onnxruntime/core/platform/windows/telemetry.h
index b23a60a44b5f0..92b3d11d77702 100644
--- a/onnxruntime/core/platform/windows/telemetry.h
+++ b/onnxruntime/core/platform/windows/telemetry.h
@@ -60,6 +60,10 @@ class WindowsTelemetry : public Telemetry {
 
   void LogExecutionProviderEvent(LUID* adapterLuid) const override;
 
+  void LogDriverInfoEvent(const std::string_view device_class,
+                          const std::wstring_view& driver_names,
+                          const std::wstring_view& driver_versions) const override;
+
   using EtwInternalCallback = std::function<void(LPCGUID SourceId, ULONG IsEnabled, UCHAR Level,
                                                  ULONGLONG MatchAnyKeyword, ULONGLONG MatchAllKeyword,
                                                  PEVENT_FILTER_DESCRIPTOR FilterData, PVOID CallbackContext)>;

From d0a6c9fecba617ebf61d663ae9aa353672bd7a31 Mon Sep 17 00:00:00 2001
From: Guenther Schmuelling <guschmue@microsoft.com>
Date: Tue, 13 May 2025 15:54:38 -0700
Subject: [PATCH 59/84] [webgpu] fix hardsigmoid fuse (#24745)

low and high need to be value_type_cast to T to match clamp signature:
clamp(e: T, low: T, high: T)

fix for https://github.com/microsoft/onnxruntime/issues/24644
---
 onnxruntime/core/providers/webgpu/nn/fuse_utils.cc | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/onnxruntime/core/providers/webgpu/nn/fuse_utils.cc b/onnxruntime/core/providers/webgpu/nn/fuse_utils.cc
index 38db604695a54..9e934e9eb5db7 100644
--- a/onnxruntime/core/providers/webgpu/nn/fuse_utils.cc
+++ b/onnxruntime/core/providers/webgpu/nn/fuse_utils.cc
@@ -63,11 +63,15 @@ std::string GetActivationSnippet(const Activation& activation, std::string value
     case ActivationKind::Sigmoid:
       return "value = " + value_type_cast(1.0) + " / (" + value_type_cast(1.0) + " + exp(-value));";
     case ActivationKind::Clip:
-      return "value = clamp(value, " + value_type_cast(activation.activation_params_.Clip.minimum_) + ", " + value_type_cast(activation.activation_params_.Clip.maximum_) + ");";
+      return "value = clamp(value, " + value_type_cast(activation.activation_params_.Clip.minimum_) + ", " +
+             value_type_cast(activation.activation_params_.Clip.maximum_) + ");";
     case ActivationKind::HardSigmoid:
-      return "value = clamp(" + value_type_cast(activation.activation_params_.HardSigmoid.alpha_) + " * value + " + value_type_cast(activation.activation_params_.HardSigmoid.beta_) + ", 0.0" + ", 1.0" + ");";
+      return "value = clamp(" + value_type_cast(activation.activation_params_.HardSigmoid.alpha_) + " * value + " +
+             value_type_cast(activation.activation_params_.HardSigmoid.beta_) + ", " + value_type_cast(0.0) + ", " +
+             value_type_cast(1.0) + ");";
     case ActivationKind::LeakyRelu:
-      return "value = select(" + base_type_cast(activation.activation_params_.LeakyRelu.alpha_) + " * value, value, value >= " + value_type_cast(0.0) + ");";
+      return "value = select(" + base_type_cast(activation.activation_params_.LeakyRelu.alpha_) +
+             " * value, value, value >= " + value_type_cast(0.0) + ");";
     case ActivationKind::Tanh:
       return "value = tanh(value);";
     default:

From 3fb2dc73f53b84b3df81639eb6340c1bb5ec774d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cl=C3=A9ment=20P=C3=A9ron?= <peron.clem@gmail.com>
Date: Wed, 14 May 2025 02:51:27 +0200
Subject: [PATCH 60/84] Fix reproducible (#24409)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

### Description
Build are not reproducible, remove information that contains local
information from the build


### Motivation and Context
Reproducible build is important to ensure package is reliable

Signed-off-by: Andrew Davis <afd@ti.com>
Signed-off-by: Clément Péron <peron.clem@gmail.com>
Co-authored-by: Andrew Davis <afd@ti.com>
---
 cmake/CMakeLists.txt | 1 -
 1 file changed, 1 deletion(-)

diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
index 0204ce1423bbf..cd55a1156a6e1 100644
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -1552,7 +1552,6 @@ if (Git_FOUND)
   string(APPEND ORT_BUILD_INFO "git-branch=${ORT_GIT_BRANCH}, git-commit-id=${ORT_GIT_COMMIT}, ")
 endif()
 string(APPEND ORT_BUILD_INFO "build type=${CMAKE_BUILD_TYPE}")
-string(APPEND ORT_BUILD_INFO ", cmake cxx flags: ${CMAKE_CXX_FLAGS}")
 configure_file(onnxruntime_config.h.in ${CMAKE_CURRENT_BINARY_DIR}/onnxruntime_config.h)
 get_property(onnxruntime_GENERATOR_IS_MULTI_CONFIG GLOBAL PROPERTY GENERATOR_IS_MULTI_CONFIG)
 

From d23eb9e4e80af9f834bed0d6523bc73634670c8c Mon Sep 17 00:00:00 2001
From: "Nat Kershaw (MSFT)" <nakersha@microsoft.com>
Date: Tue, 13 May 2025 17:56:25 -0700
Subject: [PATCH 61/84] Fix policheck error in string_normalizer.h (#24669)

---
 onnxruntime/core/providers/cpu/text/string_normalizer.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/onnxruntime/core/providers/cpu/text/string_normalizer.h b/onnxruntime/core/providers/cpu/text/string_normalizer.h
index 750c59ec21e21..9759140847b3a 100644
--- a/onnxruntime/core/providers/cpu/text/string_normalizer.h
+++ b/onnxruntime/core/providers/cpu/text/string_normalizer.h
@@ -27,7 +27,7 @@ class StringNormalizer : public OpKernel {
  private:
   bool is_case_sensitive_{true};
   CaseAction case_change_action_{NONE};
-  // Set this to lower because some characters do not have capital case.
+  // Set this to lower because some characters do not have upper case.
   // used for case-insensitive compare
   CaseAction compare_caseaction_{LOWER};
   std::string locale_name_;

From 8f0865d454b0bf08eff35eee583d4f5856d35164 Mon Sep 17 00:00:00 2001
From: Prathik Rao <prathik.rao@gmail.com>
Date: Wed, 14 May 2025 06:07:03 -0700
Subject: [PATCH 62/84] [WebGPU EP] add output_size==0 check in transpose
 kernel (#24746)

fixes error for https://huggingface.co/Xenova/musicgen-small on webgpu
native


![4574e02e-fb31-41a9-9257-bd389373f64f](https://github.com/user-attachments/assets/d71b07db-863b-40ad-8478-08d94ac74f69)
---
 onnxruntime/core/providers/webgpu/tensor/transpose.cc | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/onnxruntime/core/providers/webgpu/tensor/transpose.cc b/onnxruntime/core/providers/webgpu/tensor/transpose.cc
index 0df7d1ae9fa2f..74707a6739c94 100644
--- a/onnxruntime/core/providers/webgpu/tensor/transpose.cc
+++ b/onnxruntime/core/providers/webgpu/tensor/transpose.cc
@@ -104,6 +104,10 @@ Status Transpose::DoTranspose(onnxruntime::webgpu::ComputeContext& context,
                               gsl::span<const size_t> permutations,
                               const Tensor& input, Tensor& output) {
   const auto& input_shape = input.Shape();
+  uint32_t output_size = onnxruntime::narrow<int32_t>(input_shape.Size());
+  if (output_size == 0) {
+    return Status::OK();
+  }
   const auto& input_dims = input_shape.GetDims();
   int32_t rank = static_cast<int32_t>(input_shape.NumDimensions());
 
@@ -130,8 +134,6 @@ Status Transpose::DoTranspose(onnxruntime::webgpu::ComputeContext& context,
                           : new_shape;
     new_output_shape = TensorShape({new_input_shape[1], new_input_shape[0]});
   }
-
-  uint32_t output_size = onnxruntime::narrow<int32_t>(input_shape.Size());
   TransposeProgram program{permutations, use_shared};
 
   if (use_shared) {

From a755ebf3e8e2e0a52c3389178b6292f10965dbe6 Mon Sep 17 00:00:00 2001
From: Changming Sun <chasun@microsoft.com>
Date: Wed, 14 May 2025 06:58:32 -0700
Subject: [PATCH 63/84] Publish symbols to MSDL (#24748)

### Description
Publish Windows debug symbols to not only Azure DevOps but also
msdl.microsoft.com . See
https://learn.microsoft.com/en-us/windows-hardware/drivers/debugger/symbol-path
for how to consume it.
---
 ...acts-package-and-publish-steps-windows.yml | 24 +++--
 .../publish-symbolrequestprod-api.yml         | 90 +++++++++++++++++++
 2 files changed, 101 insertions(+), 13 deletions(-)
 create mode 100644 tools/ci_build/github/azure-pipelines/templates/publish-symbolrequestprod-api.yml

diff --git a/tools/ci_build/github/azure-pipelines/templates/c-api-artifacts-package-and-publish-steps-windows.yml b/tools/ci_build/github/azure-pipelines/templates/c-api-artifacts-package-and-publish-steps-windows.yml
index 1f71e8c0e0125..f872461e3b36b 100644
--- a/tools/ci_build/github/azure-pipelines/templates/c-api-artifacts-package-and-publish-steps-windows.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/c-api-artifacts-package-and-publish-steps-windows.yml
@@ -31,19 +31,17 @@ parameters:
   default: true
 
 steps:
-    - task: PublishSymbols@2
-      displayName: 'Publish Build Debug Symbols'
-      condition: and(succeeded(), or(eq(variables['Build.SourceBranch'], 'refs/heads/main'), startsWith(variables['Build.SourceBranch'], 'refs/heads/rel-')))
-      inputs:
-        SymbolsFolder: '$(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\'
-        SearchPattern: '*.pdb'
-        IndexSources: true
-        PublishSymbols: true
-        SymbolServerType: 'TeamServices'
-        SymbolExpirationInDays: '36530'
-        IndexableFileFormats: 'Default'
-        DetailedLog: true
-        SymbolsArtifactName: 'Symbols_${{parameters.artifactNameNoVersionString}}_$(Build.BuildId)'
+    - ${{if or(eq(variables['Build.SourceBranch'], 'refs/heads/main'), startsWith(variables['Build.SourceBranch'], 'refs/heads/rel-'))}}:
+      - template: publish-symbolrequestprod-api.yml
+        parameters:
+          ${{if eq(variables['Build.SourceBranch'], 'refs/heads/main')}}:
+            symbolExpiryTime: 60
+          includePublicSymbolServer: true
+          symbolsFolder: '$(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\'
+          symbolsArtifactName: ${{parameters.artifactNameNoVersionString}}
+          symbolsVersion: $(Build.BuildId)
+          symbolProject: 'ONNX Runtime'
+          subscription: 'OnnxrunTimeCodeSign_20240611'          
 
     - task: CmdLine@2
       displayName: 'Copy build artifacts for zipping'
diff --git a/tools/ci_build/github/azure-pipelines/templates/publish-symbolrequestprod-api.yml b/tools/ci_build/github/azure-pipelines/templates/publish-symbolrequestprod-api.yml
new file mode 100644
index 0000000000000..0441e685948bd
--- /dev/null
+++ b/tools/ci_build/github/azure-pipelines/templates/publish-symbolrequestprod-api.yml
@@ -0,0 +1,90 @@
+# This file was copied from https://github.com/microsoft/devhome/blob/main/build/templates/publish-symbolrequestprod-api.yml#L71
+parameters:
+  - name: includePublicSymbolServer
+    type: boolean
+    default: false
+  - name: symbolsFolder
+    type: string
+    default: '$(Build.SourcesDirectory)/bin'
+  - name: searchPattern
+    type: string
+    default: '**/*.pdb'
+  - name: jobName
+    type: string
+    default: PublishSymbols
+  - name: indexSources
+    type: boolean
+    default: true
+  - name: symbolExpiryTime
+    type: string
+    default: 36530 # This is the default from PublishSymbols@2
+  - name: symbolsArtifactName
+    type: string
+    default: ''
+  - name: symbolsVersion
+    type: string
+    default: ''
+  - name: symbolProject
+    type: string
+  - name: subscription
+    type: string
+
+steps:
+  - powershell: |-
+      Get-PackageProvider -Name NuGet -ForceBootstrap
+      Install-Module -Verbose -AllowClobber -Force Az.Accounts, Az.Storage, Az.Network, Az.Resources, Az.Compute
+    displayName: Install Azure Module Dependencies
+
+  # Transit the Azure token from the Service Connection into a secret variable for the rest of the pipeline to use.
+  - task: AzurePowerShell@5
+    displayName: Generate an Azure Token
+    inputs:
+      azureSubscription: ${{ parameters.subscription }}
+      azurePowerShellVersion: LatestVersion
+      pwsh: true
+      ScriptType: InlineScript
+      Inline: |-
+        $AzToken = (Get-AzAccessToken -ResourceUrl api://30471ccf-0966-45b9-a979-065dbedb24c1).Token
+        Write-Host "##vso[task.setvariable variable=SymbolAccessToken;issecret=true]$AzToken"
+
+  - task: PublishSymbols@2
+    displayName: Publish Symbols (to current Azure DevOps tenant)
+    continueOnError: True
+    inputs:
+      SearchPattern: ${{ parameters.searchPattern }}
+      IndexSources: ${{ parameters.indexSources }}
+      DetailedLog: true
+      SymbolsMaximumWaitTime: 30
+      SymbolServerType: 'TeamServices'
+      SymbolsProduct: 'onnxruntime'
+      SymbolsVersion: ${{ parameters.symbolsVersion }}
+      SymbolsArtifactName: '${{ parameters.symbolsArtifactName }}_${{ parameters.symbolsVersion }}'
+      SymbolExpirationInDays: ${{ parameters.symbolExpiryTime }}
+    env:
+      LIB: $(Build.SourcesDirectory)
+
+  - pwsh: |-
+      # Prepare the defaults for IRM
+      $PSDefaultParameterValues['Invoke-RestMethod:Headers'] = @{ Authorization = "Bearer $(SymbolAccessToken)" }
+      $PSDefaultParameterValues['Invoke-RestMethod:ContentType'] = "application/json"
+      $PSDefaultParameterValues['Invoke-RestMethod:Method'] = "POST"
+
+      $BaseUri = "https://symbolrequestprod.trafficmanager.net/projects/${{ parameters.symbolProject }}/requests"
+
+      # Prepare the request
+      $expiration = (Get-Date).Add([TimeSpan]::FromDays(${{ parameters.symbolExpiryTime }}))
+      $createRequestBody = @{
+        requestName = "${{ parameters.symbolsArtifactName }}_${{ parameters.symbolsVersion }}";
+        expirationTime = $expiration.ToString();
+      }
+      Write-Host "##[debug]Starting request $($createRequestBody.requestName) with expiration date of $($createRequestBody.expirationTime)"
+      Invoke-RestMethod -Uri "$BaseUri" -Body ($createRequestBody | ConvertTo-Json -Compress) -Verbose
+
+      # Request symbol publication
+      $publishRequestBody = @{
+        publishToInternalServer = $true;
+        publishToPublicServer = $${{ parameters.includePublicSymbolServer }};
+      }
+      Write-Host "##[debug]Submitting request $($createRequestBody.requestName) ($($publishRequestBody | ConvertTo-Json -Compress))"
+      Invoke-RestMethod -Uri "$BaseUri/$($createRequestBody.requestName)" -Body ($publishRequestBody | ConvertTo-Json -Compress) -Verbose
+    displayName: Publish Symbols using internal REST API
\ No newline at end of file

From 80c390fa974c0b007200aa57d798b6ff51260c99 Mon Sep 17 00:00:00 2001
From: co63oc <co63oc@users.noreply.github.com>
Date: Wed, 14 May 2025 22:36:19 +0800
Subject: [PATCH 64/84] [C#] Fix typos in multiple files (#24665)

### Description
<!-- Describe your changes. -->
Fix typos in multiple files


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
---
 .../InferenceSession.shared.cs                         | 10 +++++-----
 .../Microsoft.ML.OnnxRuntime/NativeMethods.shared.cs   |  2 +-
 csharp/src/Microsoft.ML.OnnxRuntime/OrtValue.shared.cs |  2 +-
 .../Tensors/ArrayUtilities.shared.cs                   |  2 +-
 .../Microsoft.ML.OnnxRuntime/Tensors/Tensor.shared.cs  |  8 ++++----
 5 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/csharp/src/Microsoft.ML.OnnxRuntime/InferenceSession.shared.cs b/csharp/src/Microsoft.ML.OnnxRuntime/InferenceSession.shared.cs
index b62a3c50bfda6..792f0ddd0f777 100644
--- a/csharp/src/Microsoft.ML.OnnxRuntime/InferenceSession.shared.cs
+++ b/csharp/src/Microsoft.ML.OnnxRuntime/InferenceSession.shared.cs
@@ -1046,7 +1046,7 @@ public ulong ProfilingStartTimeNs
             }
         }
 
-        private static void OrtCallback(IntPtr userData, IntPtr[] ouputs, uint numOutputs, IntPtr status)
+        private static void OrtCallback(IntPtr userData, IntPtr[] outputs, uint numOutputs, IntPtr status)
         {
             var hostHdl = GCHandle.FromIntPtr(userData);
             CallbackHost host = (CallbackHost)hostHdl.Target;
@@ -1635,7 +1635,7 @@ internal TensorTypeAndShape(TensorElementType elementType, int[] dimensions, str
     }
 
     /// <summary>
-    /// Represents sequnce metdata
+    /// Represents sequence metadata
     /// </summary>
     public class SequenceMetadata
     {
@@ -1648,7 +1648,7 @@ internal SequenceMetadata(NodeMetadata elementData)
             ElementMeta = elementData;
         }
         /// <summary>
-        /// Element Metatada, recursive definition with a Tensor being a base case
+        /// Element Metadata, recursive definition with a Tensor being a base case
         /// may contain maps, tensors and other sequences
         /// </summary>
         public NodeMetadata ElementMeta { get; }
@@ -1669,7 +1669,7 @@ internal OptionalMetadata(NodeMetadata elementData)
         }
 
         /// <summary>
-        /// Element Metatada, recursive definition with a Tensor being a base case
+        /// Element Metadata, recursive definition with a Tensor being a base case
         /// may contain maps, tensors and sequences
         /// </summary>
         public NodeMetadata ElementMeta { get; }
@@ -1876,7 +1876,7 @@ public TensorElementType ElementDataType
         }
 
         /// <summary>
-        /// Convinience method to check for string
+        /// Convenience method to check for string
         /// </summary>
         public bool IsString
         {
diff --git a/csharp/src/Microsoft.ML.OnnxRuntime/NativeMethods.shared.cs b/csharp/src/Microsoft.ML.OnnxRuntime/NativeMethods.shared.cs
index 664a77ceab037..de06be2384dd3 100644
--- a/csharp/src/Microsoft.ML.OnnxRuntime/NativeMethods.shared.cs
+++ b/csharp/src/Microsoft.ML.OnnxRuntime/NativeMethods.shared.cs
@@ -1456,7 +1456,7 @@ out IntPtr lora_adapter
         /// <param name="bytes">bytes</param>
         /// <param name="size">size in bytes</param>
         /// <param name="allocator">optional device allocator</param>
-        /// <param name="lora_adapter">resuling LoraAdapter instance</param>
+        /// <param name="lora_adapter">resulting LoraAdapter instance</param>
         /// <returns></returns>
         [UnmanagedFunctionPointer(CallingConvention.Winapi)]
         public delegate IntPtr /*(OrtStatus*)*/ DCreateLoraAdapterFromArray(
diff --git a/csharp/src/Microsoft.ML.OnnxRuntime/OrtValue.shared.cs b/csharp/src/Microsoft.ML.OnnxRuntime/OrtValue.shared.cs
index f3c0287d2bf9d..7440378cf71b4 100644
--- a/csharp/src/Microsoft.ML.OnnxRuntime/OrtValue.shared.cs
+++ b/csharp/src/Microsoft.ML.OnnxRuntime/OrtValue.shared.cs
@@ -1357,7 +1357,7 @@ public static OrtValue CreateMapWithStringValues<K>(K[] keys, IReadOnlyCollectio
         /// This API helps the user to process a map OrtValue without
         /// having to deal with the lifespan of intermediate OrtValues.
         ///
-        /// each API value is fed to the vistor functor.
+        /// each API value is fed to the visitor functor.
         /// </summary>
         /// <param name="visitor">visitor function</param>
         /// <param name="allocator">Allocator to use for intermediate values</param>
diff --git a/csharp/src/Microsoft.ML.OnnxRuntime/Tensors/ArrayUtilities.shared.cs b/csharp/src/Microsoft.ML.OnnxRuntime/Tensors/ArrayUtilities.shared.cs
index 025c1331ce54d..a0c1a0a150365 100644
--- a/csharp/src/Microsoft.ML.OnnxRuntime/Tensors/ArrayUtilities.shared.cs
+++ b/csharp/src/Microsoft.ML.OnnxRuntime/Tensors/ArrayUtilities.shared.cs
@@ -179,7 +179,7 @@ public static void GetIndices(ReadOnlySpan<int> strides, bool reverseStride, int
         }
 
         /// <summary>
-        /// Calculates the n-d indices from the 1-d index in a layout specificed by strides
+        /// Calculates the n-d indices from the 1-d index in a layout specified by strides
         /// </summary>
         /// <param name="strides"></param>
         /// <param name="reverseStride"></param>
diff --git a/csharp/src/Microsoft.ML.OnnxRuntime/Tensors/Tensor.shared.cs b/csharp/src/Microsoft.ML.OnnxRuntime/Tensors/Tensor.shared.cs
index e927b8105c6c9..27597525fc946 100644
--- a/csharp/src/Microsoft.ML.OnnxRuntime/Tensors/Tensor.shared.cs
+++ b/csharp/src/Microsoft.ML.OnnxRuntime/Tensors/Tensor.shared.cs
@@ -674,7 +674,7 @@ public Tensor<T> GetDiagonal(int offset)
 
             // the diagonal will be the length of the smaller axis
             // if offset it positive, the length will shift along the second axis
-            // if the offsett is negative, the length will shift along the first axis
+            // if the offset is negative, the length will shift along the first axis
             // In that way the length of the diagonal will be 
             //   Min(offset < 0 ? axisLength0 + offset : axisLength0, offset > 0 ? axisLength1 - offset : axisLength1)
             // To illustrate, consider the following
@@ -907,21 +907,21 @@ public virtual T this[ReadOnlySpan<int> indices]
         }
 
         /// <summary>
-        /// Gets the value at the specied index, where index is a linearized version of n-dimension indices using strides.
+        /// Gets the value at the specified index, where index is a linearized version of n-dimension indices using strides.
         /// </summary>
         /// <param name="index">An integer index computed as a dot-product of indices.</param>
         /// <returns>The value at the specified position in this Tensor.</returns>
         public abstract T GetValue(int index);
 
         /// <summary>
-        /// Sets the value at the specied index, where index is a linearized version of n-dimension indices using strides.
+        /// Sets the value at the specified index, where index is a linearized version of n-dimension indices using strides.
         /// </summary>
         /// <param name="index">An integer index computed as a dot-product of indices.</param>
         /// <param name="value">The new value to set at the specified position in this Tensor.</param>
         public abstract void SetValue(int index, T value);
 
 
-        #region statics
+        #region statistics
         /// <summary>
         /// Performs a value comparison of the content and shape of two tensors.  Two tensors are equal if they have the same shape and same value at every set of indices.  If not equal a tensor is greater or less than another tensor based on the first non-equal element when enumerating in linear order.
         /// </summary>

From da3c24b004bdb2969b780aac2edb5a5e9b9a688a Mon Sep 17 00:00:00 2001
From: Dmitri Smirnov <yuslepukhin@users.noreply.github.com>
Date: Wed, 14 May 2025 10:34:45 -0700
Subject: [PATCH 65/84] Expose and test GetTensorSizeInBytes in C# and Python
 (#24728)

### Description
Expose and test GetTensorSizeInBytes in C# and Python

### Motivation and Context

https://github.com/microsoft/onnxruntime/issues/24680
---
 .../Microsoft.ML.OnnxRuntime/NativeMethods.shared.cs  |  9 +++++++++
 .../src/Microsoft.ML.OnnxRuntime/OrtValue.shared.cs   | 11 +++++++++++
 .../OrtValueTests.cs                                  |  5 ++++-
 .../python/onnxruntime_inference_collection.py        |  7 +++++++
 onnxruntime/python/onnxruntime_pybind_ortvalue.cc     |  3 +++
 onnxruntime/test/python/onnxruntime_test_python.py    |  2 ++
 6 files changed, 36 insertions(+), 1 deletion(-)

diff --git a/csharp/src/Microsoft.ML.OnnxRuntime/NativeMethods.shared.cs b/csharp/src/Microsoft.ML.OnnxRuntime/NativeMethods.shared.cs
index de06be2384dd3..8cca2b42e987a 100644
--- a/csharp/src/Microsoft.ML.OnnxRuntime/NativeMethods.shared.cs
+++ b/csharp/src/Microsoft.ML.OnnxRuntime/NativeMethods.shared.cs
@@ -366,6 +366,8 @@ public struct OrtApi
         public IntPtr EpDevice_EpMetadata;
         public IntPtr EpDevice_EpOptions;
         public IntPtr EpDevice_Device;
+        public IntPtr GetEpApi;
+        public IntPtr GetTensorSizeInBytes;
     }
 
     internal static class NativeMethods
@@ -539,6 +541,7 @@ static NativeMethods()
             OrtValueIsTensor = (DOrtValueIsTensor)Marshal.GetDelegateForFunctionPointer(api_.IsTensor, typeof(DOrtValueIsTensor));
             OrtValueIsSparseTensor = (DOrtValueIsSparseTensor)Marshal.GetDelegateForFunctionPointer(api_.IsSparseTensor, typeof(DOrtValueIsSparseTensor));
             OrtGetTensorMutableData = (DOrtGetTensorMutableData)Marshal.GetDelegateForFunctionPointer(api_.GetTensorMutableData, typeof(DOrtGetTensorMutableData));
+            OrtGetTensorSizeInBytes = (DOrtGetTensorSizeInBytes)Marshal.GetDelegateForFunctionPointer(api_.GetTensorSizeInBytes, typeof(DOrtGetTensorSizeInBytes));
             OrtFillStringTensor = (DOrtFillStringTensor)Marshal.GetDelegateForFunctionPointer(api_.FillStringTensor, typeof(DOrtFillStringTensor));
             OrtGetResizedStringTensorElementBuffer = (DOrtGetResizedStringTensorElementBuffer)Marshal.GetDelegateForFunctionPointer(api_.GetResizedStringTensorElementBuffer, typeof(DOrtGetResizedStringTensorElementBuffer));
             OrtGetStringTensorContent = (DOrtGetStringTensorContent)Marshal.GetDelegateForFunctionPointer(api_.GetStringTensorContent, typeof(DOrtGetStringTensorContent));
@@ -2095,6 +2098,12 @@ out IntPtr lora_adapter
 
         public static DOrtGetTensorMutableData OrtGetTensorMutableData;
 
+        [UnmanagedFunctionPointer(CallingConvention.Winapi)]
+        public delegate IntPtr /*(OrtStatus*)*/ DOrtGetTensorSizeInBytes(IntPtr /* const struct OrtValue*/ ortValue,
+                                  out UIntPtr /* size_t* */ tensorSizeInBytes);
+
+        public static DOrtGetTensorSizeInBytes OrtGetTensorSizeInBytes;
+
         /// \param value A tensor created from OrtCreateTensor... function.
         /// \param len total data length, not including the trailing '\0' chars.
         [UnmanagedFunctionPointer(CallingConvention.Winapi)]
diff --git a/csharp/src/Microsoft.ML.OnnxRuntime/OrtValue.shared.cs b/csharp/src/Microsoft.ML.OnnxRuntime/OrtValue.shared.cs
index 7440378cf71b4..01ee3aa5ae753 100644
--- a/csharp/src/Microsoft.ML.OnnxRuntime/OrtValue.shared.cs
+++ b/csharp/src/Microsoft.ML.OnnxRuntime/OrtValue.shared.cs
@@ -312,6 +312,17 @@ public SystemNumericsTensors.TensorSpan<byte> GetTensorSpanMutableRawData<T>() w
         }
 #endif
 
+        /// <summary>
+        /// This API computes and returns the size of the tensor data in bytes.
+        /// </summary>
+        /// <returns>size of the tensor data in bytes</returns>
+        public long GetTensorSizeInBytes()
+        {
+            // The native API verifies that this is a non-string tensor
+            NativeApiStatus.VerifySuccess(NativeMethods.OrtGetTensorSizeInBytes(Handle, out UIntPtr size));
+            return (long)size;
+        }
+
         /// <summary>
         /// Fetch string tensor element buffer pointer at the specified index,
         /// convert/copy to UTF-16 char[] and return a ReadOnlyMemory{char} instance.
diff --git a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/OrtValueTests.cs b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/OrtValueTests.cs
index 69baa3f58b23a..eb2e4ad7fb5bb 100644
--- a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/OrtValueTests.cs
+++ b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/OrtValueTests.cs
@@ -153,6 +153,8 @@ static void VerifyTensorCreateWithData<T>(OrtValue tensor, TensorElementType dat
 
             // Verify contained data
             Assert.Equal(originalData.ToArray(), tensor.GetTensorDataAsSpan<T>().ToArray());
+            var byteData = tensor.GetTensorMutableRawData();
+            Assert.Equal(byteData.Length, tensor.GetTensorSizeInBytes());
         }
 
         [Fact(DisplayName = "CreateTensorOverManagedBuffer")]
@@ -278,7 +280,8 @@ public void CreateMapFromValues()
             // Must return always 2 for map since we have two ort values
             Assert.Equal(2, map.GetValueCount());
 
-            map.ProcessMap((keys, values) => {
+            map.ProcessMap((keys, values) =>
+            {
                 Assert.Equal(OnnxValueType.ONNX_TYPE_TENSOR, keys.OnnxType);
                 Assert.Equal(OnnxValueType.ONNX_TYPE_TENSOR, values.OnnxType);
                 Assert.Equal(ml_data_1, keys.GetTensorDataAsSpan<long>().ToArray());
diff --git a/onnxruntime/python/onnxruntime_inference_collection.py b/onnxruntime/python/onnxruntime_inference_collection.py
index e7dc4294f3672..15c423d7285bc 100644
--- a/onnxruntime/python/onnxruntime_inference_collection.py
+++ b/onnxruntime/python/onnxruntime_inference_collection.py
@@ -1006,6 +1006,13 @@ def element_type(self) -> int:
         """
         return self._ortvalue.element_type()
 
+    def tensor_size_in_bytes(self) -> int:
+        """
+        Returns the size of the data in the OrtValue in bytes
+        if the OrtValue is a tensor.
+        """
+        return self._ortvalue.tensor_size_in_bytes()
+
     def has_value(self) -> bool:
         """
         Returns True if the OrtValue corresponding to an
diff --git a/onnxruntime/python/onnxruntime_pybind_ortvalue.cc b/onnxruntime/python/onnxruntime_pybind_ortvalue.cc
index 66ceacda75e6d..382cd742c96aa 100644
--- a/onnxruntime/python/onnxruntime_pybind_ortvalue.cc
+++ b/onnxruntime/python/onnxruntime_pybind_ortvalue.cc
@@ -327,6 +327,9 @@ void addOrtValueMethods(pybind11::module& m) {
            "This integer is one type defined by ONNX TensorProto_DataType "
            "(such as onnx.TensorProto.FLOAT)."
            "Raises an exception in any other case.")
+      .def("tensor_size_in_bytes", [](const OrtValue* ort_value) -> size_t {
+        ORT_ENFORCE(ort_value->IsTensor(), "Only OrtValues that are Tensors are currently supported");
+        return ort_value->Get<Tensor>().SizeInBytes(); }, "Returns tensor size in bytes.")
       .def("has_value", [](const OrtValue* ort_value) -> bool { return ort_value->IsAllocated(); })
       .def("is_tensor", [](const OrtValue* ort_value) -> bool { return ort_value->IsTensor(); })
       .def("is_sparse_tensor", [](const OrtValue* ort_value) -> bool { return ort_value->IsSparseTensor(); })
diff --git a/onnxruntime/test/python/onnxruntime_test_python.py b/onnxruntime/test/python/onnxruntime_test_python.py
index 5631e49069ae3..0558b44ae6b47 100644
--- a/onnxruntime/test/python/onnxruntime_test_python.py
+++ b/onnxruntime/test/python/onnxruntime_test_python.py
@@ -1382,6 +1382,8 @@ def test_session_with_ortvalue_input(ortvalue, providers):
         self.assertEqual(ortvalue1.shape(), [3, 2])
         self.assertEqual(ortvalue1.data_type(), "tensor(float)")
         self.assertEqual(ortvalue1.is_tensor(), True)
+        # Assumes float32 and shape {3, 2} as above
+        self.assertEqual(ortvalue1.tensor_size_in_bytes(), 4 * 2 * 3)
         self.assertTrue(np.array_equal(ortvalue1.numpy(), numpy_arr_input))
 
         # Pass in the constructed OrtValue to a session via Run() and check results

From d7e49c7a5d07abc1c40bb58f78b9afb327acf167 Mon Sep 17 00:00:00 2001
From: Changming Sun <chasun@microsoft.com>
Date: Wed, 14 May 2025 13:01:17 -0700
Subject: [PATCH 66/84] Update dlpack to a newer version (#24760)

To include the following change:
https://github.com/dmlc/dlpack/pull/165
---
 cmake/vcpkg-ports/dlpack/portfile.cmake | 25 +++++++++++++++++++++++++
 cmake/vcpkg-ports/dlpack/usage          |  4 ++++
 cmake/vcpkg-ports/dlpack/vcpkg.json     | 17 +++++++++++++++++
 3 files changed, 46 insertions(+)
 create mode 100644 cmake/vcpkg-ports/dlpack/portfile.cmake
 create mode 100644 cmake/vcpkg-ports/dlpack/usage
 create mode 100644 cmake/vcpkg-ports/dlpack/vcpkg.json

diff --git a/cmake/vcpkg-ports/dlpack/portfile.cmake b/cmake/vcpkg-ports/dlpack/portfile.cmake
new file mode 100644
index 0000000000000..fdf328836d4dd
--- /dev/null
+++ b/cmake/vcpkg-ports/dlpack/portfile.cmake
@@ -0,0 +1,25 @@
+set(VCPKG_BUILD_TYPE release) # header-only port
+
+vcpkg_from_github(
+    OUT_SOURCE_PATH SOURCE_PATH
+    REPO dmlc/dlpack
+    REF 5c210da409e7f1e51ddf445134a4376fdbd70d7d
+    SHA512 4bc5f5fd36b20ef2943989d5c06fe9cd34f942cdfd4b4866a4405649f7faac47fcdcf3a1fa60eb7b96b643222e5e4b036cbca7d49835dc5f8b659708620a2e8f
+    HEAD_REF main
+)
+
+vcpkg_cmake_configure(
+    SOURCE_PATH "${SOURCE_PATH}"
+    OPTIONS
+      -DBUILD_MOCK=FALSE
+)
+
+vcpkg_cmake_install()
+
+vcpkg_cmake_config_fixup(CONFIG_PATH "lib/cmake/dlpack")
+
+file(REMOVE_RECURSE "${CURRENT_PACKAGES_DIR}/lib")
+
+vcpkg_install_copyright(FILE_LIST "${SOURCE_PATH}/LICENSE")
+
+file(COPY "${CMAKE_CURRENT_LIST_DIR}/usage" DESTINATION "${CURRENT_PACKAGES_DIR}/share/${PORT}")
diff --git a/cmake/vcpkg-ports/dlpack/usage b/cmake/vcpkg-ports/dlpack/usage
new file mode 100644
index 0000000000000..771ec78517174
--- /dev/null
+++ b/cmake/vcpkg-ports/dlpack/usage
@@ -0,0 +1,4 @@
+dlpack provides CMake targets:
+
+    find_package(dlpack CONFIG REQUIRED)
+    target_link_libraries(main PRIVATE dlpack::dlpack)
diff --git a/cmake/vcpkg-ports/dlpack/vcpkg.json b/cmake/vcpkg-ports/dlpack/vcpkg.json
new file mode 100644
index 0000000000000..48f2f22a0a058
--- /dev/null
+++ b/cmake/vcpkg-ports/dlpack/vcpkg.json
@@ -0,0 +1,17 @@
+{
+  "name": "dlpack",
+  "version-semver": "1.1.1",
+  "description": "DLPack is an open in-memory tensor structure for sharing tensors among frameworks",
+  "homepage": "https://github.com/dmlc/dlpack",
+  "license": "Apache-2.0",
+  "dependencies": [
+    {
+      "name": "vcpkg-cmake",
+      "host": true
+    },
+    {
+      "name": "vcpkg-cmake-config",
+      "host": true
+    }
+  ]
+}

From e55ce526b4404d6f2d0135dada83f716920aad0c Mon Sep 17 00:00:00 2001
From: Jordan Zangara <jordanozang@protonmail.com>
Date: Wed, 14 May 2025 17:43:01 -0400
Subject: [PATCH 67/84] CMake exports for static onnxruntime (#22173)

### Description

See the [example
repository](https://github.com/jordanozang/onnxruntime_minimal_static)
for a minimal example of using the static CMake Config.
- Add libraries built during the static onnxruntime build
(onnxruntime_common, onnxruntime_mlas, etc.) to the onnxruntime export
set. Additionally, add an onnxruntime::onnxruntime interface target that
behaves much the same as that target in the shared build case.
``find_package(onnxruntime REQUIRED)``
``target_link_libraries(example PRIVATE onnxruntime::onnxruntime)``
should now work.
- Minor modifications to ensure that dependency targets like Boost::mp11
are treated as imported targets and not part of the build interface.
- Static webgpu builds will currently not generate this CMake export


### Motivation and Context
- Resolves Issue #21351
- Builds on Pull Request #21348
---
 cmake/CMakeLists.txt                          |  70 +++--
 cmake/external/abseil-cpp.cmake               |  14 +-
 cmake/external/extensions.cmake               |  24 +-
 .../external/onnxruntime_external_deps.cmake  |  21 +-
 cmake/onnxruntime.cmake                       | 244 ++++++++++--------
 cmake/onnxruntime_common.cmake                |   2 +-
 cmake/onnxruntime_flatbuffers.cmake           |   3 +-
 cmake/onnxruntime_framework.cmake             |   6 +-
 cmake/onnxruntime_graph.cmake                 |   2 +-
 cmake/onnxruntime_lora.cmake                  |   2 +-
 cmake/onnxruntime_mlas.cmake                  |   2 +-
 cmake/onnxruntime_optimizer.cmake             |   2 +-
 cmake/onnxruntime_providers_acl.cmake         |   4 +-
 cmake/onnxruntime_providers_armnn.cmake       |   4 +-
 cmake/onnxruntime_providers_coreml.cmake      |   2 +-
 cmake/onnxruntime_providers_cpu.cmake         |   2 +-
 cmake/onnxruntime_providers_dml.cmake         |   2 +-
 cmake/onnxruntime_providers_nnapi.cmake       |   2 +-
 cmake/onnxruntime_providers_qnn.cmake         |   8 +
 cmake/onnxruntime_providers_rknpu.cmake       |   4 +-
 cmake/onnxruntime_providers_webnn.cmake       |  10 +-
 cmake/onnxruntime_providers_xnnpack.cmake     |   2 +-
 cmake/onnxruntime_session.cmake               |   2 +-
 cmake/onnxruntime_util.cmake                  |   2 +-
 .../protobuf/protobuf_android_log.patch       |  26 ++
 cmake/tensorboard/compat/proto/CMakeLists.txt |   7 +
 26 files changed, 304 insertions(+), 165 deletions(-)
 create mode 100644 cmake/patches/protobuf/protobuf_android_log.patch

diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
index cd55a1156a6e1..121799e16ee97 100644
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -974,7 +974,7 @@ if (onnxruntime_USE_JSEP)
     list(APPEND ONNXRUNTIME_PROVIDER_NAMES js)
 endif()
 if (onnxruntime_USE_QNN OR onnxruntime_USE_QNN_INTERFACE)
-    
+
     if(onnxruntime_USE_QNN)
         list(APPEND ORT_PROVIDER_FLAGS -DUSE_QNN=1)
     else()
@@ -1789,12 +1789,10 @@ if (onnxruntime_USE_WINML)
   list(APPEND ONNXRUNTIME_CMAKE_FILES winml)
 endif() # if (onnxruntime_USE_WINML)
 
-if (onnxruntime_BUILD_SHARED_LIB OR onnxruntime_BUILD_APPLE_FRAMEWORK)
-  if (onnxruntime_BUILD_APPLE_FRAMEWORK AND NOT ${CMAKE_SYSTEM_NAME} MATCHES "Darwin|iOS|visionOS|tvOS")
-    message(FATAL_ERROR "onnxruntime_BUILD_APPLE_FRAMEWORK can only be enabled for macOS or iOS or visionOS or tvOS.")
-  endif()
-  list(APPEND ONNXRUNTIME_CMAKE_FILES onnxruntime)
+if (onnxruntime_BUILD_APPLE_FRAMEWORK AND NOT ${CMAKE_SYSTEM_NAME} MATCHES "Darwin|iOS|visionOS|tvOS")
+  message(FATAL_ERROR "onnxruntime_BUILD_APPLE_FRAMEWORK can only be enabled for macOS or iOS or visionOS or tvOS.")
 endif()
+list(APPEND ONNXRUNTIME_CMAKE_FILES onnxruntime)
 
 if (onnxruntime_BUILD_JAVA)
   message(STATUS "Java Build is enabled")
@@ -1909,18 +1907,50 @@ if (onnxruntime_ENABLE_EXTERNAL_CUSTOM_OP_SCHEMAS)
   )
 endif()
 
-if(TARGET onnxruntime)
-# Install
-  include(GNUInstallDirs)
+if(NOT onnxruntime_BUILD_SHARED_LIB AND onnxruntime_USE_WEBGPU)
+  message(WARNING "CMake target files will not be generated for static onnxruntime builds with webgpu support")
+else()
+  # Install
   include(CMakePackageConfigHelpers)
+
   set(PROJECT_CONFIG_CONTENT "@PACKAGE_INIT@\n")
+
+  if (NOT onnxruntime_BUILD_SHARED_LIB)
+  string(APPEND PROJECT_CONFIG_CONTENT
+        "include(CMakeFindDependencyMacro)\n\
+        find_dependency(absl)\n\
+        find_dependency(date)\n\
+        find_dependency(Eigen3)\n\
+        find_dependency(nlohmann_json)\n\
+        find_dependency(ONNX)\n\
+        find_dependency(re2)\n\
+        find_dependency(flatbuffers)\n\
+        find_dependency(cpuinfo)\n\
+        find_dependency(protobuf)\n\
+        find_dependency(Boost COMPONENTS mp11)\n\
+        find_dependency(Microsoft.GSL 4.0)\n\
+        if(NOT WIN32 AND NOT CMAKE_SYSTEM_NAME STREQUAL \"Android\")\n\
+        find_dependency(Iconv)\n\
+        endif()\n\
+        if(WIN32)\n\
+        find_dependency(wil)\n\
+        endif()\n\
+        find_path(safeint_SOURCE_DIR NAMES \"SafeInt.hpp\" REQUIRED)\n\
+        add_library(safeint_interface IMPORTED INTERFACE)\n\
+        target_include_directories(safeint_interface INTERFACE ${safeint_SOURCE_DIR})\n\
+        ")
+  endif()
+
   string(APPEND PROJECT_CONFIG_CONTENT
-        "include(\"\${CMAKE_CURRENT_LIST_DIR}/${PROJECT_NAME}Targets.cmake\")")
+        "include(\"\${CMAKE_CURRENT_LIST_DIR}/${PROJECT_NAME}Targets.cmake\")\n")
+
+
   file(WRITE "${CMAKE_CURRENT_BINARY_DIR}/PROJECT_CONFIG_FILE" ${PROJECT_CONFIG_CONTENT})
   install(EXPORT ${PROJECT_NAME}Targets
     NAMESPACE ${PROJECT_NAME}::
     DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME})
-# Create config for find_package()
+
+  # Create config for find_package()
   configure_package_config_file(
     "${CMAKE_CURRENT_BINARY_DIR}/PROJECT_CONFIG_FILE" ${PROJECT_NAME}Config.cmake
     INSTALL_DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME}")
@@ -1935,16 +1965,16 @@ if(TARGET onnxruntime)
     "${PROJECT_BINARY_DIR}/${PROJECT_NAME}Config.cmake"
     "${PROJECT_BINARY_DIR}/${PROJECT_NAME}ConfigVersion.cmake"
     DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME}")
-endif()
 
-if(DEFINED BUILD_AS_ARM64X)
-  set(ARM64X_TARGETS onnxruntime)
+  if(DEFINED BUILD_AS_ARM64X)
+    set(ARM64X_TARGETS onnxruntime)
 
-  # Add additional ARM64X build targets
-  if (onnxruntime_USE_QNN AND NOT onnxruntime_BUILD_QNN_EP_STATIC_LIB)
-    list(APPEND ARM64X_TARGETS onnxruntime_providers_shared)
-    list(APPEND ARM64X_TARGETS onnxruntime_providers_qnn)
-  endif()
+    # Add additional ARM64X build targets
+    if (onnxruntime_USE_QNN AND NOT onnxruntime_BUILD_QNN_EP_STATIC_LIB)
+      list(APPEND ARM64X_TARGETS onnxruntime_providers_shared)
+      list(APPEND ARM64X_TARGETS onnxruntime_providers_qnn)
+    endif()
 
-  include("${CMAKE_CURRENT_SOURCE_DIR}/arm64x.cmake")
+    include("${CMAKE_CURRENT_SOURCE_DIR}/arm64x.cmake")
+  endif()
 endif()
diff --git a/cmake/external/abseil-cpp.cmake b/cmake/external/abseil-cpp.cmake
index cd93a8da0061f..5cfb9e78b4720 100644
--- a/cmake/external/abseil-cpp.cmake
+++ b/cmake/external/abseil-cpp.cmake
@@ -9,15 +9,21 @@ set(BUILD_TESTING 0)
 set(ABSL_BUILD_TESTING OFF)
 set(ABSL_BUILD_TEST_HELPERS OFF)
 set(ABSL_USE_EXTERNAL_GOOGLETEST ON)
+
+# Both abseil and xnnpack create a target called memory, which
+# results in a duplicate target if ABSL_ENABLE_INSTALL is on.
+if (onnxruntime_USE_XNNPACK)
+  set(ABSL_ENABLE_INSTALL OFF)
+else()
+  set(ABSL_ENABLE_INSTALL ON)
+endif()
+
 if(Patch_FOUND AND WIN32)
   set(ABSL_PATCH_COMMAND ${Patch_EXECUTABLE} --binary --ignore-whitespace -p1 < ${PROJECT_SOURCE_DIR}/patches/abseil/absl_windows.patch)
 else()
   set(ABSL_PATCH_COMMAND "")
 endif()
-if(WIN32 AND NOT Patch_FOUND)
-  #see https://github.com/google/re2/issues/425 and https://github.com/google/re2/issues/436
-  set(ABSL_ENABLE_INSTALL ON)
-endif()
+
 # NB! Advancing Abseil version changes its internal namespace,
 # currently absl::lts_20240116 which affects abseil-cpp.natvis debugger
 # visualization file, that must be adjusted accordingly, unless we eliminate
diff --git a/cmake/external/extensions.cmake b/cmake/external/extensions.cmake
index e12d74d734e97..8c00c1c8a530b 100644
--- a/cmake/external/extensions.cmake
+++ b/cmake/external/extensions.cmake
@@ -52,9 +52,24 @@ else()
   add_subdirectory(${onnxruntime_EXTENSIONS_PATH} ${CMAKE_BINARY_DIR}/_deps/extensions-subbuild EXCLUDE_FROM_ALL)
 endif()
 
+# move internal includes generated by onnxruntime-extensions as PUBLIC
+# from INTERFACE_INCLUDE_DIRECTORIES to INTERFACE_DIRECTORIES for the
+# targets that we will link to
+get_target_property(ocos_operators_INTERFACE_INCLUDES ocos_operators INTERFACE_INCLUDE_DIRECTORIES)
+target_include_directories(ocos_operators PRIVATE ${ocos_operators_INTERFACE_INCLUDES})
+set_target_properties(ocos_operators PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "")
+
+get_target_property(noexcep_operators_INTERFACE_INCLUDES noexcep_operators INTERFACE_INCLUDE_DIRECTORIES)
+target_include_directories(noexcep_operators PRIVATE ${noexcep_operators_INTERFACE_INCLUDES})
+set_target_properties(noexcep_operators PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "")
+
+get_target_property(ortcustomops_INTERFACE_INCLUDES ortcustomops INTERFACE_INCLUDE_DIRECTORIES)
+target_include_directories(ortcustomops PRIVATE ${ortcustomops_INTERFACE_INCLUDES} ${ONNXRUNTIME_INCLUDE_DIR}/core/session)
+set_target_properties(ortcustomops PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "")
+
 # target library or executable are defined in CMakeLists.txt of onnxruntime-extensions
 target_include_directories(ocos_operators PRIVATE ${RE2_INCLUDE_DIR} ${json_SOURCE_DIR}/include)
-target_include_directories(ortcustomops PUBLIC ${onnxruntime_EXTENSIONS_PATH}/includes)
+target_include_directories(ortcustomops PUBLIC $<BUILD_INTERFACE:${onnxruntime_EXTENSIONS_PATH}/includes>)
 if(OCOS_ENABLE_SPM_TOKENIZER)
   onnxruntime_add_include_to_target(sentencepiece-static ${PROTOBUF_LIB} ${ABSEIL_LIBS})
 endif()
@@ -64,3 +79,10 @@ onnxruntime_add_include_to_target(noexcep_operators ${PROTOBUF_LIB} ${ABSEIL_LIB
 add_dependencies(ocos_operators ${onnxruntime_EXTERNAL_DEPENDENCIES})
 add_dependencies(ortcustomops ${onnxruntime_EXTERNAL_DEPENDENCIES})
 
+if (NOT onnxruntime_BUILD_SHARED_LIB)
+  install(TARGETS ocos_operators ortcustomops noexcep_operators EXPORT ${PROJECT_NAME}Targets
+            ARCHIVE   DESTINATION ${CMAKE_INSTALL_LIBDIR}
+            LIBRARY   DESTINATION ${CMAKE_INSTALL_LIBDIR}
+            RUNTIME   DESTINATION ${CMAKE_INSTALL_BINDIR}
+            FRAMEWORK DESTINATION ${CMAKE_INSTALL_BINDIR})
+endif()
diff --git a/cmake/external/onnxruntime_external_deps.cmake b/cmake/external/onnxruntime_external_deps.cmake
index 1f31acb057b37..d967e806eb5a3 100644
--- a/cmake/external/onnxruntime_external_deps.cmake
+++ b/cmake/external/onnxruntime_external_deps.cmake
@@ -179,7 +179,8 @@ endif()
 #   for cross-compiling
 #2. if ONNX_CUSTOM_PROTOC_EXECUTABLE is not set, Compile everything(including protoc) from source code.
 if(Patch_FOUND)
-  set(ONNXRUNTIME_PROTOBUF_PATCH_COMMAND ${Patch_EXECUTABLE} --binary --ignore-whitespace -p1 < ${PROJECT_SOURCE_DIR}/patches/protobuf/protobuf_cmake.patch)
+  set(ONNXRUNTIME_PROTOBUF_PATCH_COMMAND ${Patch_EXECUTABLE} --binary --ignore-whitespace -p1 < ${PROJECT_SOURCE_DIR}/patches/protobuf/protobuf_cmake.patch &&
+                                         ${Patch_EXECUTABLE} --binary --ignore-whitespace -p1 < ${PROJECT_SOURCE_DIR}/patches/protobuf/protobuf_android_log.patch)
 else()
  set(ONNXRUNTIME_PROTOBUF_PATCH_COMMAND "")
 endif()
@@ -287,15 +288,16 @@ if(NOT TARGET Boost::mp11)
      EXCLUDE_FROM_ALL
      FIND_PACKAGE_ARGS NAMES Boost
     )
-    onnxruntime_fetchcontent_makeavailable(mp11)
+    FetchContent_Populate(mp11)
     if(NOT TARGET Boost::mp11)
-      add_library(Boost::mp11 ALIAS Boost::headers)
+      add_library(Boost::mp11 IMPORTED INTERFACE)
+      target_include_directories(Boost::mp11 INTERFACE $<BUILD_INTERFACE:${mp11_SOURCE_DIR}/include>)
     endif()
   endif()
 endif()
 
 set(JSON_BuildTests OFF CACHE INTERNAL "")
-set(JSON_Install OFF CACHE INTERNAL "")
+set(JSON_Install ON CACHE INTERNAL "")
 
 onnxruntime_fetchcontent_declare(
     nlohmann_json
@@ -407,6 +409,13 @@ set(GSL_TARGET "Microsoft.GSL::GSL")
 set(GSL_INCLUDE_DIR "$<TARGET_PROPERTY:${GSL_TARGET},INTERFACE_INCLUDE_DIRECTORIES>")
 onnxruntime_fetchcontent_makeavailable(GSL)
 
+if (NOT GSL_FOUND AND NOT onnxruntime_BUILD_SHARED_LIB)
+  install(TARGETS GSL EXPORT ${PROJECT_NAME}Targets
+  ARCHIVE  DESTINATION ${CMAKE_INSTALL_LIBDIR}
+  LIBRARY  DESTINATION ${CMAKE_INSTALL_LIBDIR}
+  RUNTIME  DESTINATION ${CMAKE_INSTALL_BINDIR})
+endif()
+
 find_path(safeint_SOURCE_DIR NAMES "SafeInt.hpp")
 if(NOT safeint_SOURCE_DIR)
   unset(safeint_SOURCE_DIR)
@@ -420,7 +429,7 @@ if(NOT safeint_SOURCE_DIR)
   # use fetch content rather than makeavailable because safeint only includes unconditional test targets
   FetchContent_Populate(safeint)
 endif()
-add_library(safeint_interface INTERFACE)
+add_library(safeint_interface IMPORTED INTERFACE)
 target_include_directories(safeint_interface INTERFACE ${safeint_SOURCE_DIR})
 
 
@@ -433,7 +442,7 @@ if (CMAKE_SYSTEM_NAME STREQUAL "iOS" OR CMAKE_SYSTEM_NAME STREQUAL "Android" OR
   set(FLATBUFFERS_BUILD_FLATC OFF CACHE BOOL "FLATBUFFERS_BUILD_FLATC" FORCE)
 endif()
 set(FLATBUFFERS_BUILD_TESTS OFF CACHE BOOL "FLATBUFFERS_BUILD_TESTS" FORCE)
-set(FLATBUFFERS_INSTALL OFF CACHE BOOL "FLATBUFFERS_INSTALL" FORCE)
+set(FLATBUFFERS_INSTALL ON CACHE BOOL "FLATBUFFERS_INSTALL" FORCE)
 set(FLATBUFFERS_BUILD_FLATHASH OFF CACHE BOOL "FLATBUFFERS_BUILD_FLATHASH" FORCE)
 set(FLATBUFFERS_BUILD_FLATLIB ON CACHE BOOL "FLATBUFFERS_BUILD_FLATLIB" FORCE)
 if(Patch_FOUND)
diff --git a/cmake/onnxruntime.cmake b/cmake/onnxruntime.cmake
index 2f4fd548b0e8e..1b124e3bb3f74 100644
--- a/cmake/onnxruntime.cmake
+++ b/cmake/onnxruntime.cmake
@@ -53,135 +53,148 @@ endfunction()
 
 get_c_cxx_api_headers(ONNXRUNTIME_PUBLIC_HEADERS)
 
-#If you want to verify if there is any extra line in symbols.txt, run
-# nm -C -g --defined libonnxruntime.so |grep -v '\sA\s' | cut -f 3 -d ' ' | sort
-# after build
-
-list(APPEND SYMBOL_FILES "${REPO_ROOT}/tools/ci_build/gen_def.py")
-foreach(f ${ONNXRUNTIME_PROVIDER_NAMES})
-  list(APPEND SYMBOL_FILES "${ONNXRUNTIME_ROOT}/core/providers/${f}/symbols.txt")
-endforeach()
-
-if(NOT CMAKE_SYSTEM_NAME MATCHES "AIX")
-add_custom_command(OUTPUT ${SYMBOL_FILE} ${CMAKE_CURRENT_BINARY_DIR}/generated_source.c
-  COMMAND ${Python_EXECUTABLE} "${REPO_ROOT}/tools/ci_build/gen_def.py"
-    --version_file "${ONNXRUNTIME_ROOT}/../VERSION_NUMBER" --src_root "${ONNXRUNTIME_ROOT}"
-    --config ${ONNXRUNTIME_PROVIDER_NAMES} --style=${OUTPUT_STYLE} --output ${SYMBOL_FILE}
-    --output_source ${CMAKE_CURRENT_BINARY_DIR}/generated_source.c
-  DEPENDS ${SYMBOL_FILES}
-  WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
-
-add_custom_target(onnxruntime_generate_def ALL DEPENDS ${SYMBOL_FILE} ${CMAKE_CURRENT_BINARY_DIR}/generated_source.c)
-endif()
-if(WIN32)
-  onnxruntime_add_shared_library(onnxruntime
-    ${SYMBOL_FILE}
-    "${ONNXRUNTIME_ROOT}/core/dll/dllmain.cc"
-    "${ONNXRUNTIME_ROOT}/core/dll/delay_load_hook.cc"
-    "${ONNXRUNTIME_ROOT}/core/dll/onnxruntime.rc"
-  )
-elseif(onnxruntime_BUILD_APPLE_FRAMEWORK)
-  # apple framework requires the header file be part of the library
-  onnxruntime_add_shared_library(onnxruntime
-    ${ONNXRUNTIME_PUBLIC_HEADERS}
-    "${CMAKE_CURRENT_BINARY_DIR}/generated_source.c"
-  )
+if(onnxruntime_BUILD_SHARED_LIB)
+  #If you want to verify if there is any extra line in symbols.txt, run
+  # nm -C -g --defined libonnxruntime.so |grep -v '\sA\s' | cut -f 3 -d ' ' | sort
+  # after build
 
-  # create Info.plist for the framework and podspec for CocoaPods (optional)
-  set(MACOSX_FRAMEWORK_NAME "onnxruntime")
-  set(MACOSX_FRAMEWORK_IDENTIFIER "com.microsoft.onnxruntime")
+  list(APPEND SYMBOL_FILES "${REPO_ROOT}/tools/ci_build/gen_def.py")
+  foreach(f ${ONNXRUNTIME_PROVIDER_NAMES})
+    list(APPEND SYMBOL_FILES "${ONNXRUNTIME_ROOT}/core/providers/${f}/symbols.txt")
+  endforeach()
 
-  # Setup weak frameworks for macOS/iOS. 'weak' as the CoreML or WebGPU EPs are optionally enabled.
-  if(onnxruntime_USE_COREML)
-    list(APPEND _weak_frameworks "\\\"CoreML\\\"")
-  endif()
+  if(NOT CMAKE_SYSTEM_NAME MATCHES "AIX")
+  add_custom_command(OUTPUT ${SYMBOL_FILE} ${CMAKE_CURRENT_BINARY_DIR}/generated_source.c
+    COMMAND ${Python_EXECUTABLE} "${REPO_ROOT}/tools/ci_build/gen_def.py"
+      --version_file "${ONNXRUNTIME_ROOT}/../VERSION_NUMBER" --src_root "${ONNXRUNTIME_ROOT}"
+      --config ${ONNXRUNTIME_PROVIDER_NAMES} --style=${OUTPUT_STYLE} --output ${SYMBOL_FILE}
+      --output_source ${CMAKE_CURRENT_BINARY_DIR}/generated_source.c
+    DEPENDS ${SYMBOL_FILES}
+    WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
 
-  if(onnxruntime_USE_WEBGPU)
-    list(APPEND _weak_frameworks "\\\"QuartzCore\\\"")
-    list(APPEND _weak_frameworks "\\\"IOSurface\\\"")
-    list(APPEND _weak_frameworks "\\\"Metal\\\"")
+  add_custom_target(onnxruntime_generate_def ALL DEPENDS ${SYMBOL_FILE} ${CMAKE_CURRENT_BINARY_DIR}/generated_source.c)
   endif()
+  if(WIN32)
+    onnxruntime_add_shared_library(onnxruntime
+      ${SYMBOL_FILE}
+      "${ONNXRUNTIME_ROOT}/core/dll/dllmain.cc"
+      "${ONNXRUNTIME_ROOT}/core/dll/delay_load_hook.cc"
+      "${ONNXRUNTIME_ROOT}/core/dll/onnxruntime.rc"
+    )
+  elseif(onnxruntime_BUILD_APPLE_FRAMEWORK)
+    # apple framework requires the header file be part of the library
+    onnxruntime_add_shared_library(onnxruntime
+      ${ONNXRUNTIME_PUBLIC_HEADERS}
+      "${CMAKE_CURRENT_BINARY_DIR}/generated_source.c"
+    )
+
+    # create Info.plist for the framework and podspec for CocoaPods (optional)
+    set(MACOSX_FRAMEWORK_NAME "onnxruntime")
+    set(MACOSX_FRAMEWORK_IDENTIFIER "com.microsoft.onnxruntime")
+
+    # Setup weak frameworks for macOS/iOS. 'weak' as the CoreML or WebGPU EPs are optionally enabled.
+    if(onnxruntime_USE_COREML)
+      list(APPEND _weak_frameworks "\\\"CoreML\\\"")
+    endif()
+
+    if(onnxruntime_USE_WEBGPU)
+      list(APPEND _weak_frameworks "\\\"QuartzCore\\\"")
+      list(APPEND _weak_frameworks "\\\"IOSurface\\\"")
+      list(APPEND _weak_frameworks "\\\"Metal\\\"")
+    endif()
 
-  if (_weak_frameworks)
-    string(JOIN ", " APPLE_WEAK_FRAMEWORK ${_weak_frameworks})
+    if (_weak_frameworks)
+      string(JOIN ", " APPLE_WEAK_FRAMEWORK ${_weak_frameworks})
+    endif()
+
+    set(INFO_PLIST_PATH "${CMAKE_CURRENT_BINARY_DIR}/Info.plist")
+    configure_file(${REPO_ROOT}/cmake/Info.plist.in ${INFO_PLIST_PATH})
+    configure_file(
+      ${REPO_ROOT}/tools/ci_build/github/apple/framework_info.json.template
+      ${CMAKE_CURRENT_BINARY_DIR}/framework_info.json)
+    set_target_properties(onnxruntime PROPERTIES
+      FRAMEWORK TRUE
+      FRAMEWORK_VERSION A
+      MACOSX_FRAMEWORK_INFO_PLIST ${INFO_PLIST_PATH}
+      # Note: The PUBLIC_HEADER and VERSION properties for the 'onnxruntime' target will be set later in this file.
+    )
+  else()
+    if(CMAKE_SYSTEM_NAME MATCHES "AIX")
+      onnxruntime_add_shared_library(onnxruntime ${ONNXRUNTIME_ROOT}/core/session/onnxruntime_c_api.cc)
+    else()
+      onnxruntime_add_shared_library(onnxruntime ${CMAKE_CURRENT_BINARY_DIR}/generated_source.c )
+    endif()
+    if(NOT APPLE)
+      include(CheckLinkerFlag)
+      check_linker_flag(CXX "LINKER:-rpath=\$ORIGIN" LINKER_SUPPORT_RPATH)
+      if(LINKER_SUPPORT_RPATH)
+        target_link_options(onnxruntime PRIVATE "LINKER:-rpath=\$ORIGIN")
+      endif()
+    endif()
   endif()
 
-  set(INFO_PLIST_PATH "${CMAKE_CURRENT_BINARY_DIR}/Info.plist")
-  configure_file(${REPO_ROOT}/cmake/Info.plist.in ${INFO_PLIST_PATH})
-  configure_file(
-    ${REPO_ROOT}/tools/ci_build/github/apple/framework_info.json.template
-    ${CMAKE_CURRENT_BINARY_DIR}/framework_info.json)
-  set_target_properties(onnxruntime PROPERTIES
-    FRAMEWORK TRUE
-    FRAMEWORK_VERSION A
-    MACOSX_FRAMEWORK_INFO_PLIST ${INFO_PLIST_PATH}
-    # Note: The PUBLIC_HEADER and VERSION properties for the 'onnxruntime' target will be set later in this file.
-  )
-else()
   if(CMAKE_SYSTEM_NAME MATCHES "AIX")
-    onnxruntime_add_shared_library(onnxruntime ${ONNXRUNTIME_ROOT}/core/session/onnxruntime_c_api.cc)
+    add_dependencies(onnxruntime ${onnxruntime_EXTERNAL_DEPENDENCIES})
   else()
-    onnxruntime_add_shared_library(onnxruntime ${CMAKE_CURRENT_BINARY_DIR}/generated_source.c )
-  endif()
-  if(NOT APPLE)
-    include(CheckLinkerFlag)
-    check_linker_flag(CXX "LINKER:-rpath=\$ORIGIN" LINKER_SUPPORT_RPATH)
-    if(LINKER_SUPPORT_RPATH)
-      target_link_options(onnxruntime PRIVATE "LINKER:-rpath=\$ORIGIN")
-    endif()
+    add_dependencies(onnxruntime onnxruntime_generate_def ${onnxruntime_EXTERNAL_DEPENDENCIES})
   endif()
-endif()
+  target_include_directories(onnxruntime PRIVATE ${ONNXRUNTIME_ROOT} PUBLIC "$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/onnxruntime>")
 
-if(CMAKE_SYSTEM_NAME MATCHES "AIX")
-  add_dependencies(onnxruntime ${onnxruntime_EXTERNAL_DEPENDENCIES})
-else()
-  add_dependencies(onnxruntime onnxruntime_generate_def ${onnxruntime_EXTERNAL_DEPENDENCIES})
-endif()
-target_include_directories(onnxruntime PRIVATE ${ONNXRUNTIME_ROOT} PUBLIC "$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/onnxruntime>")
 
+  target_compile_definitions(onnxruntime PRIVATE FILE_NAME=\"onnxruntime.dll\")
+
+  if(UNIX)
+    if (APPLE)
+      target_link_options(onnxruntime PRIVATE "LINKER:-dead_strip")
+    elseif(NOT CMAKE_SYSTEM_NAME MATCHES "AIX")
+      target_link_options(onnxruntime PRIVATE  "LINKER:--version-script=${SYMBOL_FILE}" "LINKER:--no-undefined" "LINKER:--gc-sections")
+    endif()
+  else()
+    target_link_options(onnxruntime PRIVATE  "-DEF:${SYMBOL_FILE}")
+  endif()
 
-target_compile_definitions(onnxruntime PRIVATE FILE_NAME=\"onnxruntime.dll\")
 
-if(UNIX)
   if (APPLE)
-    target_link_options(onnxruntime PRIVATE "LINKER:-dead_strip")
-  elseif(NOT CMAKE_SYSTEM_NAME MATCHES "AIX")
-    target_link_options(onnxruntime PRIVATE  "LINKER:--version-script=${SYMBOL_FILE}" "LINKER:--no-undefined" "LINKER:--gc-sections")
+    target_link_options(onnxruntime PRIVATE  "LINKER:-exported_symbols_list,${SYMBOL_FILE}")
+    set_target_properties(onnxruntime PROPERTIES
+      MACOSX_RPATH TRUE
+      SKIP_BUILD_RPATH TRUE
+      INSTALL_RPATH_USE_LINK_PATH FALSE
+      BUILD_WITH_INSTALL_NAME_DIR TRUE
+      INSTALL_NAME_DIR @rpath)
   endif()
-else()
-  target_link_options(onnxruntime PRIVATE  "-DEF:${SYMBOL_FILE}")
-endif()
-
 
-if (APPLE)
-  target_link_options(onnxruntime PRIVATE  "LINKER:-exported_symbols_list,${SYMBOL_FILE}")
-  set_target_properties(onnxruntime PROPERTIES
-    MACOSX_RPATH TRUE
-    SKIP_BUILD_RPATH TRUE
-    INSTALL_RPATH_USE_LINK_PATH FALSE
-    BUILD_WITH_INSTALL_NAME_DIR TRUE
-    INSTALL_NAME_DIR @rpath)
-endif()
 
 
+  if(CMAKE_SYSTEM_NAME STREQUAL "Android" AND onnxruntime_MINIMAL_BUILD)
+    # target onnxruntime is a shared library, the dummy __cxa_demangle is only attach to it to avoid
+    # affecting downstream ort library users with the behavior of dummy __cxa_demangle. So the dummy
+    # __cxa_demangle must not expose to libonnxruntime_common.a. It works as when the linker is
+    # creating the DSO, our dummy __cxa_demangle always comes before libc++abi.a so the
+    # __cxa_demangle in libc++abi.a is discarded, thus, huge binary size reduction.
+    target_sources(onnxruntime PRIVATE "${ONNXRUNTIME_ROOT}/core/platform/android/cxa_demangle.cc")
+    target_compile_definitions(onnxruntime PRIVATE USE_DUMMY_EXA_DEMANGLE=1)
+  endif()
 
-if(CMAKE_SYSTEM_NAME STREQUAL "Android" AND onnxruntime_MINIMAL_BUILD)
-  # target onnxruntime is a shared library, the dummy __cxa_demangle is only attach to it to avoid
-  # affecting downstream ort library users with the behavior of dummy __cxa_demangle. So the dummy
-  # __cxa_demangle must not expose to libonnxruntime_common.a. It works as when the linker is
-  # creating the DSO, our dummy __cxa_demangle always comes before libc++abi.a so the
-  # __cxa_demangle in libc++abi.a is discarded, thus, huge binary size reduction.
-  target_sources(onnxruntime PRIVATE "${ONNXRUNTIME_ROOT}/core/platform/android/cxa_demangle.cc")
-  target_compile_definitions(onnxruntime PRIVATE USE_DUMMY_EXA_DEMANGLE=1)
-endif()
+  # strip binary on Android, or for a minimal build on Unix
+  if(CMAKE_SYSTEM_NAME STREQUAL "Android" OR (onnxruntime_MINIMAL_BUILD AND UNIX))
+    if (onnxruntime_MINIMAL_BUILD AND ADD_DEBUG_INFO_TO_MINIMAL_BUILD)
+      # don't strip
+    else()
+      set_target_properties(onnxruntime PROPERTIES LINK_FLAGS_RELEASE -s)
+      set_target_properties(onnxruntime PROPERTIES LINK_FLAGS_MINSIZEREL -s)
+    endif()
+  endif()
 
-# strip binary on Android, or for a minimal build on Unix
-if(CMAKE_SYSTEM_NAME STREQUAL "Android" OR (onnxruntime_MINIMAL_BUILD AND UNIX))
-  if (onnxruntime_MINIMAL_BUILD AND ADD_DEBUG_INFO_TO_MINIMAL_BUILD)
-    # don't strip
-  else()
-    set_target_properties(onnxruntime PROPERTIES LINK_FLAGS_RELEASE -s)
-    set_target_properties(onnxruntime PROPERTIES LINK_FLAGS_MINSIZEREL -s)
+  # we need to copy C/C++ API headers to be packed into Android AAR package
+  if(CMAKE_SYSTEM_NAME STREQUAL "Android" AND onnxruntime_BUILD_JAVA)
+    set(ANDROID_HEADERS_DIR ${CMAKE_CURRENT_BINARY_DIR}/android/headers)
+    file(MAKE_DIRECTORY ${ANDROID_HEADERS_DIR})
+    # copy the header files one by one
+    foreach(h_ ${ONNXRUNTIME_PUBLIC_HEADERS})
+      get_filename_component(HEADER_NAME_ ${h_} NAME)
+      add_custom_command(TARGET onnxruntime POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy_if_different ${h_} ${ANDROID_HEADERS_DIR}/${HEADER_NAME_})
+    endforeach()
   endif()
 endif()
 
@@ -196,6 +209,10 @@ if(CMAKE_SYSTEM_NAME STREQUAL "Android" AND onnxruntime_BUILD_JAVA)
   endforeach()
 endif()
 
+if (NOT onnxruntime_BUILD_SHARED_LIB)
+  add_library(onnxruntime INTERFACE)
+endif()
+
 set(onnxruntime_INTERNAL_PROVIDER_LIBRARIES
   ${PROVIDERS_ACL}
   ${PROVIDERS_ARMNN}
@@ -248,10 +265,17 @@ endif()
 
 # If you are linking a new library, please add it to the list onnxruntime_INTERNAL_LIBRARIES or onnxruntime_EXTERNAL_LIBRARIES,
 # Please do not add a library directly to the target_link_libraries command
-target_link_libraries(onnxruntime PRIVATE
+if (onnxruntime_BUILD_SHARED_LIB)
+  target_link_libraries(onnxruntime PRIVATE
+      ${onnxruntime_INTERNAL_LIBRARIES}
+      ${onnxruntime_EXTERNAL_LIBRARIES}
+  )
+else()
+  target_link_libraries(onnxruntime INTERFACE
     ${onnxruntime_INTERNAL_LIBRARIES}
     ${onnxruntime_EXTERNAL_LIBRARIES}
-)
+  )
+endif()
 
 if(WIN32)
   target_link_options(onnxruntime PRIVATE ${onnxruntime_DELAYLOAD_FLAGS})
diff --git a/cmake/onnxruntime_common.cmake b/cmake/onnxruntime_common.cmake
index 993bb5d89efee..f9cd35fa71aa8 100644
--- a/cmake/onnxruntime_common.cmake
+++ b/cmake/onnxruntime_common.cmake
@@ -222,7 +222,7 @@ endif()
 
 if (NOT onnxruntime_BUILD_SHARED_LIB)
   install(DIRECTORY ${PROJECT_SOURCE_DIR}/../include/onnxruntime/core/common  DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/onnxruntime/core)
-  install(TARGETS onnxruntime_common
+  install(TARGETS onnxruntime_common EXPORT ${PROJECT_NAME}Targets
             ARCHIVE   DESTINATION ${CMAKE_INSTALL_LIBDIR}
             LIBRARY   DESTINATION ${CMAKE_INSTALL_LIBDIR}
             RUNTIME   DESTINATION ${CMAKE_INSTALL_BINDIR}
diff --git a/cmake/onnxruntime_flatbuffers.cmake b/cmake/onnxruntime_flatbuffers.cmake
index 3ab4c19122ba1..066fb561de57d 100644
--- a/cmake/onnxruntime_flatbuffers.cmake
+++ b/cmake/onnxruntime_flatbuffers.cmake
@@ -22,10 +22,9 @@ if (FLATBUFFERS_BUILD_FLATC)
   add_dependencies(onnxruntime_flatbuffers flatc)
 endif()
 if (NOT onnxruntime_BUILD_SHARED_LIB)
-    install(TARGETS onnxruntime_flatbuffers
+    install(TARGETS onnxruntime_flatbuffers EXPORT ${PROJECT_NAME}Targets
             ARCHIVE   DESTINATION ${CMAKE_INSTALL_LIBDIR}
             LIBRARY   DESTINATION ${CMAKE_INSTALL_LIBDIR}
             RUNTIME   DESTINATION ${CMAKE_INSTALL_BINDIR}
             FRAMEWORK DESTINATION ${CMAKE_INSTALL_BINDIR})
 endif()
-
diff --git a/cmake/onnxruntime_framework.cmake b/cmake/onnxruntime_framework.cmake
index e96bb32a7cd21..15f3105b34ecb 100644
--- a/cmake/onnxruntime_framework.cmake
+++ b/cmake/onnxruntime_framework.cmake
@@ -66,9 +66,9 @@ endif()
 if(onnxruntime_USE_TENSORRT OR onnxruntime_USE_NCCL OR onnxruntime_USE_NV)
 # TODO: for now, core framework depends on CUDA. It should be moved to TensorRT EP
 # TODO: provider_bridge_ort.cc should not include nccl.h
-target_include_directories(onnxruntime_framework PRIVATE ${ONNXRUNTIME_ROOT} PUBLIC ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
+target_include_directories(onnxruntime_framework PRIVATE ${ONNXRUNTIME_ROOT} ${CMAKE_CURRENT_BINARY_DIR} PUBLIC ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
 else()
-target_include_directories(onnxruntime_framework PRIVATE ${ONNXRUNTIME_ROOT} PUBLIC ${CMAKE_CURRENT_BINARY_DIR})
+target_include_directories(onnxruntime_framework PRIVATE ${ONNXRUNTIME_ROOT} ${CMAKE_CURRENT_BINARY_DIR})
 endif()
 # Needed for the provider interface, as it includes training headers when training is enabled
 if (onnxruntime_ENABLE_TRAINING_OPS)
@@ -118,7 +118,7 @@ if (onnxruntime_BUILD_SHARED_LIB)
   install(FILES ${PROJECT_SOURCE_DIR}/../include/onnxruntime/core/framework/provider_options.h  DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/onnxruntime/)
 else()
   install(DIRECTORY ${PROJECT_SOURCE_DIR}/../include/onnxruntime/core/framework  DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/onnxruntime/core)
-  install(TARGETS onnxruntime_framework
+  install(TARGETS onnxruntime_framework EXPORT ${PROJECT_NAME}Targets
             ARCHIVE   DESTINATION ${CMAKE_INSTALL_LIBDIR}
             LIBRARY   DESTINATION ${CMAKE_INSTALL_LIBDIR}
             RUNTIME   DESTINATION ${CMAKE_INSTALL_BINDIR}
diff --git a/cmake/onnxruntime_graph.cmake b/cmake/onnxruntime_graph.cmake
index 4d51325b8414e..fba1a680bb62a 100644
--- a/cmake/onnxruntime_graph.cmake
+++ b/cmake/onnxruntime_graph.cmake
@@ -157,7 +157,7 @@ endif()
 
 if (NOT onnxruntime_BUILD_SHARED_LIB)
   install(DIRECTORY ${PROJECT_SOURCE_DIR}/../include/onnxruntime/core/graph  DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/onnxruntime/core)
-  install(TARGETS onnxruntime_graph
+  install(TARGETS onnxruntime_graph EXPORT ${PROJECT_NAME}Targets
             ARCHIVE   DESTINATION ${CMAKE_INSTALL_LIBDIR}
             LIBRARY   DESTINATION ${CMAKE_INSTALL_LIBDIR}
             RUNTIME   DESTINATION ${CMAKE_INSTALL_BINDIR}
diff --git a/cmake/onnxruntime_lora.cmake b/cmake/onnxruntime_lora.cmake
index 7ba48454d997e..26ee21c645584 100644
--- a/cmake/onnxruntime_lora.cmake
+++ b/cmake/onnxruntime_lora.cmake
@@ -22,7 +22,7 @@ add_dependencies(onnxruntime_lora ${onnxruntime_EXTERNAL_DEPENDENCIES})
 set_target_properties(onnxruntime_lora PROPERTIES FOLDER "ONNXRuntime")
 
 if (NOT onnxruntime_BUILD_SHARED_LIB)
-    install(TARGETS onnxruntime_lora
+    install(TARGETS onnxruntime_lora EXPORT ${PROJECT_NAME}Targets
             ARCHIVE   DESTINATION ${CMAKE_INSTALL_LIBDIR}
             LIBRARY   DESTINATION ${CMAKE_INSTALL_LIBDIR}
             RUNTIME   DESTINATION ${CMAKE_INSTALL_BINDIR}
diff --git a/cmake/onnxruntime_mlas.cmake b/cmake/onnxruntime_mlas.cmake
index 5d46ac9adb7c2..3279a17f8cd5e 100644
--- a/cmake/onnxruntime_mlas.cmake
+++ b/cmake/onnxruntime_mlas.cmake
@@ -790,7 +790,7 @@ if (PLATFORM_NAME STREQUAL "macabi")
 endif()
 
 if (NOT onnxruntime_BUILD_SHARED_LIB)
-    install(TARGETS onnxruntime_mlas
+    install(TARGETS onnxruntime_mlas EXPORT ${PROJECT_NAME}Targets
             ARCHIVE   DESTINATION ${CMAKE_INSTALL_LIBDIR}
             LIBRARY   DESTINATION ${CMAKE_INSTALL_LIBDIR}
             RUNTIME   DESTINATION ${CMAKE_INSTALL_BINDIR}
diff --git a/cmake/onnxruntime_optimizer.cmake b/cmake/onnxruntime_optimizer.cmake
index 173c872d4cc06..e60cfbe1c0566 100644
--- a/cmake/onnxruntime_optimizer.cmake
+++ b/cmake/onnxruntime_optimizer.cmake
@@ -131,7 +131,7 @@ set_target_properties(onnxruntime_optimizer PROPERTIES FOLDER "ONNXRuntime")
 
 if (NOT onnxruntime_BUILD_SHARED_LIB)
   install(DIRECTORY ${PROJECT_SOURCE_DIR}/../include/onnxruntime/core/optimizer  DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/onnxruntime/core)
-  install(TARGETS onnxruntime_optimizer
+  install(TARGETS onnxruntime_optimizer EXPORT ${PROJECT_NAME}Targets
             ARCHIVE   DESTINATION ${CMAKE_INSTALL_LIBDIR}
             LIBRARY   DESTINATION ${CMAKE_INSTALL_LIBDIR}
             RUNTIME   DESTINATION ${CMAKE_INSTALL_BINDIR}
diff --git a/cmake/onnxruntime_providers_acl.cmake b/cmake/onnxruntime_providers_acl.cmake
index 1726151a3597a..afa41d72c1e9c 100644
--- a/cmake/onnxruntime_providers_acl.cmake
+++ b/cmake/onnxruntime_providers_acl.cmake
@@ -28,9 +28,9 @@
   set_target_properties(onnxruntime_providers_acl PROPERTIES LINKER_LANGUAGE CXX)
 
   if (NOT onnxruntime_BUILD_SHARED_LIB)
-    install(TARGETS onnxruntime_providers_acl
+    install(TARGETS onnxruntime_providers_acl EXPORT ${PROJECT_NAME}Targets
             ARCHIVE   DESTINATION ${CMAKE_INSTALL_LIBDIR}
             LIBRARY   DESTINATION ${CMAKE_INSTALL_LIBDIR}
             RUNTIME   DESTINATION ${CMAKE_INSTALL_BINDIR}
             FRAMEWORK DESTINATION ${CMAKE_INSTALL_BINDIR})
-  endif()
\ No newline at end of file
+  endif()
diff --git a/cmake/onnxruntime_providers_armnn.cmake b/cmake/onnxruntime_providers_armnn.cmake
index d6e0f3bd1b6cc..5238a1d19a197 100644
--- a/cmake/onnxruntime_providers_armnn.cmake
+++ b/cmake/onnxruntime_providers_armnn.cmake
@@ -25,9 +25,9 @@
   set_target_properties(onnxruntime_providers_armnn PROPERTIES LINKER_LANGUAGE CXX)
 
   if (NOT onnxruntime_BUILD_SHARED_LIB)
-    install(TARGETS onnxruntime_providers_armnn
+    install(TARGETS onnxruntime_providers_armnn EXPORT ${PROJECT_NAME}Targets
             ARCHIVE   DESTINATION ${CMAKE_INSTALL_LIBDIR}
             LIBRARY   DESTINATION ${CMAKE_INSTALL_LIBDIR}
             RUNTIME   DESTINATION ${CMAKE_INSTALL_BINDIR}
             FRAMEWORK DESTINATION ${CMAKE_INSTALL_BINDIR})
-  endif()
\ No newline at end of file
+  endif()
diff --git a/cmake/onnxruntime_providers_coreml.cmake b/cmake/onnxruntime_providers_coreml.cmake
index 0dd6e77c4b67a..757198ffb651d 100644
--- a/cmake/onnxruntime_providers_coreml.cmake
+++ b/cmake/onnxruntime_providers_coreml.cmake
@@ -210,7 +210,7 @@ target_include_directories(onnxruntime_providers_coreml PRIVATE ${ONNXRUNTIME_RO
 set_target_properties(onnxruntime_providers_coreml PROPERTIES LINKER_LANGUAGE CXX)
 
 if (NOT onnxruntime_BUILD_SHARED_LIB)
-  install(TARGETS onnxruntime_providers_coreml
+  install(TARGETS onnxruntime_providers_coreml EXPORT ${PROJECT_NAME}Targets
           ARCHIVE   DESTINATION ${CMAKE_INSTALL_LIBDIR}
           LIBRARY   DESTINATION ${CMAKE_INSTALL_LIBDIR}
           RUNTIME   DESTINATION ${CMAKE_INSTALL_BINDIR}
diff --git a/cmake/onnxruntime_providers_cpu.cmake b/cmake/onnxruntime_providers_cpu.cmake
index b9c810e1250a0..5a2dfb3210988 100644
--- a/cmake/onnxruntime_providers_cpu.cmake
+++ b/cmake/onnxruntime_providers_cpu.cmake
@@ -267,7 +267,7 @@ if (NOT onnxruntime_MINIMAL_BUILD AND NOT onnxruntime_EXTENDED_MINIMAL_BUILD
 endif()
 
 if (NOT onnxruntime_BUILD_SHARED_LIB)
-  install(TARGETS onnxruntime_providers
+  install(TARGETS onnxruntime_providers EXPORT ${PROJECT_NAME}Targets
           ARCHIVE   DESTINATION ${CMAKE_INSTALL_LIBDIR}
           LIBRARY   DESTINATION ${CMAKE_INSTALL_LIBDIR}
           RUNTIME   DESTINATION ${CMAKE_INSTALL_BINDIR}
diff --git a/cmake/onnxruntime_providers_dml.cmake b/cmake/onnxruntime_providers_dml.cmake
index e5ff19f133ee5..62136c5c568d7 100644
--- a/cmake/onnxruntime_providers_dml.cmake
+++ b/cmake/onnxruntime_providers_dml.cmake
@@ -87,7 +87,7 @@
   set_target_properties(onnxruntime_providers_dml PROPERTIES FOLDER "ONNXRuntime")
 
   if (NOT onnxruntime_BUILD_SHARED_LIB)
-    install(TARGETS onnxruntime_providers_dml
+    install(TARGETS onnxruntime_providers_dml EXPORT ${PROJECT_NAME}Targets
             ARCHIVE   DESTINATION ${CMAKE_INSTALL_LIBDIR}
             LIBRARY   DESTINATION ${CMAKE_INSTALL_LIBDIR}
             RUNTIME   DESTINATION ${CMAKE_INSTALL_BINDIR}
diff --git a/cmake/onnxruntime_providers_nnapi.cmake b/cmake/onnxruntime_providers_nnapi.cmake
index b718a976eb26f..06364ebd49593 100644
--- a/cmake/onnxruntime_providers_nnapi.cmake
+++ b/cmake/onnxruntime_providers_nnapi.cmake
@@ -74,7 +74,7 @@
   endif()
 
   if (NOT onnxruntime_BUILD_SHARED_LIB)
-    install(TARGETS onnxruntime_providers_nnapi
+    install(TARGETS onnxruntime_providers_nnapi EXPORT ${PROJECT_NAME}Targets
             ARCHIVE   DESTINATION ${CMAKE_INSTALL_LIBDIR}
             LIBRARY   DESTINATION ${CMAKE_INSTALL_LIBDIR}
             RUNTIME   DESTINATION ${CMAKE_INSTALL_BINDIR}
diff --git a/cmake/onnxruntime_providers_qnn.cmake b/cmake/onnxruntime_providers_qnn.cmake
index 60b3aaf38cd85..748e3de843bab 100644
--- a/cmake/onnxruntime_providers_qnn.cmake
+++ b/cmake/onnxruntime_providers_qnn.cmake
@@ -161,3 +161,11 @@
         )
     endif()
   endif()
+
+if (NOT onnxruntime_BUILD_SHARED_LIB)
+  install(TARGETS onnxruntime_providers_qnn EXPORT ${PROJECT_NAME}Targets
+          ARCHIVE   DESTINATION ${CMAKE_INSTALL_LIBDIR}
+          LIBRARY   DESTINATION ${CMAKE_INSTALL_LIBDIR}
+          RUNTIME   DESTINATION ${CMAKE_INSTALL_BINDIR}
+          FRAMEWORK DESTINATION ${CMAKE_INSTALL_BINDIR})
+endif()
diff --git a/cmake/onnxruntime_providers_rknpu.cmake b/cmake/onnxruntime_providers_rknpu.cmake
index 408bcfde06c36..831df84aa6e08 100644
--- a/cmake/onnxruntime_providers_rknpu.cmake
+++ b/cmake/onnxruntime_providers_rknpu.cmake
@@ -36,9 +36,9 @@
   set_target_properties(onnxruntime_providers_rknpu PROPERTIES LINKER_LANGUAGE CXX)
 
   if (NOT onnxruntime_BUILD_SHARED_LIB)
-    install(TARGETS onnxruntime_providers_rknpu
+    install(TARGETS onnxruntime_providers_rknpu EXPORT ${PROJECT_NAME}Targets
             ARCHIVE   DESTINATION ${CMAKE_INSTALL_LIBDIR}
             LIBRARY   DESTINATION ${CMAKE_INSTALL_LIBDIR}
             RUNTIME   DESTINATION ${CMAKE_INSTALL_BINDIR}
             FRAMEWORK DESTINATION ${CMAKE_INSTALL_BINDIR})
-  endif()
\ No newline at end of file
+  endif()
diff --git a/cmake/onnxruntime_providers_webnn.cmake b/cmake/onnxruntime_providers_webnn.cmake
index 05c63c22244db..da5720a9f7cb5 100644
--- a/cmake/onnxruntime_providers_webnn.cmake
+++ b/cmake/onnxruntime_providers_webnn.cmake
@@ -22,4 +22,12 @@
 
   add_dependencies(onnxruntime_providers_webnn onnx ${onnxruntime_EXTERNAL_DEPENDENCIES})
   set_target_properties(onnxruntime_providers_webnn PROPERTIES FOLDER "ONNXRuntime")
-  set_target_properties(onnxruntime_providers_webnn PROPERTIES LINKER_LANGUAGE CXX)
\ No newline at end of file
+  set_target_properties(onnxruntime_providers_webnn PROPERTIES LINKER_LANGUAGE CXX)
+
+if (NOT onnxruntime_BUILD_SHARED_LIB)
+  install(TARGETS onnxruntime_providers_webnn EXPORT ${PROJECT_NAME}Targets
+          ARCHIVE   DESTINATION ${CMAKE_INSTALL_LIBDIR}
+          LIBRARY   DESTINATION ${CMAKE_INSTALL_LIBDIR}
+          RUNTIME   DESTINATION ${CMAKE_INSTALL_BINDIR}
+          FRAMEWORK DESTINATION ${CMAKE_INSTALL_BINDIR})
+endif()
diff --git a/cmake/onnxruntime_providers_xnnpack.cmake b/cmake/onnxruntime_providers_xnnpack.cmake
index 796536ac9d12b..1fbe553fa58a0 100644
--- a/cmake/onnxruntime_providers_xnnpack.cmake
+++ b/cmake/onnxruntime_providers_xnnpack.cmake
@@ -28,7 +28,7 @@
   set_target_properties(onnxruntime_providers_xnnpack PROPERTIES LINKER_LANGUAGE CXX)
 
   if (NOT onnxruntime_BUILD_SHARED_LIB)
-    install(TARGETS onnxruntime_providers_xnnpack
+    install(TARGETS onnxruntime_providers_xnnpack EXPORT ${PROJECT_NAME}Targets
             ARCHIVE   DESTINATION ${CMAKE_INSTALL_LIBDIR}
             LIBRARY   DESTINATION ${CMAKE_INSTALL_LIBDIR}
             RUNTIME   DESTINATION ${CMAKE_INSTALL_BINDIR}
diff --git a/cmake/onnxruntime_session.cmake b/cmake/onnxruntime_session.cmake
index bd46c7476d0f0..d61512fa3cf09 100644
--- a/cmake/onnxruntime_session.cmake
+++ b/cmake/onnxruntime_session.cmake
@@ -69,7 +69,7 @@ endif()
 
 if (NOT onnxruntime_BUILD_SHARED_LIB)
     install(DIRECTORY ${PROJECT_SOURCE_DIR}/../include/onnxruntime/core/session  DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/onnxruntime/core)
-    install(TARGETS onnxruntime_session
+    install(TARGETS onnxruntime_session EXPORT ${PROJECT_NAME}Targets
             ARCHIVE   DESTINATION ${CMAKE_INSTALL_LIBDIR}
             LIBRARY   DESTINATION ${CMAKE_INSTALL_LIBDIR}
             RUNTIME   DESTINATION ${CMAKE_INSTALL_BINDIR}
diff --git a/cmake/onnxruntime_util.cmake b/cmake/onnxruntime_util.cmake
index 17e64ca2c1d65..851b68a4b61a0 100644
--- a/cmake/onnxruntime_util.cmake
+++ b/cmake/onnxruntime_util.cmake
@@ -21,7 +21,7 @@ if (WIN32)
 endif()
 
 if (NOT onnxruntime_BUILD_SHARED_LIB)
-    install(TARGETS onnxruntime_util
+    install(TARGETS onnxruntime_util EXPORT ${PROJECT_NAME}Targets
             ARCHIVE   DESTINATION ${CMAKE_INSTALL_LIBDIR}
             LIBRARY   DESTINATION ${CMAKE_INSTALL_LIBDIR}
             RUNTIME   DESTINATION ${CMAKE_INSTALL_BINDIR}
diff --git a/cmake/patches/protobuf/protobuf_android_log.patch b/cmake/patches/protobuf/protobuf_android_log.patch
new file mode 100644
index 0000000000000..56bca9d325a75
--- /dev/null
+++ b/cmake/patches/protobuf/protobuf_android_log.patch
@@ -0,0 +1,26 @@
+diff --git a/cmake/libprotobuf-lite.cmake b/cmake/libprotobuf-lite.cmake
+index 83e970312..96908991f 100644
+--- a/cmake/libprotobuf-lite.cmake
++++ b/cmake/libprotobuf-lite.cmake
+@@ -102,7 +102,7 @@ if(protobuf_LINK_LIBATOMIC)
+   target_link_libraries(libprotobuf-lite PRIVATE atomic)
+ endif()
+ if(${CMAKE_SYSTEM_NAME} STREQUAL "Android")
+-  target_link_libraries(libprotobuf-lite PRIVATE log)
++  target_link_libraries(libprotobuf-lite PRIVATE -llog)
+ endif()
+ target_include_directories(libprotobuf-lite PUBLIC ${protobuf_SOURCE_DIR}/src)
+ if(protobuf_BUILD_SHARED_LIBS)
+diff --git a/cmake/libprotobuf.cmake b/cmake/libprotobuf.cmake
+index 07e4bcf57..0cf27caff 100644
+--- a/cmake/libprotobuf.cmake
++++ b/cmake/libprotobuf.cmake
+@@ -118,7 +118,7 @@ if(protobuf_LINK_LIBATOMIC)
+   target_link_libraries(libprotobuf PRIVATE atomic)
+ endif()
+ if(${CMAKE_SYSTEM_NAME} STREQUAL "Android")
+-  target_link_libraries(libprotobuf PRIVATE log)
++  target_link_libraries(libprotobuf PRIVATE -llog)
+ endif()
+ target_include_directories(libprotobuf PUBLIC ${protobuf_SOURCE_DIR}/src)
+ if(protobuf_BUILD_SHARED_LIBS)
diff --git a/cmake/tensorboard/compat/proto/CMakeLists.txt b/cmake/tensorboard/compat/proto/CMakeLists.txt
index ad31e4062a8a4..addc3779e9521 100644
--- a/cmake/tensorboard/compat/proto/CMakeLists.txt
+++ b/cmake/tensorboard/compat/proto/CMakeLists.txt
@@ -23,3 +23,10 @@ add_dependencies(tensorboard ${onnxruntime_EXTERNAL_DEPENDENCIES})
 if(WIN32)
   target_compile_options(tensorboard PRIVATE "/wd4100" "/wd4125" "/wd4127" "/wd4267" "/wd4456" "/wd4800" "/wd6011" "/wd6387" "/wd28182")
 endif()
+
+if (NOT onnxruntime_BUILD_SHARED_LIB)
+  install(TARGETS tensorboard EXPORT ${PROJECT_NAME}Targets
+  ARCHIVE  DESTINATION ${CMAKE_INSTALL_LIBDIR}
+  LIBRARY  DESTINATION ${CMAKE_INSTALL_LIBDIR}
+  RUNTIME  DESTINATION ${CMAKE_INSTALL_BINDIR})
+endif()

From d186e2838cb1470a67c5e4a5e0594160ca47c79b Mon Sep 17 00:00:00 2001
From: Wanming Lin <wanming.lin@intel.com>
Date: Thu, 15 May 2025 07:11:45 +0800
Subject: [PATCH 68/84] [WebNN] Support MatMulInteger op (#24687)

WebNN doesn't provide a dedicated op for `MatMulInteger`, this PR
supports `MatMulInteger` by decomposing it into `DequantizeLinear A, B
-> MatMul -> Cast (to int32)` and makes some code optimization BTW.
---
 js/web/docs/webnn-operators.md                |   1 +
 .../core/providers/webnn/builders/helper.h    |   2 +-
 .../webnn/builders/impl/gemm_op_builder.cc    | 211 +++++++++++-------
 3 files changed, 133 insertions(+), 81 deletions(-)

diff --git a/js/web/docs/webnn-operators.md b/js/web/docs/webnn-operators.md
index 01acb257e9d0f..447ccd2664948 100644
--- a/js/web/docs/webnn-operators.md
+++ b/js/web/docs/webnn-operators.md
@@ -64,6 +64,7 @@ platforms. Check the [WebNN status](https://webmachinelearning.github.io/webnn-s
 | LRN | ai.onnx(7-12, 13+) | pad, averagePool2d, transpose, add, mul, pow, div | |
 | LSTM | ai.onnx(7-13, 14-21, 22+) | lstm | Only supports 'layout' == 0, 'input_forget' == 0. 'clip' is not supported. The activation functions in 'activations' must be one of 'Relu', 'Tanh', 'Sigmoid'. Forward and backward activations must be the same if bidirectional. 'sequence_lens' if present should be constant with values equal to the first dimension length of input 'X' |
 | MatMul | ai.onnx(7-8, 9-12, 13+) | matmul | |
+| MatMulInteger | ai.onnx(10+) | cast, dequantizeLinear, matmul | |
 | MatMulNBits | com.microsoft(1+) | add, dequantizeLinear, matmul, reshape, transpose | Inputs 'B' and 'zero_points' (if present) should be constants, input 'g_idx' is not supported, only bits=4 is supported |
 | Max | ai.onnx(7, 8-11, 12, 13+) | max | |
 | MaxPool | ai.onnx(7, 8-9, 10, 11, 12+) | maxPool2d | Only supports 4-D input, 2-D 'kernel_shape', 'storage_order' != 1, one output |
diff --git a/onnxruntime/core/providers/webnn/builders/helper.h b/onnxruntime/core/providers/webnn/builders/helper.h
index d77634e1f5b9b..946bd04dddbf5 100644
--- a/onnxruntime/core/providers/webnn/builders/helper.h
+++ b/onnxruntime/core/providers/webnn/builders/helper.h
@@ -203,6 +203,7 @@ const std::map<std::string_view, std::vector<std::string_view>> decomposed_op_ma
      {"add", "cast", "concat", "constant", "cumulativeSum", "div", "expand", "lesser", "matmul", "reshape", "scatterND",
       "softmax", "transpose", "where"}},
     {"LRN", {"add", "averagePool2d", "div", "mul", "pad", "pow", "transpose"}},
+    {"MatMulInteger", {"cast", "dequantizeLinear", "matmul"}},
     {"MatMulNBits", {"add", "dequantizeLinear", "matmul", "reshape", "transpose"}},
     {"MultiHeadAttention", {"add", "cast", "concat", "constant", "div", "matmul", "reshape", "softmax", "transpose"}},
     {"RotaryEmbedding", {"add", "concat", "gather", "mul", "reshape", "split"}},
@@ -262,7 +263,6 @@ const std::map<std::string_view, std::string_view> op_map = {
     {"LpPool", "l2Pool2d"},
     {"LSTM", "lstm"},
     {"MatMul", "matmul"},
-    {"MatMulInteger", "matmulInteger"},
     {"Max", "max"},
     {"MaxPool", "maxPool2d"},
     {"Min", "min"},
diff --git a/onnxruntime/core/providers/webnn/builders/impl/gemm_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/gemm_op_builder.cc
index d4be0f1bee18e..02f46c85d1d06 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/gemm_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/gemm_op_builder.cc
@@ -26,6 +26,8 @@ class GemmOpBuilder : public BaseOpBuilder {
                          const WebnnDeviceType /* device_type */, const logging::Logger& logger) const override;
   bool HasSupportedInputsImpl(const GraphViewer&, const Node& node,
                               const emscripten::val& wnn_limits, const logging::Logger& logger) const override;
+  bool HasSupportedOutputsImpl(const Node& node, const emscripten::val& wnn_limits,
+                               const logging::Logger& logger) const override;
 };
 
 // Add operator related.
@@ -35,111 +37,136 @@ Status GemmOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N
   const auto& input_defs = node.InputDefs();
   const size_t a_idx = 0, b_idx = 1, c_idx = 2;  // A*B+C
 
+  std::vector<int64_t> a_shape;
+  std::vector<int64_t> b_shape;
+  std::vector<int64_t> output_shape;
+  ORT_RETURN_IF_NOT(GetShape(*input_defs[a_idx], a_shape, logger), "Can not get shape of A");
+  ORT_RETURN_IF_NOT(GetShape(*input_defs[b_idx], b_shape, logger), "Can not get shape of B");
+  ORT_RETURN_IF_NOT(GetShape(*node.OutputDefs()[0], output_shape, logger), "Can not get output shape");
+
   emscripten::val a = model_builder.GetOperand(node.InputDefs()[a_idx]->Name());
   emscripten::val b = model_builder.GetOperand(node.InputDefs()[b_idx]->Name());
   emscripten::val output = emscripten::val::object();
-  emscripten::val options = emscripten::val::object();
-  options.set("label", node.Name());
-  if (op_type == "MatMul") {
-    std::vector<int64_t> a_shape;
-    if (!GetShape(*input_defs[a_idx], a_shape, logger)) {
-      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Can not get shape of A.");
-    }
-    std::vector<int64_t> b_shape;
-    if (!GetShape(*input_defs[b_idx], b_shape, logger)) {
-      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Can not get shape of B.");
-    }
-    // If the first argument is 1-D, it is promoted to a matrix by prepending a 1 to its dimensions.
-    bool extended_a_shape = false;
-    if (a_shape.size() == 1) {
-      extended_a_shape = true;
-      a_shape.insert(a_shape.begin(), 1);
-      emscripten::val a_shape_arr = emscripten::val::array(GetNarrowedIntfromInt64<uint32_t>(a_shape));
-      emscripten::val reshape_a_options = emscripten::val::object();
-      reshape_a_options.set("label", node.Name() + "_reshape_a");
-      a = model_builder.GetBuilder().call<emscripten::val>("reshape", a, a_shape_arr, reshape_a_options);
-    }
-    // If the second argument is 1-D, it is promoted to a matrix by appending a 1 to its dimensions.
-    bool extended_b_shape = false;
-    if (b_shape.size() == 1) {
-      extended_b_shape = true;
-      b_shape.push_back(1);
-      emscripten::val b_shape_arr = emscripten::val::array(GetNarrowedIntfromInt64<uint32_t>(b_shape));
-      emscripten::val reshape_b_options = emscripten::val::object();
-      reshape_b_options.set("label", node.Name() + "_reshape_b");
-      b = model_builder.GetBuilder().call<emscripten::val>("reshape", b, b_shape_arr, reshape_b_options);
-    }
+  emscripten::val common_options = emscripten::val::object();
 
-    output = model_builder.GetBuilder().call<emscripten::val>("matmul", a, b, options);
+  // MatMul and MatMulInteger in ONNX allow 1-D inputs while matmul in WebNN only supports at least 2-D inputs.
+  // We can support 1-D inputs by reshaping them to 2-D. We don't care Gemm here because it only provides 2-D inputs.
 
-    emscripten::val reshape_output_options = emscripten::val::object();
-    reshape_output_options.set("label", node.Name() + "_reshape_output");
-    // If the inputs are both 1D， reduce the output to a scalar.
-    if (extended_a_shape && extended_b_shape) {
-      output = model_builder.GetBuilder().call<emscripten::val>("reshape",
-                                                                output,
-                                                                emscripten::val::array(),
-                                                                reshape_output_options);
-    }
-    // After matrix multiplication the prepended 1 is removed.
-    else if (extended_a_shape) {
-      std::vector<uint32_t> new_shape;
-      for (size_t i = 0; i < b_shape.size() - 2; i++) {
-        new_shape.push_back(SafeInt<uint32_t>(b_shape[i]));
-      }
-      new_shape.push_back(SafeInt<uint32_t>(b_shape.back()));
-      output = model_builder.GetBuilder().call<emscripten::val>("reshape",
-                                                                output,
-                                                                emscripten::val::array(new_shape),
-                                                                reshape_output_options);
-    }
-    // After matrix multiplication the appended 1 is removed.
-    else if (extended_b_shape) {
-      std::vector<uint32_t> new_shape;
-      for (size_t i = 0; i < a_shape.size() - 1; i++) {
-        new_shape.push_back(SafeInt<uint32_t>(a_shape[i]));
-      }
+  // If the input A is 1-D, it is promoted to a matrix by prepending a 1 to its dimensions.
+  if (a_shape.size() == 1) {
+    a_shape.insert(a_shape.begin(), 1);
+    emscripten::val a_shape_arr = emscripten::val::array(GetNarrowedIntfromInt64<uint32_t>(a_shape));
+    common_options.set("label", node.Name() + "_reshape_a");
+    a = model_builder.GetBuilder().call<emscripten::val>("reshape", a, a_shape_arr, common_options);
+  }
+  // If the input B is 1-D, it is promoted to a matrix by appending a 1 to its dimensions.
+  if (b_shape.size() == 1) {
+    b_shape.push_back(1);
+    emscripten::val b_shape_arr = emscripten::val::array(GetNarrowedIntfromInt64<uint32_t>(b_shape));
+    common_options.set("label", node.Name() + "_reshape_b");
+    b = model_builder.GetBuilder().call<emscripten::val>("reshape", b, b_shape_arr, common_options);
+  }
+
+  if (op_type == "MatMul") {
+    common_options.set("label", node.Name());
+    output = model_builder.GetBuilder().call<emscripten::val>("matmul", a, b, common_options);
+
+    // If A or B input is 1-D, we need to reshape the output back to its original shape.
+    if (a_shape.size() == 1 || b_shape.size() == 1) {
+      common_options.set("label", node.Name() + "_reshape_output");
+      emscripten::val output_shape_arr = emscripten::val::array(GetNarrowedIntfromInt64<uint32_t>(output_shape));
       output = model_builder.GetBuilder().call<emscripten::val>("reshape",
                                                                 output,
-                                                                emscripten::val::array(new_shape),
-                                                                reshape_output_options);
+                                                                output_shape_arr,
+                                                                common_options);
     }
   } else if (op_type == "MatMulInteger") {
-    emscripten::val a_zero_point = emscripten::val::null();
-    emscripten::val b_zero_point = emscripten::val::null();
-    if (input_defs.size() >= 3) {
+    // WebNN doesn't provide a dedicated op for MatMulInteger, it can be simply decomposed by
+    // DequantizeLinear A, B -> MatMul -> Cast (to int32)
+    int32_t a_type;
+    ORT_RETURN_IF_NOT(GetType(*input_defs[0], a_type, logger), "Cannot get data type of input A");
+
+    emscripten::val a_zero_point, b_zero_point, a_scale, b_scale;
+    if (TensorExists(input_defs, 2)) {
       a_zero_point = model_builder.GetOperand(node.InputDefs()[2]->Name());
+      std::vector<int64_t> a_zero_point_shape;
+      ORT_RETURN_IF_NOT(GetShape(*input_defs[2], a_zero_point_shape, logger), "Cannot get shape of a_zero_point");
+      // Scale is not used by MatMulInteger but required by DequantizeLinear. So set it to deafult value 1.0f.
+      // The scale input should have the same shape as the zero point input.
+      a_scale = model_builder.CreateOrGetConstant<float>(ONNX_NAMESPACE::TensorProto_DataType_FLOAT,
+                                                         1.0f,
+                                                         GetNarrowedIntfromInt64<uint32_t>(a_zero_point_shape));
     } else {
-      a_zero_point = model_builder.CreateOrGetConstant<uint8_t>(ONNX_NAMESPACE::TensorProto_DataType_UINT8, 0);
+      // If a_zero_point is not provided, create default scalar for zero_point and scale inputs.
+      a_zero_point = model_builder.CreateOrGetConstant<uint8_t>(a_type, 0);
+      a_scale = model_builder.CreateOrGetConstant<float>(ONNX_NAMESPACE::TensorProto_DataType_FLOAT, 1.0f);
     }
-    if (input_defs.size() >= 4) {
+
+    // Dequantize A to Float32
+    common_options.set("label", node.Name() + "_dequantized_a");
+    emscripten::val dequantized_a = model_builder.GetBuilder().call<emscripten::val>("dequantizeLinear",
+                                                                                     a,
+                                                                                     a_scale,
+                                                                                     a_zero_point,
+                                                                                     common_options);
+    if (TensorExists(input_defs, 3)) {
       b_zero_point = model_builder.GetOperand(node.InputDefs()[3]->Name());
+      std::vector<int64_t> b_zero_point_shape;
+      ORT_RETURN_IF_NOT(GetShape(*input_defs[3], b_zero_point_shape, logger), "Cannot get shape of b_zero_point");
+      b_scale = model_builder.CreateOrGetConstant<float>(ONNX_NAMESPACE::TensorProto_DataType_FLOAT,
+                                                         1.0f,
+                                                         GetNarrowedIntfromInt64<uint32_t>(b_zero_point_shape));
     } else {
-      b_zero_point = model_builder.CreateOrGetConstant<uint8_t>(ONNX_NAMESPACE::TensorProto_DataType_UINT8, 0);
+      b_zero_point = model_builder.CreateOrGetConstant<uint8_t>(a_type, 0);
+      b_scale = model_builder.CreateOrGetConstant<float>(ONNX_NAMESPACE::TensorProto_DataType_FLOAT, 1.0f);
+    }
+
+    // Dequantize B to Float32
+    common_options.set("label", node.Name() + "_dequantized_b");
+    emscripten::val dequantized_b = model_builder.GetBuilder().call<emscripten::val>("dequantizeLinear",
+                                                                                     b,
+                                                                                     b_scale,
+                                                                                     b_zero_point,
+                                                                                     common_options);
+    // MatMul dequantized A and B
+    common_options.set("label", node.Name() + "_matmul_dequantized_ab");
+    emscripten::val matmul_dequantized_ab = model_builder.GetBuilder().call<emscripten::val>("matmul",
+                                                                                             dequantized_a,
+                                                                                             dequantized_b,
+                                                                                             common_options);
+    // Cast matmul_dequantized_ab to int32
+    common_options.set("label", node.Name() + "_cast_output");
+    output = model_builder.GetBuilder().call<emscripten::val>("cast",
+                                                              matmul_dequantized_ab,
+                                                              emscripten::val("int32"),
+                                                              common_options);
+    // If A or B input is 1-D, we need to reshape the output back to its original shape.
+    if (a_shape.size() == 1 || b_shape.size() == 1) {
+      common_options.set("label", node.Name() + "_reshape_output");
+      emscripten::val output_shape_arr = emscripten::val::array(GetNarrowedIntfromInt64<uint32_t>(output_shape));
+      output = model_builder.GetBuilder().call<emscripten::val>("reshape",
+                                                                output,
+                                                                output_shape_arr,
+                                                                common_options);
     }
-    output = model_builder.GetBuilder().call<emscripten::val>("matmulInteger",
-                                                              a,
-                                                              a_zero_point,
-                                                              b,
-                                                              b_zero_point,
-                                                              options);
   } else {  // Gemm
     NodeAttrHelper helper(node);
     const auto transA = helper.Get("transA", 0);
-    options.set("aTranspose", emscripten::val(transA == 1));
+    common_options.set("aTranspose", emscripten::val(transA == 1));
     const auto transB = helper.Get("transB", 0);
-    options.set("bTranspose", emscripten::val(transB == 1));
+    common_options.set("bTranspose", emscripten::val(transB == 1));
     const auto alpha = helper.Get("alpha", 1.0f);
     const auto beta = helper.Get("beta", 1.0f);
-    options.set("alpha", alpha);
-    options.set("beta", beta);
+    common_options.set("alpha", alpha);
+    common_options.set("beta", beta);
 
     // Add bias if present.
     if (input_defs.size() > 2) {
-      options.set("c", model_builder.GetOperand(node.InputDefs()[c_idx]->Name()));
+      common_options.set("c", model_builder.GetOperand(node.InputDefs()[c_idx]->Name()));
     }
 
-    output = model_builder.GetBuilder().call<emscripten::val>("gemm", a, b, options);
+    common_options.set("label", node.Name());
+    output = model_builder.GetBuilder().call<emscripten::val>("gemm", a, b, common_options);
   }
 
   model_builder.AddOperand(node.OutputDefs()[0]->Name(), std::move(output));
@@ -241,7 +268,31 @@ bool GemmOpBuilder::HasSupportedInputsImpl(const GraphViewer&, const Node& node,
     return false;
   }
 
-  return IsDataTypeSupportedByOp(op_type, input0_type, wnn_limits, "a", "A", logger);
+  if (op_type == "MatMulInteger") {
+    // The first decomposed op of MatMulInteger is DequantizeLinear, and so
+    // we only need to ensure it supports the input0_type.
+    return IsDataTypeSupportedByOp("DequantizeLinear", input0_type, wnn_limits, "input", "x", logger);
+  } else {
+    return IsDataTypeSupportedByOp(op_type, input0_type, wnn_limits, "a", "A", logger);
+  }
+}
+
+bool GemmOpBuilder::HasSupportedOutputsImpl(const Node& node, const emscripten::val& wnn_limits,
+                                            const logging::Logger& logger) const {
+  const auto& output = *node.OutputDefs()[0];
+  const std::string_view op_type = node.OpType();
+  int32_t output_type;
+  if (!GetType(output, output_type, logger)) {
+    return false;
+  }
+
+  if (op_type == "MatMulInteger") {
+    // The last decomposed op of MatMulInteger is Cast, and so
+    // we only need to ensure it supports the output_type.
+    return IsDataTypeSupportedByOp("Cast", output_type, wnn_limits, "output", "Output", logger);
+  } else {
+    return IsDataTypeSupportedByOp(op_type, output_type, wnn_limits, "output", "Output", logger);
+  }
 }
 
 void CreateGemmOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations) {

From de0415ed47549761b336c0b13418141560a9d1a8 Mon Sep 17 00:00:00 2001
From: Ted Themistokleous
 <107195283+TedThemistokleous@users.noreply.github.com>
Date: Wed, 14 May 2025 20:27:21 -0400
Subject: [PATCH 69/84] =?UTF-8?q?Skipping=20newly=20added=20MHA=20test=20a?=
 =?UTF-8?q?s=20this=20isn't=20supported=20by=20ROCm=20EP=20like=E2=80=A6?=
 =?UTF-8?q?=20(#24757)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 onnxruntime/test/contrib_ops/multihead_attention_op_test.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/onnxruntime/test/contrib_ops/multihead_attention_op_test.cc b/onnxruntime/test/contrib_ops/multihead_attention_op_test.cc
index 7c3dc617ffb12..2104f7a35c078 100644
--- a/onnxruntime/test/contrib_ops/multihead_attention_op_test.cc
+++ b/onnxruntime/test/contrib_ops/multihead_attention_op_test.cc
@@ -671,6 +671,7 @@ TEST(MultiHeadAttentionTest, CrossAttention_Batch1_HeadSize16) {
 }
 
 TEST(MultiHeadAttentionTest, CrossAttention_Batch1_HeadSize8) {
+  ROCM_GTEST_SKIP("ROCm MHA skip - missing support for ROCm on Radeon");
   AttentionTestData data;
   GetCrossAttentionData_HeadSize8_NoBias(data);
   RunMultiHeadAttentionTests(data, DISABLE_CUDA);

From dee90b51d5b8708e57f1183ab54cec5b7f823c74 Mon Sep 17 00:00:00 2001
From: Scott McKay <skottmckay@gmail.com>
Date: Thu, 15 May 2025 11:07:07 +1000
Subject: [PATCH 70/84] Pickup fix for when we don't find cpu info from
 setupapi. (#24753)

### Description
<!-- Describe your changes. -->
Some CPUs don't show up in SetupApi info for some reason.

Create default entry if that is the case.

Manually tested by disabling the lookup of GUID_DEVCLASS_PROCESSOR info.
Not sure of a better way to test.

### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
Fix crash as other code assumes there will always be CPU device.
---
 .../core/platform/windows/device_discovery.cc | 30 +++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/onnxruntime/core/platform/windows/device_discovery.cc b/onnxruntime/core/platform/windows/device_discovery.cc
index a3b3ed0cec93e..61db2bf368b09 100644
--- a/onnxruntime/core/platform/windows/device_discovery.cc
+++ b/onnxruntime/core/platform/windows/device_discovery.cc
@@ -448,6 +448,20 @@ std::unordered_map<uint64_t, DeviceInfo> GetDeviceInfoDxcore() {
 
   return device_info;
 }
+
+DeviceInfo GetDeviceInfoCPUID() {
+  DeviceInfo cpu_info{};
+  cpu_info.type = OrtHardwareDeviceType_CPU;
+
+  auto& cpuinfo = CPUIDInfo::GetCPUIDInfo();
+  cpu_info.vendor_id = cpuinfo.GetCPUVendorId();
+
+  std::string_view cpuid_vendor = cpuinfo.GetCPUVendor();
+  cpu_info.vendor = std::wstring(cpuid_vendor.begin(), cpuid_vendor.end());
+  cpu_info.description = cpu_info.vendor;
+
+  return cpu_info;
+}
 }  // namespace
 
 // Get devices from various sources and combine them into a single set of devices.
@@ -469,6 +483,22 @@ std::unordered_set<OrtHardwareDevice> DeviceDiscovery::DiscoverDevicesForPlatfor
   // setupapi_info. key is vendor_id+device_id
   std::unordered_map<uint64_t, DeviceInfo> setupapi_info = GetDeviceInfoSetupApi(npus);
 
+  // Ensure we have at least one CPU
+  bool found_cpu = false;
+  for (auto& [key, device] : setupapi_info) {
+    if (device.type == OrtHardwareDeviceType_CPU) {
+      found_cpu = true;
+      break;
+    }
+  }
+
+  // If no CPU was found via SetupApi, add one from CPUID
+  if (!found_cpu) {
+    DeviceInfo device = GetDeviceInfoCPUID();
+    uint64_t key = GetDeviceKey(device);
+    setupapi_info[key] = std::move(device);
+  }
+
   // add dxcore info for any devices that are not in d3d12.
   // d3d12 info is more complete and has a good description and metadata.
   // dxcore has 'Discrete' in metadata so add that if found

From c4495a90b0efc28db0667583b8516fbfb52b43ca Mon Sep 17 00:00:00 2001
From: Changming Sun <chasun@microsoft.com>
Date: Wed, 14 May 2025 18:17:16 -0700
Subject: [PATCH 71/84] Update pull request pipeline triggers (#24762)

This PR is a follow-up of #24547 . Previously the pipelines had some
issues that prevented me to modify these files. Now the issue is solved.
---
 .github/workflows/codeql.yml                  | 52 ++++++-------
 .../workflows/gradle-wrapper-validation.yml   |  7 +-
 .github/workflows/ios.yml                     | 76 +++++++++----------
 .github/workflows/lint.yml                    | 15 +---
 .github/workflows/linux-dnnl.yml              |  6 +-
 .github/workflows/linux_ci.yml                |  6 +-
 .github/workflows/linux_cuda_ci.yml           |  6 +-
 .github/workflows/linux_openvino_ci.yml       |  6 +-
 .github/workflows/linux_tensorrt_ci.yml       |  6 +-
 .github/workflows/mac.yml                     |  2 +-
 .github/workflows/pr_checks.yml               |  2 +-
 .github/workflows/web.yml                     |  2 +-
 12 files changed, 89 insertions(+), 97 deletions(-)

diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml
index 3fadd99bd4ccc..a0188b864d849 100644
--- a/.github/workflows/codeql.yml
+++ b/.github/workflows/codeql.yml
@@ -8,16 +8,16 @@ name: "CodeQL"
 
 on:
   push:
-    branches: [ "main", nuget_pkg, rel-* ]
+    branches: ["main", nuget_pkg, rel-*]
   pull_request:
     # The branches below must be a subset of the branches above
-    branches: [ "main" ]
+    branches: ["main"]
   schedule:
     - cron: '41 13 * * 0'
   workflow_dispatch:
 
 concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  group: ${{ github.workflow }}-${{ github.event_name == 'pull_request' && github.ref || github.sha }}
   cancel-in-progress: true
 
 jobs:
@@ -32,44 +32,44 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        language: [ 'java', 'javascript', 'python' ]
+        language: ['java', 'javascript', 'python']
         # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ruby' ]
         # Learn more about CodeQL language support at https://aka.ms/codeql-docs/language-support
 
     steps:
-    - name: Checkout repository
-      uses: actions/checkout@v4
+      - name: Checkout repository
+        uses: actions/checkout@v4
 
     # Initializes the CodeQL tools for scanning.
-    - name: Initialize CodeQL
-      uses: github/codeql-action/init@v3
-      with:
-        languages: ${{ matrix.language }}
+      - name: Initialize CodeQL
+        uses: github/codeql-action/init@v3
+        with:
+          languages: ${{ matrix.language }}
         # If you wish to specify custom queries, you can do so here or in a config file.
         # By default, queries listed here will override any specified in a config file.
         # Prefix the list here with "+" to use these queries and those in the config file.
 
         # Details on CodeQL's query packs refer to: https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/configuring-code-scanning#using-queries-in-ql-packs
-        queries: security-extended,security-and-quality
+          queries: security-extended,security-and-quality
 
     # Setup Java to use a version that is not too old for the project
-    - if: ${{ matrix.language == 'java' }}
-      name: Setup Java 11
-      uses: actions/setup-java@v4
-      with:
-        java-version: '11'
-        distribution: 'microsoft'
+      - if: ${{ matrix.language == 'java' }}
+        name: Setup Java 11
+        uses: actions/setup-java@v4
+        with:
+          java-version: '11'
+          distribution: 'microsoft'
 
-    - if: ${{ matrix.language == 'javascript' }}
-      uses: actions/setup-node@v4
-      with:
-        node-version: 20
+      - if: ${{ matrix.language == 'javascript' }}
+        uses: actions/setup-node@v4
+        with:
+          node-version: 20
 
     # Autobuild attempts to build any compiled languages  (C/C++, C#, or Java).
     # If this step fails, then you should remove it and run the build manually (see below)
-    - if: ${{ matrix.language != 'cpp' }}
-      name: Autobuild
-      uses: github/codeql-action/autobuild@v3
+      - if: ${{ matrix.language != 'cpp' }}
+        name: Autobuild
+        uses: github/codeql-action/autobuild@v3
 
-    - name: Perform CodeQL Analysis
-      uses: github/codeql-action/analyze@v3
+      - name: Perform CodeQL Analysis
+        uses: github/codeql-action/analyze@v3
diff --git a/.github/workflows/gradle-wrapper-validation.yml b/.github/workflows/gradle-wrapper-validation.yml
index 88ec3b451dd2d..91e42583d361f 100644
--- a/.github/workflows/gradle-wrapper-validation.yml
+++ b/.github/workflows/gradle-wrapper-validation.yml
@@ -5,9 +5,9 @@
 name: "Validate Gradle Wrapper"
 on:
   push:
-    branches: [ main, 'rel-*']
+    branches: [main, 'rel-*']
   pull_request:
-    branches: [ main, 'rel-*']
+    branches: [main, 'rel-*']
   workflow_dispatch:
 
 jobs:
@@ -17,3 +17,6 @@ jobs:
     steps:
       - uses: actions/checkout@v4
       - uses: gradle/actions/wrapper-validation@v4
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event_name == 'pull_request' && github.ref || github.sha }}
+  cancel-in-progress: true
diff --git a/.github/workflows/ios.yml b/.github/workflows/ios.yml
index 2c51bf0ce476f..75f5d02fd3720 100644
--- a/.github/workflows/ios.yml
+++ b/.github/workflows/ios.yml
@@ -12,53 +12,53 @@ on:
   workflow_dispatch:
 
 concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
+  group: ${{ github.workflow }}-${{ github.event_name == 'pull_request' && github.ref || github.sha }}
   cancel-in-progress: true
 
 jobs:
   iOS_CI_on_Mac:
     runs-on: macos-14
     steps:
-    - name: Checkout repository
-      uses: actions/checkout@v4
-      with:
-        submodules: false
-    - name: Use Xcode ${{ env.XCODE_VERSION }}
-      shell: bash
-      run: |
-        set -e -x
-        XCODE_DEVELOPER_DIR="/Applications/Xcode_${{ env.XCODE_VERSION }}.app/Contents/Developer"
-        sudo xcode-select --switch "${XCODE_DEVELOPER_DIR}"
+      - name: Checkout repository
+        uses: actions/checkout@v4
+        with:
+          submodules: false
+      - name: Use Xcode ${{ env.XCODE_VERSION }}
+        shell: bash
+        run: |
+          set -e -x
+          XCODE_DEVELOPER_DIR="/Applications/Xcode_${{ env.XCODE_VERSION }}.app/Contents/Developer"
+          sudo xcode-select --switch "${XCODE_DEVELOPER_DIR}"
 
-    - name: Export GitHub Actions cache environment variables
-      uses: actions/github-script@v7
-      with:
-        script: |
-          core.exportVariable('ACTIONS_CACHE_URL', process.env.ACTIONS_CACHE_URL || '');
-          core.exportVariable('ACTIONS_RUNTIME_TOKEN', process.env.ACTIONS_RUNTIME_TOKEN || '');
+      - name: Export GitHub Actions cache environment variables
+        uses: actions/github-script@v7
+        with:
+          script: |
+            core.exportVariable('ACTIONS_CACHE_URL', process.env.ACTIONS_CACHE_URL || '');
+            core.exportVariable('ACTIONS_RUNTIME_TOKEN', process.env.ACTIONS_RUNTIME_TOKEN || '');
 
-    - name: (CPU, CoreML, XNNPACK EPs) Build onnxruntime for iOS x86_64 and run tests using simulator
-      shell: bash
-      run: |
-        python3 ${{ github.workspace }}/tools/ci_build/build.py \
-          --skip_submodule_sync \
-          --build_dir ${{ github.workspace }}/iOS \
-          --build_shared_lib \
-          --use_coreml \
-          --use_xnnpack \
-          --ios \
-          --apple_sysroot iphonesimulator \
-          --osx_arch x86_64 \
-          --apple_deploy_target=15.1 \
-          --use_xcode \
-          --config RelWithDebInfo \
-          --build_apple_framework \
-          --parallel \
-          --use_binskim_compliant_compile_flags
-      env:
-        ORT_GET_SIMULATOR_DEVICE_INFO_REQUESTED_RUNTIME_VERSION: ${{ env.IOS_SIMULATOR_RUNTIME_VERSION }}
+      - name: (CPU, CoreML, XNNPACK EPs) Build onnxruntime for iOS x86_64 and run tests using simulator
+        shell: bash
+        run: |
+          python3 ${{ github.workspace }}/tools/ci_build/build.py \
+            --skip_submodule_sync \
+            --build_dir ${{ github.workspace }}/iOS \
+            --build_shared_lib \
+            --use_coreml \
+            --use_xnnpack \
+            --ios \
+            --apple_sysroot iphonesimulator \
+            --osx_arch x86_64 \
+            --apple_deploy_target=15.1 \
+            --use_xcode \
+            --config RelWithDebInfo \
+            --build_apple_framework \
+            --parallel \
+            --use_binskim_compliant_compile_flags
+        env:
+          ORT_GET_SIMULATOR_DEVICE_INFO_REQUESTED_RUNTIME_VERSION: ${{ env.IOS_SIMULATOR_RUNTIME_VERSION }}
 
     timeout-minutes: 150
     env:
       XCODE_VERSION: 15.3.0
-      IOS_SIMULATOR_RUNTIME_VERSION: 17.4
\ No newline at end of file
+      IOS_SIMULATOR_RUNTIME_VERSION: 17.4
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
index c7e434de0aa33..16c9008f3675f 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -9,7 +9,7 @@ on:
   workflow_dispatch:
 
 concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}-${{ github.event_name == 'workflow_dispatch' }}
+  group: ${{ github.workflow }}-${{ github.event_name == 'pull_request' && github.ref || github.sha }}
   cancel-in-progress: true
 
 jobs:
@@ -109,18 +109,7 @@ jobs:
           github_token: ${{ secrets.github_token }}
           reporter: github-pr-check
           level: info
-          flags: --linelength=120
-            --exclude=java/src/main/native/*.c
-            --exclude=onnxruntime/core/mlas/inc/*
-            --exclude=onnxruntime/core/mlas/lib/*
-            --exclude=onnxruntime/contrib_ops/cuda/bert/flash_attention/*
-            --exclude=build/Debug/*
-            --exclude=cmake/*
-            --exclude=csharp/test/*
-            --exclude=onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/GeneratedShaders/*
-            --exclude=orttraining/orttraining/test/*
-            --exclude=onnxruntime/test/*
-            --exclude=winml/*
+          flags: --linelength=120 --exclude=java/src/main/native/*.c --exclude=onnxruntime/core/mlas/inc/* --exclude=onnxruntime/core/mlas/lib/* --exclude=onnxruntime/contrib_ops/cuda/bert/flash_attention/* --exclude=build/Debug/* --exclude=cmake/* --exclude=csharp/test/* --exclude=onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/GeneratedShaders/* --exclude=orttraining/orttraining/test/* --exclude=onnxruntime/test/* --exclude=winml/*
           filter: "-runtime/references"
 
   lint-js:
diff --git a/.github/workflows/linux-dnnl.yml b/.github/workflows/linux-dnnl.yml
index 0d2c959354e3d..f6e4fe5708140 100644
--- a/.github/workflows/linux-dnnl.yml
+++ b/.github/workflows/linux-dnnl.yml
@@ -8,13 +8,13 @@ name: Linux DNNL CI
 
 on:
   push:
-    branches: [ main, 'rel-*']
+    branches: [main, 'rel-*']
   pull_request:
-    branches: [ main, 'rel-*']
+    branches: [main, 'rel-*']
   workflow_dispatch:
 
 concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
+  group: ${{ github.workflow }}-${{ github.event_name == 'pull_request' && github.ref || github.sha }}
   cancel-in-progress: true
 
 permissions:
diff --git a/.github/workflows/linux_ci.yml b/.github/workflows/linux_ci.yml
index 031c0cdf7d620..6f517f2656e94 100644
--- a/.github/workflows/linux_ci.yml
+++ b/.github/workflows/linux_ci.yml
@@ -21,13 +21,13 @@ name: Linux CI
 
 on:
   push:
-    branches: [ main, 'rel-*']
+    branches: [main, 'rel-*']
   pull_request:
-    branches: [ main, 'rel-*']
+    branches: [main, 'rel-*']
   workflow_dispatch:
 
 concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
+  group: ${{ github.workflow }}-${{ github.event_name == 'pull_request' && github.ref || github.sha }}
   cancel-in-progress: true
 
 permissions:
diff --git a/.github/workflows/linux_cuda_ci.yml b/.github/workflows/linux_cuda_ci.yml
index d3a54e1506e39..0dbe63371c7b8 100644
--- a/.github/workflows/linux_cuda_ci.yml
+++ b/.github/workflows/linux_cuda_ci.yml
@@ -2,13 +2,13 @@ name: Linux CUDA CI
 
 on:
   push:
-    branches: [ main, 'rel-*']
+    branches: [main, 'rel-*']
   pull_request:
-    branches: [ main, 'rel-*']
+    branches: [main, 'rel-*']
   workflow_dispatch:
 
 concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
+  group: ${{ github.workflow }}-${{ github.event_name == 'pull_request' && github.ref || github.sha }}
   cancel-in-progress: true
 
 permissions:
diff --git a/.github/workflows/linux_openvino_ci.yml b/.github/workflows/linux_openvino_ci.yml
index 12495b1f26c65..0a4827087309e 100644
--- a/.github/workflows/linux_openvino_ci.yml
+++ b/.github/workflows/linux_openvino_ci.yml
@@ -2,13 +2,13 @@ name: Linux OpenVINO CI
 
 on:
   push:
-    branches: [ main, 'rel-*' ]
+    branches: [main, 'rel-*']
   pull_request:
-    branches: [ main, 'rel-*' ]
+    branches: [main, 'rel-*']
   workflow_dispatch:
 
 concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
+  group: ${{ github.workflow }}-${{ github.event_name == 'pull_request' && github.ref || github.sha }}
   cancel-in-progress: true
 
 permissions:
diff --git a/.github/workflows/linux_tensorrt_ci.yml b/.github/workflows/linux_tensorrt_ci.yml
index f8d4a0d4dd218..405de75e95454 100644
--- a/.github/workflows/linux_tensorrt_ci.yml
+++ b/.github/workflows/linux_tensorrt_ci.yml
@@ -2,13 +2,13 @@ name: Linux TensorRT CI
 
 on:
   push:
-    branches: [ main, 'rel-*']
+    branches: [main, 'rel-*']
   pull_request:
-    branches: [ main, 'rel-*']
+    branches: [main, 'rel-*']
   workflow_dispatch:
 
 concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
+  group: ${{ github.workflow }}-${{ github.event_name == 'pull_request' && github.ref || github.sha }}
   cancel-in-progress: true
 
 permissions:
diff --git a/.github/workflows/mac.yml b/.github/workflows/mac.yml
index 8fcc53bbd9991..9cc1604d71e68 100644
--- a/.github/workflows/mac.yml
+++ b/.github/workflows/mac.yml
@@ -12,7 +12,7 @@ on:
   workflow_dispatch:
 
 concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  group: ${{ github.workflow }}-${{ github.event_name == 'pull_request' && github.ref || github.sha }}
   cancel-in-progress: true
 
 env:
diff --git a/.github/workflows/pr_checks.yml b/.github/workflows/pr_checks.yml
index af890d88995be..bb8a0638afea2 100644
--- a/.github/workflows/pr_checks.yml
+++ b/.github/workflows/pr_checks.yml
@@ -13,7 +13,7 @@ permissions:  # set top-level default permissions as security best practice
   contents: read
 
 concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}-${{ github.event_name == 'workflow_dispatch' }}
+  group: ${{ github.workflow }}-${{ github.event_name == 'pull_request' && github.ref || github.sha }}
   cancel-in-progress: true
 
 jobs:
diff --git a/.github/workflows/web.yml b/.github/workflows/web.yml
index 8490252ca657b..8f922ef26cd7e 100644
--- a/.github/workflows/web.yml
+++ b/.github/workflows/web.yml
@@ -12,7 +12,7 @@ on:
   workflow_dispatch:
 
 concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
+  group: ${{ github.workflow }}-${{ github.event_name == 'pull_request' && github.ref || github.sha }}
   cancel-in-progress: true
 
 jobs:

From e8777269afaad7b6d60e18b65aee54f28622302f Mon Sep 17 00:00:00 2001
From: Prathik Rao <prathik.rao@gmail.com>
Date: Wed, 14 May 2025 19:51:04 -0700
Subject: [PATCH 72/84] [WebGPU EP] move transpose output_size check to
 computeinternal (#24761)

Adjust fix for https://github.com/microsoft/onnxruntime/pull/24746
---
 onnxruntime/core/providers/webgpu/tensor/transpose.cc | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/onnxruntime/core/providers/webgpu/tensor/transpose.cc b/onnxruntime/core/providers/webgpu/tensor/transpose.cc
index 74707a6739c94..5f1496ff7a40e 100644
--- a/onnxruntime/core/providers/webgpu/tensor/transpose.cc
+++ b/onnxruntime/core/providers/webgpu/tensor/transpose.cc
@@ -104,10 +104,6 @@ Status Transpose::DoTranspose(onnxruntime::webgpu::ComputeContext& context,
                               gsl::span<const size_t> permutations,
                               const Tensor& input, Tensor& output) {
   const auto& input_shape = input.Shape();
-  uint32_t output_size = onnxruntime::narrow<int32_t>(input_shape.Size());
-  if (output_size == 0) {
-    return Status::OK();
-  }
   const auto& input_dims = input_shape.GetDims();
   int32_t rank = static_cast<int32_t>(input_shape.NumDimensions());
 
@@ -134,6 +130,8 @@ Status Transpose::DoTranspose(onnxruntime::webgpu::ComputeContext& context,
                           : new_shape;
     new_output_shape = TensorShape({new_input_shape[1], new_input_shape[0]});
   }
+
+  uint32_t output_size = onnxruntime::narrow<int32_t>(input_shape.Size());
   TransposeProgram program{permutations, use_shared};
 
   if (use_shared) {
@@ -167,6 +165,11 @@ Status Transpose::ComputeInternal(ComputeContext& context) const {
   TensorShape output_shape(output_dims);
   auto* output_tensor = context.Output(0, output_shape);
 
+  int64_t output_size = output_shape.Size();
+  if (output_size == 0) {
+    return Status::OK();
+  }
+
   return DoTranspose(context, *p_perm, *input_tensor, *output_tensor);
 }
 

From 6fff633d85297c1e855a11172707e2dcd8979216 Mon Sep 17 00:00:00 2001
From: Wanming Lin <wanming.lin@intel.com>
Date: Thu, 15 May 2025 15:04:33 +0800
Subject: [PATCH 73/84] [WebNN] Support RotaryEmbedding for opset 23 (#24736)

The
[RotaryEmbedding](https://onnx.ai/onnx/operators/onnx__RotaryEmbedding.html#rotaryembedding)
op has been released in opset 23 and has some differences compared to
the original contributed op:
- The order of input indexes changed
- The position_ids input is optional
- If the input is 3D, the num_heads must be provided
- If it is full rotation, we need to slice the gathered cosine/sine to
get the shape [batch_size, sequence_length, rotary_embedding_dim / 2]
---
 js/web/docs/webnn-operators.md                |   2 +-
 .../core/providers/webnn/builders/helper.h    |   6 +-
 .../impl/rotaryEmbedding_op_builder.cc        | 144 +++++++++++++-----
 3 files changed, 109 insertions(+), 43 deletions(-)

diff --git a/js/web/docs/webnn-operators.md b/js/web/docs/webnn-operators.md
index 447ccd2664948..981a684154df1 100644
--- a/js/web/docs/webnn-operators.md
+++ b/js/web/docs/webnn-operators.md
@@ -92,7 +92,7 @@ platforms. Check the [WebNN status](https://webmachinelearning.github.io/webnn-s
 | Relu | ai.onnx(7-12, 13, 14+) | relu | |
 | Reshape | ai.onnx(7-12, 13, 14-18, 19-20, 21+) | reshape | Input 'shape' should be a constant, 0 dimension value in 'shape' is not supported |
 | Resize | ai.onnx(11-12, 13-17, 18, 19+) | resample2d | Only supports 4-D input, antialias == 0, exclude_outside == 0, keep_aspect_ratio_policy == 'stretch', 'linear' and 'nearest' modes, input 'scales' and 'sizes' if present must be a constant |
-| RotaryEmbedding | com.microsoft(1+) | add, concat, gather, mul, reshape, split | |
+| RotaryEmbedding | ai.onnx(23+), com.microsoft(1+) | add, concat, gather, mul, reshape, slice, split | |
 | ScatterElements | ai.onnx(11-12, 13-15, 16-17, 18+) | scatterElements | Only supports 'reduction' == 'none' |
 | ScatterND | ai.onnx(11-12, 13-15, 16-17, 18+) | scatterND | Only supports 'reduction' == 'none' |
 | Shape | ai.onnx(7-12, 13-14, 15-18, 19-20, 21+) | slice | |
diff --git a/onnxruntime/core/providers/webnn/builders/helper.h b/onnxruntime/core/providers/webnn/builders/helper.h
index 946bd04dddbf5..6556c293f81bf 100644
--- a/onnxruntime/core/providers/webnn/builders/helper.h
+++ b/onnxruntime/core/providers/webnn/builders/helper.h
@@ -183,6 +183,10 @@ inline bool IsEmptyTensor(const GraphViewer& graph_viewer, const std::string& na
   return std::any_of(dims.begin(), dims.end(), [](auto d) { return d == 0; });
 }
 
+inline bool IsOnnxDomain(std::string_view domain) {
+  return (domain == onnxruntime::kOnnxDomain) || (domain == onnxruntime::kOnnxDomainAlias);
+}
+
 inline bool TensorExists(const ConstPointerContainer<std::vector<NodeArg*>>& defs, size_t tensor_index) noexcept {
   return tensor_index < defs.size() && defs[tensor_index]->Exists();
 }
@@ -206,7 +210,7 @@ const std::map<std::string_view, std::vector<std::string_view>> decomposed_op_ma
     {"MatMulInteger", {"cast", "dequantizeLinear", "matmul"}},
     {"MatMulNBits", {"add", "dequantizeLinear", "matmul", "reshape", "transpose"}},
     {"MultiHeadAttention", {"add", "cast", "concat", "constant", "div", "matmul", "reshape", "softmax", "transpose"}},
-    {"RotaryEmbedding", {"add", "concat", "gather", "mul", "reshape", "split"}},
+    {"RotaryEmbedding", {"add", "concat", "gather", "mul", "reshape", "slice", "split"}},
     {"SimplifiedLayerNormalization", {"add", "div", "mul", "pow", "reduceMean", "sqrt"}},
     {"SkipSimplifiedLayerNormalization", {"add", "div", "mul", "pow", "reduceMean", "sqrt"}},
 };
diff --git a/onnxruntime/core/providers/webnn/builders/impl/rotaryEmbedding_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/rotaryEmbedding_op_builder.cc
index d02dd61460f60..ad22758028f2c 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/rotaryEmbedding_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/rotaryEmbedding_op_builder.cc
@@ -70,20 +70,34 @@ Status RotaryEmbeddingOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_build
   const auto& input_defs = node.InputDefs();
   int32_t input_data_type;
   ORT_RETURN_IF_NOT(GetType(*input_defs[0], input_data_type, logger), "Cannot get input type");
+
+  const bool is_onnx_domain = IsOnnxDomain(node.Domain());
+  // The input indexes for the onnx domain and the microsoft domain are different.
+  const size_t cos_cache_idx = is_onnx_domain ? 1 : 2;
+  const size_t sin_cache_idx = is_onnx_domain ? 2 : 3;
+  const size_t position_ids_idx = is_onnx_domain ? 3 : 1;
   std::vector<int64_t> input_shape;
   std::vector<int64_t> position_ids_shape;
   std::vector<int64_t> cos_cache_shape;
+  // Since opset 23, the position_ids input is optional.
+  const bool has_position_ids = TensorExists(input_defs, position_ids_idx);
   ORT_RETURN_IF_NOT(GetShape(*input_defs[0], input_shape, logger), "Cannot get input shape");
-  ORT_RETURN_IF_NOT(GetShape(*input_defs[1], position_ids_shape, logger), "Cannot get position_ids shape");
-  ORT_RETURN_IF_NOT(GetShape(*input_defs[2], cos_cache_shape, logger), "Cannot get cos_cache shape");
+  ORT_RETURN_IF_NOT(GetShape(*input_defs[cos_cache_idx], cos_cache_shape, logger), "Cannot get cos_cache shape");
+  if (has_position_ids) {
+    ORT_RETURN_IF_NOT(GetShape(*input_defs[position_ids_idx], position_ids_shape, logger),
+                      "Cannot get position_ids shape");
+  }
   const bool input_is_4d = input_shape.size() == 4;
   // When position_ids is a 1D tensor, it represents the start offset for each sequence.
-  const bool position_ids_is_offset = position_ids_shape.size() == 1;
+  const bool position_ids_is_offset = has_position_ids && position_ids_shape.size() == 1;
 
   emscripten::val input = model_builder.GetOperand(input_defs[0]->Name());
-  emscripten::val position_ids = model_builder.GetOperand(input_defs[1]->Name());
-  emscripten::val cos_cache = model_builder.GetOperand(input_defs[2]->Name());
-  emscripten::val sin_cache = model_builder.GetOperand(input_defs[3]->Name());
+  emscripten::val position_ids;
+  if (has_position_ids) {
+    position_ids = model_builder.GetOperand(input_defs[position_ids_idx]->Name());
+  }
+  emscripten::val cos_cache = model_builder.GetOperand(input_defs[cos_cache_idx]->Name());
+  emscripten::val sin_cache = model_builder.GetOperand(input_defs[sin_cache_idx]->Name());
 
   const auto node_name = node.Name();
   emscripten::val wnn_builder = model_builder.GetBuilder();
@@ -97,19 +111,34 @@ Status RotaryEmbeddingOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_build
   // - 3D: [batch_size, sequence_length, hidden_size]
   // - 4D: [batch_size, num_heads, sequence_length, head_size]
   const uint32_t batch_size = static_cast<uint32_t>(input_shape[0]);
-  const uint32_t sequence_length = input_is_4d ? static_cast<uint32_t>(input_shape[2])
-                                               : static_cast<uint32_t>(input_shape[1]);
-  const uint32_t hidden_size = input_is_4d ? static_cast<uint32_t>(input_shape[1] * input_shape[3])
-                                           : static_cast<uint32_t>(input_shape[2]);
-  const uint32_t head_size = num_heads == 0 ? static_cast<uint32_t>(cos_cache_shape[1]) * 2
-                                            : hidden_size / num_heads;
-  if (num_heads == 0) {
-    num_heads = hidden_size / head_size;
+  uint32_t sequence_length, hidden_size, head_size;
+  if (input_is_4d) {
+    sequence_length = static_cast<uint32_t>(input_shape[2]);
+    hidden_size = static_cast<uint32_t>(input_shape[1] * input_shape[3]);
+    num_heads = static_cast<uint32_t>(input_shape[1]);
+    head_size = static_cast<uint32_t>(input_shape[3]);
+  } else {
+    sequence_length = static_cast<uint32_t>(input_shape[1]);
+    hidden_size = static_cast<uint32_t>(input_shape[2]);
+    // Since opset 23, if the input is 3D, the num_heads must be provided.
+    if (is_onnx_domain) {
+      assert(num_heads != 0);
+      head_size = hidden_size / num_heads;
+    } else {
+      if (num_heads == 0) {
+        head_size = static_cast<uint32_t>(cos_cache_shape[1]) * 2;
+        num_heads = hidden_size / head_size;
+      } else {
+        head_size = hidden_size / num_heads;
+      }
+    }
   }
+
   if (rotary_embedding_dim == 0) {
     rotary_embedding_dim = head_size;
   }
 
+  const uint32_t half_rotary_embedding_dim = rotary_embedding_dim / 2;
   emscripten::val transpose_options = emscripten::val::object();
 
   // Ensure the input is reshaped to: [batch_size, sequence_length, num_heads, head_size].
@@ -147,8 +176,8 @@ Status RotaryEmbeddingOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_build
   // Split the partial input0 data into 2 equal parts.
   // Firstly reshape the partial input0.
   const std::vector<uint32_t> new_partial_input0_shape =
-      interleaved ? std::vector<uint32_t>({batch_size, sequence_length, num_heads, rotary_embedding_dim / 2, 2})
-                  : std::vector<uint32_t>({batch_size, sequence_length, num_heads, 2, rotary_embedding_dim / 2});
+      interleaved ? std::vector<uint32_t>({batch_size, sequence_length, num_heads, half_rotary_embedding_dim, 2})
+                  : std::vector<uint32_t>({batch_size, sequence_length, num_heads, 2, half_rotary_embedding_dim});
   emscripten::val reshape_partial_input0_options = emscripten::val::object();
   reshape_partial_input0_options.set("label", node_name + "_reshape_partial_input0");
   partial_input0 = wnn_builder.call<emscripten::val>(
@@ -196,19 +225,34 @@ Status RotaryEmbeddingOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_build
         "add", position_ids, position_ids_range, position_ids_add_range_options);
   }
 
-  // Gather the cosine/sine values based on the position_ids.
-  emscripten::val gather_cos_sin_options = emscripten::val::object();
-  gather_cos_sin_options.set("label", node_name + "_gather_cos_sin");
-  gather_cos_sin_options.set("axis", 0);
-  emscripten::val gather_cos = wnn_builder.call<emscripten::val>(
-      "gather", cos_cache, position_ids, gather_cos_sin_options);
-  emscripten::val gather_sin = wnn_builder.call<emscripten::val>(
-      "gather", sin_cache, position_ids, gather_cos_sin_options);
+  // Gather the cosine/sine values based on the position_ids (if it presents).
+  emscripten::val gather_cos = cos_cache;
+  emscripten::val gather_sin = sin_cache;
+  if (has_position_ids) {
+    emscripten::val gather_cos_sin_options = emscripten::val::object();
+    gather_cos_sin_options.set("label", node_name + "_gather_cos_sin");
+    gather_cos_sin_options.set("axis", 0);
+    gather_cos = wnn_builder.call<emscripten::val>("gather", gather_cos, position_ids, gather_cos_sin_options);
+    gather_sin = wnn_builder.call<emscripten::val>("gather", gather_sin, position_ids, gather_cos_sin_options);
+  }
+
+  // If it is full rotation, we need to slice the gathered cosine/sine
+  // to get the shape [batch_size, sequence_length, rotary_embedding_dim / 2].
+  if (cos_cache_shape.back() != static_cast<int64_t>(half_rotary_embedding_dim)) {
+    emscripten::val slice_gather_cos_sin_options = emscripten::val::object();
+    slice_gather_cos_sin_options.set("label", node_name + "_slice_gather_cos_sin");
+    const std::vector<uint32_t> starts{0, 0, 0};
+    const std::vector<uint32_t> sizes{batch_size, sequence_length, half_rotary_embedding_dim};
+    gather_cos = wnn_builder.call<emscripten::val>("slice", gather_cos, emscripten::val::array(starts),
+                                                   emscripten::val::array(sizes), slice_gather_cos_sin_options);
+    gather_sin = wnn_builder.call<emscripten::val>("slice", gather_sin, emscripten::val::array(starts),
+                                                   emscripten::val::array(sizes), slice_gather_cos_sin_options);
+  }
 
-  // After gathering cosine/sine, reshape and broadcast them to match the number of heads of the input data.
+  // Reshape and broadcast them to match the number of heads of the input data.
   const std::vector<uint32_t> reshaped_cos_sin_shape =
-      interleaved ? std::vector<uint32_t>({batch_size, sequence_length, 1, rotary_embedding_dim / 2, 1})
-                  : std::vector<uint32_t>({batch_size, sequence_length, 1, 1, rotary_embedding_dim / 2});
+      interleaved ? std::vector<uint32_t>({batch_size, sequence_length, 1, half_rotary_embedding_dim, 1})
+                  : std::vector<uint32_t>({batch_size, sequence_length, 1, 1, half_rotary_embedding_dim});
   emscripten::val reshape_gather_cos_sin_options = emscripten::val::object();
   reshape_gather_cos_sin_options.set("label", node_name + "_reshape_gather_cos_sin");
   gather_cos = wnn_builder.call<emscripten::val>(
@@ -312,10 +356,13 @@ bool RotaryEmbeddingOpBuilder::IsOpSupportedImpl(const GraphViewer&, const Node&
                                                  const WebnnDeviceType /* device_type */,
                                                  const logging::Logger& logger) const {
   const auto& input_defs = node.InputDefs();
+  const bool is_onnx_domain = IsOnnxDomain(node.Domain());
+  // The input indexes for the onnx domain and the microsoft domain are different.
+  const size_t cos_cache_idx = is_onnx_domain ? 1 : 2;
   std::vector<int64_t> input_shape;
   std::vector<int64_t> cos_cache_shape;
   if (!GetShape(*input_defs[0], input_shape, logger)) return false;
-  if (!GetShape(*input_defs[2], cos_cache_shape, logger)) return false;
+  if (!GetShape(*input_defs[cos_cache_idx], cos_cache_shape, logger)) return false;
   const auto input_size = input_shape.size();
   if (input_size != 3 && input_size != 4) {
     LOGS(logger, VERBOSE) << "RotaryEmbedding only supports 3D or 4D input shape, input is " << input_size << "D shape";
@@ -326,11 +373,23 @@ bool RotaryEmbeddingOpBuilder::IsOpSupportedImpl(const GraphViewer&, const Node&
   const int is_packed_batching = helper.Get("is_packed_batching", 0);
   const int num_heads = helper.Get("num_heads", 0);
   const int rotary_embedding_dim = helper.Get("rotary_embedding_dim", 0);
-
   const auto sequence_length = input_size == 4 ? input_shape[2] : input_shape[1];
-  if (is_packed_batching == 0 && sequence_length > cos_cache_shape[0]) {
-    LOGS(logger, VERBOSE) << "RotaryEmbedding: updating cos_cache and sin_cache is not currently supported";
-    return false;
+
+  if (is_onnx_domain) {
+    if (input_size == 3 && num_heads == 0) {
+      LOGS(logger, VERBOSE) << "RotaryEmbedding: num_heads must be provided if input is 3D";
+      return false;
+    }
+  } else {
+    if (is_packed_batching == 0 && sequence_length > cos_cache_shape[0]) {
+      LOGS(logger, VERBOSE) << "RotaryEmbedding: updating cos_cache and sin_cache is not currently supported";
+      return false;
+    }
+
+    if (rotary_embedding_dim > 0 && num_heads == 0) {
+      LOGS(logger, VERBOSE) << "RotaryEmbedding: num_heads must be provided if rotary_embedding_dim is specified";
+      return false;
+    }
   }
 
   if (input_size == 4 && num_heads != 0 && num_heads != input_shape[1]) {
@@ -339,11 +398,6 @@ bool RotaryEmbeddingOpBuilder::IsOpSupportedImpl(const GraphViewer&, const Node&
     return false;
   }
 
-  if (rotary_embedding_dim > 0 && num_heads == 0) {
-    LOGS(logger, VERBOSE) << "RotaryEmbedding: num_heads must be provided if rotary_embedding_dim is specified";
-    return false;
-  }
-
   return true;
 }
 
@@ -353,14 +407,22 @@ bool RotaryEmbeddingOpBuilder::HasSupportedInputsImpl(const GraphViewer&,
                                                       const logging::Logger& logger) const {
   const auto& input_defs = node.InputDefs();
   const std::string_view op_type = node.OpType();
+  const bool is_onnx_domain = IsOnnxDomain(node.Domain());
+  // The input indexes for the onnx domain and the microsoft domain are different.
+  const size_t cos_cache_idx = is_onnx_domain ? 1 : 2;
+  const size_t sin_cache_idx = is_onnx_domain ? 2 : 3;
+  const size_t position_ids_idx = is_onnx_domain ? 3 : 1;
   int32_t input_type = 0;
   int32_t position_ids_type = 0;
   int32_t cos_cache_type = 0;
   int32_t sin_cache_type = 0;
+  // Since opset 23, the position_ids is an optional input.
+  const bool has_position_ids = TensorExists(input_defs, position_ids_idx);
+
   if (!GetType(*input_defs[0], input_type, logger) ||
-      !GetType(*input_defs[1], position_ids_type, logger) ||
-      !GetType(*input_defs[2], cos_cache_type, logger) ||
-      !GetType(*input_defs[3], sin_cache_type, logger)) {
+      (has_position_ids && !GetType(*input_defs[position_ids_idx], position_ids_type, logger)) ||
+      !GetType(*input_defs[cos_cache_idx], cos_cache_type, logger) ||
+      !GetType(*input_defs[sin_cache_idx], sin_cache_type, logger)) {
     return false;
   }
 
@@ -369,7 +431,7 @@ bool RotaryEmbeddingOpBuilder::HasSupportedInputsImpl(const GraphViewer&,
     return false;
   }
 
-  if (position_ids_type != ONNX_NAMESPACE::TensorProto_DataType_INT64) {
+  if (has_position_ids && position_ids_type != ONNX_NAMESPACE::TensorProto_DataType_INT64) {
     return false;
   }
 

From f61728db3620cb3eb641d9f7647640d7efe4410a Mon Sep 17 00:00:00 2001
From: co63oc <co63oc@users.noreply.github.com>
Date: Thu, 15 May 2025 15:51:21 +0800
Subject: [PATCH 74/84] Fix typos in multiple files (#24747)

### Description
<!-- Describe your changes. -->
Fix typos in multiple files


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->

Signed-off-by: co63oc <co63oc@users.noreply.github.com>
---
 cmake/winml.cmake                             |  2 +-
 .../Tensors/Tensor.shared.cs                  |  2 +-
 .../OnnxMl.cs                                 |  4 +-
 .../TestDataLoader.cs                         |  2 +-
 .../OnnxMl.cs                                 |  4 +-
 docs/How_To_Update_ONNX_Dev_Notes.md          |  2 +-
 docs/cmake_guideline.md                       |  2 +-
 .../plot_convert_pipeline_vectorizer.py       |  2 +-
 .../examples/plot_train_convert_predict.py    |  4 +-
 .../core/framework/sparse_tensor.h            |  4 +-
 .../core/graph/model_saving_options.h         |  2 +-
 .../platform/EigenNonBlockingThreadPool.h     |  8 +--
 .../core/session/onnxruntime_c_api.h          | 16 +++---
 .../core/session/onnxruntime_cxx_api.h        |  8 +--
 .../core/session/onnxruntime_lite_custom_op.h |  2 +-
 .../java/ai/onnxruntime/OnnxSparseTensor.java |  2 +-
 .../java/ai/onnxruntime/InferenceTest.java    |  2 +-
 java/src/test/java/ai/onnxruntime/OnnxMl.java | 52 +++++++++----------
 js/README.md                                  |  2 +-
 js/web/lib/onnxjs/backends/backend-webgl.ts   |  2 +-
 .../lib/onnxjs/backends/webgl/glsl-vec-lib.ts |  2 +-
 .../onnxjs/backends/webgl/texture-manager.ts  |  2 +-
 .../onnxjs/backends/webgl/webgl-context.ts    |  2 +-
 js/web/lib/onnxjs/instrument.ts               |  6 +--
 js/web/lib/onnxjs/session.ts                  |  2 +-
 js/web/lib/onnxjs/util.ts                     |  8 +--
 js/web/lib/wasm/jsep/util.ts                  |  2 +-
 js/web/lib/wasm/jsep/webgpu/ops/bias-add.ts   |  2 +-
 js/web/lib/wasm/jsep/webgpu/ops/einsum.ts     |  6 +--
 .../jsep/webgpu/ops/group-query-attention.ts  |  2 +-
 js/web/lib/wasm/jsep/webgpu/ops/split.ts      |  2 +-
 js/web/lib/wasm/proxy-wrapper.ts              |  2 +-
 js/web/lib/wasm/run-options.ts                |  2 +-
 js/web/lib/wasm/session-options.ts            |  2 +-
 .../backends/webgl/test-pack-unpack.ts        | 10 ++--
 .../backends/webgl/test-reshape-packed.ts     |  2 +-
 36 files changed, 89 insertions(+), 89 deletions(-)

diff --git a/cmake/winml.cmake b/cmake/winml.cmake
index ef635c0c8c794..f2651d0cbc2b2 100644
--- a/cmake/winml.cmake
+++ b/cmake/winml.cmake
@@ -316,7 +316,7 @@ if (onnxruntime_WINML_NAMESPACE_OVERRIDE STREQUAL "Windows")
   target_compile_definitions(winml_adapter PRIVATE "BUILD_INBOX=1")
 endif()
 
-# wil requires C++17
+# will requires C++17
 set_target_properties(winml_adapter PROPERTIES CXX_STANDARD 17)
 set_target_properties(winml_adapter PROPERTIES CXX_STANDARD_REQUIRED ON)
 
diff --git a/csharp/src/Microsoft.ML.OnnxRuntime/Tensors/Tensor.shared.cs b/csharp/src/Microsoft.ML.OnnxRuntime/Tensors/Tensor.shared.cs
index 27597525fc946..9de80a15942e4 100644
--- a/csharp/src/Microsoft.ML.OnnxRuntime/Tensors/Tensor.shared.cs
+++ b/csharp/src/Microsoft.ML.OnnxRuntime/Tensors/Tensor.shared.cs
@@ -68,7 +68,7 @@ public class TensorTypeInfo
         /// Ctor
         /// </summary>
         /// <param name="elementType">TensorElementType value</param>
-        /// <param name="typeSize">size fo the type in bytes</param>
+        /// <param name="typeSize">size of the type in bytes</param>
         public TensorTypeInfo(TensorElementType elementType, int typeSize)
         {
             ElementType = elementType;
diff --git a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/OnnxMl.cs b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/OnnxMl.cs
index 510097e2f4c29..f9e9f42f8b78e 100644
--- a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/OnnxMl.cs
+++ b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/OnnxMl.cs
@@ -209,7 +209,7 @@ public enum Version {
     [pbr::OriginalName("IR_VERSION_2019_9_19")] IrVersion2019919 = 6,
     /// <summary>
     /// IR VERSION 7 published on May 8, 2020
-    /// - Add support to allow function body graph to rely on multiple external opreator sets.
+    /// - Add support to allow function body graph to rely on multiple external operator sets.
     /// - Add a list to promote inference graph's initializers to global and
     ///   mutable variables. Global variables are visible in all graphs of the
     ///   stored models.
@@ -2481,7 +2481,7 @@ public string DocString {
     ///
     /// Name of the function "FunctionProto.name" should be unique within the domain "FunctionProto.domain".
     /// In case of any conflicts the behavior (whether the model local functions are given higher priority,
-    /// or standard opserator sets are given higher priotity or this is treated as error) is defined by
+    /// or standard opserator sets are given higher priority or this is treated as error) is defined by
     /// the runtimes.
     ///
     /// The operator sets imported by FunctionProto should be compatible with the ones
diff --git a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/TestDataLoader.cs b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/TestDataLoader.cs
index 9e5e2b6203790..2efbd55f9f350 100644
--- a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/TestDataLoader.cs
+++ b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/TestDataLoader.cs
@@ -550,7 +550,7 @@ internal static OrtValue CreateOrtValueFromRawData(OrtAllocator allocator, ReadO
             var ortValue = OrtValue.CreateAllocatedTensorValue(allocator, elementType, shape);
             try
             {
-                // The endianess data in protobuf is little endian.
+                // The endianness data in protobuf is little endian.
                 // We simply copy raw memory into the tensor raw data.
                 var span = ortValue.GetTensorMutableRawData();
                 Assert.Equal(rawData.Length, span.Length);
diff --git a/csharp/tools/Microsoft.ML.OnnxRuntime.PerfTool/OnnxMl.cs b/csharp/tools/Microsoft.ML.OnnxRuntime.PerfTool/OnnxMl.cs
index 9ee5e44a356da..6dd94750afd0a 100644
--- a/csharp/tools/Microsoft.ML.OnnxRuntime.PerfTool/OnnxMl.cs
+++ b/csharp/tools/Microsoft.ML.OnnxRuntime.PerfTool/OnnxMl.cs
@@ -209,7 +209,7 @@ public enum Version {
     [pbr::OriginalName("IR_VERSION_2019_9_19")] IrVersion2019919 = 6,
     /// <summary>
     /// IR VERSION 7 published on May 8, 2020
-    /// - Add support to allow function body graph to rely on multiple external opreator sets.
+    /// - Add support to allow function body graph to rely on multiple external operator sets.
     /// - Add a list to promote inference graph's initializers to global and
     ///   mutable variables. Global variables are visible in all graphs of the
     ///   stored models.
@@ -2481,7 +2481,7 @@ public string DocString {
     ///
     /// Name of the function "FunctionProto.name" should be unique within the domain "FunctionProto.domain".
     /// In case of any conflicts the behavior (whether the model local functions are given higher priority,
-    /// or standard opserator sets are given higher priotity or this is treated as error) is defined by
+    /// or standard opserator sets are given higher priority or this is treated as error) is defined by
     /// the runtimes.
     ///
     /// The operator sets imported by FunctionProto should be compatible with the ones
diff --git a/docs/How_To_Update_ONNX_Dev_Notes.md b/docs/How_To_Update_ONNX_Dev_Notes.md
index 895b552508cf6..98d830bd20645 100644
--- a/docs/How_To_Update_ONNX_Dev_Notes.md
+++ b/docs/How_To_Update_ONNX_Dev_Notes.md
@@ -6,7 +6,7 @@ If you need to update the ONNX submodule to a different version, follow the step
 
 ## Update ONNX installation
 
-Currently, ONNXRUNTIME supports two ways to install ONNX cpp dependencies, one is through cmake/deps.txt, and the other one is by vcpkg. And both of them are guarded by CI. It is recommeded to test vcpkg within Windows machines.
+Currently, ONNXRUNTIME supports two ways to install ONNX cpp dependencies, one is through cmake/deps.txt, and the other one is by vcpkg. And both of them are guarded by CI. It is recommended to test vcpkg within Windows machines.
 
 ### Update the ONNX submodule (commit would be more precise than branch)
 
diff --git a/docs/cmake_guideline.md b/docs/cmake_guideline.md
index e03706476d73f..2ea0d5e0841b3 100644
--- a/docs/cmake_guideline.md
+++ b/docs/cmake_guideline.md
@@ -144,7 +144,7 @@ Here system means the combination of
 - CPU Arch: x86_32, x86_64, armv6, armv7, arvm7l, aarch64, …
 - OS: bare-metal, linux, Windows
 - Libc: gnu libc/ulibc/musl/…
-- ABI: ARM has mutilple ABIs like eabi, eabihf…
+- ABI: ARM has multiple ABIs like eabi, eabihf…
 
 When "host system" != "target system" (any different in the four dimensions), we call it cross-compiling. For example, when you build a Windows EXE on Linux, or build an ARM program on an x86_64 CPU, you are doing cross-compiling. Then special handling is needed.
 
diff --git a/docs/python/examples/plot_convert_pipeline_vectorizer.py b/docs/python/examples/plot_convert_pipeline_vectorizer.py
index 2215cb73ee643..62617e581b2e8 100644
--- a/docs/python/examples/plot_convert_pipeline_vectorizer.py
+++ b/docs/python/examples/plot_convert_pipeline_vectorizer.py
@@ -95,4 +95,4 @@
 
 #########################
 # Very similar. *ONNX Runtime* uses floats instead of doubles,
-# that explains the small discrepencies.
+# that explains the small discrepancies.
diff --git a/docs/python/examples/plot_train_convert_predict.py b/docs/python/examples/plot_train_convert_predict.py
index f0fd8694fb541..24e49d421f285 100644
--- a/docs/python/examples/plot_train_convert_predict.py
+++ b/docs/python/examples/plot_train_convert_predict.py
@@ -16,7 +16,7 @@
 Train a logistic regression
 +++++++++++++++++++++++++++
 
-The first step consists in retrieving the iris datset.
+The first step consists in retrieving the iris dataset.
 """
 
 from sklearn.datasets import load_iris
@@ -95,7 +95,7 @@
 
 #############################
 # And then with ONNX Runtime.
-# The probabilies appear to be
+# The probabilities appear to be
 
 prob_name = sess.get_outputs()[1].name
 prob_rt = sess.run([prob_name], {input_name: X_test.astype(numpy.float32)})[0]
diff --git a/include/onnxruntime/core/framework/sparse_tensor.h b/include/onnxruntime/core/framework/sparse_tensor.h
index da10e3bc10307..4e9d5c41fbc76 100644
--- a/include/onnxruntime/core/framework/sparse_tensor.h
+++ b/include/onnxruntime/core/framework/sparse_tensor.h
@@ -93,7 +93,7 @@ class SparseTensor final {
 
   /// <summary>
   /// The factory function creates an instance of SparseTensor on the heap
-  /// using appropriate constructor and initializes OrtValue instance wit it.
+  /// using appropriate constructor and initializes OrtValue instance with it.
   /// </summary>
   /// <param name="elt_type">element data type</param>
   /// <param name="dense_shape">dense shape of the sparse tensor</param>
@@ -110,7 +110,7 @@ class SparseTensor final {
 
   /// <summary>
   /// The factory function creates an instance of SparseTensor on the heap
-  /// using appropriate constructor and initializes OrtValue instance wit it.
+  /// using appropriate constructor and initializes OrtValue instance with it.
   /// </summary>
   /// <param name="elt_type">element data type</param>
   /// <param name="dense_shape">dense shape of the sparse tensor</param>
diff --git a/include/onnxruntime/core/graph/model_saving_options.h b/include/onnxruntime/core/graph/model_saving_options.h
index 924799f15b247..45536a6967606 100644
--- a/include/onnxruntime/core/graph/model_saving_options.h
+++ b/include/onnxruntime/core/graph/model_saving_options.h
@@ -21,7 +21,7 @@ struct ModelSavingOptions {
   explicit ModelSavingOptions(size_t size_threshold)
       : initializer_size_threshold(size_threshold) {}
 
-  // Mimimal initializer size in bytes to be externalized on disk
+  // Minimal initializer size in bytes to be externalized on disk
   size_t initializer_size_threshold;
   // Offset will always be page aligned and allocation granularity aligned for
   // mmap support. This is done by padding previous tensor data with zeros
diff --git a/include/onnxruntime/core/platform/EigenNonBlockingThreadPool.h b/include/onnxruntime/core/platform/EigenNonBlockingThreadPool.h
index 26fc440f7bfc5..172ec3ec50686 100644
--- a/include/onnxruntime/core/platform/EigenNonBlockingThreadPool.h
+++ b/include/onnxruntime/core/platform/EigenNonBlockingThreadPool.h
@@ -256,7 +256,7 @@ class ThreadPoolProfiler {
   void LogCoreAndBlock(std::ptrdiff_t block_size);  // called in main thread to log core and block size for task breakdown
   void LogThreadId(int thread_idx);                 // called in child thread to log its id
   void LogRun(int thread_idx);                      // called in child thread to log num of run
-  std::string DumpChildThreadStat();                // return all child statitics collected so far
+  std::string DumpChildThreadStat();                // return all child statistics collected so far
 
  private:
   static const char* GetEventName(ThreadPoolEvent);
@@ -739,7 +739,7 @@ class ThreadPoolTempl : public onnxruntime::concurrency::ExtendedThreadPoolInter
     // Allocate a new tag to use to identify work items from a given
     // thread in a parallel section.  Ideally, threads will have
     // unique tags, but re-use is not incorrect if the counter wraps
-    // (for intsance, if a long-running workload is calling into ORT
+    // (for instance, if a long-running workload is calling into ORT
     // from a fresh thread for each request).  We must not re-use the
     // default tag 0 which is used to identify work items added via
     // Schedule as opposed to requests for help in parallel sections.
@@ -912,7 +912,7 @@ class ThreadPoolTempl : public onnxruntime::concurrency::ExtendedThreadPoolInter
       }
     }
 
-    // Now we know that dispatch is finshed, we synchronize with the
+    // Now we know that dispatch is finished, we synchronize with the
     // tasks that were created (if any) for the parallel section.  We
     // revoke tasks still in queues, and then wait for any that are
     // still running.
@@ -1004,7 +1004,7 @@ class ThreadPoolTempl : public onnxruntime::concurrency::ExtendedThreadPoolInter
   // * First, suppose that a single job is running a series of loops.
   //   Its main thread enters a parallel loop.  Initially, let's assume
   //   its preferred worker array is [_,0,1,2], writing "_" for the
-  //   unusued element for the par_idx=0 work that the main thread will
+  //   unused element for the par_idx=0 work that the main thread will
   //   run.
   //
   //   The main thread schedules the dispatcher task onto worker 0.
diff --git a/include/onnxruntime/core/session/onnxruntime_c_api.h b/include/onnxruntime/core/session/onnxruntime_c_api.h
index 25b6d72394e0c..0d2da44971b3a 100644
--- a/include/onnxruntime/core/session/onnxruntime_c_api.h
+++ b/include/onnxruntime/core/session/onnxruntime_c_api.h
@@ -574,7 +574,7 @@ typedef struct OrtROCMProviderOptions {
    */
   int device_id;
 
-  /** \brief ROCM MIOpen Convolution algorithm exaustive search option.
+  /** \brief ROCM MIOpen Convolution algorithm exhaustive search option.
    *   Defaults to 0 (false).
    */
   int miopen_conv_exhaustive_search;
@@ -3636,9 +3636,9 @@ struct OrtApi {
    * \param[in] op_name Operator name
    * \param[in] domain Operator domain
    * \param[in] version Operator opset version
-   * \param[in] type_constraint_names Name of the type contraints, such as "T" or "T1"
-   * \param[in] type_constraint_values Type of each contraints
-   * \param[in] type_constraint_count Number of contraints
+   * \param[in] type_constraint_names Name of the type constraints, such as "T" or "T1"
+   * \param[in] type_constraint_values Type of each constraints
+   * \param[in] type_constraint_count Number of constraints
    * \param[in] attr_values Attributes used to initialize the operator
    * \param[in] attr_count Number of the attributes
    * \param[in] input_count Number of inputs
@@ -4326,7 +4326,7 @@ struct OrtApi {
 
   /** \brief Get the logging severity level of the ::OrtLogger.
    *
-   * Can be used in a custom operator to get the logging serverity level of the ::OrtLogger associated with
+   * Can be used in a custom operator to get the logging severity level of the ::OrtLogger associated with
    * the ::OrtKernelInfo.
    *
    * \param[in] logger The ::OrtLogger instance.
@@ -4744,12 +4744,12 @@ struct OrtApi {
                   _In_reads_(num_keys) const char* const* provider_options_values,
                   _In_ size_t num_keys);
 
-  /** \brief Get scratch buffer from the corresponding allocator under the sepcific OrtMemoryInfo object.
+  /** \brief Get scratch buffer from the corresponding allocator under the specific OrtMemoryInfo object.
    *         NOTE: callers are responsible to release this scratch buffer from the corresponding allocator
    *  \param[in] context OrtKernelContext instance
    *  \param[in] mem_info OrtMemoryInfo instance
    *  \param[in] count_or_bytes How many bytes is this scratch buffer
-   *  \param[out] out A pointer to the scrach buffer
+   *  \param[out] out A pointer to the scratch buffer
    *
    *  \snippet{doc} snippets.dox OrtStatus Return Value
    *
@@ -5328,7 +5328,7 @@ struct OrtCustomOp {
   // Returns the memory type of the input tensors. This API allows the custom op
   // to place the inputs on specific devices. By default, it returns
   // OrtMemTypeDefault, which means the input is placed on the default device for
-  // the execution provider. If the inputs need to be with different memory tyeps,
+  // the execution provider. If the inputs need to be with different memory types,
   // this function can be overridden to return the specific memory types.
   OrtMemType(ORT_API_CALL* GetInputMemoryType)(_In_ const struct OrtCustomOp* op, _In_ size_t index);
 
diff --git a/include/onnxruntime/core/session/onnxruntime_cxx_api.h b/include/onnxruntime/core/session/onnxruntime_cxx_api.h
index 8876c40fe9e6c..bf8e57894d384 100644
--- a/include/onnxruntime/core/session/onnxruntime_cxx_api.h
+++ b/include/onnxruntime/core/session/onnxruntime_cxx_api.h
@@ -195,7 +195,7 @@ inline const OrtEpApi& GetEpApi() {
  * uint16_t buffers to/from Ort::Float16_t to feed and retrieve data.
  *
  * \code{.unparsed}
- * // This example demonstrates converion from float to float16
+ * // This example demonstrates conversion from float to float16
  * constexpr float values[] = {1.f, 2.f, 3.f, 4.f, 5.f};
  * std::vector<Ort::Float16_t> fp16_values;
  * fp16_values.reserve(std::size(values));
@@ -337,7 +337,7 @@ static_assert(sizeof(Float16_t) == sizeof(uint16_t), "Sizes must match");
  * uint16_t buffers to/from Ort::BFloat16_t to feed and retrieve data.
  *
  * \code{.unparsed}
- * // This example demonstrates converion from float to float16
+ * // This example demonstrates conversion from float to float16
  * constexpr float values[] = {1.f, 2.f, 3.f, 4.f, 5.f};
  * std::vector<Ort::BFloat16_t> bfp16_values;
  * bfp16_values.reserve(std::size(values));
@@ -1831,7 +1831,7 @@ struct ValueImpl : ConstValueImpl<T> {
   /// by the vector of dims.
   /// </summary>
   /// <typeparam name="R"></typeparam>
-  /// <param name="location">[in] expressed by a vecotr of dimensions offsets</param>
+  /// <param name="location">[in] expressed by a vector of dimensions offsets</param>
   /// <returns></returns>
   template <typename R>
   R& At(const std::vector<int64_t>& location);
@@ -2688,7 +2688,7 @@ struct CustomOpBase : OrtCustomOp {
     return OrtCustomOpInputOutputCharacteristic::INPUT_OUTPUT_REQUIRED;
   }
 
-  // Default implemention of GetInputMemoryType() that returns OrtMemTypeDefault
+  // Default implementation of GetInputMemoryType() that returns OrtMemTypeDefault
   OrtMemType GetInputMemoryType(size_t /*index*/) const {
     return OrtMemTypeDefault;
   }
diff --git a/include/onnxruntime/core/session/onnxruntime_lite_custom_op.h b/include/onnxruntime/core/session/onnxruntime_lite_custom_op.h
index ce87d8c56d3fe..5002e16ba116c 100644
--- a/include/onnxruntime/core/session/onnxruntime_lite_custom_op.h
+++ b/include/onnxruntime/core/session/onnxruntime_lite_custom_op.h
@@ -361,7 +361,7 @@ struct TensorArray : public ArgBase {
             tensor = std::make_unique<Custom::Tensor<std::string>>(ctx, ith_input, true);
             break;
           default:
-            ORT_CXX_API_THROW("unknow input type", ORT_RUNTIME_EXCEPTION);
+            ORT_CXX_API_THROW("unknown input type", ORT_RUNTIME_EXCEPTION);
             break;
         }
         tensors_.emplace_back(tensor.release());
diff --git a/java/src/main/java/ai/onnxruntime/OnnxSparseTensor.java b/java/src/main/java/ai/onnxruntime/OnnxSparseTensor.java
index 8400ef53ff6d7..3901df19d0570 100644
--- a/java/src/main/java/ai/onnxruntime/OnnxSparseTensor.java
+++ b/java/src/main/java/ai/onnxruntime/OnnxSparseTensor.java
@@ -927,7 +927,7 @@ public BlockSparseTensor(
       }
       if (indicesShape.length < 2) {
         throw new IllegalArgumentException(
-            "Expected [numBlocks, co-ordinates] or larger, but indices shape was "
+            "Expected [numBlocks, coordinates] or larger, but indices shape was "
                 + Arrays.toString(indicesShape));
       }
     }
diff --git a/java/src/test/java/ai/onnxruntime/InferenceTest.java b/java/src/test/java/ai/onnxruntime/InferenceTest.java
index 41c4f5e02bd69..7246738fd4406 100644
--- a/java/src/test/java/ai/onnxruntime/InferenceTest.java
+++ b/java/src/test/java/ai/onnxruntime/InferenceTest.java
@@ -1145,7 +1145,7 @@ public void testPreTrainedModel(String opset, String modelName) throws IOExcepti
       try (OrtSession session = env.createSession(onnxModelFileName)) {
         String testDataDirNamePattern;
         if (opset.equals("opset9") && modelName.equals("LSTM_Seq_lens_unpacked")) {
-          testDataDirNamePattern = "seq_lens"; // discrepency in data directory
+          testDataDirNamePattern = "seq_lens"; // discrepancy in data directory
         } else {
           testDataDirNamePattern = "test_data";
         }
diff --git a/java/src/test/java/ai/onnxruntime/OnnxMl.java b/java/src/test/java/ai/onnxruntime/OnnxMl.java
index d96c6ccba6e31..1347fb5e1a259 100644
--- a/java/src/test/java/ai/onnxruntime/OnnxMl.java
+++ b/java/src/test/java/ai/onnxruntime/OnnxMl.java
@@ -18317,7 +18317,7 @@ public interface TensorProtoOrBuilder
      * For float and complex64 values
      * Complex64 tensors are encoded as a single array of floats,
      * with the real components appearing in odd numbered positions,
-     * and the corresponding imaginary component apparing in the
+     * and the corresponding imaginary component appearing in the
      * subsequent even numbered position. (e.g., [1.0 + 2.0i, 3.0 + 4.0i]
      * is encoded as [1.0, 2.0 ,3.0 ,4.0]
      * When this field is present, the data_type field MUST be FLOAT or COMPLEX64.
@@ -18333,7 +18333,7 @@ public interface TensorProtoOrBuilder
      * For float and complex64 values
      * Complex64 tensors are encoded as a single array of floats,
      * with the real components appearing in odd numbered positions,
-     * and the corresponding imaginary component apparing in the
+     * and the corresponding imaginary component appearing in the
      * subsequent even numbered position. (e.g., [1.0 + 2.0i, 3.0 + 4.0i]
      * is encoded as [1.0, 2.0 ,3.0 ,4.0]
      * When this field is present, the data_type field MUST be FLOAT or COMPLEX64.
@@ -18349,7 +18349,7 @@ public interface TensorProtoOrBuilder
      * For float and complex64 values
      * Complex64 tensors are encoded as a single array of floats,
      * with the real components appearing in odd numbered positions,
-     * and the corresponding imaginary component apparing in the
+     * and the corresponding imaginary component appearing in the
      * subsequent even numbered position. (e.g., [1.0 + 2.0i, 3.0 + 4.0i]
      * is encoded as [1.0, 2.0 ,3.0 ,4.0]
      * When this field is present, the data_type field MUST be FLOAT or COMPLEX64.
@@ -18663,7 +18663,7 @@ public interface TensorProtoOrBuilder
      * For double
      * Complex128 tensors are encoded as a single array of doubles,
      * with the real components appearing in odd numbered positions,
-     * and the corresponding imaginary component apparing in the
+     * and the corresponding imaginary component appearing in the
      * subsequent even numbered position. (e.g., [1.0 + 2.0i, 3.0 + 4.0i]
      * is encoded as [1.0, 2.0 ,3.0 ,4.0]
      * When this field is present, the data_type field MUST be DOUBLE or COMPLEX128
@@ -18679,7 +18679,7 @@ public interface TensorProtoOrBuilder
      * For double
      * Complex128 tensors are encoded as a single array of doubles,
      * with the real components appearing in odd numbered positions,
-     * and the corresponding imaginary component apparing in the
+     * and the corresponding imaginary component appearing in the
      * subsequent even numbered position. (e.g., [1.0 + 2.0i, 3.0 + 4.0i]
      * is encoded as [1.0, 2.0 ,3.0 ,4.0]
      * When this field is present, the data_type field MUST be DOUBLE or COMPLEX128
@@ -18695,7 +18695,7 @@ public interface TensorProtoOrBuilder
      * For double
      * Complex128 tensors are encoded as a single array of doubles,
      * with the real components appearing in odd numbered positions,
-     * and the corresponding imaginary component apparing in the
+     * and the corresponding imaginary component appearing in the
      * subsequent even numbered position. (e.g., [1.0 + 2.0i, 3.0 + 4.0i]
      * is encoded as [1.0, 2.0 ,3.0 ,4.0]
      * When this field is present, the data_type field MUST be DOUBLE or COMPLEX128
@@ -20167,7 +20167,7 @@ public OnnxMl.TensorProto.SegmentOrBuilder getSegmentOrBuilder() {
      * For float and complex64 values
      * Complex64 tensors are encoded as a single array of floats,
      * with the real components appearing in odd numbered positions,
-     * and the corresponding imaginary component apparing in the
+     * and the corresponding imaginary component appearing in the
      * subsequent even numbered position. (e.g., [1.0 + 2.0i, 3.0 + 4.0i]
      * is encoded as [1.0, 2.0 ,3.0 ,4.0]
      * When this field is present, the data_type field MUST be FLOAT or COMPLEX64.
@@ -20185,7 +20185,7 @@ public java.util.List<java.lang.Float> getFloatDataList() {
      * For float and complex64 values
      * Complex64 tensors are encoded as a single array of floats,
      * with the real components appearing in odd numbered positions,
-     * and the corresponding imaginary component apparing in the
+     * and the corresponding imaginary component appearing in the
      * subsequent even numbered position. (e.g., [1.0 + 2.0i, 3.0 + 4.0i]
      * is encoded as [1.0, 2.0 ,3.0 ,4.0]
      * When this field is present, the data_type field MUST be FLOAT or COMPLEX64.
@@ -20203,7 +20203,7 @@ public int getFloatDataCount() {
      * For float and complex64 values
      * Complex64 tensors are encoded as a single array of floats,
      * with the real components appearing in odd numbered positions,
-     * and the corresponding imaginary component apparing in the
+     * and the corresponding imaginary component appearing in the
      * subsequent even numbered position. (e.g., [1.0 + 2.0i, 3.0 + 4.0i]
      * is encoded as [1.0, 2.0 ,3.0 ,4.0]
      * When this field is present, the data_type field MUST be FLOAT or COMPLEX64.
@@ -20621,7 +20621,7 @@ public OnnxMl.TensorProto.DataLocation getDataLocation() {
      * For double
      * Complex128 tensors are encoded as a single array of doubles,
      * with the real components appearing in odd numbered positions,
-     * and the corresponding imaginary component apparing in the
+     * and the corresponding imaginary component appearing in the
      * subsequent even numbered position. (e.g., [1.0 + 2.0i, 3.0 + 4.0i]
      * is encoded as [1.0, 2.0 ,3.0 ,4.0]
      * When this field is present, the data_type field MUST be DOUBLE or COMPLEX128
@@ -20639,7 +20639,7 @@ public java.util.List<java.lang.Double> getDoubleDataList() {
      * For double
      * Complex128 tensors are encoded as a single array of doubles,
      * with the real components appearing in odd numbered positions,
-     * and the corresponding imaginary component apparing in the
+     * and the corresponding imaginary component appearing in the
      * subsequent even numbered position. (e.g., [1.0 + 2.0i, 3.0 + 4.0i]
      * is encoded as [1.0, 2.0 ,3.0 ,4.0]
      * When this field is present, the data_type field MUST be DOUBLE or COMPLEX128
@@ -20657,7 +20657,7 @@ public int getDoubleDataCount() {
      * For double
      * Complex128 tensors are encoded as a single array of doubles,
      * with the real components appearing in odd numbered positions,
-     * and the corresponding imaginary component apparing in the
+     * and the corresponding imaginary component appearing in the
      * subsequent even numbered position. (e.g., [1.0 + 2.0i, 3.0 + 4.0i]
      * is encoded as [1.0, 2.0 ,3.0 ,4.0]
      * When this field is present, the data_type field MUST be DOUBLE or COMPLEX128
@@ -21732,7 +21732,7 @@ private void ensureFloatDataIsMutable() {
        * For float and complex64 values
        * Complex64 tensors are encoded as a single array of floats,
        * with the real components appearing in odd numbered positions,
-       * and the corresponding imaginary component apparing in the
+       * and the corresponding imaginary component appearing in the
        * subsequent even numbered position. (e.g., [1.0 + 2.0i, 3.0 + 4.0i]
        * is encoded as [1.0, 2.0 ,3.0 ,4.0]
        * When this field is present, the data_type field MUST be FLOAT or COMPLEX64.
@@ -21750,7 +21750,7 @@ public java.util.List<java.lang.Float> getFloatDataList() {
        * For float and complex64 values
        * Complex64 tensors are encoded as a single array of floats,
        * with the real components appearing in odd numbered positions,
-       * and the corresponding imaginary component apparing in the
+       * and the corresponding imaginary component appearing in the
        * subsequent even numbered position. (e.g., [1.0 + 2.0i, 3.0 + 4.0i]
        * is encoded as [1.0, 2.0 ,3.0 ,4.0]
        * When this field is present, the data_type field MUST be FLOAT or COMPLEX64.
@@ -21768,7 +21768,7 @@ public int getFloatDataCount() {
        * For float and complex64 values
        * Complex64 tensors are encoded as a single array of floats,
        * with the real components appearing in odd numbered positions,
-       * and the corresponding imaginary component apparing in the
+       * and the corresponding imaginary component appearing in the
        * subsequent even numbered position. (e.g., [1.0 + 2.0i, 3.0 + 4.0i]
        * is encoded as [1.0, 2.0 ,3.0 ,4.0]
        * When this field is present, the data_type field MUST be FLOAT or COMPLEX64.
@@ -21786,7 +21786,7 @@ public float getFloatData(int index) {
        * For float and complex64 values
        * Complex64 tensors are encoded as a single array of floats,
        * with the real components appearing in odd numbered positions,
-       * and the corresponding imaginary component apparing in the
+       * and the corresponding imaginary component appearing in the
        * subsequent even numbered position. (e.g., [1.0 + 2.0i, 3.0 + 4.0i]
        * is encoded as [1.0, 2.0 ,3.0 ,4.0]
        * When this field is present, the data_type field MUST be FLOAT or COMPLEX64.
@@ -21807,7 +21807,7 @@ public Builder setFloatData(int index, float value) {
        * For float and complex64 values
        * Complex64 tensors are encoded as a single array of floats,
        * with the real components appearing in odd numbered positions,
-       * and the corresponding imaginary component apparing in the
+       * and the corresponding imaginary component appearing in the
        * subsequent even numbered position. (e.g., [1.0 + 2.0i, 3.0 + 4.0i]
        * is encoded as [1.0, 2.0 ,3.0 ,4.0]
        * When this field is present, the data_type field MUST be FLOAT or COMPLEX64.
@@ -21828,7 +21828,7 @@ public Builder addFloatData(float value) {
        * For float and complex64 values
        * Complex64 tensors are encoded as a single array of floats,
        * with the real components appearing in odd numbered positions,
-       * and the corresponding imaginary component apparing in the
+       * and the corresponding imaginary component appearing in the
        * subsequent even numbered position. (e.g., [1.0 + 2.0i, 3.0 + 4.0i]
        * is encoded as [1.0, 2.0 ,3.0 ,4.0]
        * When this field is present, the data_type field MUST be FLOAT or COMPLEX64.
@@ -21849,7 +21849,7 @@ public Builder addAllFloatData(java.lang.Iterable<? extends java.lang.Float> val
        * For float and complex64 values
        * Complex64 tensors are encoded as a single array of floats,
        * with the real components appearing in odd numbered positions,
-       * and the corresponding imaginary component apparing in the
+       * and the corresponding imaginary component appearing in the
        * subsequent even numbered position. (e.g., [1.0 + 2.0i, 3.0 + 4.0i]
        * is encoded as [1.0, 2.0 ,3.0 ,4.0]
        * When this field is present, the data_type field MUST be FLOAT or COMPLEX64.
@@ -23107,7 +23107,7 @@ private void ensureDoubleDataIsMutable() {
        * For double
        * Complex128 tensors are encoded as a single array of doubles,
        * with the real components appearing in odd numbered positions,
-       * and the corresponding imaginary component apparing in the
+       * and the corresponding imaginary component appearing in the
        * subsequent even numbered position. (e.g., [1.0 + 2.0i, 3.0 + 4.0i]
        * is encoded as [1.0, 2.0 ,3.0 ,4.0]
        * When this field is present, the data_type field MUST be DOUBLE or COMPLEX128
@@ -23125,7 +23125,7 @@ public java.util.List<java.lang.Double> getDoubleDataList() {
        * For double
        * Complex128 tensors are encoded as a single array of doubles,
        * with the real components appearing in odd numbered positions,
-       * and the corresponding imaginary component apparing in the
+       * and the corresponding imaginary component appearing in the
        * subsequent even numbered position. (e.g., [1.0 + 2.0i, 3.0 + 4.0i]
        * is encoded as [1.0, 2.0 ,3.0 ,4.0]
        * When this field is present, the data_type field MUST be DOUBLE or COMPLEX128
@@ -23143,7 +23143,7 @@ public int getDoubleDataCount() {
        * For double
        * Complex128 tensors are encoded as a single array of doubles,
        * with the real components appearing in odd numbered positions,
-       * and the corresponding imaginary component apparing in the
+       * and the corresponding imaginary component appearing in the
        * subsequent even numbered position. (e.g., [1.0 + 2.0i, 3.0 + 4.0i]
        * is encoded as [1.0, 2.0 ,3.0 ,4.0]
        * When this field is present, the data_type field MUST be DOUBLE or COMPLEX128
@@ -23161,7 +23161,7 @@ public double getDoubleData(int index) {
        * For double
        * Complex128 tensors are encoded as a single array of doubles,
        * with the real components appearing in odd numbered positions,
-       * and the corresponding imaginary component apparing in the
+       * and the corresponding imaginary component appearing in the
        * subsequent even numbered position. (e.g., [1.0 + 2.0i, 3.0 + 4.0i]
        * is encoded as [1.0, 2.0 ,3.0 ,4.0]
        * When this field is present, the data_type field MUST be DOUBLE or COMPLEX128
@@ -23182,7 +23182,7 @@ public Builder setDoubleData(int index, double value) {
        * For double
        * Complex128 tensors are encoded as a single array of doubles,
        * with the real components appearing in odd numbered positions,
-       * and the corresponding imaginary component apparing in the
+       * and the corresponding imaginary component appearing in the
        * subsequent even numbered position. (e.g., [1.0 + 2.0i, 3.0 + 4.0i]
        * is encoded as [1.0, 2.0 ,3.0 ,4.0]
        * When this field is present, the data_type field MUST be DOUBLE or COMPLEX128
@@ -23203,7 +23203,7 @@ public Builder addDoubleData(double value) {
        * For double
        * Complex128 tensors are encoded as a single array of doubles,
        * with the real components appearing in odd numbered positions,
-       * and the corresponding imaginary component apparing in the
+       * and the corresponding imaginary component appearing in the
        * subsequent even numbered position. (e.g., [1.0 + 2.0i, 3.0 + 4.0i]
        * is encoded as [1.0, 2.0 ,3.0 ,4.0]
        * When this field is present, the data_type field MUST be DOUBLE or COMPLEX128
@@ -23224,7 +23224,7 @@ public Builder addAllDoubleData(java.lang.Iterable<? extends java.lang.Double> v
        * For double
        * Complex128 tensors are encoded as a single array of doubles,
        * with the real components appearing in odd numbered positions,
-       * and the corresponding imaginary component apparing in the
+       * and the corresponding imaginary component appearing in the
        * subsequent even numbered position. (e.g., [1.0 + 2.0i, 3.0 + 4.0i]
        * is encoded as [1.0, 2.0 ,3.0 ,4.0]
        * When this field is present, the data_type field MUST be DOUBLE or COMPLEX128
diff --git a/js/README.md b/js/README.md
index dbc58f3a75ebd..4fc1af240852f 100644
--- a/js/README.md
+++ b/js/README.md
@@ -307,7 +307,7 @@ From ORT v1.19 onwards, the ONNX Runtime Mobile packages are no longer published
 
    3. In `<ORT_ROOT>`, run the below python script to build the ONNX Runtime Android archive file. On a Windows machine, this requires an admin account to build.
 
-   You can build a 'full' package that supports all operators and types, or a reduced size package that supports a limited set of operators and types based on your model/s to miminize the binary size.
+   You can build a 'full' package that supports all operators and types, or a reduced size package that supports a limited set of operators and types based on your model/s to minimize the binary size.
    See [here](https://onnxruntime.ai/docs/build/custom.html) for information about how the reduced build works, including creating the configuration file using your model/s.
    The instructions here show how to build a 'full' package.
 
diff --git a/js/web/lib/onnxjs/backends/backend-webgl.ts b/js/web/lib/onnxjs/backends/backend-webgl.ts
index a122068eb67bc..25c29f8680e6f 100644
--- a/js/web/lib/onnxjs/backends/backend-webgl.ts
+++ b/js/web/lib/onnxjs/backends/backend-webgl.ts
@@ -12,7 +12,7 @@ import { WebGLContext } from './webgl/webgl-context';
 import { createWebGLContext } from './webgl/webgl-context-factory';
 
 /**
- * WebGLBackend is the entry point for all WebGL opeartions
+ * WebGLBackend is the entry point for all WebGL operations
  * When it starts it created the WebGLRenderingContext
  * and other main framework components such as Program and Texture Managers
  */
diff --git a/js/web/lib/onnxjs/backends/webgl/glsl-vec-lib.ts b/js/web/lib/onnxjs/backends/webgl/glsl-vec-lib.ts
index 7b1ba915e7c10..d50e3a510d480 100644
--- a/js/web/lib/onnxjs/backends/webgl/glsl-vec-lib.ts
+++ b/js/web/lib/onnxjs/backends/webgl/glsl-vec-lib.ts
@@ -5,7 +5,7 @@ import { GlslContext, GlslLib, GlslLibRoutine } from './glsl-definitions';
 
 /**
  * GLSL Library responsible for vec routines
- * Vec is an varible length int array. The length is fixed at the time of
+ * Vec is an variable length int array. The length is fixed at the time of
  * generating the library functions from the dimensions of the output.
  */
 export class VecGlslLib extends GlslLib {
diff --git a/js/web/lib/onnxjs/backends/webgl/texture-manager.ts b/js/web/lib/onnxjs/backends/webgl/texture-manager.ts
index 3aad95b33e3e4..05efced124d14 100644
--- a/js/web/lib/onnxjs/backends/webgl/texture-manager.ts
+++ b/js/web/lib/onnxjs/backends/webgl/texture-manager.ts
@@ -16,7 +16,7 @@ export interface TextureManagerConfig {
 /**
  * TextureManager is the mainly responsible for caching Textures
  * Textures are cached in 2 levels:
- *   1. the texures which are associated with a dataId (from Tensor)
+ *   1. the textures which are associated with a dataId (from Tensor)
  *    Caching these is crucial to performance. These are In-use Textures
  *   2. textures which are not in use by any current ProgramInfo/Tensor
  *     These are called Free Textures
diff --git a/js/web/lib/onnxjs/backends/webgl/webgl-context.ts b/js/web/lib/onnxjs/backends/webgl/webgl-context.ts
index 19684dec81b3d..a559e6053863c 100644
--- a/js/web/lib/onnxjs/backends/webgl/webgl-context.ts
+++ b/js/web/lib/onnxjs/backends/webgl/webgl-context.ts
@@ -581,7 +581,7 @@ ${shaderSource}`);
       // TODO: add webgl 1 handling.
       throw new Error('WebGL1 profiling currently not supported');
     }
-    // return miliseconds
+    // return milliseconds
     return timeElapsed / 1000000;
   }
 
diff --git a/js/web/lib/onnxjs/instrument.ts b/js/web/lib/onnxjs/instrument.ts
index df6a1777054fd..a8cb8685950e6 100644
--- a/js/web/lib/onnxjs/instrument.ts
+++ b/js/web/lib/onnxjs/instrument.ts
@@ -27,7 +27,7 @@ export declare namespace Logger {
      */
     provider?: Provider;
     /**
-     * Specify the minimal logger serverity. 'warning' by default
+     * Specify the minimal logger severity. 'warning' by default
      */
     minimalSeverity?: Logger.Severity;
     /**
@@ -178,7 +178,7 @@ function createCategorizedLogger(category: string): Logger.CategorizedLogger {
   };
 }
 
-// NOTE: argument 'category' is put the last parameter beacause typescript
+// NOTE: argument 'category' is put the last parameter because typescript
 // doesn't allow optional argument put in front of required argument. This
 // order is different from a usual logging API.
 function logInternal(severity: Logger.Severity, content: string, _stack: number, category?: string) {
@@ -440,7 +440,7 @@ export class Profiler {
       this._timingEvents.length - this._flushPointer >= this._flushBatchSize ||
       currentTime - this._flushTime >= this._flushIntervalInMilliseconds
     ) {
-      // should flush when either batch size accumlated or interval elepsed
+      // should flush when either batch size accumulated or interval elepsed
 
       for (
         const previousPointer = this._flushPointer;
diff --git a/js/web/lib/onnxjs/session.ts b/js/web/lib/onnxjs/session.ts
index 26243ed9fe509..ffa616b921c5e 100644
--- a/js/web/lib/onnxjs/session.ts
+++ b/js/web/lib/onnxjs/session.ts
@@ -225,7 +225,7 @@ export class Session {
 
     for (let i = 0; i < expectedDims.length; ++i) {
       if (expectedDims[i] !== actualDims[i] && (!noneDimSupported || expectedDims[i] !== 0)) {
-        // data shape mis-match AND not a 'None' dimension.
+        // data shape mismatch AND not a 'None' dimension.
         return false;
       }
     }
diff --git a/js/web/lib/onnxjs/util.ts b/js/web/lib/onnxjs/util.ts
index 3afe84c046eb4..b25e9ef33b38a 100644
--- a/js/web/lib/onnxjs/util.ts
+++ b/js/web/lib/onnxjs/util.ts
@@ -100,7 +100,7 @@ export class MatMulUtil {
 
   /**
    * Fix the output shape computed for MatMul operation if it needs fixing
-   * @param outputShape The computed outputShape. Should be an array (atleast of length 2) of positive integers.
+   * @param outputShape The computed outputShape. Should be an array (at least of length 2) of positive integers.
    * This will be mutated.
    * @param aRank The rank of tensor A.
    * @param bRank The rank of tensor B.
@@ -183,7 +183,7 @@ export class BroadcastUtil {
   /**
    * Given the indices of a broadcasted tensor, calculate the original indices
    * @param broadcastedIndices The given indices of the broadcasted tensor.
-   * @param originalShape The original shape of the tensor before broadcas
+   * @param originalShape The original shape of the tensor before broadcast
    * @returns The calculated indices that maps to the original tensor.
    */
   static index(broadcastedIndices: readonly number[], originalShape: readonly number[]): number[] {
@@ -243,7 +243,7 @@ export class BroadcastUtil {
         c.set([], op(a.get([]) as number, b.get([]) as number));
       }
 
-      // atleast one input is a non-scalar
+      // at least one input is a non-scalar
       else {
         const outputIndices = new Array<number>(outputShape.length);
         const originalIndicesA = new Array(a.dims.length);
@@ -611,7 +611,7 @@ export class ShapeUtil {
   }
 
   /**
-   * normailze axis of range [-r, r) into [0, r).
+   * normalize axis of range [-r, r) into [0, r).
    */
   static normalizeAxis(axis: number, tensorRank: number): number {
     if (axis < -tensorRank && axis >= tensorRank) {
diff --git a/js/web/lib/wasm/jsep/util.ts b/js/web/lib/wasm/jsep/util.ts
index 85aca96057df2..7f87a2475ac16 100644
--- a/js/web/lib/wasm/jsep/util.ts
+++ b/js/web/lib/wasm/jsep/util.ts
@@ -189,7 +189,7 @@ export class ShapeUtil {
   }
 
   /**
-   * normailze axis of range [-r, r) into [0, r).
+   * normalize axis of range [-r, r) into [0, r).
    */
   static normalizeAxis(axis: number, tensorRank: number): number {
     if (axis < -tensorRank && axis >= tensorRank) {
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/bias-add.ts b/js/web/lib/wasm/jsep/webgpu/ops/bias-add.ts
index dd59d5f03d47d..321cb136f86b1 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/bias-add.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/bias-add.ts
@@ -29,7 +29,7 @@ const createBiasAddProgramInfo = (inputs: readonly TensorView[]): ProgramInfo =>
   const outputShape = inputs[0].dims;
 
   const channels = inputs[0].dims[2];
-  // since channel number can be only 320/640/1280, it's always divisable by 4
+  // since channel number can be only 320/640/1280, it's always divisible by 4
   const outputSize = ShapeUtil.size(outputShape) / 4;
 
   const dataType = inputs[0].dataType;
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/einsum.ts b/js/web/lib/wasm/jsep/webgpu/ops/einsum.ts
index 48da675193ad8..9374d50111ca1 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/einsum.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/einsum.ts
@@ -22,12 +22,12 @@ export interface EinsumAttributes extends AttributeWithCacheKey {
 
 const symbolPattern = '[a-zA-Z]|\\.\\.\\.'; // The pattern each symbol in each term in the symbolic equation should match
 const termPattern = '(' + symbolPattern + ')+'; // The pattern each term in the symbolic equation should match
-const termPatternOnly = '^' + termPattern + '$'; // The patterns only matchs a term begin to end.
+const termPatternOnly = '^' + termPattern + '$'; // The patterns only matches a term begin to end.
 const lhsPattern = '(' + termPattern + ',)*' + termPattern; // The pattern the LHS should match
-const lhsPatternOnly = '^' + lhsPattern + '$'; // The patterns only matchs a LHS begin to end.
+const lhsPatternOnly = '^' + lhsPattern + '$'; // The patterns only matches a LHS begin to end.
 
 interface SymbolInfo {
-  count: number; // Symbol corresponding to a dimmension of an input
+  count: number; // Symbol corresponding to a dimension of an input
   inputIndices: number[]; // Number of input variables the symbol corresponds to
   dimValue: number; // Number of dimensions the symbol corresponds to
 }
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/group-query-attention.ts b/js/web/lib/wasm/jsep/webgpu/ops/group-query-attention.ts
index 32b3c54f734dc..d218be3ce8b5f 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/group-query-attention.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/group-query-attention.ts
@@ -100,7 +100,7 @@ export const validateInputs = (
   }
   const hasPastKey = pastKey && pastKey.dims.length !== 0;
   const hasPastValue = pastValue && pastValue.dims.length !== 0;
-  // Currenly the onnxruntime GQA specification only support key/value BNSH format.
+  // Currently the onnxruntime GQA specification only support key/value BNSH format.
   const isPastkvBSNH =
     hasPastKey &&
     pastKey.dims.length === 4 &&
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/split.ts b/js/web/lib/wasm/jsep/webgpu/ops/split.ts
index 8c39505734e41..76a69e4bb8061 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/split.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/split.ts
@@ -140,7 +140,7 @@ export const parseSplitAttributes = (attributes: Record<string, unknown>): Split
   const splitSizes: number[] = attributes.splitSizes as number[];
   const numOutputs = (attributes.numOutputs as number) < 0 ? splitSizes.length : (attributes.numOutputs as number);
   if (numOutputs !== splitSizes.length) {
-    throw new Error('numOutputs and splitSizes lengh must be equal');
+    throw new Error('numOutputs and splitSizes length must be equal');
   }
   return createAttributeWithCacheKey({ axis, numOutputs, splitSizes });
 };
diff --git a/js/web/lib/wasm/proxy-wrapper.ts b/js/web/lib/wasm/proxy-wrapper.ts
index 30b1f5101e5f2..a6b0ac6d5a051 100644
--- a/js/web/lib/wasm/proxy-wrapper.ts
+++ b/js/web/lib/wasm/proxy-wrapper.ts
@@ -124,7 +124,7 @@ export const initializeWebAssemblyAndOrtRuntime = async (): Promise<void> => {
           ) {
             // for a build bundled the wasm JS, if either of the following conditions is met:
             // - the proxy worker is loaded from a blob URL
-            // - `import.meta.url` is a file URL, it means it is overwriten by the bundler.
+            // - `import.meta.url` is a file URL, it means it is overwritten by the bundler.
             //
             // in either case, the path information is lost, we need to pass the path of the .wasm file to the worker.
             // we need to use the bundler preferred URL format:
diff --git a/js/web/lib/wasm/run-options.ts b/js/web/lib/wasm/run-options.ts
index d15c8339b6824..a0ef57df31129 100644
--- a/js/web/lib/wasm/run-options.ts
+++ b/js/web/lib/wasm/run-options.ts
@@ -22,7 +22,7 @@ export const setRunOptions = (options: InferenceSession.RunOptions): [number, nu
       options.logSeverityLevel < 0 ||
       options.logSeverityLevel > 4
     ) {
-      throw new Error(`log serverity level is not valid: ${options.logSeverityLevel}`);
+      throw new Error(`log severity level is not valid: ${options.logSeverityLevel}`);
     }
 
     if (options?.logVerbosityLevel === undefined) {
diff --git a/js/web/lib/wasm/session-options.ts b/js/web/lib/wasm/session-options.ts
index 89a4484e5a1c4..cd787379220c1 100644
--- a/js/web/lib/wasm/session-options.ts
+++ b/js/web/lib/wasm/session-options.ts
@@ -179,7 +179,7 @@ export const setSessionOptions = async (options?: InferenceSession.SessionOption
 
     const logSeverityLevel = sessionOptions.logSeverityLevel ?? 2; // Default to 2 - warning
     if (!Number.isInteger(logSeverityLevel) || logSeverityLevel < 0 || logSeverityLevel > 4) {
-      throw new Error(`log serverity level is not valid: ${logSeverityLevel}`);
+      throw new Error(`log severity level is not valid: ${logSeverityLevel}`);
     }
 
     const logVerbosityLevel = sessionOptions.logVerbosityLevel ?? 0; // Default to 0 - verbose
diff --git a/js/web/test/unittests/backends/webgl/test-pack-unpack.ts b/js/web/test/unittests/backends/webgl/test-pack-unpack.ts
index 28821663ffd50..bae0a9f3e8fc9 100644
--- a/js/web/test/unittests/backends/webgl/test-pack-unpack.ts
+++ b/js/web/test/unittests/backends/webgl/test-pack-unpack.ts
@@ -257,7 +257,7 @@ describe('#UnitTest# - pack - Tensor pack', () => {
     for (let k = 0; k < testDataSet.length; ++k) {
       const testData = testDataSet[k];
       describe('Test pack', () => {});
-      it(`Test pack kernal ${textureLayout[w]} ${JSON.stringify(testData)}`, () => {
+      it(`Test pack kernel ${textureLayout[w]} ${JSON.stringify(testData)}`, () => {
         const webglInferenceHandler = inferenceHandler as WebGLInferenceHandler;
 
         const elementCount = testData.elementCount;
@@ -291,7 +291,7 @@ describe('#UnitTest# - pack - Tensor pack', () => {
         // compile shader code
         const programInfo = createPackProgramInfoLoader(inferenceHandler! as WebGLInferenceHandler, inputTensor);
 
-        // run kernal and get output
+        // run kernel and get output
         const resultTextureData = webglInferenceHandler.executeProgram(programInfo, [inputTensor]);
         const gl = webglInferenceHandler.session.textureManager.glContext.gl;
         const resultDataBuffer = createArrayFromTexture(
@@ -324,7 +324,7 @@ describe('#UnitTest# - unpack - Tensor unpack', () => {
   for (let k = 0; k < testDataSet.length; ++k) {
     const testData = testDataSet[k];
     describe(`Test unpack ${JSON.stringify(testData)}`, () => {});
-    it(`Test unpack kernal ${testData.inputShape}`, () => {
+    it(`Test unpack kernel ${testData.inputShape}`, () => {
       const webglInferenceHandler = inferenceHandler as WebGLInferenceHandler;
 
       const elementCount = testData.elementCount;
@@ -365,7 +365,7 @@ describe('#UnitTest# - unpack - Tensor unpack', () => {
       // compile shader code
       const programInfo = createUnpackProgramInfoLoader(inferenceHandler! as WebGLInferenceHandler, inputTensor);
 
-      // run kernal and get output
+      // run kernel and get output
       const resultTextureData = webglInferenceHandler.executeProgram(programInfo, [inputTensor]);
       const result = resultTextureData.tensor.data;
 
@@ -419,7 +419,7 @@ describe('#UnitTest# - pack-unpack round trip', () => {
         packResultData.tensor,
       );
 
-      // run unpack kernal and get output
+      // run unpack kernel and get output
       const unpackResultData = webglInferenceHandler.executeProgram(unpackProgramInfo, [inputTensor]);
 
       const resultData = unpackResultData.tensor.data;
diff --git a/js/web/test/unittests/backends/webgl/test-reshape-packed.ts b/js/web/test/unittests/backends/webgl/test-reshape-packed.ts
index b90372db1250a..61abcadc5b98b 100644
--- a/js/web/test/unittests/backends/webgl/test-reshape-packed.ts
+++ b/js/web/test/unittests/backends/webgl/test-reshape-packed.ts
@@ -134,7 +134,7 @@ describe('#UnitTest# - reshape - packed', () => {
       const inputData = createAscendingArray(elementCount);
       const inputTensorA = new Tensor(inputTensorShape, 'float32', undefined, undefined, inputData);
 
-      // run kernal and get output
+      // run kernel and get output
       const resultTensor = webglInferenceHandler.reshapePacked(inputTensorA, outputTensorShape);
       const result = resultTensor.data;
 

From 00bd398d54d3c4f8815d360b69965811f41cea94 Mon Sep 17 00:00:00 2001
From: co63oc <co63oc@users.noreply.github.com>
Date: Thu, 15 May 2025 15:54:20 +0800
Subject: [PATCH 75/84] Fix typos in bert_defs.cc and contrib_defs.cc (#24764)

### Description
<!-- Describe your changes. -->
Fix typos in bert_defs.cc and contrib_defs.cc


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->

Signed-off-by: co63oc <co63oc@users.noreply.github.com>
---
 docs/ContribOperators.md                           | 12 ++++++------
 onnxruntime/core/graph/contrib_ops/bert_defs.cc    |  2 +-
 onnxruntime/core/graph/contrib_ops/contrib_defs.cc | 10 +++++-----
 3 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/docs/ContribOperators.md b/docs/ContribOperators.md
index 0308b5c79c508..dbe7d9b85092a 100644
--- a/docs/ContribOperators.md
+++ b/docs/ContribOperators.md
@@ -815,7 +815,7 @@ This version of the operator has been available since version 1 of the 'com.micr
   scale = 1. / (1. - ratio).
   ```
   
-  This op functions in much the same was as Dropout-11 and Dropout-13 do, execpt that the mask is output as a bit-packed uint32 tensor, instead of a boolean tensor.
+  This op functions in much the same was as Dropout-11 and Dropout-13 do, except that the mask is output as a bit-packed uint32 tensor, instead of a boolean tensor.
 
 #### Version
 
@@ -2231,9 +2231,9 @@ This version of the operator has been available since version 1 of the 'com.micr
 <dt><tt>dtype</tt> : int</dt>
 <dd>Output Type. Same definition as attribute 'to' for operator Cast.</dd>
 <dt><tt>transA</tt> : int</dt>
-<dd>Whether A should be transposed. Float 8 only supprted transA=0.</dd>
+<dd>Whether A should be transposed. Float 8 only supported transA=0.</dd>
 <dt><tt>transB</tt> : int</dt>
-<dd>Whether B should be transposed. Float 8 only supprted transB=1.</dd>
+<dd>Whether B should be transposed. Float 8 only supported transB=1.</dd>
 </dl>
 
 #### Inputs (2 - 6)
@@ -2309,7 +2309,7 @@ This version of the operator has been available since version 1 of the 'com.micr
 
 <dl>
 <dt><tt>emb</tt> : U</dt>
-<dd>embeddding - 3D tensor with shape (batch_size, seq_len, dim)</dd>
+<dd>embedding - 3D tensor with shape (batch_size, seq_len, dim)</dd>
 <dt><tt>q</tt> : T</dt>
 <dd>q state - 4D tensor with shape (batch_size, num_heads, seq_len, dim)</dd>
 <dt><tt>q_rot</tt> : T</dt>
@@ -2816,7 +2816,7 @@ This version of the operator has been available since version 1 of the 'com.micr
 
   Matrix product with right hand matrix being pre-packed and quantized int4 data blob.
   During quantization, the matrix is divided into blocks, where each block is a
-  continguous subset inside each column. Each block is quantized into a
+  contiguous subset inside each column. Each block is quantized into a
   sequence of 4b integers with a scaling factor and an optional offset.
   Currently 3 quantization types are supported:
   (0): block size 32, no offset, (1): block size 32, with offset, (2): block size 64,
@@ -6076,7 +6076,7 @@ This version of the operator has been available since version 1 of the 'com.micr
 
 ### <a name="com.microsoft.WhisperBeamSearch"></a><a name="com.microsoft.whisperbeamsearch">**com.microsoft.WhisperBeamSearch**</a>
 
-  Beam Search for whisper model, especiall with cross_qk features etc.
+  Beam Search for whisper model, especially with cross_qk features etc.
 
 #### Version
 
diff --git a/onnxruntime/core/graph/contrib_ops/bert_defs.cc b/onnxruntime/core/graph/contrib_ops/bert_defs.cc
index abfd93e13ca3b..7eea7d218e278 100644
--- a/onnxruntime/core/graph/contrib_ops/bert_defs.cc
+++ b/onnxruntime/core/graph/contrib_ops/bert_defs.cc
@@ -1474,7 +1474,7 @@ ONNX_MS_OPERATOR_SET_SCHEMA(
         .SetDoc(GemmaRotaryEmbedding_ver1_doc)
         .Input(0,
                "emb",
-               "embeddding - 3D tensor with shape (batch_size, seq_len, dim)",
+               "embedding - 3D tensor with shape (batch_size, seq_len, dim)",
                "U")
         .Input(1,
                "q",
diff --git a/onnxruntime/core/graph/contrib_ops/contrib_defs.cc b/onnxruntime/core/graph/contrib_ops/contrib_defs.cc
index d87688a62040c..f9f7be60a9bd6 100644
--- a/onnxruntime/core/graph/contrib_ops/contrib_defs.cc
+++ b/onnxruntime/core/graph/contrib_ops/contrib_defs.cc
@@ -1188,7 +1188,7 @@ ONNX_MS_OPERATOR_SET_SCHEMA(BeamSearch, 1,
 
 ONNX_MS_OPERATOR_SET_SCHEMA(WhisperBeamSearch, 1,
                             OpSchema()
-                                .SetDoc("Beam Search for whisper model, especiall with cross_qk features etc.")
+                                .SetDoc("Beam Search for whisper model, especially with cross_qk features etc.")
                                 .Attr("eos_token_id", "The id of the end-of-sequence token", AttributeProto::INT)
                                 .Attr("pad_token_id", "The id of the padding token", AttributeProto::INT)
                                 .Attr("decoder_start_token_id", "The id of the token that indicates decoding starts (i.e. the start of transcription token id)", AttributeProto::INT, static_cast<int64_t>(-1))
@@ -2079,7 +2079,7 @@ ONNX_MS_OPERATOR_SET_SCHEMA(MatMulFpQ4, 1,
                                 .SetDoc(R"DOC(
 Matrix product with right hand matrix being pre-packed and quantized int4 data blob.
 During quantization, the matrix is divided into blocks, where each block is a
-continguous subset inside each column. Each block is quantized into a
+contiguous subset inside each column. Each block is quantized into a
 sequence of 4b integers with a scaling factor and an optional offset.
 Currently 3 quantization types are supported:
 (0): block size 32, no offset, (1): block size 32, with offset, (2): block size 64,
@@ -2691,12 +2691,12 @@ ONNX_MS_OPERATOR_SET_SCHEMA(GemmFloat8, 1,
                                 .SetDoc(R"DOC(Generic Gemm for float and float 8.)DOC")
                                 .Attr(
                                     "transA",
-                                    "Whether A should be transposed. Float 8 only supprted transA=0.",
+                                    "Whether A should be transposed. Float 8 only supported transA=0.",
                                     AttributeProto::INT,
                                     static_cast<int64_t>(0))
                                 .Attr(
                                     "transB",
-                                    "Whether B should be transposed. Float 8 only supprted transB=1.",
+                                    "Whether B should be transposed. Float 8 only supported transB=1.",
                                     AttributeProto::INT,
                                     static_cast<int64_t>(0))
                                 .Attr(
@@ -3388,7 +3388,7 @@ where
 scale = 1. / (1. - ratio).
 ```
 
-This op functions in much the same was as Dropout-11 and Dropout-13 do, execpt that the mask is output as a bit-packed uint32 tensor, instead of a boolean tensor.
+This op functions in much the same was as Dropout-11 and Dropout-13 do, except that the mask is output as a bit-packed uint32 tensor, instead of a boolean tensor.
 )DOC";
 
   ONNX_CONTRIB_OPERATOR_SCHEMA(BitmaskDropout)

From fd3e0e848f394206d28890648fa425b8a4991980 Mon Sep 17 00:00:00 2001
From: Jing Fang <126209182+fajin-corp@users.noreply.github.com>
Date: Thu, 15 May 2025 11:24:28 -0700
Subject: [PATCH 76/84] [x86] matmul8bit memory loading perf tuning (#24732)

### Description
Use aligned load and preloading. There is ~10% token generation speed
up.

### Motivation and Context
Optimize perf
---
 onnxruntime/core/mlas/lib/qnbitgemm.h         |   4 +-
 .../sqnbitgemm_kernel_avx2_int8_blklen32.h    |  48 +++---
 .../sqnbitgemm_kernel_avx2_int8_blklen64.h    | 161 ++++++++++++++----
 .../sqnbitgemm_kernel_avx512_int8_blklen128.h |  32 ++--
 .../sqnbitgemm_kernel_avx512_int8_blklen64.h  |  48 +++---
 .../mlas/lib/sqnbitgemm_kernel_avx_common.h   |   4 +-
 .../core/mlas/lib/sqnbitgemm_q8_block.h       |   2 +-
 7 files changed, 195 insertions(+), 104 deletions(-)

diff --git a/onnxruntime/core/mlas/lib/qnbitgemm.h b/onnxruntime/core/mlas/lib/qnbitgemm.h
index a740801e00514..4c133103bee04 100644
--- a/onnxruntime/core/mlas/lib/qnbitgemm.h
+++ b/onnxruntime/core/mlas/lib/qnbitgemm.h
@@ -54,8 +54,8 @@ struct PackedQuantBDataStruct {
         const size_t PackedQuantBDataSize = N * BlockCountK * MlasQNBitBlkDataSizeInBytes(BlkBitWidth, BlkLen);
         size_t BlkSumSize = MlasDivRoundup(N, 16) * BlockCountK * 16 * sizeof(T);
 #if defined(MLAS_TARGET_AMD64_IX86)
-        // _mm256_load_si256 requires alignment on a 32-byte boundary
-        PackedQuantBData = (std::byte*)MlasAlignAddress(PackedQuantBWorkspace, 32);
+        // avx512 requires alignment on a 64-byte boundary
+        PackedQuantBData = (std::byte*)MlasAlignAddress(PackedQuantBWorkspace, 64);
 #else
         PackedQuantBData = (std::byte*)PackedQuantBWorkspace;
 #endif
diff --git a/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx2_int8_blklen32.h b/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx2_int8_blklen32.h
index 09d53f9b852db..d2d9886ab61f7 100644
--- a/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx2_int8_blklen32.h
+++ b/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx2_int8_blklen32.h
@@ -131,8 +131,8 @@ accumulate_q8_blklen32_r1c1blk2_avx2(
     __m256& acc0
 )
 {
-    const __m256i bv0_32_epi8 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(QuantBDataPtr));
-    const __m256i bv1_32_epi8 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(QuantBDataPtr + 32));
+    const __m256i bv0_32_epi8 = _mm256_load_si256(reinterpret_cast<const __m256i*>(QuantBDataPtr));
+    const __m256i bv1_32_epi8 = _mm256_load_si256(reinterpret_cast<const __m256i*>(QuantBDataPtr + 32));
     __m256 scale_b_2_ps = _mm256_castpd_ps(_mm256_broadcast_sd((double*)scale_b)); // 01 01 01 01
     __m256 scale_a0_2_ps = _mm256_castpd_ps(_mm256_broadcast_sd((double*)scale_a0));
     __m256 scale_a0b_2_ps = _mm256_mul_ps(scale_b_2_ps, scale_a0_2_ps);
@@ -194,8 +194,8 @@ accumulate_q8_blklen32_r2c1blk2_avx2(
     __m256& acc1
 )
 {
-    const __m256i bv0_32_epi8 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(QuantBDataPtr));
-    const __m256i bv1_32_epi8 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(QuantBDataPtr + 32));
+    const __m256i bv0_32_epi8 = _mm256_load_si256(reinterpret_cast<const __m256i*>(QuantBDataPtr));
+    const __m256i bv1_32_epi8 = _mm256_load_si256(reinterpret_cast<const __m256i*>(QuantBDataPtr + 32));
     __m256 scale_b_2_ps = _mm256_castpd_ps(_mm256_broadcast_sd((double*)scale_b)); // 01 01 01 01
     __m256 scale_a0_2_ps = _mm256_castpd_ps(_mm256_broadcast_sd((double*)scale_a0));
     __m256 scale_a0b_2_ps = _mm256_mul_ps(scale_b_2_ps, scale_a0_2_ps);
@@ -360,7 +360,7 @@ accumulate_q8_blklen32_r1c1blk1_avx2(
     __m256& acc0
 )
 {
-    const __m256i bv0_32_epi8 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(QuantBDataPtr));
+    const __m256i bv0_32_epi8 = _mm256_load_si256(reinterpret_cast<const __m256i*>(QuantBDataPtr));
 
 #if !defined(__GNUC__) || (__GNUC__ > 10)
     if constexpr (vnni)
@@ -402,7 +402,7 @@ accumulate_q8_blklen32_r2c1blk1_avx2(
     __m256& acc1
 )
 {
-    const __m256i bv0_32_epi8 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(QuantBDataPtr));
+    const __m256i bv0_32_epi8 = _mm256_load_si256(reinterpret_cast<const __m256i*>(QuantBDataPtr));
 
 #if !defined(__GNUC__) || (__GNUC__ > 10)
     if constexpr (vnni)
@@ -667,10 +667,10 @@ Q8Int8GemmR2xC4BlkLen32Avx2(
             // process 2 blks of 64 4b weights a time
             for (; k_blks_remaining > 1; k_blks_remaining -= PerAccuBlk2) {
                 // load A:
-                const __m256i av_00_epi8 = _mm256_loadu_si256((const __m256i*)(QuantAPtr));
-                const __m256i av_01_epi8 = _mm256_loadu_si256((const __m256i*)(QuantAPtr + BlkLen32));
-                const __m256i av_10_epi8 = _mm256_loadu_si256((const __m256i*)(QuantAPtr + lda));
-                const __m256i av_11_epi8 = _mm256_loadu_si256((const __m256i*)(QuantAPtr + lda + BlkLen32));
+                const __m256i av_00_epi8 = _mm256_load_si256((const __m256i*)(QuantAPtr));
+                const __m256i av_01_epi8 = _mm256_load_si256((const __m256i*)(QuantAPtr + BlkLen32));
+                const __m256i av_10_epi8 = _mm256_load_si256((const __m256i*)(QuantAPtr + lda));
+                const __m256i av_11_epi8 = _mm256_load_si256((const __m256i*)(QuantAPtr + lda + BlkLen32));
 
                 accumulate_q8_blklen32_r2c1blk2_avx2<vnni>(av_00_epi8, av_01_epi8, av_10_epi8, av_11_epi8, QuantBDataPtr, QuantAScalePtr, QuantAScalePtr + BlockCountK, QuantBScalePtr, acc[0], acc[NCols4]);
                 accumulate_q8_blklen32_r2c1blk2_avx2<vnni>(av_00_epi8, av_01_epi8, av_10_epi8, av_11_epi8, QuantBDataPtr + StrideQuantBData2, QuantAScalePtr, QuantAScalePtr + BlockCountK, QuantBScalePtr + PerAccuBlk2, acc[1], acc[NCols4 + 1]);
@@ -686,8 +686,8 @@ Q8Int8GemmR2xC4BlkLen32Avx2(
 
             if (k_blks_remaining > 0) {
                 // load A
-                const __m256i av_00_epi8 = _mm256_loadu_si256((const __m256i*)(QuantAPtr));
-                const __m256i av_10_epi8 = _mm256_loadu_si256((const __m256i*)(QuantAPtr + lda));
+                const __m256i av_00_epi8 = _mm256_load_si256((const __m256i*)(QuantAPtr));
+                const __m256i av_10_epi8 = _mm256_load_si256((const __m256i*)(QuantAPtr + lda));
 
                 float scale_a00 = *QuantAScalePtr;
                 float scale_a10 = *(QuantAScalePtr + BlockCountK);
@@ -865,10 +865,10 @@ Q8Int8GemmR2xC1BlkLen32Avx2(
             size_t k_blks_remaining = BlockCountK;
             // process 2 blks of 64 4b weights a time
             for (; k_blks_remaining > 1; k_blks_remaining -= PerAccuBlk2) {
-                const __m256i av_00_epi8 = _mm256_loadu_si256((const __m256i*)(QuantAPtr));
-                const __m256i av_01_epi8 = _mm256_loadu_si256((const __m256i*)(QuantAPtr + BlkLen32));
-                const __m256i av_10_epi8 = _mm256_loadu_si256((const __m256i*)(QuantAPtr + lda));
-                const __m256i av_11_epi8 = _mm256_loadu_si256((const __m256i*)(QuantAPtr + lda + BlkLen32));
+                const __m256i av_00_epi8 = _mm256_load_si256((const __m256i*)(QuantAPtr));
+                const __m256i av_01_epi8 = _mm256_load_si256((const __m256i*)(QuantAPtr + BlkLen32));
+                const __m256i av_10_epi8 = _mm256_load_si256((const __m256i*)(QuantAPtr + lda));
+                const __m256i av_11_epi8 = _mm256_load_si256((const __m256i*)(QuantAPtr + lda + BlkLen32));
 
                 accumulate_q8_blklen32_r2c1blk2_avx2<vnni>(av_00_epi8, av_01_epi8, av_10_epi8, av_11_epi8, QuantBDataPtr, QuantAScalePtr, QuantAScalePtr + BlockCountK, QuantBScalePtr, acc0, acc1);
 
@@ -880,8 +880,8 @@ Q8Int8GemmR2xC1BlkLen32Avx2(
             }
 
             if (k_blks_remaining > 0) {
-                const __m256i av_00_epi8 = _mm256_loadu_si256((const __m256i*)QuantAPtr);
-                const __m256i av_10_epi8 = _mm256_loadu_si256((const __m256i*)(QuantAPtr + lda));
+                const __m256i av_00_epi8 = _mm256_load_si256((const __m256i*)QuantAPtr);
+                const __m256i av_10_epi8 = _mm256_load_si256((const __m256i*)(QuantAPtr + lda));
 
                 const float scale_a00 = *QuantAScalePtr;
                 const float scale_a10 = *(QuantAScalePtr + BlockCountK);
@@ -1082,8 +1082,8 @@ Q8Int8GemmR1xC4BlkLen32Avx2(
             __m256 acc[NCols4] = {_mm256_setzero_ps(), _mm256_setzero_ps(), _mm256_setzero_ps(), _mm256_setzero_ps()};
             size_t k_blks_remaining = BlockCountK;
             for (; k_blks_remaining > 1; k_blks_remaining -= PerAccuBlk2) {
-                const __m256i av_00_epi8 = _mm256_loadu_si256((const __m256i*)(QuantAPtr));
-                const __m256i av_01_epi8 = _mm256_loadu_si256((const __m256i*)(QuantAPtr + BlkLen32));
+                const __m256i av_00_epi8 = _mm256_load_si256((const __m256i*)(QuantAPtr));
+                const __m256i av_01_epi8 = _mm256_load_si256((const __m256i*)(QuantAPtr + BlkLen32));
 
                 accumulate_q8_blklen32_r1c1blk2_avx2<vnni>(av_00_epi8, av_01_epi8, QuantBDataPtr, QuantAScalePtr, QuantBScalePtr, acc[0]);
                 accumulate_q8_blklen32_r1c1blk2_avx2<vnni>(av_00_epi8, av_01_epi8, QuantBDataPtr + StrideQuantBData2, QuantAScalePtr, QuantBScalePtr + PerAccuBlk2, acc[1]);
@@ -1099,7 +1099,7 @@ Q8Int8GemmR1xC4BlkLen32Avx2(
 
             if (k_blks_remaining > 0) {
                 // load A
-                const __m256i av_00_epi8 = _mm256_loadu_si256((const __m256i*)QuantAPtr);
+                const __m256i av_00_epi8 = _mm256_load_si256((const __m256i*)QuantAPtr);
                 const float scale_a00 = *QuantAScalePtr;
 
                 const float scale_00 = scale_a00 * (QuantBScalePtr)[0];
@@ -1257,8 +1257,8 @@ Q8Int8GemmR1xC1BlkLen32Avx2(
             __m256 acc0 = _mm256_setzero_ps();
             size_t k_blks_remaining = BlockCountK;
             for (; k_blks_remaining > 1; k_blks_remaining -= PerAccuBlk2) {
-                const __m256i av_00_epi8 = _mm256_loadu_si256((const __m256i*)(QuantAPtr));
-                const __m256i av_01_epi8 = _mm256_loadu_si256((const __m256i*)(QuantAPtr + BlkLen32));
+                const __m256i av_00_epi8 = _mm256_load_si256((const __m256i*)(QuantAPtr));
+                const __m256i av_01_epi8 = _mm256_load_si256((const __m256i*)(QuantAPtr + BlkLen32));
                 accumulate_q8_blklen32_r1c1blk2_avx2<vnni>(av_00_epi8, av_01_epi8, QuantBDataPtr, QuantAScalePtr, QuantBScalePtr, acc0);
 
                 // increment block pointers
@@ -1269,7 +1269,7 @@ Q8Int8GemmR1xC1BlkLen32Avx2(
             }
 
             if (k_blks_remaining > 0) {
-                const __m256i av_00_epi8 = _mm256_loadu_si256((const __m256i*)QuantAPtr);
+                const __m256i av_00_epi8 = _mm256_load_si256((const __m256i*)QuantAPtr);
                 const float& scale_a00 = *QuantAScalePtr;
                 const float& scale_00 = scale_a00 * (QuantBScalePtr)[0];
                 accumulate_q8_blklen32_r1c1blk1_avx2<vnni>(av_00_epi8, QuantBDataPtr, scale_00, acc0);
diff --git a/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx2_int8_blklen64.h b/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx2_int8_blklen64.h
index 2bf27df2dccce..20583740396a7 100644
--- a/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx2_int8_blklen64.h
+++ b/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx2_int8_blklen64.h
@@ -87,13 +87,12 @@ static MLAS_FORCEINLINE void
 accumulate_q8_blklen64_r1c1blk1_avx2(
     const __m256i& av00_32_epi8,
     const __m256i& av01_32_epi8,
-    const std::byte* QuantBDataPtr,
+    const __m256i& bv0_32_epi8,
+    const __m256i& bv1_32_epi8,
     float scale_a0b,
     __m256& acc0
 )
 {
-    __m256i bv0_32_epi8 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(QuantBDataPtr));
-    __m256i bv1_32_epi8 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(QuantBDataPtr + 32));
     __m256 scale_8_ps = _mm256_set1_ps(scale_a0b);
 
 #if !defined(__GNUC__) || (__GNUC__ > 10)
@@ -143,15 +142,14 @@ accumulate_q8_blklen64_r2c1blk1_avx2(
     const __m256i& av01_32_epi8,
     const __m256i& av10_32_epi8,
     const __m256i& av11_32_epi8,
-    const std::byte* QuantBDataPtr,
+    const __m256i& bv0_32_epi8,
+    const __m256i& bv1_32_epi8,
     float scale_a0b,
     float scale_a1b,
     __m256& acc0,
     __m256& acc1
 )
 {
-    __m256i bv0_32_epi8 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(QuantBDataPtr));
-    __m256i bv1_32_epi8 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(QuantBDataPtr + 32));
     __m256 scale0_8_ps = _mm256_set1_ps(scale_a0b);
     __m256 scale1_8_ps = _mm256_set1_ps(scale_a1b);
 
@@ -416,21 +414,54 @@ Q8Int8GemmR2xC4BlkLen64Avx2(
                 const float scale_a0b3 = (*(QuantBScalePtr + 3)) * scale_a0;
                 const float scale_a1b3 = (*(QuantBScalePtr + 3)) * scale_a1;
 
-                for (size_t kk = 0; kk < PerBlkSubblkCount; kk++) {
-                    const __m256i av_00_epi8 = _mm256_loadu_si256((const __m256i*)QuantAPtr);
-                    const __m256i av_01_epi8 = _mm256_loadu_si256((const __m256i*)(QuantAPtr + 32));
-                    const __m256i av_10_epi8 = _mm256_loadu_si256((const __m256i*)(QuantAPtr + lda));
-                    const __m256i av_11_epi8 = _mm256_loadu_si256((const __m256i*)(QuantAPtr + lda + 32));
-
-                    accumulate_q8_blklen64_r2c1blk1_avx2<vnni>(av_00_epi8, av_01_epi8, av_10_epi8, av_11_epi8, QuantBDataPtr, scale_a0b0, scale_a1b0, acc[0], acc[NCols4]);
-                    accumulate_q8_blklen64_r2c1blk1_avx2<vnni>(av_00_epi8, av_01_epi8, av_10_epi8, av_11_epi8, QuantBDataPtr + SubblkDataSizeInBytes, scale_a0b1, scale_a1b1, acc[1], acc[NCols4 + 1]);
-                    accumulate_q8_blklen64_r2c1blk1_avx2<vnni>(av_00_epi8, av_01_epi8, av_10_epi8, av_11_epi8, QuantBDataPtr + 2 * SubblkDataSizeInBytes, scale_a0b2, scale_a1b2, acc[2], acc[NCols4 + 2]);
-                    accumulate_q8_blklen64_r2c1blk1_avx2<vnni>(av_00_epi8, av_01_epi8, av_10_epi8, av_11_epi8, QuantBDataPtr + 3 * SubblkDataSizeInBytes, scale_a0b3, scale_a1b3, acc[3], acc[NCols4 + 3]);
+                __m256i av_00_epi8 = _mm256_load_si256((const __m256i*)QuantAPtr);
+                __m256i av_01_epi8 = _mm256_load_si256((const __m256i*)(QuantAPtr + 32));
+                __m256i av_10_epi8 = _mm256_load_si256((const __m256i*)(QuantAPtr + lda));
+                __m256i av_11_epi8 = _mm256_load_si256((const __m256i*)(QuantAPtr + lda + 32));
+
+                __m256i bv00_32_epi8 = _mm256_load_si256(reinterpret_cast<const __m256i*>(QuantBDataPtr));
+                __m256i bv01_32_epi8 = _mm256_load_si256(reinterpret_cast<const __m256i*>(QuantBDataPtr + 32));
+                __m256i bv10_32_epi8 = _mm256_load_si256(reinterpret_cast<const __m256i*>(QuantBDataPtr + SubblkDataSizeInBytes));
+                __m256i bv11_32_epi8 = _mm256_load_si256(reinterpret_cast<const __m256i*>(QuantBDataPtr + SubblkDataSizeInBytes + 32));
+                __m256i bv20_32_epi8 = _mm256_load_si256(reinterpret_cast<const __m256i*>(QuantBDataPtr + 2 * SubblkDataSizeInBytes));
+                __m256i bv21_32_epi8 = _mm256_load_si256(reinterpret_cast<const __m256i*>(QuantBDataPtr + 2 * SubblkDataSizeInBytes + 32));
+                __m256i bv30_32_epi8 = _mm256_load_si256(reinterpret_cast<const __m256i*>(QuantBDataPtr + 3 * SubblkDataSizeInBytes));
+                __m256i bv31_32_epi8 = _mm256_load_si256(reinterpret_cast<const __m256i*>(QuantBDataPtr + 3 * SubblkDataSizeInBytes + 32));
+
+                for (size_t kk = 0; kk < PerBlkSubblkCount - 1; kk++) {
+                    accumulate_q8_blklen64_r2c1blk1_avx2<vnni>(av_00_epi8, av_01_epi8, av_10_epi8, av_11_epi8, bv00_32_epi8, bv01_32_epi8, scale_a0b0, scale_a1b0, acc[0], acc[NCols4]);
+                    accumulate_q8_blklen64_r2c1blk1_avx2<vnni>(av_00_epi8, av_01_epi8, av_10_epi8, av_11_epi8, bv10_32_epi8, bv11_32_epi8, scale_a0b1, scale_a1b1, acc[1], acc[NCols4 + 1]);
+                    accumulate_q8_blklen64_r2c1blk1_avx2<vnni>(av_00_epi8, av_01_epi8, av_10_epi8, av_11_epi8, bv20_32_epi8, bv21_32_epi8, scale_a0b2, scale_a1b2, acc[2], acc[NCols4 + 2]);
+                    accumulate_q8_blklen64_r2c1blk1_avx2<vnni>(av_00_epi8, av_01_epi8, av_10_epi8, av_11_epi8, bv30_32_epi8, bv31_32_epi8, scale_a0b3, scale_a1b3, acc[3], acc[NCols4 + 3]);
 
                     // increment block pointers
                     QuantAPtr += SubblkLen;
                     QuantBDataPtr += NCols4 * SubblkDataSizeInBytes;
+
+                    av_00_epi8 = _mm256_load_si256((const __m256i*)QuantAPtr);
+                    av_01_epi8 = _mm256_load_si256((const __m256i*)(QuantAPtr + 32));
+                    av_10_epi8 = _mm256_load_si256((const __m256i*)(QuantAPtr + lda));
+                    av_11_epi8 = _mm256_load_si256((const __m256i*)(QuantAPtr + lda + 32));
+
+                    bv00_32_epi8 = _mm256_load_si256(reinterpret_cast<const __m256i*>(QuantBDataPtr));
+                    bv01_32_epi8 = _mm256_load_si256(reinterpret_cast<const __m256i*>(QuantBDataPtr + 32));
+                    bv10_32_epi8 = _mm256_load_si256(reinterpret_cast<const __m256i*>(QuantBDataPtr + SubblkDataSizeInBytes));
+                    bv11_32_epi8 = _mm256_load_si256(reinterpret_cast<const __m256i*>(QuantBDataPtr + SubblkDataSizeInBytes + 32));
+                    bv20_32_epi8 = _mm256_load_si256(reinterpret_cast<const __m256i*>(QuantBDataPtr + 2 * SubblkDataSizeInBytes));
+                    bv21_32_epi8 = _mm256_load_si256(reinterpret_cast<const __m256i*>(QuantBDataPtr + 2 * SubblkDataSizeInBytes + 32));
+                    bv30_32_epi8 = _mm256_load_si256(reinterpret_cast<const __m256i*>(QuantBDataPtr + 3 * SubblkDataSizeInBytes));
+                    bv31_32_epi8 = _mm256_load_si256(reinterpret_cast<const __m256i*>(QuantBDataPtr + 3 * SubblkDataSizeInBytes + 32));
                 }
+
+                accumulate_q8_blklen64_r2c1blk1_avx2<vnni>(av_00_epi8, av_01_epi8, av_10_epi8, av_11_epi8, bv00_32_epi8, bv01_32_epi8, scale_a0b0, scale_a1b0, acc[0], acc[NCols4]);
+                accumulate_q8_blklen64_r2c1blk1_avx2<vnni>(av_00_epi8, av_01_epi8, av_10_epi8, av_11_epi8, bv10_32_epi8, bv11_32_epi8, scale_a0b1, scale_a1b1, acc[1], acc[NCols4 + 1]);
+                accumulate_q8_blklen64_r2c1blk1_avx2<vnni>(av_00_epi8, av_01_epi8, av_10_epi8, av_11_epi8, bv20_32_epi8, bv21_32_epi8, scale_a0b2, scale_a1b2, acc[2], acc[NCols4 + 2]);
+                accumulate_q8_blklen64_r2c1blk1_avx2<vnni>(av_00_epi8, av_01_epi8, av_10_epi8, av_11_epi8, bv30_32_epi8, bv31_32_epi8, scale_a0b3, scale_a1b3, acc[3], acc[NCols4 + 3]);
+
+                // increment block pointers
+                QuantAPtr += SubblkLen;
+                QuantBDataPtr += NCols4 * SubblkDataSizeInBytes;
+
                 QuantAScalePtr++;
                 QuantBScalePtr += NCols4;
             }  // k_blks_remaining
@@ -587,18 +618,36 @@ Q8Int8GemmR2xC1BlkLen64Avx2(
                 const float scale_a0b0 = (*QuantBScalePtr) * scale_a0;
                 const float scale_a1b0 = (*QuantBScalePtr) * scale_a1;
 
-                for (size_t kk = 0; kk < PerBlkSubblkCount; kk++) {
-                    const __m256i av_00_epi8 = _mm256_loadu_si256((const __m256i*)QuantAPtr);
-                    const __m256i av_01_epi8 = _mm256_loadu_si256((const __m256i*)(QuantAPtr + 32));
-                    const __m256i av_10_epi8 = _mm256_loadu_si256((const __m256i*)(QuantAPtr + lda));
-                    const __m256i av_11_epi8 = _mm256_loadu_si256((const __m256i*)(QuantAPtr + lda + 32));
+                __m256i av_00_epi8 = _mm256_load_si256((const __m256i*)QuantAPtr);
+                __m256i av_01_epi8 = _mm256_load_si256((const __m256i*)(QuantAPtr + 32));
+                __m256i av_10_epi8 = _mm256_load_si256((const __m256i*)(QuantAPtr + lda));
+                __m256i av_11_epi8 = _mm256_load_si256((const __m256i*)(QuantAPtr + lda + 32));
+
+                __m256i bv00_32_epi8 = _mm256_load_si256(reinterpret_cast<const __m256i*>(QuantBDataPtr));
+                __m256i bv01_32_epi8 = _mm256_load_si256(reinterpret_cast<const __m256i*>(QuantBDataPtr + 32));
 
-                    accumulate_q8_blklen64_r2c1blk1_avx2<vnni>(av_00_epi8, av_01_epi8, av_10_epi8, av_11_epi8, QuantBDataPtr, scale_a0b0, scale_a1b0, acc0, acc1);
+                for (size_t kk = 0; kk < PerBlkSubblkCount - 1; kk++) {
+                    accumulate_q8_blklen64_r2c1blk1_avx2<vnni>(av_00_epi8, av_01_epi8, av_10_epi8, av_11_epi8, bv00_32_epi8, bv01_32_epi8, scale_a0b0, scale_a1b0, acc0, acc1);
 
                     // increment block pointers
                     QuantAPtr += SubblkLen;
                     QuantBDataPtr += SubblkDataSizeInBytes;
+
+                    av_00_epi8 = _mm256_load_si256((const __m256i*)QuantAPtr);
+                    av_01_epi8 = _mm256_load_si256((const __m256i*)(QuantAPtr + 32));
+                    av_10_epi8 = _mm256_load_si256((const __m256i*)(QuantAPtr + lda));
+                    av_11_epi8 = _mm256_load_si256((const __m256i*)(QuantAPtr + lda + 32));
+
+                    bv00_32_epi8 = _mm256_load_si256(reinterpret_cast<const __m256i*>(QuantBDataPtr));
+                    bv01_32_epi8 = _mm256_load_si256(reinterpret_cast<const __m256i*>(QuantBDataPtr + 32));
                 }
+
+                accumulate_q8_blklen64_r2c1blk1_avx2<vnni>(av_00_epi8, av_01_epi8, av_10_epi8, av_11_epi8, bv00_32_epi8, bv01_32_epi8, scale_a0b0, scale_a1b0, acc0, acc1);
+
+                // increment block pointers
+                QuantAPtr += SubblkLen;
+                QuantBDataPtr += SubblkDataSizeInBytes;
+
                 QuantAScalePtr++;
                 QuantBScalePtr++;
             }
@@ -751,19 +800,47 @@ Q8Int8GemmR1xC4BlkLen64Avx2(
                 const float scale_a0b2 = (*(QuantBScalePtr + 2)) * scale_a0;
                 const float scale_a0b3 = (*(QuantBScalePtr + 3)) * scale_a0;
 
-                for (size_t kk = 0; kk < PerBlkSubblkCount; kk++) {
-                    const __m256i av_00_epi8 = _mm256_loadu_si256((const __m256i*)QuantAPtr);
-                    const __m256i av_01_epi8 = _mm256_loadu_si256((const __m256i*)(QuantAPtr + 32));
-
-                    accumulate_q8_blklen64_r1c1blk1_avx2<vnni>(av_00_epi8, av_01_epi8, QuantBDataPtr, scale_a0b0, acc[0]);
-                    accumulate_q8_blklen64_r1c1blk1_avx2<vnni>(av_00_epi8, av_01_epi8, QuantBDataPtr + SubblkDataSizeInBytes, scale_a0b1, acc[1]);
-                    accumulate_q8_blklen64_r1c1blk1_avx2<vnni>(av_00_epi8, av_01_epi8, QuantBDataPtr + 2 * SubblkDataSizeInBytes, scale_a0b2, acc[2]);
-                    accumulate_q8_blklen64_r1c1blk1_avx2<vnni>(av_00_epi8, av_01_epi8, QuantBDataPtr + 3 * SubblkDataSizeInBytes, scale_a0b3, acc[3]);
-
+                __m256i av_00_epi8 = _mm256_load_si256((const __m256i*)QuantAPtr);
+                __m256i av_01_epi8 = _mm256_load_si256((const __m256i*)(QuantAPtr + 32));
+
+                __m256i bv00_32_epi8 = _mm256_load_si256(reinterpret_cast<const __m256i*>(QuantBDataPtr));
+                __m256i bv01_32_epi8 = _mm256_load_si256(reinterpret_cast<const __m256i*>(QuantBDataPtr + 32));
+                __m256i bv10_32_epi8 = _mm256_load_si256(reinterpret_cast<const __m256i*>(QuantBDataPtr + SubblkDataSizeInBytes));
+                __m256i bv11_32_epi8 = _mm256_load_si256(reinterpret_cast<const __m256i*>(QuantBDataPtr + SubblkDataSizeInBytes + 32));
+                __m256i bv20_32_epi8 = _mm256_load_si256(reinterpret_cast<const __m256i*>(QuantBDataPtr + 2 * SubblkDataSizeInBytes));
+                __m256i bv21_32_epi8 = _mm256_load_si256(reinterpret_cast<const __m256i*>(QuantBDataPtr + 2 * SubblkDataSizeInBytes + 32));
+                __m256i bv30_32_epi8 = _mm256_load_si256(reinterpret_cast<const __m256i*>(QuantBDataPtr + 3 * SubblkDataSizeInBytes));
+                __m256i bv31_32_epi8 = _mm256_load_si256(reinterpret_cast<const __m256i*>(QuantBDataPtr + 3 * SubblkDataSizeInBytes + 32));
+
+                for (size_t kk = 0; kk < PerBlkSubblkCount - 1; kk++) {
+                    accumulate_q8_blklen64_r1c1blk1_avx2<vnni>(av_00_epi8, av_01_epi8, bv00_32_epi8, bv01_32_epi8, scale_a0b0, acc[0]);
+                    accumulate_q8_blklen64_r1c1blk1_avx2<vnni>(av_00_epi8, av_01_epi8, bv10_32_epi8, bv11_32_epi8, scale_a0b1, acc[1]);
+                    accumulate_q8_blklen64_r1c1blk1_avx2<vnni>(av_00_epi8, av_01_epi8, bv20_32_epi8, bv21_32_epi8, scale_a0b2, acc[2]);
+                    accumulate_q8_blklen64_r1c1blk1_avx2<vnni>(av_00_epi8, av_01_epi8, bv30_32_epi8, bv31_32_epi8, scale_a0b3, acc[3]);
                     // increment block pointers
                     QuantAPtr += SubblkLen;
                     QuantBDataPtr += NCols4 * SubblkDataSizeInBytes;
+
+                    av_00_epi8 = _mm256_load_si256((const __m256i*)QuantAPtr);
+                    av_01_epi8 = _mm256_load_si256((const __m256i*)(QuantAPtr + 32));
+
+                    bv00_32_epi8 = _mm256_load_si256(reinterpret_cast<const __m256i*>(QuantBDataPtr));
+                    bv01_32_epi8 = _mm256_load_si256(reinterpret_cast<const __m256i*>(QuantBDataPtr + 32));
+                    bv10_32_epi8 = _mm256_load_si256(reinterpret_cast<const __m256i*>(QuantBDataPtr + SubblkDataSizeInBytes));
+                    bv11_32_epi8 = _mm256_load_si256(reinterpret_cast<const __m256i*>(QuantBDataPtr + SubblkDataSizeInBytes + 32));
+                    bv20_32_epi8 = _mm256_load_si256(reinterpret_cast<const __m256i*>(QuantBDataPtr + 2 * SubblkDataSizeInBytes));
+                    bv21_32_epi8 = _mm256_load_si256(reinterpret_cast<const __m256i*>(QuantBDataPtr + 2 * SubblkDataSizeInBytes + 32));
+                    bv30_32_epi8 = _mm256_load_si256(reinterpret_cast<const __m256i*>(QuantBDataPtr + 3 * SubblkDataSizeInBytes));
+                    bv31_32_epi8 = _mm256_load_si256(reinterpret_cast<const __m256i*>(QuantBDataPtr + 3 * SubblkDataSizeInBytes + 32));
                 }
+
+                accumulate_q8_blklen64_r1c1blk1_avx2<vnni>(av_00_epi8, av_01_epi8, bv00_32_epi8, bv01_32_epi8, scale_a0b0, acc[0]);
+                accumulate_q8_blklen64_r1c1blk1_avx2<vnni>(av_00_epi8, av_01_epi8, bv10_32_epi8, bv11_32_epi8, scale_a0b1, acc[1]);
+                accumulate_q8_blklen64_r1c1blk1_avx2<vnni>(av_00_epi8, av_01_epi8, bv20_32_epi8, bv21_32_epi8, scale_a0b2, acc[2]);
+                accumulate_q8_blklen64_r1c1blk1_avx2<vnni>(av_00_epi8, av_01_epi8, bv30_32_epi8, bv31_32_epi8, scale_a0b3, acc[3]);
+                QuantAPtr += SubblkLen;
+                QuantBDataPtr += NCols4 * SubblkDataSizeInBytes;
+
                 QuantAScalePtr++;
                 QuantBScalePtr += NCols4;
             }
@@ -909,16 +986,30 @@ Q8Int8GemmR1xC1BlkLen64Avx2(
                 const float scale_a0 = *QuantAScalePtr;
                 const float scale_a0b0 = (*QuantBScalePtr) * scale_a0;
 
-                for (size_t kk = 0; kk < PerBlkSubblkCount; kk++) {
-                    const __m256i av_00_epi8 = _mm256_loadu_si256((const __m256i*)QuantAPtr);
-                    const __m256i av_01_epi8 = _mm256_loadu_si256((const __m256i*)(QuantAPtr + 32));
+                __m256i av_00_epi8 = _mm256_load_si256((const __m256i*)QuantAPtr);
+                __m256i av_01_epi8 = _mm256_load_si256((const __m256i*)(QuantAPtr + 32));
+
+                __m256i bv00_32_epi8 = _mm256_load_si256(reinterpret_cast<const __m256i*>(QuantBDataPtr));
+                __m256i bv01_32_epi8 = _mm256_load_si256(reinterpret_cast<const __m256i*>(QuantBDataPtr + 32));
 
-                    accumulate_q8_blklen64_r1c1blk1_avx2<vnni>(av_00_epi8, av_01_epi8, QuantBDataPtr, scale_a0b0, acc0);
+                for (size_t kk = 0; kk < PerBlkSubblkCount - 1; kk++) {
+                    accumulate_q8_blklen64_r1c1blk1_avx2<vnni>(av_00_epi8, av_01_epi8, bv00_32_epi8, bv01_32_epi8, scale_a0b0, acc0);
 
                     // increment block pointers
                     QuantAPtr += SubblkLen;
                     QuantBDataPtr += SubblkDataSizeInBytes;
+
+                    av_00_epi8 = _mm256_load_si256((const __m256i*)QuantAPtr);
+                    av_01_epi8 = _mm256_load_si256((const __m256i*)(QuantAPtr + 32));
+
+                    bv00_32_epi8 = _mm256_load_si256(reinterpret_cast<const __m256i*>(QuantBDataPtr));
+                    bv01_32_epi8 = _mm256_load_si256(reinterpret_cast<const __m256i*>(QuantBDataPtr + 32));
                 }
+
+                accumulate_q8_blklen64_r1c1blk1_avx2<vnni>(av_00_epi8, av_01_epi8, bv00_32_epi8, bv01_32_epi8, scale_a0b0, acc0);
+                QuantAPtr += SubblkLen;
+                QuantBDataPtr += SubblkDataSizeInBytes;
+
                 QuantAScalePtr++;
                 QuantBScalePtr++;
             }
diff --git a/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx512_int8_blklen128.h b/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx512_int8_blklen128.h
index 7ca72debd6d25..6e8cebeac7bc5 100644
--- a/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx512_int8_blklen128.h
+++ b/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx512_int8_blklen128.h
@@ -156,8 +156,8 @@ accumulate_q8_blklen128_r1c1blk1_avx512(
     __m512& acc0
 )
 {
-    __m512i bv0_64_epi8 = _mm512_loadu_si512(reinterpret_cast<const __m512i*>(QuantBDataPtr));
-    __m512i bv1_64_epi8 = _mm512_loadu_si512(reinterpret_cast<const __m512i*>(QuantBDataPtr + 64));
+    __m512i bv0_64_epi8 = _mm512_load_si512(reinterpret_cast<const __m512i*>(QuantBDataPtr));
+    __m512i bv1_64_epi8 = _mm512_load_si512(reinterpret_cast<const __m512i*>(QuantBDataPtr + 64));
 
     if constexpr (vnni) {
         dot_accumulate_1blkvnni(bv0_64_epi8, bv1_64_epi8, av00_64_epi8, av01_64_epi8, scale_a0b, acc0);
@@ -204,8 +204,8 @@ accumulate_q8_blklen128_r2c1blk1_avx512(
     __m512& acc1
 )
 {
-    __m512i bv0_64_epi8 = _mm512_loadu_si512(reinterpret_cast<const __m512i*>(QuantBDataPtr));
-    __m512i bv1_64_epi8 = _mm512_loadu_si512(reinterpret_cast<const __m512i*>(QuantBDataPtr + 64));
+    __m512i bv0_64_epi8 = _mm512_load_si512(reinterpret_cast<const __m512i*>(QuantBDataPtr));
+    __m512i bv1_64_epi8 = _mm512_load_si512(reinterpret_cast<const __m512i*>(QuantBDataPtr + 64));
 
     if constexpr (vnni) {
         dot_accumulate_1blkvnni(bv0_64_epi8, bv1_64_epi8, av00_64_epi8, av01_64_epi8, scale_a0b, acc0);
@@ -428,10 +428,10 @@ Q8Int8GemmR2xC4BlkLen128Avx512(
                 const float scale_a1b3 = (*(QuantAScalePtr + BlockCountK)) * (*(QuantBScalePtr + 3));
 
                 for (size_t kk = 0; kk < PerBlkSubblkCount; kk++) {
-                    const __m512i av00_64_epi8 = _mm512_loadu_si512((const __m512i*)QuantAPtr);
-                    const __m512i av01_64_epi8 = _mm512_loadu_si512((const __m512i*)(QuantAPtr + SubblkLen / 2));
-                    const __m512i av10_64_epi8 = _mm512_loadu_si512((const __m512i*)(QuantAPtr + lda));
-                    const __m512i av11_64_epi8 = _mm512_loadu_si512((const __m512i*)(QuantAPtr + lda + SubblkLen / 2));
+                    const __m512i av00_64_epi8 = _mm512_load_si512((const __m512i*)QuantAPtr);
+                    const __m512i av01_64_epi8 = _mm512_load_si512((const __m512i*)(QuantAPtr + SubblkLen / 2));
+                    const __m512i av10_64_epi8 = _mm512_load_si512((const __m512i*)(QuantAPtr + lda));
+                    const __m512i av11_64_epi8 = _mm512_load_si512((const __m512i*)(QuantAPtr + lda + SubblkLen / 2));
 
                     accumulate_q8_blklen128_r2c1blk1_avx512<vnni>(av00_64_epi8, av01_64_epi8, av10_64_epi8, av11_64_epi8, QuantBDataPtr, scale_a0b0, scale_a1b0, acc[0], acc[NCols4]);
                     accumulate_q8_blklen128_r2c1blk1_avx512<vnni>(av00_64_epi8, av01_64_epi8, av10_64_epi8, av11_64_epi8, QuantBDataPtr + SubblkDataSizeInBytes, scale_a0b1, scale_a1b1, acc[1], acc[NCols4 + 1]);
@@ -606,10 +606,10 @@ Q8Int8GemmR2xC1BlkLen128Avx512(
                 const float scale_a1b0 = (*(QuantAScalePtr + BlockCountK)) * (*QuantBScalePtr);
 
                 for (size_t kk = 0; kk < PerBlkSubblkCount; kk++) {
-                    const __m512i av00_64_epi8 = _mm512_loadu_si512((const __m512i*)QuantAPtr);
-                    const __m512i av01_64_epi8 = _mm512_loadu_si512((const __m512i*)(QuantAPtr + SubblkLen / 2));
-                    const __m512i av10_64_epi8 = _mm512_loadu_si512((const __m512i*)(QuantAPtr + lda));
-                    const __m512i av11_64_epi8 = _mm512_loadu_si512((const __m512i*)(QuantAPtr + lda + SubblkLen / 2));
+                    const __m512i av00_64_epi8 = _mm512_load_si512((const __m512i*)QuantAPtr);
+                    const __m512i av01_64_epi8 = _mm512_load_si512((const __m512i*)(QuantAPtr + SubblkLen / 2));
+                    const __m512i av10_64_epi8 = _mm512_load_si512((const __m512i*)(QuantAPtr + lda));
+                    const __m512i av11_64_epi8 = _mm512_load_si512((const __m512i*)(QuantAPtr + lda + SubblkLen / 2));
 
                     accumulate_q8_blklen128_r2c1blk1_avx512<vnni>(av00_64_epi8, av01_64_epi8, av10_64_epi8, av11_64_epi8, QuantBDataPtr, scale_a0b0, scale_a1b0, acc0, acc1);
 
@@ -768,8 +768,8 @@ Q8Int8GemmR1xC4BlkLen128Avx512(
                 const float scale_a0b3 = (*QuantAScalePtr) * (*(QuantBScalePtr + 3));
 
                 for (size_t kk = 0; kk < PerBlkSubblkCount; kk++) {
-                    const __m512i av0_64_epi8 = _mm512_loadu_si512((const __m512i*)QuantAPtr);
-                    const __m512i av1_64_epi8 = _mm512_loadu_si512((const __m512i*)(QuantAPtr + SubblkLen / 2));
+                    const __m512i av0_64_epi8 = _mm512_load_si512((const __m512i*)QuantAPtr);
+                    const __m512i av1_64_epi8 = _mm512_load_si512((const __m512i*)(QuantAPtr + SubblkLen / 2));
 
                     accumulate_q8_blklen128_r1c1blk1_avx512<vnni>(av0_64_epi8, av1_64_epi8, QuantBDataPtr, scale_a0b0, acc[0]);
                     accumulate_q8_blklen128_r1c1blk1_avx512<vnni>(av0_64_epi8, av1_64_epi8, QuantBDataPtr + SubblkDataSizeInBytes, scale_a0b1, acc[1]);
@@ -925,8 +925,8 @@ Q8Int8GemmR1xC1BlkLen128Avx512(
                 const float scale_a0b0 = (*QuantAScalePtr) * (*QuantBScalePtr);
 
                 for (size_t kk = 0; kk < PerBlkSubblkCount; kk++) {
-                    const __m512i av0_64_epi8 = _mm512_loadu_si512((const __m512i*)QuantAPtr);
-                    const __m512i av1_64_epi8 = _mm512_loadu_si512((const __m512i*)(QuantAPtr + SubblkLen / 2));
+                    const __m512i av0_64_epi8 = _mm512_load_si512((const __m512i*)QuantAPtr);
+                    const __m512i av1_64_epi8 = _mm512_load_si512((const __m512i*)(QuantAPtr + SubblkLen / 2));
 
                     accumulate_q8_blklen128_r1c1blk1_avx512<vnni>(av0_64_epi8, av1_64_epi8, QuantBDataPtr, scale_a0b0, acc0);
 
diff --git a/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx512_int8_blklen64.h b/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx512_int8_blklen64.h
index 33d4fde26ae5b..68bf1dae6d0d0 100644
--- a/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx512_int8_blklen64.h
+++ b/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx512_int8_blklen64.h
@@ -200,8 +200,8 @@ accumulate_q8_blklen64_r1c1blk2_avx512(
     __m512& acc0
 )
 {
-    __m512i bv0_64_epi8 = _mm512_loadu_si512(reinterpret_cast<const __m512i*>(QuantBDataPtr));
-    __m512i bv1_64_epi8 = _mm512_loadu_si512(reinterpret_cast<const __m512i*>(QuantBDataPtr + 64));
+    __m512i bv0_64_epi8 = _mm512_load_si512(reinterpret_cast<const __m512i*>(QuantBDataPtr));
+    __m512i bv1_64_epi8 = _mm512_load_si512(reinterpret_cast<const __m512i*>(QuantBDataPtr + 64));
 
     const __m256 scale_b_ps = _mm256_castpd_ps(_mm256_broadcast_sd((double*)scale_b));
     const __m512 scale_b_16_ps = _mm512_broadcast_f32x8(scale_b_ps);
@@ -259,8 +259,8 @@ accumulate_q8_blklen64_r2c1blk2_avx512(
     __m512& acc1
 )
 {
-    __m512i bv0_64_epi8 = _mm512_loadu_si512(reinterpret_cast<const __m512i*>(QuantBDataPtr));
-    __m512i bv1_64_epi8 = _mm512_loadu_si512(reinterpret_cast<const __m512i*>(QuantBDataPtr + 64));
+    __m512i bv0_64_epi8 = _mm512_load_si512(reinterpret_cast<const __m512i*>(QuantBDataPtr));
+    __m512i bv1_64_epi8 = _mm512_load_si512(reinterpret_cast<const __m512i*>(QuantBDataPtr + 64));
 
     const __m256 scale_b_ps = _mm256_castpd_ps(_mm256_broadcast_sd((double*)scale_b));
     const __m512 scale_b_16_ps = _mm512_broadcast_f32x8(scale_b_ps);
@@ -440,7 +440,7 @@ accumulate_q8_blklen64_r1c1blk1_avx512(
     __m512& acc0
 )
 {
-    __m512i bv_64_epi8 = _mm512_loadu_si512(reinterpret_cast<const __m512i*>(QuantBDataPtr));
+    __m512i bv_64_epi8 = _mm512_load_si512(reinterpret_cast<const __m512i*>(QuantBDataPtr));
     const __m128 scale_b_ps = _mm_broadcast_ss(scale_b);
     const __m512 scale_b_16_ps = _mm512_broadcast_f32x2(scale_b_ps);
 
@@ -485,7 +485,7 @@ accumulate_q8_blklen64_r2c1blk1_avx512(
     __m512& acc1
 )
 {
-    __m512i bv_64_epi8 = _mm512_loadu_si512(reinterpret_cast<const __m512i*>(QuantBDataPtr));
+    __m512i bv_64_epi8 = _mm512_load_si512(reinterpret_cast<const __m512i*>(QuantBDataPtr));
     const __m128 scale_b_ps = _mm_broadcast_ss(scale_b);
     const __m512 scale_b_16_ps = _mm512_broadcast_f32x2(scale_b_ps);
 
@@ -750,10 +750,10 @@ Q8Int8GemmR2xC4BlkLen64Avx512(
 
             size_t k_blks_remaining = BlockCountK;
             for (; k_blks_remaining > 1; k_blks_remaining -= PerAccuBlk2) {
-                const __m512i av_00_epi8 = _mm512_loadu_si512((const __m512i*)QuantAPtr);
-                const __m512i av_01_epi8 = _mm512_loadu_si512((const __m512i*)(QuantAPtr + 64));
-                const __m512i av_10_epi8 = _mm512_loadu_si512((const __m512i*)(QuantAPtr + lda));
-                const __m512i av_11_epi8 = _mm512_loadu_si512((const __m512i*)(QuantAPtr + lda + 64));
+                const __m512i av_00_epi8 = _mm512_load_si512((const __m512i*)QuantAPtr);
+                const __m512i av_01_epi8 = _mm512_load_si512((const __m512i*)(QuantAPtr + 64));
+                const __m512i av_10_epi8 = _mm512_load_si512((const __m512i*)(QuantAPtr + lda));
+                const __m512i av_11_epi8 = _mm512_load_si512((const __m512i*)(QuantAPtr + lda + 64));
 
                 accumulate_q8_blklen64_r2c1blk2_avx512<vnni>(av_00_epi8, av_01_epi8, av_10_epi8, av_11_epi8, QuantBDataPtr, QuantAScalePtr, QuantAScalePtr + BlockCountK, QuantBScalePtr, acc[0], acc[NCols4]);
                 accumulate_q8_blklen64_r2c1blk2_avx512<vnni>(av_00_epi8, av_01_epi8, av_10_epi8, av_11_epi8, QuantBDataPtr + StrideQuantBData, QuantAScalePtr, QuantAScalePtr + BlockCountK, QuantBScalePtr + PerAccuBlk2, acc[1], acc[NCols4 + 1]);
@@ -768,8 +768,8 @@ Q8Int8GemmR2xC4BlkLen64Avx512(
             }  // k_blks_remaining
 
             for (; k_blks_remaining > 0; --k_blks_remaining) {
-                const __m512i av_00_epi8 = _mm512_loadu_si512((const __m512i*)QuantAPtr);
-                const __m512i av_10_epi8 = _mm512_loadu_si512((const __m512i*)(QuantAPtr + lda));
+                const __m512i av_00_epi8 = _mm512_load_si512((const __m512i*)QuantAPtr);
+                const __m512i av_10_epi8 = _mm512_load_si512((const __m512i*)(QuantAPtr + lda));
 
                 accumulate_q8_blklen64_r2c1blk1_avx512<vnni>(av_00_epi8, av_10_epi8, QuantBDataPtr, QuantAScalePtr, QuantAScalePtr + BlockCountK, QuantBScalePtr, acc[0], acc[NCols4]);
                 accumulate_q8_blklen64_r2c1blk1_avx512<vnni>(av_00_epi8, av_10_epi8, QuantBDataPtr + BlkDataSizeInBytes, QuantAScalePtr, QuantAScalePtr + BlockCountK, QuantBScalePtr + 1, acc[1], acc[NCols4 + 1]);
@@ -940,10 +940,10 @@ Q8Int8GemmR2xC1BlkLen64Avx512(
 
             size_t k_blks_remaining = BlockCountK;
             for (; k_blks_remaining > 1; k_blks_remaining -= PerAccuBlk2) {
-                const __m512i av_00_epi8 = _mm512_loadu_si512((const __m512i*)QuantAPtr);
-                const __m512i av_01_epi8 = _mm512_loadu_si512((const __m512i*)(QuantAPtr + 64));
-                const __m512i av_10_epi8 = _mm512_loadu_si512((const __m512i*)(QuantAPtr + lda));
-                const __m512i av_11_epi8 = _mm512_loadu_si512((const __m512i*)(QuantAPtr + lda + 64));
+                const __m512i av_00_epi8 = _mm512_load_si512((const __m512i*)QuantAPtr);
+                const __m512i av_01_epi8 = _mm512_load_si512((const __m512i*)(QuantAPtr + 64));
+                const __m512i av_10_epi8 = _mm512_load_si512((const __m512i*)(QuantAPtr + lda));
+                const __m512i av_11_epi8 = _mm512_load_si512((const __m512i*)(QuantAPtr + lda + 64));
 
                 accumulate_q8_blklen64_r2c1blk2_avx512<vnni>(av_00_epi8, av_01_epi8, av_10_epi8, av_11_epi8, QuantBDataPtr, QuantAScalePtr, QuantAScalePtr + BlockCountK, QuantBScalePtr, acc0, acc1);
 
@@ -955,8 +955,8 @@ Q8Int8GemmR2xC1BlkLen64Avx512(
             }
 
             for (; k_blks_remaining > 0; --k_blks_remaining) {
-                const __m512i av_00_epi8 = _mm512_loadu_si512((const __m512i*)QuantAPtr);
-                const __m512i av_10_epi8 = _mm512_loadu_si512((const __m512i*)(QuantAPtr + lda));
+                const __m512i av_00_epi8 = _mm512_load_si512((const __m512i*)QuantAPtr);
+                const __m512i av_10_epi8 = _mm512_load_si512((const __m512i*)(QuantAPtr + lda));
 
                 accumulate_q8_blklen64_r2c1blk1_avx512<vnni>(av_00_epi8, av_10_epi8, QuantBDataPtr, QuantAScalePtr, QuantAScalePtr + BlockCountK, QuantBScalePtr, acc0, acc1);
 
@@ -1119,8 +1119,8 @@ Q8Int8GemmR1xC4BlkLen64Avx512(
             __m512 acc[NCols4] = {_mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps()};
             size_t k_blks_remaining = BlockCountK;
             for (; k_blks_remaining >= PerAccuBlk2; k_blks_remaining -= PerAccuBlk2) {
-                const __m512i av0_64_epi8 = _mm512_loadu_si512((const __m512i*)QuantAPtr);
-                const __m512i av1_64_epi8 = _mm512_loadu_si512((const __m512i*)(QuantAPtr + 64));
+                const __m512i av0_64_epi8 = _mm512_load_si512((const __m512i*)QuantAPtr);
+                const __m512i av1_64_epi8 = _mm512_load_si512((const __m512i*)(QuantAPtr + 64));
                 accumulate_q8_blklen64_r1c1blk2_avx512<vnni>(av0_64_epi8, av1_64_epi8, QuantBDataPtr, QuantAScalePtr, QuantBScalePtr, acc[0]);
                 accumulate_q8_blklen64_r1c1blk2_avx512<vnni>(av0_64_epi8, av1_64_epi8, QuantBDataPtr + PerAccuBlk2 * BlkDataSizeInBytes, QuantAScalePtr, QuantBScalePtr + PerAccuBlk2, acc[1]);
                 accumulate_q8_blklen64_r1c1blk2_avx512<vnni>(av0_64_epi8, av1_64_epi8, QuantBDataPtr + 2 * PerAccuBlk2 * BlkDataSizeInBytes, QuantAScalePtr, QuantBScalePtr + 2 * PerAccuBlk2, acc[2]);
@@ -1134,7 +1134,7 @@ Q8Int8GemmR1xC4BlkLen64Avx512(
             }
 
             for (; k_blks_remaining > 0; --k_blks_remaining) {
-                const __m512i av_epi8 = _mm512_loadu_si512((const __m512i*)QuantAPtr);
+                const __m512i av_epi8 = _mm512_load_si512((const __m512i*)QuantAPtr);
 
                 accumulate_q8_blklen64_r1c1blk1_avx512<vnni>(av_epi8, QuantBDataPtr, QuantAScalePtr, QuantBScalePtr, acc[0]);
                 accumulate_q8_blklen64_r1c1blk1_avx512<vnni>(av_epi8, QuantBDataPtr + BlkDataSizeInBytes, QuantAScalePtr, QuantBScalePtr + 1, acc[1]);
@@ -1293,8 +1293,8 @@ Q8Int8GemmR1xC1BlkLen64Avx512(
             __m512 acc0 = _mm512_setzero_ps();
             size_t k_blks_remaining = BlockCountK;
             for (; k_blks_remaining > 1; k_blks_remaining -= PerAccuBlk2) {
-                const __m512i av_00_epi8 = _mm512_loadu_si512((const __m512i*)QuantAPtr);
-                const __m512i av_01_epi8 = _mm512_loadu_si512((const __m512i*)(QuantAPtr + 64));
+                const __m512i av_00_epi8 = _mm512_load_si512((const __m512i*)QuantAPtr);
+                const __m512i av_01_epi8 = _mm512_load_si512((const __m512i*)(QuantAPtr + 64));
 
                 accumulate_q8_blklen64_r1c1blk2_avx512<vnni>(av_00_epi8, av_01_epi8, QuantBDataPtr, QuantAScalePtr, QuantBScalePtr, acc0);
 
@@ -1306,7 +1306,7 @@ Q8Int8GemmR1xC1BlkLen64Avx512(
             }
 
             for (; k_blks_remaining > 0; --k_blks_remaining) {
-                const __m512i av_00_epi8 = _mm512_loadu_si512((const __m512i*)QuantAPtr);
+                const __m512i av_00_epi8 = _mm512_load_si512((const __m512i*)QuantAPtr);
 
                 accumulate_q8_blklen64_r1c1blk1_avx512<vnni>(av_00_epi8, QuantBDataPtr, QuantAScalePtr, QuantBScalePtr, acc0);
 
diff --git a/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx_common.h b/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx_common.h
index e7df817dea34c..bb38f37fb0eb8 100644
--- a/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx_common.h
+++ b/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx_common.h
@@ -22,8 +22,8 @@ QNBitGemmPackQuantBDataSize(
         const size_t ScaleSize = N * BlockCountK * sizeof(float);
         size_t BlkSumSize = MlasDivRoundup(N, 16) * BlockCountK * 16 * sizeof(float);
 
-        // _mm256_load_si256 requires alignment on a 32-byte boundary
-        constexpr size_t PackedQuantBDataAlignment = 32;
+        // avx512 requires alignment on a 64-byte boundary
+        constexpr size_t PackedQuantBDataAlignment = 64;
         PackedQuantBDataSize += PackedQuantBDataAlignment - 1;
         constexpr size_t BlkSumAlignment = MlasQNBitQuantBBlkSumAlignment();
         BlkSumSize += BlkSumAlignment - 1;
diff --git a/onnxruntime/core/mlas/lib/sqnbitgemm_q8_block.h b/onnxruntime/core/mlas/lib/sqnbitgemm_q8_block.h
index 80af2f46790df..9dc217c5ddb97 100644
--- a/onnxruntime/core/mlas/lib/sqnbitgemm_q8_block.h
+++ b/onnxruntime/core/mlas/lib/sqnbitgemm_q8_block.h
@@ -66,5 +66,5 @@ MLAS_FORCEINLINE
 constexpr size_t
 Q8BlkAlignment()
 {
-    return alignof(float);
+    return 16 * alignof(float);
 }

From 129210f12094b9c4102a57a80676a9101640bf65 Mon Sep 17 00:00:00 2001
From: Ashwath Shankarnarayan <quic_ashwshan@quicinc.com>
Date: Thu, 15 May 2025 14:26:38 -0700
Subject: [PATCH 77/84] [QNN EP] Add Support for MaxPool rank-3 input QNN EP
 (#24735)

- Currently QNN EP only supports MaxPool for rank 4.
- This change adds support for rank 3 input by adding Reshapes before and after the Op to ensure that the MaxPool gets input rank 4.
- Updated all attributes if converting rank 3 input to rank 4 by updating stride, pads, dilations and kernel size.
- Added unit tests which takes input rank 3 to validate MaxPool on NPU.

### Description
This change extends the support of QNN EP's MaxPool operation to handle input tensors of rank 3. To achieve this, Reshape Ops are added before and after the MaxPool Op to ensure that the input to MaxPool is always of rank 4, as required. Additionally, the attributes such as stride, pads, dilations, and kernel size are updated accordingly to accommodate the conversion of rank 3 inputs to rank 4. Unit tests have been added to validate the functionality of MaxPool on NPU with rank 3 inputs.

### Motivation and Context
This change is required to enhance the flexibility and usability of QNN EP's MaxPool operation by supporting a broader range of input tensor ranks. Previously, the operation was limited to only supporting rank 4 inputs, which restricted the support in certain scenarios. By adding support for rank 3 inputs, this change solves the problem of limited compatibility and enhances the overall functionality and makes sure that the MaxPool op offloads to the NPU (QNN HTP Backend)
---
 .../qnn/builder/opbuilder/pool_op_builder.cc  | 226 ++++++++++++++++--
 .../test/providers/qnn/pool_op_test.cpp       |  74 ++++++
 2 files changed, 278 insertions(+), 22 deletions(-)

diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/pool_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/pool_op_builder.cc
index 795886fa255ed..fb23a2c99ab84 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/pool_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/pool_op_builder.cc
@@ -59,8 +59,29 @@ Status PoolOpBuilder::IsOpSupported(QnnModelWrapper& qnn_model_wrapper,
 
   std::vector<uint32_t> input_shape;
   ORT_RETURN_IF_NOT(qnn_model_wrapper.GetOnnxShape(inputs[0].node_arg, input_shape), "Cannot get shape");
-  if (input_shape.size() != 4) {
-    return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "QNN Pool2D only support 2D!");
+
+  bool is1d = (input_shape.size() == 3);
+  if (!is1d && input_shape.size() != 4) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "QNN Pool only supports rank 3 or 4!");
+  }
+
+  NodeAttrHelper node_helper(node_unit);
+
+  if (is1d) {
+    auto kernel_shape = node_helper.Get("kernel_shape", std::vector<int32_t>{});
+    ORT_RETURN_IF_NOT(kernel_shape.size() == 1, "QNN Pool1D: kernel_shape must have length 1!");
+
+    auto pads = node_helper.Get("pads", std::vector<int32_t>{});
+    ORT_RETURN_IF_NOT(pads.size() == 2, "QNN Pool1D: pads must have length 2!");
+
+    auto strides = node_helper.Get("strides", std::vector<int32_t>{});
+    ORT_RETURN_IF_NOT(strides.empty() || strides.size() == 1, "QNN Pool1D: strides must have length 1 or omitted!");
+
+    auto dilations = node_helper.Get("dilations", std::vector<int32_t>{1});
+    ORT_RETURN_IF_NOT(dilations.size() == 1, "QNN Pool1D: dilations must have length 1 or omitted!");
+  } else {
+    auto dilations = node_helper.Get("dilations", std::vector<int32_t>{1, 1});
+    ORT_RETURN_IF_NOT(dilations.size() == 2, "QNN Pool2D: dilations must have length 2 or omitted!");
   }
 
   if (node_unit.Outputs().size() > 1) {
@@ -73,12 +94,6 @@ Status PoolOpBuilder::IsOpSupported(QnnModelWrapper& qnn_model_wrapper,
     return Status::OK();
   }
 
-  NodeAttrHelper node_helper(node_unit);
-  auto dilation_values = node_helper.Get("dilations", std::vector<int32_t>{1, 1});
-  if (dilation_values != std::vector<int32_t>{1, 1}) {
-    return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "QNN does not support Dilation attribute");
-  }
-
   if (op_type == "MaxPool" || op_type == "AveragePool") {
     auto auto_pad = node_helper.Get("auto_pad", std::string("NOTSET"));
     ORT_RETURN_IF(auto_pad != "NOTSET" && auto_pad != "SAME_LOWER" && auto_pad != "SAME_UPPER",
@@ -94,19 +109,49 @@ Status PoolOpBuilder::SetCommonPoolParams(const NodeAttrHelper& node_helper,
                                           int32_t& ceil_mode,
                                           std::vector<uint32_t>&& input_shape,
                                           std::vector<uint32_t>&& output_shape) const {
-  filter_size = node_helper.Get("kernel_shape", std::vector<uint32_t>{1, 1});
-  ORT_RETURN_IF_NOT(filter_size.size() == 2, "QNN only support kernel_shape with shape[2].");
-
-  strides = node_helper.Get("strides", std::vector<uint32_t>{1, 1});
-  ORT_RETURN_IF_NOT(strides.size() == 2, "QNN only support strides with shape[2].");
+  {
+    auto raw_filter_size = node_helper.Get("kernel_shape", std::vector<uint32_t>{1, 1});
+    if (raw_filter_size.size() == 1) {
+      filter_size = {1, raw_filter_size[0]};
+    } else {
+      filter_size = raw_filter_size;
+    }
+  }
+  ORT_RETURN_IF_NOT(filter_size.size() == 2,
+                    "QNN only support kernel_shape with shape[2].");
+
+  {
+    auto raw_strides = node_helper.Get("strides", std::vector<uint32_t>{1, 1});
+    if (raw_strides.size() == 1) {
+      strides = {1, raw_strides[0]};
+    } else {
+      strides = raw_strides;
+    }
+  }
+  ORT_RETURN_IF_NOT(strides.size() == 2,
+                    "QNN only support strides with shape[2].");
+
+  {
+    auto raw_pad_amount = node_helper.Get("pads", std::vector<uint32_t>{0, 0, 0, 0});
+    if (raw_pad_amount.size() == 2) {
+      pad_amount = {0, raw_pad_amount[0], 0, raw_pad_amount[1]};
+    } else {
+      pad_amount = raw_pad_amount;
+    }
+  }
 
-  pad_amount = node_helper.Get("pads", std::vector<uint32_t>{0, 0, 0, 0});
   auto auto_pad = node_helper.Get("auto_pad", std::string("NOTSET"));
   ORT_RETURN_IF(auto_pad != "NOTSET" && auto_pad != "SAME_LOWER" && auto_pad != "SAME_UPPER",
                 "QNN Pool operators do not support 'auto_pad' value: ", auto_pad.c_str());
 
   if (auto_pad.compare("NOTSET") != 0) {
-    std::vector<uint32_t> dilations = node_helper.Get("dilations", std::vector<uint32_t>{1, 1});
+    std::vector<uint32_t> dilations;
+    auto raw_dilations = node_helper.Get("dilations", std::vector<uint32_t>{1, 1});
+    if (raw_dilations.size() == 1) {
+      dilations = {1, raw_dilations[0]};
+    } else {
+      dilations = raw_dilations;
+    }
 
     auto total_pads_0 = (output_shape[1] - 1) * strides[0] + (filter_size[0] - 1) * dilations[0] + 1 - input_shape[1];
     auto total_pads_1 = (output_shape[2] - 1) * strides[1] + (filter_size[1] - 1) * dilations[1] + 1 - input_shape[2];
@@ -144,6 +189,36 @@ void SetPoolParam(const NodeUnit& node_unit,
   qnn_model_wrapper.AddParamWrapper(std::move(qnn_param));
 }
 
+std::vector<uint32_t> ComputePoolOutputShape(
+    const std::vector<uint32_t>& input_shape,   // {N, H, W, C}
+    const std::vector<uint32_t>& kernel_shape,  // {k_h, k_w}
+    const std::vector<uint32_t>& strides,       // {s_h, s_w}
+    const std::vector<uint32_t>& pads) {
+  assert(input_shape.size() == 4 &&
+         kernel_shape.size() == 2 &&
+         strides.size() == 2 &&
+         pads.size() == 4);
+
+  const uint32_t N = input_shape[0];
+  const uint32_t H = input_shape[1];
+  const uint32_t W = input_shape[2];
+  const uint32_t C = input_shape[3];
+
+  // pad the spatial dims
+  uint32_t padded_H = H + pads[0] + pads[2];
+  uint32_t padded_W = W + pads[1] + pads[3];
+
+  // floor-mode on NHWC
+  uint32_t out_H = (padded_H < kernel_shape[0])
+                       ? 0
+                       : (padded_H - kernel_shape[0]) / strides[0] + 1;
+  uint32_t out_W = (padded_W < kernel_shape[1])
+                       ? 0
+                       : (padded_W - kernel_shape[1]) / strides[1] + 1;
+
+  return {N, out_H, out_W, C};
+}
+
 Status PoolOpBuilder::ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_wrapper,
                                                   const NodeUnit& node_unit,
                                                   std::vector<std::string>&& input_names,
@@ -154,7 +229,45 @@ Status PoolOpBuilder::ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_wra
   const auto& inputs = node_unit.Inputs();
   std::vector<uint32_t> input_shape;
   ORT_RETURN_IF_NOT(qnn_model_wrapper.GetOnnxShape(inputs[0].node_arg, input_shape), "Cannot get shape");
-  ORT_RETURN_IF_NOT(input_shape.size() == 4, "Input should have 4 dimension NCHW.");
+
+  const auto& reshape_input = node_unit.Inputs()[0];
+  TensorInfo reshape_input_info = {};
+  ORT_RETURN_IF_ERROR(qnn_model_wrapper.GetTensorInfo(reshape_input, reshape_input_info));
+
+  bool needs_reshape = false;
+  const std::string reshape4d = input_names[0] + "_pre_reshape";
+  if (input_shape.size() == 3) {
+    needs_reshape = true;
+    // build new_shape = {N, 1, C, L}
+    std::vector<uint32_t> new_shape = {
+        input_shape[0],
+        1,
+        input_shape[1],
+        input_shape[2]};
+
+    const std::string reshape_node_name = "pre_reshape";
+    QnnTensorWrapper rw(
+        reshape4d,
+        QNN_TENSOR_TYPE_NATIVE,
+        reshape_input_info.qnn_data_type,
+        reshape_input_info.quant_param.Copy(),
+        std::move(new_shape));
+    ORT_RETURN_IF_NOT(qnn_model_wrapper.AddTensorWrapper(std::move(rw)),
+                      "Failed to add reshape-4d tensor.");
+    ORT_RETURN_IF_NOT(qnn_model_wrapper.CreateQnnNode(
+                          reshape_node_name,
+                          QNN_OP_PACKAGE_NAME_QTI_AISW,
+                          "Reshape",
+                          {input_names[0]},
+                          {reshape4d},
+                          {},
+                          do_op_validation),
+                      "Failed to create reshape-4d node.");
+    input_names[0] = reshape4d;
+    input_shape = {input_shape[0], 1, input_shape[1], input_shape[2]};
+  }
+
+  ORT_RETURN_IF_NOT(input_shape.size() == 4, "Input should have 4 dims NCHW or 3 dims for 1D pooling.");
   // Default value for GlobalAveragePool
   // Pool use filter & stride with shape [filter_height, filter_width]
   // With layout transformer, the input has shape  [batch, height, width, channel],
@@ -192,6 +305,22 @@ Status PoolOpBuilder::ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_wra
                                             std::move(input_shape), std::move(output_shape)));
   }
 
+  std::vector<uint32_t> onnx_in_shape;
+  ORT_RETURN_IF_NOT(qnn_model_wrapper.GetOnnxShape(inputs[0].node_arg, onnx_in_shape), "Cannot get shape");
+  // Reshaped input rank-4 for MaxPool
+  if (onnx_in_shape.size() == 3) {
+    onnx_in_shape = {onnx_in_shape[0],
+                     1,
+                     onnx_in_shape[1],
+                     onnx_in_shape[2]};
+  }
+
+  // Calculate MaxPool output for rank-4 when input is rank 3
+  auto pooled_shape = ComputePoolOutputShape(onnx_in_shape,
+                                             filter_size,
+                                             stride,
+                                             pad_amount);
+
   SetPoolParam(node_unit, QNN_OP_POOL_MAX_2D_PARAM_FILTER_SIZE, std::move(filter_size_dim), std::move(filter_size), param_tensor_names, qnn_model_wrapper);
   SetPoolParam(node_unit, QNN_OP_POOL_MAX_2D_PARAM_PAD_AMOUNT, std::move(pad_amount_dim), std::move(pad_amount), param_tensor_names, qnn_model_wrapper);
   SetPoolParam(node_unit, QNN_OP_POOL_MAX_2D_PARAM_STRIDE, std::move(stride_dim), std::move(stride), param_tensor_names, qnn_model_wrapper);
@@ -229,12 +358,65 @@ Status PoolOpBuilder::ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_wra
     qnn_model_wrapper.AddParamWrapper(std::move(count_pad_for_edges_param));
   }
 
-  ORT_RETURN_IF_ERROR(ProcessOutputs(qnn_model_wrapper, node_unit,
-                                     std::move(input_names),
-                                     std::move(param_tensor_names),
-                                     logger,
-                                     do_op_validation,
-                                     GetQnnOpType(op_type)));
+  if (!needs_reshape) {
+    ORT_RETURN_IF_ERROR(ProcessOutputs(qnn_model_wrapper, node_unit,
+                                       std::move(input_names),
+                                       std::move(param_tensor_names),
+                                       logger,
+                                       do_op_validation,
+                                       GetQnnOpType(op_type)));
+
+    return Status::OK();
+  }
+  const auto& outputs = node_unit.Outputs();
+  const std::string real_out = outputs[0].node_arg.Name();
+  const std::string pool_name = "poolmax2d";
+  const std::string pool_out = real_out + "_post_reshape";
+  const std::string post_reshape_node_name = "post_reshape";
+  const std::string qnn_op = GetQnnOpType(op_type);
+  TensorInfo output_info{};
+  ORT_RETURN_IF_ERROR(qnn_model_wrapper.GetTensorInfo(node_unit.Outputs()[0], output_info));
+  bool is_graph_output = qnn_model_wrapper.IsGraphOutput(real_out);
+  Qnn_TensorType_t tensor_type = is_graph_output ? QNN_TENSOR_TYPE_APP_READ : QNN_TENSOR_TYPE_NATIVE;
+  QnnTensorWrapper pool_tensor(
+      pool_out,
+      QNN_TENSOR_TYPE_NATIVE,
+      reshape_input_info.qnn_data_type,
+      output_info.quant_param.Copy(),
+      std::move(pooled_shape));
+
+  ORT_RETURN_IF_NOT(
+      qnn_model_wrapper.AddTensorWrapper(std::move(pool_tensor)),
+      "Failed to add tensor for pool_out");
+
+  ORT_RETURN_IF_NOT(qnn_model_wrapper.CreateQnnNode(
+                        pool_name,
+                        QNN_OP_PACKAGE_NAME_QTI_AISW,
+                        qnn_op,
+                        {reshape4d},
+                        {pool_out},
+                        std::move(param_tensor_names),
+                        do_op_validation),
+                    "Failed to create QNN Pool node for rank-3 input.");
+
+  std::vector<uint32_t> final_shape3d = output_info.shape;
+  QnnTensorWrapper reshape_back_tensor(
+      real_out,
+      tensor_type,
+      output_info.qnn_data_type,
+      output_info.quant_param.Copy(),
+      std::move(final_shape3d));
+
+  ORT_RETURN_IF_NOT(qnn_model_wrapper.AddTensorWrapper(std::move(reshape_back_tensor)), "Failed to add tensor.");
+  ORT_RETURN_IF_NOT(qnn_model_wrapper.CreateQnnNode(
+                        post_reshape_node_name,
+                        QNN_OP_PACKAGE_NAME_QTI_AISW,
+                        "Reshape",
+                        {pool_out},
+                        {real_out},
+                        {},
+                        do_op_validation),
+                    "Failed to create reshape-back node.");
 
   return Status::OK();
 }
diff --git a/onnxruntime/test/providers/qnn/pool_op_test.cpp b/onnxruntime/test/providers/qnn/pool_op_test.cpp
index ae194bd2ef920..7d3d1d9aa613e 100644
--- a/onnxruntime/test/providers/qnn/pool_op_test.cpp
+++ b/onnxruntime/test/providers/qnn/pool_op_test.cpp
@@ -188,6 +188,80 @@ TEST_F(QnnHTPBackendTests, MaxPool_Large_Input_HTP_u8) {
                             QDQTolerance(0.00417f));
 }
 
+TEST_F(QnnHTPBackendTests, MaxPool1D_ReshapeNodesPresent) {
+  auto build_test_case = [](ModelTestBuilder& builder) {
+    NodeArg* input = builder.MakeInput<float>(std::vector<int64_t>{1, 3, 3},
+                                              GetFloatDataInRange(-10.0f, 10.0f, 9));
+    NodeArg* output = builder.MakeOutput();
+    auto& maxpool_node = builder.AddNode("MaxPool", {input}, {output});
+    maxpool_node.AddAttribute("kernel_shape", std::vector<int64_t>{3});
+    maxpool_node.AddAttribute("strides", std::vector<int64_t>{3});
+    maxpool_node.AddAttribute("pads", std::vector<int64_t>{0, 0});
+    maxpool_node.AddAttribute("ceil_mode", static_cast<int64_t>(0));
+    maxpool_node.AddAttribute("storage_order", static_cast<int64_t>(0));
+    maxpool_node.AddAttribute("auto_pad", "NOTSET");
+  };
+
+  // Build and serialize the model
+  auto& logging_manager = DefaultLoggingManager();
+  onnxruntime::Model model("maxpool1d", false, ModelMetaData(), PathString(), IOnnxRuntimeOpSchemaRegistryList(), {}, {},
+                           logging_manager.DefaultLogger());
+  ModelTestBuilder builder(model.MainGraph());
+  build_test_case(builder);
+  builder.SetGraphOutputs();
+  ASSERT_STATUS_OK(model.MainGraph().Resolve());
+  std::string model_data;
+  model.ToProto().SerializeToString(&model_data);
+
+  // Setup session options and register QNN HTP EP
+  SessionOptions so;
+  ProviderOptions options;
+  options["backend_type"] = "htp";
+
+  InferenceSessionWrapper session{so, GetEnvironment()};
+  auto qnn_ep = QnnExecutionProviderWithOptions(options, &so);
+  ASSERT_STATUS_OK(session.RegisterExecutionProvider(std::move(qnn_ep)));
+  ASSERT_STATUS_OK(session.Load(model_data.data(), static_cast<int>(model_data.size())));
+  ASSERT_STATUS_OK(session.Initialize());
+  const Graph& graph = session.GetGraph();
+  int number_of_nodes = graph.NumberOfNodes();
+
+  // The Reshape -> Pool -> Reshape gets fused to a single QNN node
+  EXPECT_EQ(number_of_nodes, 1) << "Expected 1 QNN fused node for MaxPool rank-3 input.";
+}
+
+// 1-D MaxPool HTP test for rank-3 without ceil
+TEST_F(QnnHTPBackendTests, MaxPool_Rank3_HTP_u8) {
+  RunQDQPoolOpTest<uint8_t>(
+      "MaxPool",
+      TestInputDef<float>({1, 3, 3}, false, -10.0f, 10.0f),
+      // A single 1-D kernel of length 3
+      {utils::MakeAttribute("kernel_shape", std::vector<int64_t>{3}),
+       utils::MakeAttribute("strides", std::vector<int64_t>{3}),
+       // 1-D pad: only two values
+       utils::MakeAttribute("pads", std::vector<int64_t>{0, 0}),
+       utils::MakeAttribute("dilations", std::vector<int64_t>{1}),
+       utils::MakeAttribute("ceil_mode", static_cast<int64_t>(0)),
+       utils::MakeAttribute("storage_order", static_cast<int64_t>(0)),
+       utils::MakeAttribute("auto_pad", "NOTSET")},
+      ExpectedEPNodeAssignment::All);
+}
+
+// 1-D MaxPool HTP test for rank-3 with ceil_mode=1
+TEST_F(QnnHTPBackendTests, MaxPool_Rank3_Ceil_HTP_u8) {
+  RunQDQPoolOpTest<uint8_t>(
+      "MaxPool",
+      TestInputDef<float>({1, 3, 3}, false, -10.0f, 10.0f),
+      {utils::MakeAttribute("kernel_shape", std::vector<int64_t>{3}),
+       utils::MakeAttribute("strides", std::vector<int64_t>{3}),
+       utils::MakeAttribute("pads", std::vector<int64_t>{0, 0}),
+       utils::MakeAttribute("dilations", std::vector<int64_t>{1}),
+       utils::MakeAttribute("ceil_mode", static_cast<int64_t>(1)),
+       utils::MakeAttribute("storage_order", static_cast<int64_t>(0)),
+       utils::MakeAttribute("auto_pad", "NOTSET")},
+      ExpectedEPNodeAssignment::All);
+}
+
 TEST_F(QnnHTPBackendTests, MaxPool_Ceil_HTP_u8) {
   RunQDQPoolOpTest<uint8_t>("MaxPool",
                             TestInputDef<float>({1, 2, 3, 3}, false, -10.0f, 10.0f),  // Dynamic input with range [-10, 10]

From 79ff843fa9be768b535195e4a8fa51fdd1dca562 Mon Sep 17 00:00:00 2001
From: Changming Sun <chasun@microsoft.com>
Date: Thu, 15 May 2025 14:58:51 -0700
Subject: [PATCH 78/84] Update CODEOWNERS (#24780)

Remove onnxruntime-mlas section
---
 CODEOWNERS | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/CODEOWNERS b/CODEOWNERS
index a55067ed798d8..3ce36ef57524a 100644
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -3,12 +3,6 @@
 # Mobile
 /onnxruntime/core/flatbuffers/schema/ort.fbs @microsoft/onnxruntime-mobile
 
-# MLAS and related contrib ops
-/onnxruntime/core/graph/contrib_ops/nhwc_schema_defs.cc @microsoft/onnxruntime-mlas
-/onnxruntime/core/graph/contrib_ops/nchwc_schema_defs.cc @microsoft/onnxruntime-mlas
-/onnxruntime/core/graph/contrib_ops/quantization_defs.* @microsoft/onnxruntime-mlas
-/onnxruntime/core/mlas/** @microsoft/onnxruntime-mlas
-
 # Dependencies
 requirements-dev.txt @microsoft/onnxruntime-admin
 requirements-doc.txt @microsoft/onnxruntime-admin

From 07a6e2cd408a8fff79472d204ba977a285f82fd5 Mon Sep 17 00:00:00 2001
From: Changming Sun <chasun@microsoft.com>
Date: Thu, 15 May 2025 15:01:17 -0700
Subject: [PATCH 79/84] Update symbol pattern (#24767)

The `symbolFolder` parameter in publish-symbolrequestprod-api.yml
actually was an unused-parameter. The yaml was copied from another
project and I didn't check the code.
Delete
tools/ci_build/github/azure-pipelines/templates/py-packaging-training-cuda-stage.yml.
---
 .../nuget/templates/dml-vs-2022.yml           | 22 +++--
 .../stages/py-cpu-packaging-stage.yml         | 18 +++-
 .../stages/py-win-gpu-stage.yml               | 22 ++++-
 ...acts-package-and-publish-steps-windows.yml |  7 +-
 .../publish-symbolrequestprod-api.yml         |  5 +-
 .../py-packaging-training-cuda-stage.yml      | 94 -------------------
 6 files changed, 58 insertions(+), 110 deletions(-)
 delete mode 100644 tools/ci_build/github/azure-pipelines/templates/py-packaging-training-cuda-stage.yml

diff --git a/tools/ci_build/github/azure-pipelines/nuget/templates/dml-vs-2022.yml b/tools/ci_build/github/azure-pipelines/nuget/templates/dml-vs-2022.yml
index 3b307abe5fcef..2a09eba776353 100644
--- a/tools/ci_build/github/azure-pipelines/nuget/templates/dml-vs-2022.yml
+++ b/tools/ci_build/github/azure-pipelines/nuget/templates/dml-vs-2022.yml
@@ -149,14 +149,20 @@ stages:
             artifactName: ${{ parameters.ArtifactName }}
             targetPath: '$(Build.ArtifactStagingDirectory)'
 
-        - task: PublishSymbols@2
-          displayName: 'Publish Build Symbols'
-          condition: and (succeeded(), or(eq(variables['Build.SourceBranch'], 'refs/heads/main'), startsWith(variables['Build.SourceBranch'], 'refs/heads/rel-')))
-          inputs:
-            SymbolsFolder: '$(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo'
-            SearchPattern: 'onnxruntime.pdb'
-            SymbolServerType: teamServices
-            SymbolExpirationInDays: 365
+
+        - ${{if or(eq(variables['Build.SourceBranch'], 'refs/heads/main'), startsWith(variables['Build.SourceBranch'], 'refs/heads/rel-'))}}:
+          - template: publish-symbolrequestprod-api.yml
+            parameters:
+              ${{if eq(variables['Build.SourceBranch'], 'refs/heads/main')}}:
+                symbolExpiryTime: 60
+              includePublicSymbolServer: true
+              symbolsArtifactName: ${{parameters.artifactNameNoVersionString}}
+              symbolsVersion: $(Build.BuildId)
+              symbolProject: 'ONNX Runtime'
+              subscription: 'OnnxrunTimeCodeSign_20240611'
+              searchPattern: |
+                $(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo\onnxruntime.pdb
+                $(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo\onnxruntime_providers_*.pdb
 
       # Node.js Publish
       - ${{ if eq(parameters['DoNodejsPack'], 'true') }}:
diff --git a/tools/ci_build/github/azure-pipelines/stages/py-cpu-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/stages/py-cpu-packaging-stage.yml
index 4c18fb73cd779..9928a68b6df06 100644
--- a/tools/ci_build/github/azure-pipelines/stages/py-cpu-packaging-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/stages/py-cpu-packaging-stage.yml
@@ -123,12 +123,28 @@ stages:
             --skip_submodule_sync
             --cmake_generator "Visual Studio 17 2022"
             --enable_pybind
-            --enable_onnx_tests --use_vcpkg --use_vcpkg_ms_internal_asset_cache 
+            --enable_onnx_tests --use_vcpkg --use_vcpkg_ms_internal_asset_cache
             ${{ parameters.build_py_parameters }}
             --parallel --use_binskim_compliant_compile_flags --update --build
             $(TelemetryOption)
           workingDirectory: '$(Build.BinariesDirectory)'
 
+
+      - ${{if or(eq(variables['Build.SourceBranch'], 'refs/heads/main'), startsWith(variables['Build.SourceBranch'], 'refs/heads/rel-'))}}:
+        - template: ../templates/publish-symbolrequestprod-api.yml
+          parameters:
+            ${{if eq(variables['Build.SourceBranch'], 'refs/heads/main')}}:
+              symbolExpiryTime: 60
+            includePublicSymbolServer: true            
+            symbolsArtifactName: onnxruntime_cpu_win_x64_$(PythonVersion)
+            symbolsVersion: $(Build.BuildId)
+            symbolProject: 'ONNX Runtime'
+            subscription: 'OnnxrunTimeCodeSign_20240611'
+            searchPattern: |
+              $(Build.BinariesDirectory)\${{ parameters.cmake_build_type }}\${{ parameters.cmake_build_type }}\onnxruntime.pdb
+              $(Build.BinariesDirectory)\${{ parameters.cmake_build_type }}\${{ parameters.cmake_build_type }}\onnxruntime_providers_shared.pdb
+              $(Build.BinariesDirectory)\${{ parameters.cmake_build_type }}\${{ parameters.cmake_build_type }}\onnxruntime_pybind11_state.pdb
+
       # Esrp signing
       - template: ../templates/win-esrp-dll.yml
         parameters:
diff --git a/tools/ci_build/github/azure-pipelines/stages/py-win-gpu-stage.yml b/tools/ci_build/github/azure-pipelines/stages/py-win-gpu-stage.yml
index fe2b85976d38b..0a88391dd4ad6 100644
--- a/tools/ci_build/github/azure-pipelines/stages/py-win-gpu-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/stages/py-win-gpu-stage.yml
@@ -133,6 +133,27 @@ stages:
                 $(TelemetryOption) ${{ parameters.BUILD_PY_PARAMETERS }} ${{ parameters.EP_BUILD_FLAGS }} ${{ variables.trt_build_flag }}
               workingDirectory: '$(Build.BinariesDirectory)'
 
+
+          - ${{if or(eq(variables['Build.SourceBranch'], 'refs/heads/main'), startsWith(variables['Build.SourceBranch'], 'refs/heads/rel-'))}}:
+            - template: ../templates/publish-symbolrequestprod-api.yml
+              parameters:
+                ${{if eq(variables['Build.SourceBranch'], 'refs/heads/main')}}:
+                  symbolExpiryTime: 60
+                includePublicSymbolServer: true
+                symbolsFolder: '$(Build.BinariesDirectory)\${{ parameters.cmake_build_type }}\${{ parameters.cmake_build_type }}'
+                symbolsArtifactName: onnxruntime_gpu_win_x64_${{ parameters.PYTHON_VERSION }}
+                symbolsVersion: $(Build.BuildId)
+                symbolProject: 'ONNX Runtime'
+                subscription: 'OnnxrunTimeCodeSign_20240611'
+                searchPattern: |
+                  $(Build.BinariesDirectory)\${{ parameters.cmake_build_type }}\${{ parameters.cmake_build_type }}\onnxruntime.pdb
+                  $(Build.BinariesDirectory)\${{ parameters.cmake_build_type }}\${{ parameters.cmake_build_type }}\onnxruntime_providers_shared.pdb
+                  $(Build.BinariesDirectory)\${{ parameters.cmake_build_type }}\${{ parameters.cmake_build_type }}\onnxruntime_providers_cuda.pdb
+                  $(Build.BinariesDirectory)\${{ parameters.cmake_build_type }}\${{ parameters.cmake_build_type }}\onnxruntime_providers_tensorrt.pdb
+                  $(Build.BinariesDirectory)\${{ parameters.cmake_build_type }}\${{ parameters.cmake_build_type }}\onnxruntime_providers_dml.pdb
+                  $(Build.BinariesDirectory)\${{ parameters.cmake_build_type }}\${{ parameters.cmake_build_type }}\onnxruntime_pybind11_state.pdb
+
+
           # Esrp signing
           - template: ../templates/win-esrp-dll.yml
             parameters:
@@ -170,7 +191,6 @@ stages:
       pool:
         name: ${{parameters.MACHINE_POOL}}
       steps:
-
         - checkout: self
           clean: true
           submodules: none
diff --git a/tools/ci_build/github/azure-pipelines/templates/c-api-artifacts-package-and-publish-steps-windows.yml b/tools/ci_build/github/azure-pipelines/templates/c-api-artifacts-package-and-publish-steps-windows.yml
index f872461e3b36b..adf9c91e602a0 100644
--- a/tools/ci_build/github/azure-pipelines/templates/c-api-artifacts-package-and-publish-steps-windows.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/c-api-artifacts-package-and-publish-steps-windows.yml
@@ -37,11 +37,14 @@ steps:
           ${{if eq(variables['Build.SourceBranch'], 'refs/heads/main')}}:
             symbolExpiryTime: 60
           includePublicSymbolServer: true
-          symbolsFolder: '$(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\'
           symbolsArtifactName: ${{parameters.artifactNameNoVersionString}}
           symbolsVersion: $(Build.BuildId)
           symbolProject: 'ONNX Runtime'
-          subscription: 'OnnxrunTimeCodeSign_20240611'          
+          subscription: 'OnnxrunTimeCodeSign_20240611'
+          searchPattern: |
+            $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\onnxruntime.pdb
+            $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\onnxruntime_providers_*.pdb
+
 
     - task: CmdLine@2
       displayName: 'Copy build artifacts for zipping'
diff --git a/tools/ci_build/github/azure-pipelines/templates/publish-symbolrequestprod-api.yml b/tools/ci_build/github/azure-pipelines/templates/publish-symbolrequestprod-api.yml
index 0441e685948bd..b2a3eaca0280f 100644
--- a/tools/ci_build/github/azure-pipelines/templates/publish-symbolrequestprod-api.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/publish-symbolrequestprod-api.yml
@@ -3,9 +3,6 @@ parameters:
   - name: includePublicSymbolServer
     type: boolean
     default: false
-  - name: symbolsFolder
-    type: string
-    default: '$(Build.SourcesDirectory)/bin'
   - name: searchPattern
     type: string
     default: '**/*.pdb'
@@ -87,4 +84,4 @@ steps:
       }
       Write-Host "##[debug]Submitting request $($createRequestBody.requestName) ($($publishRequestBody | ConvertTo-Json -Compress))"
       Invoke-RestMethod -Uri "$BaseUri/$($createRequestBody.requestName)" -Body ($publishRequestBody | ConvertTo-Json -Compress) -Verbose
-    displayName: Publish Symbols using internal REST API
\ No newline at end of file
+    displayName: Publish Symbols using internal REST API
diff --git a/tools/ci_build/github/azure-pipelines/templates/py-packaging-training-cuda-stage.yml b/tools/ci_build/github/azure-pipelines/templates/py-packaging-training-cuda-stage.yml
deleted file mode 100644
index a1f326ebaafa8..0000000000000
--- a/tools/ci_build/github/azure-pipelines/templates/py-packaging-training-cuda-stage.yml
+++ /dev/null
@@ -1,94 +0,0 @@
-parameters:
-- name: build_py_parameters
-  displayName: >
-    Extra parameters to pass to build.py. Don't put newlines in here.
-  type: string
-  default: ''
-
-- name: torch_version
-  displayName: >
-    torch_version.
-  type: string
-
-- name: opset_version
-  displayName: >
-    opset_version.
-  type: string
-
-- name: cuda_version
-  displayName: >
-    cuda_version.
-  type: string
-
-- name: cmake_cuda_architectures
-  displayName: >
-    cmake_cuda_architectures
-  type: string
-
-- name: docker_file
-  displayName: >
-    docker_file.
-  type: string
-
-- name: agent_pool
-  displayName: >
-    agent_pool.
-  type: string
-
-- name: upload_wheel
-  displayName: >
-    upload_wheel.
-  type: string
-  default: ''
-
-- name: debug_build
-  displayName: >
-    debug_build.
-  type: boolean
-  default: false
-
-- name: SpecificArtifact
-  displayName: Use Specific Artifact
-  type: boolean
-  default: false
-
-- name: BuildId
-  displayName: Specific Artifact's BuildId
-  type: string
-  default: '0'
-
-- name: build_pool_name
-  displayName: >
-    build_pool_name.
-  type: string
-
-- name: PythonVersionList
-  displayName: Python Version List
-  type: object
-  default:
-    - name: '38'
-      version: '3.8'
-    - name: '39'
-      version: '3.9'
-    - name: '310'
-      version: '3.10'
-    - name: '311'
-      version: '3.11'
-
-stages:
-- ${{ each python_version in parameters.PythonVersionList }}:
-  - template: py-packaging-training-cuda-stage-steps.yml
-    parameters:
-      build_py_parameters: ${{ parameters.build_py_parameters }}
-      torch_version: ${{ parameters.torch_version }}
-      opset_version: ${{ parameters.opset_version }}
-      cuda_version: ${{ parameters.cuda_version }}
-      cmake_cuda_architectures: ${{ parameters.cmake_cuda_architectures }}
-      docker_file: ${{ parameters.docker_file }}
-      upload_wheel: ${{ parameters.upload_wheel }}
-      debug_build: ${{ parameters.debug_build }}
-      stage_name: 'Linux_py_Training_Cuda_Wheels_${{ python_version.name }}'
-      python_version: ${{ python_version.version }}
-      SpecificArtifact: ${{ parameters.SpecificArtifact }}
-      BuildId: ${{ parameters.BuildId }}
-      build_pool_name: ${{ parameters.build_pool_name }}

From 159f1a838dc2c1066f4c82c146fb4a76c87b2ccb Mon Sep 17 00:00:00 2001
From: Edward Chen <18449977+edgchen1@users.noreply.github.com>
Date: Thu, 15 May 2025 16:54:29 -0700
Subject: [PATCH 80/84] [QNN EP] Always write to profiling file if
 `profiling_file_path` is given. (#24781)

### Description
<!-- Describe your changes. -->

Always write to profiling file if `profiling_file_path` is given.

### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->

Previously, on Windows, if the ETW path is enabled, the profiling data
will not be written to the file even if `profiling_file_path` is given.
I thought that this behavior was confusing.
---
 .../qnn/builder/qnn_backend_manager.cc        | 33 +++++++++----------
 1 file changed, 15 insertions(+), 18 deletions(-)

diff --git a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc
index edd1f3e9eb53b..0009dab837525 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc
+++ b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc
@@ -1280,7 +1280,7 @@ Status QnnBackendManager::ExtractBackendProfilingInfo() {
   }
 
   ORT_RETURN_IF(!tracelogging_provider_ep_enabled && profiling_file_path_.empty(),
-                "Need to specify a cvs file via provider option profiling_file_path if ETW not enabled.");
+                "Need to specify a CSV file via provider option profiling_file_path if ETW not enabled.");
 
   ORT_RETURN_IF(nullptr == profile_backend_handle_, "Backend profile handle not valid.");
 
@@ -1311,7 +1311,7 @@ Status QnnBackendManager::ExtractBackendProfilingInfo() {
     }
 
     std::ofstream outfile;
-    if (!tracelogging_provider_ep_enabled) {
+    if (!profiling_file_path_.empty()) {
       // Write to CSV in append mode
       std::ifstream infile(profiling_file_path_.c_str());
       bool exists = infile.good();
@@ -1334,10 +1334,11 @@ Status QnnBackendManager::ExtractBackendProfilingInfo() {
                                     tracelogging_provider_ep_enabled));
     }
 
-    if (!tracelogging_provider_ep_enabled) {
-      outfile.close();
-      LOGS(*logger_, VERBOSE) << "Wrote QNN profiling events (" << num_events << ") to qnn-profiling-data.csv";
-    } else {
+    if (outfile) {
+      LOGS(*logger_, VERBOSE) << "Wrote QNN profiling events (" << num_events << ") to file ("
+                              << profiling_file_path_ << ")";
+    }
+    if (tracelogging_provider_ep_enabled) {
       LOGS(*logger_, VERBOSE) << "Wrote QNN profiling events (" << num_events << ") to ETW";
     }
   }
@@ -1399,11 +1400,7 @@ Status QnnBackendManager::ExtractProfilingEventBasic(
   std::string message = GetEventTypeString(event_data.type);
   std::string unit = GetUnitString(event_data.unit);
 
-#ifndef _WIN32
-  tracelogging_provider_ep_enabled = false;
-#endif
-
-  if (!tracelogging_provider_ep_enabled) {
+  if (outfile) {
     outfile << "UNKNOWN"
             << ","
             << message << ","
@@ -1413,7 +1410,9 @@ Status QnnBackendManager::ExtractProfilingEventBasic(
             << ","
             << eventLevel << ","
             << (event_data.identifier ? event_data.identifier : "NULL") << "\n";
-  } else {
+  }
+
+  if (tracelogging_provider_ep_enabled) {
 #ifdef _WIN32
     LogQnnProfileEventAsTraceLogging(
         (uint64_t)0,
@@ -1443,11 +1442,7 @@ Status QnnBackendManager::ExtractProfilingEventExtended(
   std::string message = GetEventTypeString(event_data_extended.v1.type);
   std::string unit = GetUnitString(event_data_extended.v1.unit);
 
-#ifndef _WIN32
-  tracelogging_provider_ep_enabled = false;
-#endif
-
-  if (!tracelogging_provider_ep_enabled) {
+  if (outfile) {
     if (event_data_extended.version == QNN_PROFILE_DATA_VERSION_1) {
       outfile << event_data_extended.v1.timestamp << ","
               << message << ","
@@ -1458,7 +1453,9 @@ Status QnnBackendManager::ExtractProfilingEventExtended(
               << eventLevel << ","
               << (event_data_extended.v1.identifier ? event_data_extended.v1.identifier : "NULL") << "\n";
     }
-  } else {
+  }
+
+  if (tracelogging_provider_ep_enabled) {
 #ifdef _WIN32
     LogQnnProfileEventAsTraceLogging(
         event_data_extended.v1.timestamp,

From 75d2f5459d8ffb79fd7441965a8314b1029885c9 Mon Sep 17 00:00:00 2001
From: Ted Themistokleous
 <107195283+TedThemistokleous@users.noreply.github.com>
Date: Thu, 15 May 2025 21:48:23 -0400
Subject: [PATCH 81/84] [MIGraphX EP] Add multihead attention support in
 MIGraphX EP  (#24779)

### Description
<!-- Describe your changes. -->
Adds MultiHeadAttention operator support to MIGraphX EP to leverage the
existing MIGraphX parser and Implimentation


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
Needed for Model enablement
---
 .../core/providers/migraphx/migraphx_execution_provider.cc       | 1 +
 1 file changed, 1 insertion(+)

diff --git a/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc b/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc
index 9a694b03387ae..c713c47a96e37 100644
--- a/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc
+++ b/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc
@@ -870,6 +870,7 @@ GetUnsupportedNodeIndices(const GraphViewer& graph_viewer,
                                                     "Mod",
                                                     "Mul",
                                                     "Multinomial",
+                                                    "MultiHeadAttention",
                                                     "Neg",
                                                     "NegativeLogLikelihoodLoss",
                                                     "NonMaxSuppression",

From c2d70f3bc2dd5f71dab3fba3d08dd4b243bdde16 Mon Sep 17 00:00:00 2001
From: Ted Themistokleous
 <107195283+TedThemistokleous@users.noreply.github.com>
Date: Thu, 15 May 2025 21:48:39 -0400
Subject: [PATCH 82/84] [MIGraphX EP] Add support for QuickGelu in MIGraphX EP 
 (#24778)

### Description
<!-- Describe your changes. -->
Adds enablement for MIGraphX EP to use MIGraphX's QuickGelu parser and
op


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->

Required for model support
---
 .../core/providers/migraphx/migraphx_execution_provider.cc       | 1 +
 1 file changed, 1 insertion(+)

diff --git a/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc b/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc
index c713c47a96e37..e70ddc481ba43 100644
--- a/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc
+++ b/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc
@@ -885,6 +885,7 @@ GetUnsupportedNodeIndices(const GraphViewer& graph_viewer,
                                                     "QLinearConv",
                                                     "QLinearMatMul",
                                                     "QuantizeLinear",
+                                                    "QuickGelu",
                                                     "DynamicQuantizeLinear",
                                                     "RandomNormal",
                                                     "RandomNormalLike",

From 76a3a4d7b970667e4eee0676119e2a9d1d35b219 Mon Sep 17 00:00:00 2001
From: gnedanur <gnedanur@qti.qualcomm.com>
Date: Fri, 16 May 2025 08:54:55 +0530
Subject: [PATCH 83/84] Use Memtype as MemHandle only for Graph IO Tensors
 (#24776)

### Description
Memtype Memhandle is applicable only for Graph IO tensors. For other tensors we can leave it as RAW


### Motivation and Context
Compose failed for some models as Memtype is set as MemHandle for static tensors.
---
 onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.cc b/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.cc
index 39ec9dba18f07..0f0b42bf754d7 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.cc
+++ b/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.cc
@@ -69,7 +69,7 @@ Status QnnModelWrapper::MakeTensorWrapper(const NodeUnitIODef& tensor, QnnTensor
   }
 
   Qnn_TensorMemType_t mem_type = QNN_TENSORMEMTYPE_RAW;
-  if (true == model_settings_.htp_shared_memory) {
+  if (true == model_settings_.htp_shared_memory && (IsGraphInput(tensor_name) || IsGraphOutput(tensor_name))) {
     mem_type = QNN_TENSORMEMTYPE_MEMHANDLE;
   }
   tensor_wrapper = QnnTensorWrapper(tensor_name, GetTensorType(tensor_name), tensor_info.qnn_data_type,

From c7e3aa574cc6064179853fe1042b7d20afb59048 Mon Sep 17 00:00:00 2001
From: quic-ankus <quic_ankus@quicinc.com>
Date: Fri, 16 May 2025 08:55:41 +0530
Subject: [PATCH 84/84] [QNN EP] Enable MaxPool Op with "auto_pad" param set as
 VALID. (#24752)

Enable MaxPool Op with "auto_pad" param set as VALID.
VALID runs with all pad values set to 0.

### Description
Remove the assert from QNN_EP for MaxPool Op with "auto_pad" as VALID
since the Op with this config is supported on QNN backend.


### Motivation and Context
QNN_EP rejects MaxPool Op with "auto_pad" as VALID with message the QNN Pool does not support this config.
QNN Pool Op supports auto_pad=VALID and all the pad values are set to 0.

Signed-off-by: quic-ankus <quic_ankus@quicinc.com>
---
 .../qnn/builder/opbuilder/pool_op_builder.cc        |  4 ++--
 onnxruntime/test/providers/qnn/pool_op_test.cpp     | 13 +++++++++++++
 2 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/pool_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/pool_op_builder.cc
index fb23a2c99ab84..ba9f7baa4c1ee 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/pool_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/pool_op_builder.cc
@@ -96,7 +96,7 @@ Status PoolOpBuilder::IsOpSupported(QnnModelWrapper& qnn_model_wrapper,
 
   if (op_type == "MaxPool" || op_type == "AveragePool") {
     auto auto_pad = node_helper.Get("auto_pad", std::string("NOTSET"));
-    ORT_RETURN_IF(auto_pad != "NOTSET" && auto_pad != "SAME_LOWER" && auto_pad != "SAME_UPPER",
+    ORT_RETURN_IF(auto_pad != "NOTSET" && auto_pad != "SAME_LOWER" && auto_pad != "SAME_UPPER" && auto_pad != "VALID",
                   "QNN Pool operators do not support 'auto_pad' value: ", auto_pad.c_str());
   }
 
@@ -141,7 +141,7 @@ Status PoolOpBuilder::SetCommonPoolParams(const NodeAttrHelper& node_helper,
   }
 
   auto auto_pad = node_helper.Get("auto_pad", std::string("NOTSET"));
-  ORT_RETURN_IF(auto_pad != "NOTSET" && auto_pad != "SAME_LOWER" && auto_pad != "SAME_UPPER",
+  ORT_RETURN_IF(auto_pad != "NOTSET" && auto_pad != "SAME_LOWER" && auto_pad != "SAME_UPPER" && auto_pad != "VALID",
                 "QNN Pool operators do not support 'auto_pad' value: ", auto_pad.c_str());
 
   if (auto_pad.compare("NOTSET") != 0) {
diff --git a/onnxruntime/test/providers/qnn/pool_op_test.cpp b/onnxruntime/test/providers/qnn/pool_op_test.cpp
index 7d3d1d9aa613e..d777b1134d060 100644
--- a/onnxruntime/test/providers/qnn/pool_op_test.cpp
+++ b/onnxruntime/test/providers/qnn/pool_op_test.cpp
@@ -293,6 +293,19 @@ TEST_F(QnnHTPBackendTests, DISABLED_MaxPool_Large_Input2_Ceil_HTP_u8) {
                             ExpectedEPNodeAssignment::All);
 }
 
+TEST_F(QnnHTPBackendTests, MaxPool_Large_Input3_AutoPadValid_HTP_u8) {
+  RunQDQPoolOpTest<uint8_t>("MaxPool",
+                            TestInputDef<float>({1, 160, 14, 20}, false, -10.0f, 10.0f),  // Dynamic input with range [-10, 10]
+                            {utils::MakeAttribute("kernel_shape", std::vector<int64_t>{2, 2}),
+                             utils::MakeAttribute("strides", std::vector<int64_t>{2, 2}),
+                             utils::MakeAttribute("pads", std::vector<int64_t>{0, 0, 0, 0}),
+                             utils::MakeAttribute("dilations", std::vector<int64_t>{1, 1}),
+                             utils::MakeAttribute("ceil_mode", static_cast<int64_t>(0)),
+                             utils::MakeAttribute("storage_order", static_cast<int64_t>(0)),
+                             utils::MakeAttribute("auto_pad", "VALID")},
+                            ExpectedEPNodeAssignment::All);
+}
+
 // QNN v2.13: Certain large input sizes cause the QNN graph to fail to finalize with error 1002 (QNN_COMMON_ERROR_MEM_ALLOC).
 // Fixed in QNN v2.14.1.
 TEST_F(QnnHTPBackendTests, MaxPool_LargeInput_1Pads_u8) {