diff --git a/.github/workflows/linux_openvino_ci.yml b/.github/workflows/linux_openvino_ci.yml
new file mode 100644
index 0000000000000..12495b1f26c65
--- /dev/null
+++ b/.github/workflows/linux_openvino_ci.yml
@@ -0,0 +1,45 @@
+name: Linux OpenVINO CI
+
+on:
+  push:
+    branches: [ main, 'rel-*' ]
+  pull_request:
+    branches: [ main, 'rel-*' ]
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+permissions:
+  contents: read
+  packages: write # Needed if the reusable workflow pushes images
+  attestations: write # Optional: for artifact attestations if enabled
+  id-token: write # Optional: may be needed for OIDC authentication (e.g., ACR)
+
+jobs:
+  build_test_openvino:
+    name: Build and Test OpenVINO EP (AlamLinux8, Py3.12)
+    # Use the reusable workflow as the other Linux CI pipelines
+    uses: ./.github/workflows/reusable_linux_build.yml
+    with:
+      pool_name: "onnxruntime-github-Ubuntu2204-AMD-CPU"
+      build_config: Release
+      # Architecture: OpenVino only supports Intel X64
+      architecture: x64
+      dockerfile_path: tools/ci_build/github/linux/docker/inference/x86_64/python/openvino/Dockerfile
+      docker_image_repo: onnxruntimeopenvino
+
+      execution_providers: 'openvino'
+
+      extra_build_flags: '--use_openvino CPU --enable_generic_interface --build_shared_lib'
+
+      # Python Path Prefix: Set the correct Python 3.12 path inside the manylinux container
+      python_path_prefix: 'PATH=/opt/python/cp312-cp312/bin:$PATH'
+
+      run_tests: true
+      upload_build_output: false
+
+    # Secrets: Pass the necessary GitHub token
+    secrets:
+      GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.github/workflows/publish-python-apidocs.yml b/.github/workflows/publish-python-apidocs.yml
index adc2346d1bf1b..d03c9a407d54f 100644
--- a/.github/workflows/publish-python-apidocs.yml
+++ b/.github/workflows/publish-python-apidocs.yml
@@ -40,6 +40,7 @@ jobs:
       - name: Generate Python docs with Sphinx
         run: |
           cd tools/doc
+          chmod +x *
           ./builddoc.sh /usr/bin ../.. ../../build
       - name: Log source commit
         run: git rev-parse --short HEAD > build/docs/html/source-version.txt
diff --git a/cgmanifests/cgmanifest.json b/cgmanifests/cgmanifest.json
index d883f89dfdc56..f29857a231eb9 100644
--- a/cgmanifests/cgmanifest.json
+++ b/cgmanifests/cgmanifest.json
@@ -18,7 +18,7 @@
         "maven": {
           "GroupId": "com.google.protobuf",
           "ArtifactId": "protobuf-java",
-          "Version": "3.21.7"
+          "Version": "3.25.5"
         },
         "DevelopmentDependency": true
       }
diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
index a841b17a2a571..5ab1605dd3a99 100644
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -62,7 +62,7 @@ option(onnxruntime_GENERATE_TEST_REPORTS "Enable test report generation" OFF)
 option(onnxruntime_ENABLE_STATIC_ANALYSIS "Enable static analysis" OFF)
 option(onnxruntime_USE_CUSTOM_STATIC_ANALYSIS_RULES "Use a custom SDL Rule. It is mainly for our CI build" OFF)
 option(onnxruntime_REDIRECT_STATIC_ANALYSIS_OUTPUTS_TO_FILE "Use a custom SDL Rule. It is mainly for our CI build" OFF)
-option(onnxruntime_ENABLE_PYTHON "Enable python buildings" OFF)
+option(onnxruntime_ENABLE_PYTHON "Enable python bindings" OFF)
 # Enable it may cause LNK1169 error
 option(onnxruntime_ENABLE_MEMLEAK_CHECKER "Experimental: Enable memory leak checker in Windows debug build" OFF)
 option(onnxruntime_USE_CUDA "Build with CUDA support" OFF)
diff --git a/docs/python/ReadMeOV.rst b/docs/python/ReadMeOV.rst
index 845f79cf8257c..fefef421158f8 100644
--- a/docs/python/ReadMeOV.rst
+++ b/docs/python/ReadMeOV.rst
@@ -16,7 +16,7 @@ Requirements
 ^^^^^^^^^^^^
 
 - Ubuntu 18.04, 20.04 or Windows 10 - 64 bit
-- Python 3.11, 3.12 and 3.13 for Windows and Linux
+- Python 3.10, 3.11, 3.12 and 3.13 for Windows and Linux
 
 This package supports:
  - Intel® CPUs
@@ -29,7 +29,7 @@ This package supports:
 Please install OpenVINO™ PyPi Package separately for Windows.
 For installation instructions on Windows please refer to  `OpenVINO™ Execution Provider for ONNX Runtime for Windows <https://github.com/intel/onnxruntime/releases/>`_.
 
-**OpenVINO™ Execution Provider for ONNX Runtime** Linux Wheels comes with pre-built libraries of OpenVINO™ version 2025.0.0 eliminating the need to install OpenVINO™ separately.
+**OpenVINO™ Execution Provider for ONNX Runtime** Linux Wheels comes with pre-built libraries of OpenVINO™ version 2025.1.0 eliminating the need to install OpenVINO™ separately.
 
 For more details on build and installation please refer to `Build <https://onnxruntime.ai/docs/build/eps.html#openvino>`_.
 
diff --git a/include/onnxruntime/core/providers/providers.h b/include/onnxruntime/core/providers/providers.h
index 2cfd5acf66293..8097be287df82 100644
--- a/include/onnxruntime/core/providers/providers.h
+++ b/include/onnxruntime/core/providers/providers.h
@@ -2,6 +2,10 @@
 // Licensed under the MIT License.
 
 #pragma once
+#include <memory>
+
+struct OrtSessionOptions;
+struct OrtLogger;
 
 namespace onnxruntime {
 class IExecutionProvider;
@@ -9,5 +13,20 @@ class IExecutionProvider;
 struct IExecutionProviderFactory {
   virtual ~IExecutionProviderFactory() = default;
   virtual std::unique_ptr<IExecutionProvider> CreateProvider() = 0;
+
+  /// <summary>
+  /// Creates an IExecutionProvider instance. Enables initialization of an EP instance using session-level options
+  /// such as session configs (string key/value pairs), graph optimization level, etc.
+  ///
+  /// The default implementation ignores the arguments and calls the above CreateProvider() function,
+  /// which does not take in any arguments.
+  ///
+  /// This version of CreateProvider() is used by InferenceSession when registering EPs.
+  /// </summary>
+  /// <param name="session_options">Options for the session in which the IExecutionProvider is used</param>
+  /// <param name="session_logger">Session logger that should be used by the IExecutionProvider.</param>
+  /// <returns>An IExecutionProvider</returns>
+  virtual std::unique_ptr<IExecutionProvider> CreateProvider(const OrtSessionOptions& session_options,
+                                                             const OrtLogger& session_logger);
 };
 }  // namespace onnxruntime
diff --git a/include/onnxruntime/core/session/onnxruntime_c_api.h b/include/onnxruntime/core/session/onnxruntime_c_api.h
index 3bf0d5e19c525..d557ee7443306 100644
--- a/include/onnxruntime/core/session/onnxruntime_c_api.h
+++ b/include/onnxruntime/core/session/onnxruntime_c_api.h
@@ -310,6 +310,7 @@ ORT_RUNTIME_CLASS(ValueInfo);
 ORT_RUNTIME_CLASS(Node);
 ORT_RUNTIME_CLASS(Graph);
 ORT_RUNTIME_CLASS(Model);
+ORT_RUNTIME_CLASS(ModelCompilationOptions);
 
 #ifdef _MSC_VER
 typedef _Return_type_success_(return == 0) OrtStatus* OrtStatusPtr;
@@ -673,6 +674,9 @@ typedef struct OrtTrainingApi OrtTrainingApi;
 struct OrtModelEditorApi;
 typedef struct OrtModelEditorApi OrtModelEditorApi;
 
+struct OrtCompileApi;
+typedef struct OrtCompileApi OrtCompileApi;
+
 /** \brief The helper interface to get the right version of OrtApi
  *
  * Get a pointer to this structure through ::OrtGetApiBase
@@ -3638,10 +3642,16 @@ struct OrtApi {
    * \param[in] provider_options_values - values to configure the provider options
    * \param[in] num_keys - number of keys passed in
    *
-   * Currently supported providers:
-   *   QNN
-   *   SNPE
-   *   XNNPACK
+   * Currently supported provider names:
+   *   QNNExecutionProvider (or QNN)
+   *   OpenVINOExecutionProvider (or OpenVINO)
+   *   XnnpackExecutionProvider (or XNNPACK)
+   *   WebNNExecutionProvider (or WEBNN)
+   *   WebGpuExecutionProvider (or WebGPU)
+   *   AzureExecutionProvider (or AZURE)
+   *   JsExecutionProvider (or JS)
+   *   VitisAIExecutionProvider (or VitisAI)
+   *   CoreMLExecutionProvider (or CoreML)
    *
    * Note: If an execution provider has a dedicated SessionOptionsAppendExecutionProvider_<provider name> function
    *       that should be used to add it.
@@ -3651,93 +3661,78 @@ struct OrtApi {
    *      name. E.g., given backend type "htp", on Windows, the backend path would be "QnnHtp.dll", and on other
    *      platforms, it would be "libQnnHtp.so". Mutually exclusive with "backend_path".
    *      Available options:
-   *      - "cpu"
-   *      - "gpu"
-   *      - "htp": Default.
-   *      - "saver"
+   *      -# "cpu"
+   *      -# "gpu"
+   *      -# "htp": Default.
+   *      -# "saver"
    *   "backend_path": File path to QNN backend library. Mutually exclusive with "backend_type".
    *   "profiling_level": QNN profiling level.
    *      Available options:
-   *      - "off": Default.
-   *      - "basic"
-   *      - "detailed"
+   *      -# "off": Default.
+   *      -# "basic"
+   *      -# "detailed"
    *   "profiling_file_path": QNN profiling file path if ETW not enabled.
    *   "rpc_control_latency": QNN RPC control latency.
    *   "vtcm_mb": QNN VTCM size in MB. default to 0(not set).
    *   "htp_performance_mode": QNN performance mode.
    *      Available options:
-   *      - "burst"
-   *      - "balanced"
-   *      - "default": Default.
-   *      - "high_performance"
-   *      - "high_power_saver"
-   *      - "low_balanced"
-   *      - "extreme_power_saver"
-   *      - "low_power_saver"
-   *      - "power_saver"
-   *      - "sustained_high_performance"
+   *      -# "burst"
+   *      -# "balanced"
+   *      -# "default": Default.
+   *      -# "high_performance"
+   *      -# "high_power_saver"
+   *      -# "low_balanced"
+   *      -# "extreme_power_saver"
+   *      -# "low_power_saver"
+   *      -# "power_saver"
+   *      -# "sustained_high_performance"
    *   "qnn_saver_path": File path to the QNN Saver backend library. If specified, QNN Saver will be enabled and will
    *      dump QNN API calls to disk for replay/debugging. QNN Saver produces incorrect model inference results and
    *      may alter model/EP partitioning. Use only for debugging.
    *   "qnn_context_priority": QNN context priority.
    *      Available options:
-   *      - "low"
-   *      - "normal": Default.
-   *      - "normal_high"
-   *      - "high"
+   *      -# "low"
+   *      -# "normal": Default.
+   *      -# "normal_high"
+   *      -# "high"
    *   "htp_graph_finalization_optimization_mode": Set the optimization mode for graph finalization on the HTP backend.
    *      Available options:
-   *      - "0": Default.
-   *      - "1": Faster preparation time, less optimal graph.
-   *      - "2": Longer preparation time, more optimal graph.
-   *      - "3": Longest preparation time, most likely even more optimal graph. See QNN SDK documentation for specific
+   *      -# "0": Default.
+   *      -# "1": Faster preparation time, less optimal graph.
+   *      -# "2": Longer preparation time, more optimal graph.
+   *      -# "3": Longest preparation time, most likely even more optimal graph. See QNN SDK documentation for specific
    *        details.
    *   "soc_model": The SoC model number. Refer to the QNN SDK documentation for valid values.
    *      Defaults to "0" (unknown).
    *   "htp_arch": The minimum HTP architecture the driver will use to select compatible QNN operators.
    *      Available options:
-   *      - "0": Default (none).
-   *      - "68"
-   *      - "69"
-   *      - "73"
-   *      - "75"
+   *      -# "0": Default (none).
+   *      -# "68"
+   *      -# "69"
+   *      -# "73"
+   *      -# "75"
    *   "device_id": The ID of the device to use when setting 'htp_arch'. Defaults to "0" (for single device).
    *   "enable_htp_fp16_precision": Used for float32 model for HTP backend.
    *      Enable the float32 model to be inferenced with fp16 precision. Otherwise, it will be fp32 precision.
-   *     - "0": With fp32 precision.
-   *     - "1": Default. With fp16 precision.
+   *      -# "0": With fp32 precision.
+   *      -# "1": Default. With fp16 precision.
    *   "offload_graph_io_quantization": Offload graph input quantization and graph output dequantization to another
    *      execution provider (typically CPU EP).
-   *      - "0": Disabled. QNN EP will handle quantization and dequantization of graph I/O.
-   *      - "1": Enabled. This is the default value.
+   *      -# "0": Disabled. QNN EP will handle quantization and dequantization of graph I/O.
+   *      -# "1": Enabled. This is the default value.
    *   "enable_htp_spill_fill_buffer": Enable HTP spill fill buffer setting. The flag is used while generating context
    *      binary.
-   *      - "0": Default. Disabled.
-   *      - "1": Enabled.
+   *      -# "0": Default. Disabled.
+   *      -# "1": Enabled.
    *   "enable_htp_shared_memory_allocator": Enable the QNN HTP shared memory allocator. Requires libcdsprpc.so/dll to
    *      be available.
-   *      - "0": Default. Disabled.
-   *      - "1": Enabled.
+   *      -# "0": Default. Disabled.
+   *      -# "1": Enabled.
    *   "dump_json_qnn_graph": Set to "1" to dump QNN graphs generated by QNN EP as JSON files. Each graph partition
    *      assigned to QNN EP is dumped to a separate file.
    *   "json_qnn_graph_dir": Directory in which to dump QNN JSON graphs. If not specified, QNN graphs are dumped in the
    *      program's current working directory. Ignored if "dump_json_qnn_graph" is not set.
    *
-   * SNPE supported keys:
-   *   "runtime": SNPE runtime engine, options: "CPU", "CPU_FLOAT32", "GPU", "GPU_FLOAT32_16_HYBRID", "GPU_FLOAT16",
-   *      "DSP", "DSP_FIXED8_TF", "AIP_FIXED_TF", "AIP_FIXED8_TF".
-   *      Mapping to SNPE Runtime_t definition:
-   *        CPU, CPU_FLOAT32 => zdl::DlSystem::Runtime_t::CPU;
-   *        GPU, GPU_FLOAT32_16_HYBRID => zdl::DlSystem::Runtime_t::GPU;
-   *        GPU_FLOAT16 => zdl::DlSystem::Runtime_t::GPU_FLOAT16;
-   *        DSP, DSP_FIXED8_TF => zdl::DlSystem::Runtime_t::DSP.
-   *        AIP_FIXED_TF, AIP_FIXED8_TF => zdl::DlSystem::Runtime_t::AIP_FIXED_TF.
-   *   "priority": execution priority, options: "low", "normal".
-   *   "buffer_type": ITensor or user buffers, options: "ITENSOR", user buffer with different types - "TF8", "TF16", "UINT8", "FLOAT".
-   *   "ITENSOR" -- default, ITensor which is float only.
-   *   "TF8" -- quantized model required, "FLOAT" -- for both quantized or non-quantized model
-   *   "enable_init_cache": enable SNPE init caching feature, set to 1 to enabled it. Disabled by default.
-   *
    * XNNPACK supported keys:
    *   "intra_op_num_threads": number of thread-pool size to use for XNNPACK execution provider.
    *      default value is 0, which means to use the session thread-pool size.
@@ -4855,6 +4850,7 @@ struct OrtApi {
 
   /** \brief Get the value name from an OrtValueInfo instance.
    * \param[in] value_info The OrtValueInfo instance.
+   * \param[out] name The name of the OrtValueInfo
    * \snippet{doc} snippets.dox OrtStatus Return Value
    * \since Version 1.21.
    */
@@ -4862,6 +4858,7 @@ struct OrtApi {
 
   /** \brief Get the type information from an OrtValueInfo instance.
    * \param[in] value_info The OrtValueInfo instance.
+   * \param[out] type_info The type info of the OrtValueInfo
    * \snippet{doc} snippets.dox OrtStatus Return Value
    * \since Version 1.21.
    */
@@ -4889,6 +4886,7 @@ struct OrtApi {
    * \param[in] shape Dimensions of the Tensor. All values should be > 0.
    * \param[in] shape_len Number of dimensions in the shape array.
    * \param[in] type Data type of the Tensor.
+   * \param[out] out Newly created ::OrtValue. Must be freed with OrtApi::ReleaseValue
    *
    * \snippet{doc} snippets.dox OrtStatus Return Value
    *
@@ -4912,11 +4910,27 @@ struct OrtApi {
    *            is not guaranteed. The session may have already been created and initialized
    *            before the cancellation request was issued.
    *
-   * \snippet{doc} snippets.dox OrtStatus
+   * \snippet{doc} snippets.dox OrtStatus Return Value
    *
+   * \since Version 1.21.
    */
   ORT_API2_STATUS(SessionOptionsSetLoadCancellationFlag, _Inout_ OrtSessionOptions* options,
                   _In_ bool cancel);
+
+  /** \brief Get the Compile API instance.
+   *
+   * Get the Compile API instance to compile ONNX models. Execution providers that support compilation fuse a subgraph
+   * into an EPContext node that wraps a provider-specific binary representation of the subgraph.
+   * For more details about the EPContext design, refer to:
+   *  \htmlonly
+   *  <a href="https://onnxruntime.ai/docs/execution-providers/EP-Context-Design.html">EPContext design document.</a>
+   *  \endhtmlonly
+   *
+   * \return Compile API struct instance.
+   *
+   * \since Version 1.22.
+   */
+  const OrtCompileApi*(ORT_API_CALL* GetCompileApi)();
 };
 
 /*
@@ -5056,7 +5070,7 @@ struct OrtModelEditorApi {
    * User can release `tensor_info` after creating the OrtTypeInfo.
    *
    * \param[in] tensor_info Tensor type and shape information.
-   * \param[out] TypeInfo instance for the tensor.
+   * \param[out] type_info TypeInfo instance for the tensor.
    *
    * \snippet{doc} snippets.dox OrtStatus Return Value
    *
@@ -5072,7 +5086,7 @@ struct OrtModelEditorApi {
    * User can release `tensor_info` after creating the OrtTypeInfo.
    *
    * \param[in] tensor_info SparseTensor type and shape information.
-   * \param[out] TypeInfo instance for the tensor.
+   * \param[out] type_info TypeInfo instance for the tensor.
    *
    * \snippet{doc} snippets.dox OrtStatus Return Value
    *
@@ -5089,7 +5103,7 @@ struct OrtModelEditorApi {
    *
    * \param[in] map_key_type Key type for the map.
    * \param[in] map_value_type Value type for the map.
-   * \param[out] TypeInfo instance for the map.
+   * \param[out] type_info TypeInfo instance for the map.
    *
    * \snippet{doc} snippets.dox OrtStatus Return Value
    *
@@ -5105,7 +5119,7 @@ struct OrtModelEditorApi {
    * User can release `sequence_type` after creating the OrtTypeInfo.
    *
    * \param[in] sequence_type Sequence type and shape information.
-   * \param[out] TypeInfo instance for the sequence.
+   * \param[out] type_info TypeInfo instance for the sequence.
    *
    * \snippet{doc} snippets.dox OrtStatus Return Value
    *
@@ -5119,8 +5133,8 @@ struct OrtModelEditorApi {
    *
    * User can release `contained_type` after creating the OrtTypeInfo.
    *
-   * \param[in] tensor_info Tensor type and shape information.
-   * \param[out] TypeInfo instance for the tensor.
+   * \param[in] contained_type Tensor type and shape information.
+   * \param[out] type_info TypeInfo instance for the tensor.
    *
    * \snippet{doc} snippets.dox OrtStatus Return Value
    *
@@ -5132,6 +5146,7 @@ struct OrtModelEditorApi {
    *
    * \param[in] name The name of the input or output.
    * \param[in] type_info The type information for the input or output. The provided value is copied.
+   * \param[out] value_info The OrtValueInfo instance.
    *
    * \snippet{doc} snippets.dox OrtStatus Return Value
    *
@@ -5271,6 +5286,7 @@ struct OrtModelEditorApi {
    *                           If augmenting an existing model add additional opset versions if needed.
    * \param[in] opset_entries_len The number of domain_names and opset_versions entries.
    *                              Domain and opset entries should be 1:1
+   * \param[out] model The OrtModel instance.
    *
    * \snippet{doc} snippets.dox OrtStatus Return Value
    *
@@ -5362,6 +5378,7 @@ struct OrtModelEditorApi {
    * \param{in} model_data The model data for the existing model to augment.
    * \param{in} model_data_length The length of the model data.
    * \param{in} options The OrtSessionOptions instance.
+   * \param{out} out The created OrtSession instance.
    *
    * \snippet{doc} snippets.dox OrtStatus Return Value
    *
@@ -5376,12 +5393,13 @@ struct OrtModelEditorApi {
    *
    * When using the Model Editor API to augment a model, any new nodes must conform to the opset version of the
    * original model. To do that the user must be able to discover that opset version.
+   * Returns an error if the domain is not used in the model.
    *
    * \param[in] session OrtSession to query
    * \param[in] domain Domain to query. The ONNX domain is an empty string.
    * \param[out] opset The opset version of the domain.
    *
-   * \snippet{doc} snippets.dox OrtStatus Return Value. Returns an error if the domain is not used in the model.
+   * \snippet{doc} snippets.dox OrtStatus Return Value
    *
    * \since Version 1.21.
    */
@@ -5414,7 +5432,7 @@ struct OrtModelEditorApi {
    *
    * \param[in] session OrtSession to finalize. Session must have been created using CreateModelEditorSession[FromArray].
    * \param[in] options OrtSessionOptions to use for the session.
-   * \param[in] Optional prepacked_weights_container OrtPrepackedWeightsContainer to use for the session.
+   * \param[in] prepacked_weights_container Optional OrtPrepackedWeightsContainer to use for the session.
                 Set to nullptr if not used.
    * \snippet{doc} snippets.dox OrtStatus Return Value
    *
@@ -5425,6 +5443,193 @@ struct OrtModelEditorApi {
 #endif  // !defined(ORT_MINIMAL_BUILD)
 };
 
+/**
+ * ORT Compile API
+ */
+
+/**
+ * \brief The OrtCompileApi struct provides functions to compile ONNX models.
+ *
+ * Execution providers that support compilation fuse a subgraph into an EPContext node that wraps a provider-specific
+ * binary representation of the subgraph.
+ * For more details about the EPContext design, refer to:
+ *  \htmlonly
+ *  <a href="https://onnxruntime.ai/docs/execution-providers/EP-Context-Design.html">EPContext design document.</a>
+ *  \endhtmlonly
+ *
+ * Example (error handling not shown):
+ *   OrtStatus* status = NULL;
+ *   OrtCompileApi* compile_api = ort_api->GetCompileApi();
+ *   OrtModelCompilationOptions* compile_options = NULL;
+ *
+ *   status = compile_api->CreateModelCompilationOptionsFromSessionOptions(env, session_options, &compile_options);
+ *   status = compile_api->ModelCompilationOptions_SetInputModelPath(compile_options, ORT_TSTR("model.onnx"));
+ *   status = compile_api->ModelCompilationOptions_SetOutputModelPath(compile_options, ORT_TSTR("model.compiled.onnx"));
+ *   status = compile_api->CompileModel(env, compile_options);
+ *   compile_api->ReleaseModelCompilationOptions(compile_options);
+ *
+ * \since Version 1.22.
+ */
+struct OrtCompileApi {
+  /// @}
+  /// \name OrtModelCompilationOptions
+  /// @{
+  ORT_CLASS_RELEASE(ModelCompilationOptions);
+
+  /** \brief Creates an OrtModelCompilationOptions object from an existing OrtSessionOptions object.
+   *
+   * An OrtModelCompilationOptions object contains the settings used to generate a compiled ONNX model.
+   * The OrtSessionOptions object has the execution providers with which the model will be compiled.
+   *
+   * ReleaseOrtModelCompilationsOptions must be called to free the OrtModelCompilationOptions after calling
+   * CompileModel.
+   *
+   * \param[in] env OrtEnv object.
+   * \param[in] session_options The OrtSessionOptions instance from which to create the OrtModelCompilationOptions.
+   * \param[out] out The created OrtModelCompilationOptions instance.
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   *
+   * \since Version 1.22.
+   */
+  ORT_API2_STATUS(CreateModelCompilationOptionsFromSessionOptions, _In_ const OrtEnv* env,
+                  _In_ const OrtSessionOptions* session_options, _Outptr_ OrtModelCompilationOptions** out);
+
+  /** \brief Sets the file path to the input ONNX model to compile.
+   *
+   * The input model's location (e.g., file path or memory buffer) must be set with either
+   * ModelCompilationOptions_SetInputModelPath or ModelCompilationOptions_SetInputModelFromBuffer.
+   *
+   * \param[in] model_compile_options The OrtModelCompilationOptions instance.
+   * \param[in] input_model_path Null terminated string of the path (wchar on Windows, char otherwise).
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   *
+   * \since Version 1.22.
+   */
+  ORT_API2_STATUS(ModelCompilationOptions_SetInputModelPath, _In_ OrtModelCompilationOptions* model_compile_options,
+                  _In_ const ORTCHAR_T* input_model_path);
+
+  /** \brief Sets the buffer that stores the bytes of the loaded ONNX model to compile.
+   *
+   * The input model's location (e.g., file path or memory buffer) must be set with either
+   * ModelCompilationOptions_SetInputModelPath or ModelCompilationOptions_SetInputModelFromBuffer.
+   *
+   * \param[in] model_compile_options The OrtModelCompilationOptions instance.
+   * \param[in] input_model_data Buffer containing the loaded ONNX model bytes.
+   * \param[in] input_model_data_size The number of bytes in the `input_model_data` buffer.
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   *
+   * \since Version 1.22.
+   */
+  ORT_API2_STATUS(ModelCompilationOptions_SetInputModelFromBuffer,
+                  _In_ OrtModelCompilationOptions* model_compile_options,
+                  _In_ const void* input_model_data,
+                  size_t input_model_data_size);
+
+  /** \brief Sets the file path for the output ONNX model generated by CompileModel.
+   *
+   * The output model's location (e.g., file path or memory buffer) can be set with either
+   * ModelCompilationOptions_SetOutputModelPath or ModelCompilationOptions_SetOutputModelBuffer.
+   *
+   * If the output model's location is not set, ONNX Runtime will generate an output file with a path based on
+   * the input model's file path. Examples:
+   *   /Path/my_model.onnx -> /Path/my_model_ctx.onnx
+   *   /Path/my_model -> /Path/my_model_ctx.onnx
+   *
+   * \param[in] model_compile_options The OrtModelCompilationOptions instance.
+   * \param[in] output_model_path Null terminated string of the path (wchar on Windows, char otherwise).
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   *
+   * \since Version 1.22.
+   */
+  ORT_API2_STATUS(ModelCompilationOptions_SetOutputModelPath, _In_ OrtModelCompilationOptions* model_compile_options,
+                  _In_ const ORTCHAR_T* output_model_path);
+
+  /** \brief Optionally sets the file that should store external initializers for the compiled ONNX model.
+   * If not set, initializers are stored within the model.
+   *
+   * Only initializers for nodes that were not compiled are stored in the external initializers file.
+   * Compiled nodes contain their initializer data within the `ep_cache_context` attribute of EPContext nodes.
+   * Refer to ModelCompilationOptions_SetEpContextEmbedMode.
+   *
+   * \param[in] model_compile_options The OrtModelCompilationOptions instance.
+   * \param[in] external_initializers_file_path Null terminated string of the path to the file.
+   * \param[in] external_initializers_size_threshold Initializers larger than this threshold are stored in the file.
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   *
+   * \since Version 1.22.
+   */
+  ORT_API2_STATUS(ModelCompilationOptions_SetOutputModelExternalInitializersFile,
+                  _In_ OrtModelCompilationOptions* model_compile_options,
+                  _In_ const ORTCHAR_T* external_initializers_file_path,
+                  size_t external_initializers_size_threshold);
+
+  /** \brief Configures model compilation to store the output compiled ONNX model in a buffer.
+   *
+   * The caller passes an OrtAllocator that ONNX Runtime uses to allocate memory for the buffer.
+   *
+   * The output model's location (e.g., file path or memory buffer) can be set with either
+   * ModelCompilationOptions_SetOutputModelPath or ModelCompilationOptions_SetOutputModelBuffer.
+   *
+   * If the output model's location is not set, ONNX Runtime will generate an output file with a path based on
+   * the input model's file path. Examples:
+   *   /Path/my_model.onnx -> /Path/my_model_ctx.onnx
+   *   /Path/my_model -> /Path/my_model_ctx.onnx
+   *
+   * \param[in] model_compile_options The OrtModelCompilationOptions instance.
+   * \param[in] allocator The allocator used to allocate the buffer for the compiled model.
+   * \param[out] output_model_buffer_ptr Pointer to the buffer that stores the compiled model.
+   * \param[out] output_model_buffer_size_ptr Pointer set to the size of output model in bytes.
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   *
+   * \since Version 1.22.
+   */
+  ORT_API2_STATUS(ModelCompilationOptions_SetOutputModelBuffer,
+                  _In_ OrtModelCompilationOptions* model_compile_options,
+                  _Inout_ OrtAllocator* allocator,
+                  _Outptr_ void** output_model_buffer_ptr,
+                  _Out_ size_t* output_model_buffer_size_ptr);
+
+  /** \brief Enables or disables the embedding of EPContext binary data into the `ep_cache_context` attribute
+   * of EPContext nodes. Defaults to false.
+   *
+   * If enabled, the `ep_cache_context` attribute of EPContext nodes will store the context binary data, which may
+   * include weights for compiled subgraphs.
+   *
+   * If disabled, the `ep_cache_context` attribute of EPContext nodes will contain the path to the file containing the
+   * context binary data. The path is set by the execution provider creating the EPContext node.
+   *
+   * More details relate to EPContext design refers to:
+   *  \htmlonly
+   *  <a href="https://onnxruntime.ai/docs/execution-providers/EP-Context-Design.html">EPContext design document.</a>
+   *  \endhtmlonly
+   *
+   * \param[in] model_compile_options The OrtModelCompilationOptions instance.
+   * \param[in] embed_ep_context_in_model True to embed EPContext binary data into the EPContext node
+   *                                      `ep_cache_context` attributes.
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   *
+   * \since Version 1.22.
+   */
+  ORT_API2_STATUS(ModelCompilationOptions_SetEpContextEmbedMode, _In_ OrtModelCompilationOptions* model_compile_options,
+                  bool embed_ep_context_in_model);
+
+  /** \brief Compiles an input ONNX model with the given compilation options.
+   *
+   * \param[in] env OrtEnv object.
+   * \param[in] model_options The compilation options that defines compilation options for a model.
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   *
+   * \since Version 1.22.
+   */
+  ORT_API2_STATUS(CompileModel, _In_ const OrtEnv* env, _In_ const OrtModelCompilationOptions* model_options);
+};
 /*
  * This is the old way to add the CUDA provider to the session, please use SessionOptionsAppendExecutionProvider_CUDA above to access the latest functionality
  * This function always exists, but will only succeed if Onnxruntime was built with CUDA support and the CUDA provider shared library exists
diff --git a/include/onnxruntime/core/session/onnxruntime_cxx_api.h b/include/onnxruntime/core/session/onnxruntime_cxx_api.h
index ce7dc1c45b05e..a2937b6e82a27 100644
--- a/include/onnxruntime/core/session/onnxruntime_cxx_api.h
+++ b/include/onnxruntime/core/session/onnxruntime_cxx_api.h
@@ -158,6 +158,20 @@ inline const OrtModelEditorApi& GetModelEditorApi() {
   return *api;
 }
 
+/// <summary>
+/// This returns a reference to the ORT C Compile API. Used if compiling a model at runtime.
+/// </summary>
+/// <returns>ORT C Compile API reference</returns>
+inline const OrtCompileApi& GetCompileApi() {
+  auto* api = GetApi().GetCompileApi();
+  if (api == nullptr) {
+    // minimal build
+    ORT_CXX_API_THROW("Compile API is not available in this build", ORT_FAIL);
+  }
+
+  return *api;
+}
+
 /** \brief IEEE 754 half-precision floating point data type
  *
  * \details This struct is used for converting float to float16 and back
@@ -517,6 +531,9 @@ namespace detail {
 #define ORT_DEFINE_RELEASE(NAME) \
   inline void OrtRelease(Ort##NAME* ptr) { GetApi().Release##NAME(ptr); }
 
+#define ORT_DEFINE_RELEASE_FROM_API_STRUCT(NAME, API_GETTER) \
+  inline void OrtRelease(Ort##NAME* ptr) { API_GETTER().Release##NAME(ptr); }
+
 ORT_DEFINE_RELEASE(Allocator);
 ORT_DEFINE_RELEASE(MemoryInfo);
 ORT_DEFINE_RELEASE(CustomOpDomain);
@@ -542,8 +559,10 @@ ORT_DEFINE_RELEASE(ValueInfo);
 ORT_DEFINE_RELEASE(Node);
 ORT_DEFINE_RELEASE(Graph);
 ORT_DEFINE_RELEASE(Model);
+ORT_DEFINE_RELEASE_FROM_API_STRUCT(ModelCompilationOptions, GetCompileApi);
 
 #undef ORT_DEFINE_RELEASE
+#undef ORT_DEFINE_RELEASE_FROM_API_STRUCT
 
 /** \brief This is a tagging template type. Use it with Base<T> to indicate that the C++ interface object
  *   has no ownership of the underlying C object.
@@ -992,6 +1011,38 @@ struct SessionOptions : detail::SessionOptionsImpl<OrtSessionOptions> {
   ConstSessionOptions GetConst() const { return ConstSessionOptions{this->p_}; }
 };
 
+/** \brief Options object used when compiling a model.
+ *
+ * Wraps ::OrtModelCompilationOptions object and methods
+ */
+struct ModelCompilationOptions : detail::Base<OrtModelCompilationOptions> {
+  using Base = detail::Base<OrtModelCompilationOptions>;
+  using Base::Base;
+
+  explicit ModelCompilationOptions(std::nullptr_t) {}  ///< Create an empty ModelCompilationOptions object, must be assigned a valid one to be used.
+
+  ModelCompilationOptions(const Env& env, const SessionOptions& session_options);  ///< Wraps OrtApi::CreateModelCompilationOptionsFromSessionOptions
+  ModelCompilationOptions(const Env& env, ConstSessionOptions session_options);    ///< Wraps OrtApi::CreateModelCompilationOptionsFromSessionOptions
+
+  ModelCompilationOptions& SetInputModelPath(const ORTCHAR_T* input_model_path);  ///< Wraps OrtApi::ModelCompilationOptions_SetInputModelPath
+  ModelCompilationOptions& SetInputModelFromBuffer(const void* input_model_data,
+                                                   size_t input_model_data_size);   ///< Wraps OrtApi::ModelCompilationOptions_SetInputModelFromBuffer
+  ModelCompilationOptions& SetEpContextEmbedMode(bool embed_ep_context_in_model);   ///< Wraps OrtApi::ModelCompilationOptions_SetEpContextEmbedMode
+  ModelCompilationOptions& SetOutputModelPath(const ORTCHAR_T* output_model_path);  ///< Wraps OrtApi::ModelCompilationOptions_SetOutputModelPath
+  ModelCompilationOptions& SetOutputModelExternalInitializersFile(const ORTCHAR_T* file_path,
+                                                                  size_t initializer_size_threshold);  ///< Wraps OrtApi::ModelCompilationOptions_SetOutputModelExternalInitializersFile
+  ModelCompilationOptions& SetOutputModelBuffer(OrtAllocator* allocator, void** output_model_buffer_ptr,
+                                                size_t* output_model_buffer_size_ptr);  ///< Wraps OrtApi::ModelCompilationOptions_SetOutputModelBuffer
+};
+
+/** \brief Compiles an input model to generate a model with EPContext nodes that execute EP-specific kernels. Wraps OrtApi::CompileModels.
+ *
+ * \param env: ORT environment object.
+ * \param model_compilation_options: Compilation options for a model.
+ * \return A Status indicating success or failure.
+ */
+Status CompileModel(const Env& env, const ModelCompilationOptions& model_compilation_options);
+
 /** \brief Wrapper around ::OrtModelMetadata
  *
  */
diff --git a/include/onnxruntime/core/session/onnxruntime_cxx_inline.h b/include/onnxruntime/core/session/onnxruntime_cxx_inline.h
index 524e3ecc92936..e41ef005349ac 100644
--- a/include/onnxruntime/core/session/onnxruntime_cxx_inline.h
+++ b/include/onnxruntime/core/session/onnxruntime_cxx_inline.h
@@ -630,6 +630,62 @@ inline RunOptions& RunOptions::AddActiveLoraAdapter(const LoraAdapter& adapter)
   return *this;
 }
 
+inline ModelCompilationOptions::ModelCompilationOptions(const Env& env, const SessionOptions& session_options) {
+  ThrowOnError(GetCompileApi().CreateModelCompilationOptionsFromSessionOptions(env, session_options, &this->p_));
+}
+
+inline ModelCompilationOptions::ModelCompilationOptions(const Env& env, ConstSessionOptions session_options) {
+  ThrowOnError(GetCompileApi().CreateModelCompilationOptionsFromSessionOptions(env, session_options, &this->p_));
+}
+
+inline Status CompileModel(const Env& env, const ModelCompilationOptions& model_compilation_options) {
+  return Ort::Status(GetCompileApi().CompileModel(env, model_compilation_options));
+}
+
+inline ModelCompilationOptions& ModelCompilationOptions::SetInputModelPath(
+    const ORTCHAR_T* input_model_path) {
+  Ort::ThrowOnError(GetCompileApi().ModelCompilationOptions_SetInputModelPath(this->p_, input_model_path));
+  return *this;
+}
+
+inline ModelCompilationOptions& ModelCompilationOptions::SetInputModelFromBuffer(
+    const void* input_model_data, size_t input_model_data_size) {
+  Ort::ThrowOnError(GetCompileApi().ModelCompilationOptions_SetInputModelFromBuffer(this->p_, input_model_data,
+                                                                                    input_model_data_size));
+  return *this;
+}
+
+inline ModelCompilationOptions& ModelCompilationOptions::SetOutputModelPath(
+    const ORTCHAR_T* output_model_path) {
+  Ort::ThrowOnError(GetCompileApi().ModelCompilationOptions_SetOutputModelPath(this->p_, output_model_path));
+  return *this;
+}
+
+inline ModelCompilationOptions& ModelCompilationOptions::SetOutputModelExternalInitializersFile(
+    const ORTCHAR_T* file_path, size_t initializer_size_threshold) {
+  Ort::ThrowOnError(GetCompileApi().ModelCompilationOptions_SetOutputModelExternalInitializersFile(
+      this->p_,
+      file_path,
+      initializer_size_threshold));
+  return *this;
+}
+
+inline ModelCompilationOptions& ModelCompilationOptions::SetOutputModelBuffer(
+    OrtAllocator* allocator, void** output_model_buffer_ptr, size_t* output_model_buffer_size_ptr) {
+  Ort::ThrowOnError(GetCompileApi().ModelCompilationOptions_SetOutputModelBuffer(this->p_, allocator,
+                                                                                 output_model_buffer_ptr,
+                                                                                 output_model_buffer_size_ptr));
+  return *this;
+}
+
+inline ModelCompilationOptions& ModelCompilationOptions::SetEpContextEmbedMode(
+    bool embed_ep_context_in_model) {
+  Ort::ThrowOnError(GetCompileApi().ModelCompilationOptions_SetEpContextEmbedMode(
+      this->p_,
+      embed_ep_context_in_model));
+  return *this;
+}
+
 namespace detail {
 
 template <typename T>
diff --git a/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h b/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h
index af1f9c04b2831..379c74e011d6e 100644
--- a/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h
+++ b/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h
@@ -10,10 +10,10 @@
  * "[Area][.[SubArea1].[SubArea2]...].[Keyname]"
  * Such as "ep.cuda.use_arena"
  * The Config Key cannot be empty
- * The maximum length of the Config Key is 128
+ * The maximum length of the Config Key is 1024
  *
  * The string format of a SessionOptions Config Value is defined individually for each Config.
- * The maximum length of the Config Value is 1024
+ * The maximum length of the Config Value is 2048
  */
 
 // Key for disable PrePacking,
diff --git a/java/build-android.gradle b/java/build-android.gradle
index 9c4275b74f626..610625cf02e54 100644
--- a/java/build-android.gradle
+++ b/java/build-android.gradle
@@ -147,7 +147,7 @@ artifacts {
 dependencies {
 	testImplementation 'org.junit.jupiter:junit-jupiter-api:5.7.0'
 	testRuntimeOnly 'org.junit.jupiter:junit-jupiter-engine:5.7.0'
-	testImplementation 'com.google.protobuf:protobuf-java:3.21.7'
+	testImplementation 'com.google.protobuf:protobuf-java:3.25.5'
 }
 
 publishing {
diff --git a/java/build.gradle b/java/build.gradle
index 8452daab72872..2d43d1ead13f0 100644
--- a/java/build.gradle
+++ b/java/build.gradle
@@ -179,7 +179,7 @@ if (cmakeBuildDir != null) {
 dependencies {
 	testImplementation 'org.junit.jupiter:junit-jupiter-api:5.9.2'
 	testRuntimeOnly 'org.junit.jupiter:junit-jupiter-engine:5.9.2'
-	testImplementation 'com.google.protobuf:protobuf-java:3.21.7'
+	testImplementation 'com.google.protobuf:protobuf-java:3.25.5'
 }
 
 processTestResources {
diff --git a/js/.vscode/launch.json b/js/.vscode/launch.json
index 5fd79872cf07b..6fd4c855054bc 100644
--- a/js/.vscode/launch.json
+++ b/js/.vscode/launch.json
@@ -16,6 +16,15 @@
       "sourceMaps": true,
       "preLaunchTask": "tsc: build - common/test/tsconfig.json"
     },
+    {
+      "name": "[node] Launch installation script",
+      "program": "${workspaceFolder}/node/script/install.js",
+      "request": "launch",
+      "skipFiles": ["<node_internals>/**"],
+      "type": "node",
+      "cwd": "${workspaceFolder}/node",
+      "args": ["--onnxruntime-node-install"]
+    },
     {
       "name": "[web] Launch Build script in Node.js",
       "program": "${workspaceFolder}/web/script/build.js",
diff --git a/js/README.md b/js/README.md
index eb95c9224c081..dbc58f3a75ebd 100644
--- a/js/README.md
+++ b/js/README.md
@@ -24,9 +24,9 @@ Please follow the steps described below to setup development environment.
 
 ### Prerequisites
 
-- Node.js (16.0+): https://nodejs.org/ - (Optional) Use nvm ([Windows](https://github.com/coreybutler/nvm-windows) / [Mac/Linux](https://github.com/creationix/nvm)) to install Node.js
+- Node.js (20.0+): https://nodejs.org/ - (Optional) Use nvm ([Windows](https://github.com/coreybutler/nvm-windows) / [Mac/Linux](https://github.com/creationix/nvm)) to install Node.js
 
-- Python (2.7 or 3.6+): https://www.python.org/downloads/
+- Python (3.9+): https://www.python.org/downloads/
 
   - python should be added to the PATH environment variable
 
@@ -72,7 +72,7 @@ This project is designed to include all "common" code, which are pure javascript
 
 ### Requirements
 
-Node.js v12+ (recommended v14+)
+Node.js v20+
 
 ### Build
 
@@ -108,7 +108,7 @@ Document will be generated in folder `<ORT_ROOT>/js/common/docs`.
 
 > language: typescript/C++
 
-> dependency: onnxruntime-common, ONNXRuntime.dll
+> dependency: onnxruntime-common, ONNXRuntime shared library(.so/dll/dylib)
 
 > folder: <ORT_ROOT>/js/node
 
@@ -116,7 +116,7 @@ This project is designed to be used as a NPM package to enable Node.js users to
 
 ### Requirements
 
-Node.js v12+ (recommended v14+)
+Node.js v20+
 
 ### Build
 
diff --git a/js/node/CMakeLists.txt b/js/node/CMakeLists.txt
index c78b40a3e7429..2bd6f22e5f901 100644
--- a/js/node/CMakeLists.txt
+++ b/js/node/CMakeLists.txt
@@ -12,7 +12,7 @@ execute_process(COMMAND node -e "console.log(process.platform)"
                 OUTPUT_VARIABLE node_platform OUTPUT_STRIP_TRAILING_WHITESPACE)
 file(READ ${CMAKE_SOURCE_DIR}/../../VERSION_NUMBER ort_version)
 string(STRIP "${ort_version}" ort_version)
-set(dist_folder "${CMAKE_SOURCE_DIR}/bin/napi-v3/${node_platform}/${NODE_ARCH}/")
+set(dist_folder "${CMAKE_SOURCE_DIR}/bin/napi-v6/${node_platform}/${NODE_ARCH}/")
 
 # onnxruntime.dll dir
 if(NOT ONNXRUNTIME_BUILD_DIR)
diff --git a/js/node/README.md b/js/node/README.md
index abb91bf05ddf1..c271d8daccc8b 100644
--- a/js/node/README.md
+++ b/js/node/README.md
@@ -10,6 +10,12 @@ Install the latest stable version:
 npm install onnxruntime-node
 ```
 
+Install the nightly version:
+
+```
+npm install onnxruntime-node@dev
+```
+
 Refer to [ONNX Runtime JavaScript examples](https://github.com/microsoft/onnxruntime-inference-examples/tree/main/js) for samples and tutorials.
 
 ## Requirements
@@ -18,33 +24,32 @@ ONNXRuntime works on Node.js v16.x+ (recommend v20.x+) or Electron v15.x+ (recom
 
 The following table lists the supported versions of ONNX Runtime Node.js binding provided with pre-built binaries.
 
-| EPs/Platforms | Windows x64 | Windows arm64 | Linux x64         | Linux arm64 | MacOS x64 | MacOS arm64 |
-| ------------- | ----------- | ------------- | ----------------- | ----------- | --------- | ----------- |
-| CPU           | ✔️          | ✔️            | ✔️                | ✔️          | ✔️        | ✔️          |
-| DirectML      | ✔️          | ✔️            | ❌                | ❌          | ❌        | ❌          |
-| CUDA          | ❌          | ❌            | ✔️<sup>\[1]</sup> | ❌          | ❌        | ❌          |
+| EPs/Platforms | Windows x64        | Windows arm64      | Linux x64          | Linux arm64        | MacOS x64          | MacOS arm64        |
+| ------------- | ------------------ | ------------------ | ------------------ | ------------------ | ------------------ | ------------------ |
+| CPU           | ✔️                 | ✔️                 | ✔️                 | ✔️                 | ✔️                 | ✔️                 |
+| WebGPU        | ✔️ <sup>\[1]</sup> | ✔️ <sup>\[1]</sup> | ✔️ <sup>\[1]</sup> | ✔️ <sup>\[1]</sup> | ✔️ <sup>\[1]</sup> | ✔️ <sup>\[1]</sup> |
+| DirectML      | ✔️                 | ✔️                 | ❌                 | ❌                 | ❌                 | ❌                 |
+| CUDA          | ❌                 | ❌                 | ✔️<sup>\[2]</sup>  | ❌                 | ❌                 | ❌                 |
+| CoreML        | ❌                 | ❌                 | ❌                 | ❌                 | ✔️                 | ✔️                 |
 
-- \[1]: CUDA v11.8.
+- \[1]: WebGPU support is currently experimental.
+- \[2]: CUDA v12. See [CUDA EP Installation](#cuda-ep-installation) for details.
 
 To use on platforms without pre-built binaries, you can build Node.js binding from source and consume it by `npm install <onnxruntime_repo_root>/js/node/`. See also [instructions](https://onnxruntime.ai/docs/build/inferencing.html#apis-and-language-bindings) for building ONNX Runtime Node.js binding locally.
 
 # GPU Support
 
-Right now, the Windows version supports only the DML provider. Linux x64 can use CUDA and TensorRT.
+Right now, the Windows version supports WebGPU execution provider and DML execution provider. Linux x64 can use CUDA and TensorRT.
 
 ## CUDA EP Installation
 
-To use CUDA EP, you need to install the CUDA EP binaries. By default, the CUDA EP binaries are installed automatically when you install the package. If you want to skip the installation, you can pass the `--onnxruntime-node-install-cuda=skip` flag to the installation command.
+To use CUDA EP, you need to install the CUDA EP binaries. By default, the CUDA EP binaries are installed automatically when you install the package. If you want to skip the installation, you can pass the `--onnxruntime-node-install=skip` flag to the installation command.
 
 ```
-npm install onnxruntime-node --onnxruntime-node-install-cuda=skip
+npm install onnxruntime-node --onnxruntime-node-install=skip
 ```
 
-You can also use this flag to specify the version of the CUDA: (v11 or v12)
-
-```
-npm install onnxruntime-node --onnxruntime-node-install-cuda=v12
-```
+~~You can also use this flag to specify the version of the CUDA: (v11 or v12)~~ CUDA v11 is no longer supported since v1.22.
 
 ## License
 
diff --git a/js/node/lib/binding.ts b/js/node/lib/binding.ts
index ed133734ce66a..ab4a72a4e60a5 100644
--- a/js/node/lib/binding.ts
+++ b/js/node/lib/binding.ts
@@ -53,7 +53,7 @@ export declare namespace Binding {
 // export native binding
 export const binding =
   // eslint-disable-next-line @typescript-eslint/no-require-imports, @typescript-eslint/no-var-requires
-  require(`../bin/napi-v3/${process.platform}/${process.arch}/onnxruntime_binding.node`) as {
+  require(`../bin/napi-v6/${process.platform}/${process.arch}/onnxruntime_binding.node`) as {
     // eslint-disable-next-line @typescript-eslint/naming-convention
     InferenceSession: Binding.InferenceSessionConstructor;
     listSupportedBackends: () => Binding.SupportedBackend[];
diff --git a/js/node/package-lock.json b/js/node/package-lock.json
index 41ffb071b9ced..b445ce9e8c5c6 100644
--- a/js/node/package-lock.json
+++ b/js/node/package-lock.json
@@ -15,9 +15,9 @@
         "linux"
       ],
       "dependencies": {
+        "adm-zip": "^0.5.16",
         "global-agent": "^3.0.0",
-        "onnxruntime-common": "file:../common",
-        "tar": "^7.0.1"
+        "onnxruntime-common": "file:../common"
       },
       "devDependencies": {
         "@types/minimist": "^1.2.2",
@@ -36,123 +36,6 @@
         "typedoc": "^0.25.7"
       }
     },
-    "node_modules/@isaacs/cliui": {
-      "version": "8.0.2",
-      "resolved": "https://registry.npmjs.org/@isaacs/cliui/-/cliui-8.0.2.tgz",
-      "integrity": "sha512-O8jcjabXaleOG9DQ0+ARXWZBTfnP4WNAqzuiJK7ll44AmxGKv/J2M4TPjxjY3znBCfvBXFzucm1twdyFybFqEA==",
-      "dependencies": {
-        "string-width": "^5.1.2",
-        "string-width-cjs": "npm:string-width@^4.2.0",
-        "strip-ansi": "^7.0.1",
-        "strip-ansi-cjs": "npm:strip-ansi@^6.0.1",
-        "wrap-ansi": "^8.1.0",
-        "wrap-ansi-cjs": "npm:wrap-ansi@^7.0.0"
-      },
-      "engines": {
-        "node": ">=12"
-      }
-    },
-    "node_modules/@isaacs/cliui/node_modules/ansi-regex": {
-      "version": "6.0.1",
-      "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-6.0.1.tgz",
-      "integrity": "sha512-n5M855fKb2SsfMIiFFoVrABHJC8QtHwVx+mHWP3QcEqBHYienj5dHSgjbxtC0WEZXYt4wcD6zrQElDPhFuZgfA==",
-      "engines": {
-        "node": ">=12"
-      },
-      "funding": {
-        "url": "https://github.com/chalk/ansi-regex?sponsor=1"
-      }
-    },
-    "node_modules/@isaacs/cliui/node_modules/ansi-styles": {
-      "version": "6.2.1",
-      "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-6.2.1.tgz",
-      "integrity": "sha512-bN798gFfQX+viw3R7yrGWRqnrN2oRkEkUjjl4JNn4E8GxxbjtG3FbrEIIY3l8/hrwUwIeCZvi4QuOTP4MErVug==",
-      "engines": {
-        "node": ">=12"
-      },
-      "funding": {
-        "url": "https://github.com/chalk/ansi-styles?sponsor=1"
-      }
-    },
-    "node_modules/@isaacs/cliui/node_modules/emoji-regex": {
-      "version": "9.2.2",
-      "resolved": "https://registry.npmjs.org/emoji-regex/-/emoji-regex-9.2.2.tgz",
-      "integrity": "sha512-L18DaJsXSUk2+42pv8mLs5jJT2hqFkFE4j21wOmgbUqsZ2hL72NsUU785g9RXgo3s0ZNgVl42TiHp3ZtOv/Vyg=="
-    },
-    "node_modules/@isaacs/cliui/node_modules/string-width": {
-      "version": "5.1.2",
-      "resolved": "https://registry.npmjs.org/string-width/-/string-width-5.1.2.tgz",
-      "integrity": "sha512-HnLOCR3vjcY8beoNLtcjZ5/nxn2afmME6lhrDrebokqMap+XbeW8n9TXpPDOqdGK5qcI3oT0GKTW6wC7EMiVqA==",
-      "dependencies": {
-        "eastasianwidth": "^0.2.0",
-        "emoji-regex": "^9.2.2",
-        "strip-ansi": "^7.0.1"
-      },
-      "engines": {
-        "node": ">=12"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/sindresorhus"
-      }
-    },
-    "node_modules/@isaacs/cliui/node_modules/strip-ansi": {
-      "version": "7.1.0",
-      "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-7.1.0.tgz",
-      "integrity": "sha512-iq6eVVI64nQQTRYq2KtEg2d2uU7LElhTJwsH4YzIHZshxlgZms/wIc4VoDQTlG/IvVIrBKG06CrZnp0qv7hkcQ==",
-      "dependencies": {
-        "ansi-regex": "^6.0.1"
-      },
-      "engines": {
-        "node": ">=12"
-      },
-      "funding": {
-        "url": "https://github.com/chalk/strip-ansi?sponsor=1"
-      }
-    },
-    "node_modules/@isaacs/cliui/node_modules/wrap-ansi": {
-      "version": "8.1.0",
-      "resolved": "https://registry.npmjs.org/wrap-ansi/-/wrap-ansi-8.1.0.tgz",
-      "integrity": "sha512-si7QWI6zUMq56bESFvagtmzMdGOtoxfR+Sez11Mobfc7tm+VkUckk9bW2UeffTGVUbOksxmSw0AA2gs8g71NCQ==",
-      "dependencies": {
-        "ansi-styles": "^6.1.0",
-        "string-width": "^5.0.1",
-        "strip-ansi": "^7.0.1"
-      },
-      "engines": {
-        "node": ">=12"
-      },
-      "funding": {
-        "url": "https://github.com/chalk/wrap-ansi?sponsor=1"
-      }
-    },
-    "node_modules/@isaacs/fs-minipass": {
-      "version": "4.0.0",
-      "resolved": "https://registry.npmjs.org/@isaacs/fs-minipass/-/fs-minipass-4.0.0.tgz",
-      "integrity": "sha512-S00nN1Qt3z3dSP6Db45fj/mksrAq5XWNIJ/SWXGP8XPT2jrzEuYRCSEx08JpJwBcG2F1xgiOtBMGDU0AZHmxew==",
-      "dependencies": {
-        "minipass": "^7.0.4"
-      },
-      "engines": {
-        "node": ">=18.0.0"
-      }
-    },
-    "node_modules/@isaacs/fs-minipass/node_modules/minipass": {
-      "version": "7.0.4",
-      "resolved": "https://registry.npmjs.org/minipass/-/minipass-7.0.4.tgz",
-      "integrity": "sha512-jYofLM5Dam9279rdkWzqHozUo4ybjdZmCsDHePy5V/PbBcVMiSZR97gmAy45aqi8CK1lG2ECd356FU86avfwUQ==",
-      "engines": {
-        "node": ">=16 || 14 >=14.17"
-      }
-    },
-    "node_modules/@pkgjs/parseargs": {
-      "version": "0.11.0",
-      "resolved": "https://registry.npmjs.org/@pkgjs/parseargs/-/parseargs-0.11.0.tgz",
-      "integrity": "sha512-+1VkjdD0QBLPodGrJUeqarH8VAIvQODIbwh9XpP5Syisf7YoQgsJKPNFoqqLQlu+VQ/tVSshMR6loPMn8U+dPg==",
-      "optional": true,
-      "engines": {
-        "node": ">=14"
-      }
-    },
     "node_modules/@protobufjs/aspromise": {
       "version": "1.1.2",
       "resolved": "https://registry.npmjs.org/@protobufjs/aspromise/-/aspromise-1.1.2.tgz",
@@ -229,10 +112,20 @@
       "integrity": "sha512-93+VvleD3mXwlLI/xASjw0FzKcwzl3OdTCzm1LaRfqgS21gfFtK3zDXM5Op9TeeMsJVOaJ2VRDpT9q4Y3d0AvA==",
       "dev": true
     },
+    "node_modules/adm-zip": {
+      "version": "0.5.16",
+      "resolved": "https://registry.npmjs.org/adm-zip/-/adm-zip-0.5.16.tgz",
+      "integrity": "sha512-TGw5yVi4saajsSEgz25grObGHEUaDrniwvA2qwSC060KfqGPdglhvPMA2lPIoxs3PQIItj2iag35fONcQqgUaQ==",
+      "license": "MIT",
+      "engines": {
+        "node": ">=12.0"
+      }
+    },
     "node_modules/ansi-regex": {
       "version": "5.0.1",
       "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-5.0.1.tgz",
       "integrity": "sha512-quJQXlTSUGL2LH9SUXo8VwsY4soanhgo6LNSm84E1LBcE8s3O0wpdiRzyR9z/ZZJMlMWv37qOOb9pdJlMUEKFQ==",
+      "dev": true,
       "engines": {
         "node": ">=8"
       }
@@ -241,6 +134,7 @@
       "version": "4.3.0",
       "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-4.3.0.tgz",
       "integrity": "sha512-zbB9rCJAT1rbjiVDb2hqKFHNYLxgtk8NURxZ3IZwD3F6NtxbXZQCnnSi1Lkx+IDohdPlFp222wVALIheZJQSEg==",
+      "dev": true,
       "dependencies": {
         "color-convert": "^2.0.1"
       },
@@ -288,33 +182,12 @@
         "proxy-from-env": "^1.1.0"
       }
     },
-    "node_modules/balanced-match": {
-      "version": "1.0.2",
-      "resolved": "https://registry.npmjs.org/balanced-match/-/balanced-match-1.0.2.tgz",
-      "integrity": "sha512-3oSeUO0TMV67hN1AmbXsK4yaqU7tjiHlbxRDZOpH0KW9+CeX4bRAaX0Anxt0tx2MrpRpWwQaPwIlISEJhYU5Pw=="
-    },
     "node_modules/boolean": {
       "version": "3.2.0",
       "resolved": "https://registry.npmjs.org/boolean/-/boolean-3.2.0.tgz",
       "integrity": "sha512-d0II/GO9uf9lfUHH2BQsjxzRJZBdsjgsBiW4BvhWk/3qoKwQFjIDVN19PfX8F2D/r9PCMTtLWjYVCFrpeYUzsw==",
       "deprecated": "Package no longer supported. Contact Support at https://www.npmjs.com/support for more info."
     },
-    "node_modules/brace-expansion": {
-      "version": "2.0.1",
-      "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-2.0.1.tgz",
-      "integrity": "sha512-XnAIvQ8eM+kC6aULx6wuQiwVsnzsi9d3WxzV3FpWTGA19F621kwdbsAcFKXgKUHZWsy+mY6iL1sHTxWEFCytDA==",
-      "dependencies": {
-        "balanced-match": "^1.0.0"
-      }
-    },
-    "node_modules/chownr": {
-      "version": "3.0.0",
-      "resolved": "https://registry.npmjs.org/chownr/-/chownr-3.0.0.tgz",
-      "integrity": "sha512-+IxzY9BZOQd/XuYPRmrvEVjF/nqj5kgT4kEq7VofrDoM1MxoRjEWkrCC3EtLi59TVawxTAn+orJwFQcrqEN1+g==",
-      "engines": {
-        "node": ">=18"
-      }
-    },
     "node_modules/cliui": {
       "version": "8.0.1",
       "resolved": "https://registry.npmjs.org/cliui/-/cliui-8.0.1.tgz",
@@ -423,6 +296,7 @@
       "version": "2.0.1",
       "resolved": "https://registry.npmjs.org/color-convert/-/color-convert-2.0.1.tgz",
       "integrity": "sha512-RRECPsj7iu/xb5oKYcsFHSppFNnsj/52OVTRKb4zP5onXwVF3zVmmToNcOfGC+CRDpfK/U584fMg38ZHCaElKQ==",
+      "dev": true,
       "dependencies": {
         "color-name": "~1.1.4"
       },
@@ -433,7 +307,8 @@
     "node_modules/color-name": {
       "version": "1.1.4",
       "resolved": "https://registry.npmjs.org/color-name/-/color-name-1.1.4.tgz",
-      "integrity": "sha512-dOy+3AuW3a2wNbZHIuMZpTcgjGuLU/uBL/ubcZF9OXbDo8ff4O8yVp5Bf0efS8uEoYo5q4Fx7dY9OgQGXgAsQA=="
+      "integrity": "sha512-dOy+3AuW3a2wNbZHIuMZpTcgjGuLU/uBL/ubcZF9OXbDo8ff4O8yVp5Bf0efS8uEoYo5q4Fx7dY9OgQGXgAsQA==",
+      "dev": true
     },
     "node_modules/color-support": {
       "version": "1.1.3",
@@ -462,19 +337,6 @@
       "integrity": "sha512-ty/fTekppD2fIwRvnZAVdeOiGd1c7YXEixbgJTNzqcxJWKQnjJ/V1bNEEE6hygpM3WjwHFUVK6HTjWSzV4a8sQ==",
       "dev": true
     },
-    "node_modules/cross-spawn": {
-      "version": "7.0.6",
-      "resolved": "https://registry.npmjs.org/cross-spawn/-/cross-spawn-7.0.6.tgz",
-      "integrity": "sha512-uV2QOWP2nWzsy2aMp8aRibhi9dlzF5Hgh5SHaB9OiTGEyDTiJJyx0uy51QXdyWbtAHNua4XJzUKca3OzKUd3vA==",
-      "dependencies": {
-        "path-key": "^3.1.0",
-        "shebang-command": "^2.0.0",
-        "which": "^2.0.1"
-      },
-      "engines": {
-        "node": ">= 8"
-      }
-    },
     "node_modules/debug": {
       "version": "4.3.4",
       "resolved": "https://registry.npmjs.org/debug/-/debug-4.3.4.tgz",
@@ -553,15 +415,11 @@
       "resolved": "https://registry.npmjs.org/detect-node/-/detect-node-2.1.0.tgz",
       "integrity": "sha512-T0NIuQpnTvFDATNuHN5roPwSBG83rFsuO+MXXH9/3N1eFbn4wcPjttvjMLEPWJ0RGUYgQE7cGgS3tNxbqCGM7g=="
     },
-    "node_modules/eastasianwidth": {
-      "version": "0.2.0",
-      "resolved": "https://registry.npmjs.org/eastasianwidth/-/eastasianwidth-0.2.0.tgz",
-      "integrity": "sha512-I88TYZWc9XiYHRQ4/3c5rjjfgkjhLyW2luGIheGERbNQ6OY7yTybanSpDXZa8y7VUP9YmDcYa+eyq4ca7iLqWA=="
-    },
     "node_modules/emoji-regex": {
       "version": "8.0.0",
       "resolved": "https://registry.npmjs.org/emoji-regex/-/emoji-regex-8.0.0.tgz",
-      "integrity": "sha512-MSjYzcWNOA0ewAHpz0MxpYFvwg6yjy1NG3xteoqz644VCo/RPgnr1/GGt+ic3iJTzQ8Eu3TdM14SawnVUmGE6A=="
+      "integrity": "sha512-MSjYzcWNOA0ewAHpz0MxpYFvwg6yjy1NG3xteoqz644VCo/RPgnr1/GGt+ic3iJTzQ8Eu3TdM14SawnVUmGE6A==",
+      "dev": true
     },
     "node_modules/error-ex": {
       "version": "1.3.2",
@@ -639,32 +497,6 @@
         }
       }
     },
-    "node_modules/foreground-child": {
-      "version": "3.1.1",
-      "resolved": "https://registry.npmjs.org/foreground-child/-/foreground-child-3.1.1.tgz",
-      "integrity": "sha512-TMKDUnIte6bfb5nWv7V/caI169OHgvwjb7V4WkeUvbQQdjr5rWKqHFiKWb/fcOwB+CzBT+qbWjvj+DVwRskpIg==",
-      "dependencies": {
-        "cross-spawn": "^7.0.0",
-        "signal-exit": "^4.0.1"
-      },
-      "engines": {
-        "node": ">=14"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/isaacs"
-      }
-    },
-    "node_modules/foreground-child/node_modules/signal-exit": {
-      "version": "4.1.0",
-      "resolved": "https://registry.npmjs.org/signal-exit/-/signal-exit-4.1.0.tgz",
-      "integrity": "sha512-bzyZ1e88w9O1iNJbKnOlvYTrWPDl46O1bG0D3XInv+9tkPrxrN8jUUTiFlDkkmKWgn1M6CfIA13SuGqOa9Korw==",
-      "engines": {
-        "node": ">=14"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/isaacs"
-      }
-    },
     "node_modules/form-data": {
       "version": "4.0.0",
       "resolved": "https://registry.npmjs.org/form-data/-/form-data-4.0.0.tgz",
@@ -745,35 +577,6 @@
         "node": "6.* || 8.* || >= 10.*"
       }
     },
-    "node_modules/glob": {
-      "version": "10.3.12",
-      "resolved": "https://registry.npmjs.org/glob/-/glob-10.3.12.tgz",
-      "integrity": "sha512-TCNv8vJ+xz4QiqTpfOJA7HvYv+tNIRHKfUWw/q+v2jdgN4ebz+KY9tGx5J4rHP0o84mNP+ApH66HRX8us3Khqg==",
-      "dependencies": {
-        "foreground-child": "^3.1.0",
-        "jackspeak": "^2.3.6",
-        "minimatch": "^9.0.1",
-        "minipass": "^7.0.4",
-        "path-scurry": "^1.10.2"
-      },
-      "bin": {
-        "glob": "dist/esm/bin.mjs"
-      },
-      "engines": {
-        "node": ">=16 || 14 >=14.17"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/isaacs"
-      }
-    },
-    "node_modules/glob/node_modules/minipass": {
-      "version": "7.0.4",
-      "resolved": "https://registry.npmjs.org/minipass/-/minipass-7.0.4.tgz",
-      "integrity": "sha512-jYofLM5Dam9279rdkWzqHozUo4ybjdZmCsDHePy5V/PbBcVMiSZR97gmAy45aqi8CK1lG2ECd356FU86avfwUQ==",
-      "engines": {
-        "node": ">=16 || 14 >=14.17"
-      }
-    },
     "node_modules/global-agent": {
       "version": "3.0.0",
       "resolved": "https://registry.npmjs.org/global-agent/-/global-agent-3.0.0.tgz",
@@ -861,6 +664,7 @@
       "version": "3.0.0",
       "resolved": "https://registry.npmjs.org/is-fullwidth-code-point/-/is-fullwidth-code-point-3.0.0.tgz",
       "integrity": "sha512-zymm5+u+sCsSWyD9qNaejV3DFvhCKclKdizYaJUuHA83RLjb7nSuGnddCHGv0hk+KY7BMAlsWeK4Ueg6EV6XQg==",
+      "dev": true,
       "engines": {
         "node": ">=8"
       }
@@ -868,24 +672,8 @@
     "node_modules/isexe": {
       "version": "2.0.0",
       "resolved": "https://registry.npmjs.org/isexe/-/isexe-2.0.0.tgz",
-      "integrity": "sha512-RHxMLp9lnKHGHRng9QFhRCMbYAcVpn69smSGcq3f36xjgVVWThj4qqLbTLlq7Ssj8B+fIQ1EuCEGI2lKsyQeIw=="
-    },
-    "node_modules/jackspeak": {
-      "version": "2.3.6",
-      "resolved": "https://registry.npmjs.org/jackspeak/-/jackspeak-2.3.6.tgz",
-      "integrity": "sha512-N3yCS/NegsOBokc8GAdM8UcmfsKiSS8cipheD/nivzr700H+nsMOxJjQnvwOcRYVuFkdH0wGUvW2WbXGmrZGbQ==",
-      "dependencies": {
-        "@isaacs/cliui": "^8.0.2"
-      },
-      "engines": {
-        "node": ">=14"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/isaacs"
-      },
-      "optionalDependencies": {
-        "@pkgjs/parseargs": "^0.11.0"
-      }
+      "integrity": "sha512-RHxMLp9lnKHGHRng9QFhRCMbYAcVpn69smSGcq3f36xjgVVWThj4qqLbTLlq7Ssj8B+fIQ1EuCEGI2lKsyQeIw==",
+      "dev": true
     },
     "node_modules/json-parse-better-errors": {
       "version": "1.0.2",
@@ -991,20 +779,6 @@
         "node": ">= 0.6"
       }
     },
-    "node_modules/minimatch": {
-      "version": "9.0.4",
-      "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-9.0.4.tgz",
-      "integrity": "sha512-KqWh+VchfxcMNRAJjj2tnsSJdNbHsVgnkBhTNrW7AjVo6OvLtxw8zfT9oLw1JSohlFzJ8jCoTgaoXvJ+kHt6fw==",
-      "dependencies": {
-        "brace-expansion": "^2.0.1"
-      },
-      "engines": {
-        "node": ">=16 || 14 >=14.17"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/isaacs"
-      }
-    },
     "node_modules/minimist": {
       "version": "1.2.8",
       "resolved": "https://registry.npmjs.org/minimist/-/minimist-1.2.8.tgz",
@@ -1018,30 +792,11 @@
       "version": "5.0.0",
       "resolved": "https://registry.npmjs.org/minipass/-/minipass-5.0.0.tgz",
       "integrity": "sha512-3FnjYuehv9k6ovOEbyOswadCDPX1piCfhV8ncmYtHOjuPwylVWsghTLo7rabjC3Rx5xD4HDx8Wm1xnMF7S5qFQ==",
+      "dev": true,
       "engines": {
         "node": ">=8"
       }
     },
-    "node_modules/minizlib": {
-      "version": "3.0.1",
-      "resolved": "https://registry.npmjs.org/minizlib/-/minizlib-3.0.1.tgz",
-      "integrity": "sha512-umcy022ILvb5/3Djuu8LWeqUa8D68JaBzlttKeMWen48SjabqS3iY5w/vzeMzMUNhLDifyhbOwKDSznB1vvrwg==",
-      "dependencies": {
-        "minipass": "^7.0.4",
-        "rimraf": "^5.0.5"
-      },
-      "engines": {
-        "node": ">= 18"
-      }
-    },
-    "node_modules/minizlib/node_modules/minipass": {
-      "version": "7.0.4",
-      "resolved": "https://registry.npmjs.org/minipass/-/minipass-7.0.4.tgz",
-      "integrity": "sha512-jYofLM5Dam9279rdkWzqHozUo4ybjdZmCsDHePy5V/PbBcVMiSZR97gmAy45aqi8CK1lG2ECd356FU86avfwUQ==",
-      "engines": {
-        "node": ">=16 || 14 >=14.17"
-      }
-    },
     "node_modules/mkdirp": {
       "version": "0.5.6",
       "resolved": "https://registry.npmjs.org/mkdirp/-/mkdirp-0.5.6.tgz",
@@ -1112,37 +867,6 @@
         "node": ">=4"
       }
     },
-    "node_modules/path-key": {
-      "version": "3.1.1",
-      "resolved": "https://registry.npmjs.org/path-key/-/path-key-3.1.1.tgz",
-      "integrity": "sha512-ojmeN0qd+y0jszEtoY48r0Peq5dwMEkIlCOu6Q5f41lfkswXuKtYrhgoTpLnyIcHm24Uhqx+5Tqm2InSwLhE6Q==",
-      "engines": {
-        "node": ">=8"
-      }
-    },
-    "node_modules/path-scurry": {
-      "version": "1.10.2",
-      "resolved": "https://registry.npmjs.org/path-scurry/-/path-scurry-1.10.2.tgz",
-      "integrity": "sha512-7xTavNy5RQXnsjANvVvMkEjvloOinkAjv/Z6Ildz9v2RinZ4SBKTWFOVRbaF8p0vpHnyjV/UwNDdKuUv6M5qcA==",
-      "dependencies": {
-        "lru-cache": "^10.2.0",
-        "minipass": "^5.0.0 || ^6.0.2 || ^7.0.0"
-      },
-      "engines": {
-        "node": ">=16 || 14 >=14.17"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/isaacs"
-      }
-    },
-    "node_modules/path-scurry/node_modules/lru-cache": {
-      "version": "10.2.0",
-      "resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-10.2.0.tgz",
-      "integrity": "sha512-2bIM8x+VAf6JT4bKAljS1qUWgMsqZRPGJS6FSahIMPVvctcNhyVp7AJu7quxOW9jwkryBReKZY5tY5JYv2n/7Q==",
-      "engines": {
-        "node": "14 || >=16.14"
-      }
-    },
     "node_modules/protobufjs": {
       "version": "7.2.5",
       "resolved": "https://registry.npmjs.org/protobufjs/-/protobufjs-7.2.5.tgz",
@@ -1220,23 +944,6 @@
         "node": ">=0.10.0"
       }
     },
-    "node_modules/rimraf": {
-      "version": "5.0.5",
-      "resolved": "https://registry.npmjs.org/rimraf/-/rimraf-5.0.5.tgz",
-      "integrity": "sha512-CqDakW+hMe/Bz202FPEymy68P+G50RfMQK+Qo5YUqc9SPipvbGjCGKd0RSKEelbsfQuw3g5NZDSrlZZAJurH1A==",
-      "dependencies": {
-        "glob": "^10.3.7"
-      },
-      "bin": {
-        "rimraf": "dist/esm/bin.mjs"
-      },
-      "engines": {
-        "node": ">=14"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/isaacs"
-      }
-    },
     "node_modules/roarr": {
       "version": "2.15.4",
       "resolved": "https://registry.npmjs.org/roarr/-/roarr-2.15.4.tgz",
@@ -1312,25 +1019,6 @@
       "integrity": "sha512-KiKBS8AnWGEyLzofFfmvKwpdPzqiy16LvQfK3yv/fVH7Bj13/wl3JSR1J+rfgRE9q7xUJK4qvgS8raSOeLUehw==",
       "dev": true
     },
-    "node_modules/shebang-command": {
-      "version": "2.0.0",
-      "resolved": "https://registry.npmjs.org/shebang-command/-/shebang-command-2.0.0.tgz",
-      "integrity": "sha512-kHxr2zZpYtdmrN1qDjrrX/Z1rR1kG8Dx+gkpK1G4eXmvXswmcE1hTWBWYUzlraYw1/yZp6YuDY77YtvbN0dmDA==",
-      "dependencies": {
-        "shebang-regex": "^3.0.0"
-      },
-      "engines": {
-        "node": ">=8"
-      }
-    },
-    "node_modules/shebang-regex": {
-      "version": "3.0.0",
-      "resolved": "https://registry.npmjs.org/shebang-regex/-/shebang-regex-3.0.0.tgz",
-      "integrity": "sha512-7++dFhtcx3353uBaq8DDR4NuxBetBzC7ZQOhmTQInHEd6bSrXdiEyzCvG07Z44UYdLShWUyXt5M/yhz8ekcb1A==",
-      "engines": {
-        "node": ">=8"
-      }
-    },
     "node_modules/signal-exit": {
       "version": "3.0.7",
       "resolved": "https://registry.npmjs.org/signal-exit/-/signal-exit-3.0.7.tgz",
@@ -1355,20 +1043,7 @@
       "version": "4.2.3",
       "resolved": "https://registry.npmjs.org/string-width/-/string-width-4.2.3.tgz",
       "integrity": "sha512-wKyQRQpjJ0sIp62ErSZdGsjMJWsap5oRNihHhu6G7JVO/9jIB6UyevL+tXuOqrng8j/cxKTWyWUwvSTriiZz/g==",
-      "dependencies": {
-        "emoji-regex": "^8.0.0",
-        "is-fullwidth-code-point": "^3.0.0",
-        "strip-ansi": "^6.0.1"
-      },
-      "engines": {
-        "node": ">=8"
-      }
-    },
-    "node_modules/string-width-cjs": {
-      "name": "string-width",
-      "version": "4.2.3",
-      "resolved": "https://registry.npmjs.org/string-width/-/string-width-4.2.3.tgz",
-      "integrity": "sha512-wKyQRQpjJ0sIp62ErSZdGsjMJWsap5oRNihHhu6G7JVO/9jIB6UyevL+tXuOqrng8j/cxKTWyWUwvSTriiZz/g==",
+      "dev": true,
       "dependencies": {
         "emoji-regex": "^8.0.0",
         "is-fullwidth-code-point": "^3.0.0",
@@ -1382,18 +1057,7 @@
       "version": "6.0.1",
       "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-6.0.1.tgz",
       "integrity": "sha512-Y38VPSHcqkFrCpFnQ9vuSXmquuv5oXOKpGeT6aGrr3o3Gc9AlVa6JBfUSOCnbxGGZF+/0ooI7KrPuUSztUdU5A==",
-      "dependencies": {
-        "ansi-regex": "^5.0.1"
-      },
-      "engines": {
-        "node": ">=8"
-      }
-    },
-    "node_modules/strip-ansi-cjs": {
-      "name": "strip-ansi",
-      "version": "6.0.1",
-      "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-6.0.1.tgz",
-      "integrity": "sha512-Y38VPSHcqkFrCpFnQ9vuSXmquuv5oXOKpGeT6aGrr3o3Gc9AlVa6JBfUSOCnbxGGZF+/0ooI7KrPuUSztUdU5A==",
+      "dev": true,
       "dependencies": {
         "ansi-regex": "^5.0.1"
       },
@@ -1422,44 +1086,6 @@
         "url": "https://github.com/sponsors/sindresorhus"
       }
     },
-    "node_modules/tar": {
-      "version": "7.0.1",
-      "resolved": "https://registry.npmjs.org/tar/-/tar-7.0.1.tgz",
-      "integrity": "sha512-IjMhdQMZFpKsHEQT3woZVxBtCQY+0wk3CVxdRkGXEgyGa0dNS/ehPvOMr2nmfC7x5Zj2N+l6yZUpmICjLGS35w==",
-      "dependencies": {
-        "@isaacs/fs-minipass": "^4.0.0",
-        "chownr": "^3.0.0",
-        "minipass": "^5.0.0",
-        "minizlib": "^3.0.1",
-        "mkdirp": "^3.0.1",
-        "yallist": "^5.0.0"
-      },
-      "engines": {
-        "node": ">=18"
-      }
-    },
-    "node_modules/tar/node_modules/mkdirp": {
-      "version": "3.0.1",
-      "resolved": "https://registry.npmjs.org/mkdirp/-/mkdirp-3.0.1.tgz",
-      "integrity": "sha512-+NsyUUAZDmo6YVHzL/stxSu3t9YS1iljliy3BSDrXJ/dkn1KYdmtZODGGjLcc9XLgVVpH4KshHB8XmZgMhaBXg==",
-      "bin": {
-        "mkdirp": "dist/cjs/src/bin.js"
-      },
-      "engines": {
-        "node": ">=10"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/isaacs"
-      }
-    },
-    "node_modules/tar/node_modules/yallist": {
-      "version": "5.0.0",
-      "resolved": "https://registry.npmjs.org/yallist/-/yallist-5.0.0.tgz",
-      "integrity": "sha512-YgvUTfwqyc7UXVMrB+SImsVYSmTS8X/tSrtdNZMImM+n7+QTriRXyXim0mBrTXNeqzVF0KWGgHPeiyViFFrNDw==",
-      "engines": {
-        "node": ">=18"
-      }
-    },
     "node_modules/type-fest": {
       "version": "0.13.1",
       "resolved": "https://registry.npmjs.org/type-fest/-/type-fest-0.13.1.tgz",
@@ -1496,6 +1122,7 @@
       "version": "2.0.2",
       "resolved": "https://registry.npmjs.org/which/-/which-2.0.2.tgz",
       "integrity": "sha512-BLI3Tl1TW3Pvl70l3yq3Y64i+awpwXqsGBYWkkqMtnbXgrMD+yj7rhW0kuEDxzJaYXGjEW5ogapKNMEKNMjibA==",
+      "dev": true,
       "dependencies": {
         "isexe": "^2.0.0"
       },
@@ -1532,23 +1159,6 @@
         "url": "https://github.com/chalk/wrap-ansi?sponsor=1"
       }
     },
-    "node_modules/wrap-ansi-cjs": {
-      "name": "wrap-ansi",
-      "version": "7.0.0",
-      "resolved": "https://registry.npmjs.org/wrap-ansi/-/wrap-ansi-7.0.0.tgz",
-      "integrity": "sha512-YVGIj2kamLSTxw6NsZjoBxfSwsn0ycdesmc4p+Q21c5zPuZ1pl+NfxVdxPtdHvmNVOQ6XSYG4AUtyt/Fi7D16Q==",
-      "dependencies": {
-        "ansi-styles": "^4.0.0",
-        "string-width": "^4.1.0",
-        "strip-ansi": "^6.0.0"
-      },
-      "engines": {
-        "node": ">=10"
-      },
-      "funding": {
-        "url": "https://github.com/chalk/wrap-ansi?sponsor=1"
-      }
-    },
     "node_modules/y18n": {
       "version": "5.0.8",
       "resolved": "https://registry.npmjs.org/y18n/-/y18n-5.0.8.tgz",
@@ -1592,85 +1202,6 @@
     }
   },
   "dependencies": {
-    "@isaacs/cliui": {
-      "version": "8.0.2",
-      "resolved": "https://registry.npmjs.org/@isaacs/cliui/-/cliui-8.0.2.tgz",
-      "integrity": "sha512-O8jcjabXaleOG9DQ0+ARXWZBTfnP4WNAqzuiJK7ll44AmxGKv/J2M4TPjxjY3znBCfvBXFzucm1twdyFybFqEA==",
-      "requires": {
-        "string-width": "^5.1.2",
-        "string-width-cjs": "npm:string-width@^4.2.0",
-        "strip-ansi": "^7.0.1",
-        "strip-ansi-cjs": "npm:strip-ansi@^6.0.1",
-        "wrap-ansi": "^8.1.0",
-        "wrap-ansi-cjs": "npm:wrap-ansi@^7.0.0"
-      },
-      "dependencies": {
-        "ansi-regex": {
-          "version": "6.0.1",
-          "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-6.0.1.tgz",
-          "integrity": "sha512-n5M855fKb2SsfMIiFFoVrABHJC8QtHwVx+mHWP3QcEqBHYienj5dHSgjbxtC0WEZXYt4wcD6zrQElDPhFuZgfA=="
-        },
-        "ansi-styles": {
-          "version": "6.2.1",
-          "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-6.2.1.tgz",
-          "integrity": "sha512-bN798gFfQX+viw3R7yrGWRqnrN2oRkEkUjjl4JNn4E8GxxbjtG3FbrEIIY3l8/hrwUwIeCZvi4QuOTP4MErVug=="
-        },
-        "emoji-regex": {
-          "version": "9.2.2",
-          "resolved": "https://registry.npmjs.org/emoji-regex/-/emoji-regex-9.2.2.tgz",
-          "integrity": "sha512-L18DaJsXSUk2+42pv8mLs5jJT2hqFkFE4j21wOmgbUqsZ2hL72NsUU785g9RXgo3s0ZNgVl42TiHp3ZtOv/Vyg=="
-        },
-        "string-width": {
-          "version": "5.1.2",
-          "resolved": "https://registry.npmjs.org/string-width/-/string-width-5.1.2.tgz",
-          "integrity": "sha512-HnLOCR3vjcY8beoNLtcjZ5/nxn2afmME6lhrDrebokqMap+XbeW8n9TXpPDOqdGK5qcI3oT0GKTW6wC7EMiVqA==",
-          "requires": {
-            "eastasianwidth": "^0.2.0",
-            "emoji-regex": "^9.2.2",
-            "strip-ansi": "^7.0.1"
-          }
-        },
-        "strip-ansi": {
-          "version": "7.1.0",
-          "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-7.1.0.tgz",
-          "integrity": "sha512-iq6eVVI64nQQTRYq2KtEg2d2uU7LElhTJwsH4YzIHZshxlgZms/wIc4VoDQTlG/IvVIrBKG06CrZnp0qv7hkcQ==",
-          "requires": {
-            "ansi-regex": "^6.0.1"
-          }
-        },
-        "wrap-ansi": {
-          "version": "8.1.0",
-          "resolved": "https://registry.npmjs.org/wrap-ansi/-/wrap-ansi-8.1.0.tgz",
-          "integrity": "sha512-si7QWI6zUMq56bESFvagtmzMdGOtoxfR+Sez11Mobfc7tm+VkUckk9bW2UeffTGVUbOksxmSw0AA2gs8g71NCQ==",
-          "requires": {
-            "ansi-styles": "^6.1.0",
-            "string-width": "^5.0.1",
-            "strip-ansi": "^7.0.1"
-          }
-        }
-      }
-    },
-    "@isaacs/fs-minipass": {
-      "version": "4.0.0",
-      "resolved": "https://registry.npmjs.org/@isaacs/fs-minipass/-/fs-minipass-4.0.0.tgz",
-      "integrity": "sha512-S00nN1Qt3z3dSP6Db45fj/mksrAq5XWNIJ/SWXGP8XPT2jrzEuYRCSEx08JpJwBcG2F1xgiOtBMGDU0AZHmxew==",
-      "requires": {
-        "minipass": "^7.0.4"
-      },
-      "dependencies": {
-        "minipass": {
-          "version": "7.0.4",
-          "resolved": "https://registry.npmjs.org/minipass/-/minipass-7.0.4.tgz",
-          "integrity": "sha512-jYofLM5Dam9279rdkWzqHozUo4ybjdZmCsDHePy5V/PbBcVMiSZR97gmAy45aqi8CK1lG2ECd356FU86avfwUQ=="
-        }
-      }
-    },
-    "@pkgjs/parseargs": {
-      "version": "0.11.0",
-      "resolved": "https://registry.npmjs.org/@pkgjs/parseargs/-/parseargs-0.11.0.tgz",
-      "integrity": "sha512-+1VkjdD0QBLPodGrJUeqarH8VAIvQODIbwh9XpP5Syisf7YoQgsJKPNFoqqLQlu+VQ/tVSshMR6loPMn8U+dPg==",
-      "optional": true
-    },
     "@protobufjs/aspromise": {
       "version": "1.1.2",
       "resolved": "https://registry.npmjs.org/@protobufjs/aspromise/-/aspromise-1.1.2.tgz",
@@ -1747,15 +1278,22 @@
       "integrity": "sha512-93+VvleD3mXwlLI/xASjw0FzKcwzl3OdTCzm1LaRfqgS21gfFtK3zDXM5Op9TeeMsJVOaJ2VRDpT9q4Y3d0AvA==",
       "dev": true
     },
+    "adm-zip": {
+      "version": "0.5.16",
+      "resolved": "https://registry.npmjs.org/adm-zip/-/adm-zip-0.5.16.tgz",
+      "integrity": "sha512-TGw5yVi4saajsSEgz25grObGHEUaDrniwvA2qwSC060KfqGPdglhvPMA2lPIoxs3PQIItj2iag35fONcQqgUaQ=="
+    },
     "ansi-regex": {
       "version": "5.0.1",
       "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-5.0.1.tgz",
-      "integrity": "sha512-quJQXlTSUGL2LH9SUXo8VwsY4soanhgo6LNSm84E1LBcE8s3O0wpdiRzyR9z/ZZJMlMWv37qOOb9pdJlMUEKFQ=="
+      "integrity": "sha512-quJQXlTSUGL2LH9SUXo8VwsY4soanhgo6LNSm84E1LBcE8s3O0wpdiRzyR9z/ZZJMlMWv37qOOb9pdJlMUEKFQ==",
+      "dev": true
     },
     "ansi-styles": {
       "version": "4.3.0",
       "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-4.3.0.tgz",
       "integrity": "sha512-zbB9rCJAT1rbjiVDb2hqKFHNYLxgtk8NURxZ3IZwD3F6NtxbXZQCnnSi1Lkx+IDohdPlFp222wVALIheZJQSEg==",
+      "dev": true,
       "requires": {
         "color-convert": "^2.0.1"
       }
@@ -1793,29 +1331,11 @@
         "proxy-from-env": "^1.1.0"
       }
     },
-    "balanced-match": {
-      "version": "1.0.2",
-      "resolved": "https://registry.npmjs.org/balanced-match/-/balanced-match-1.0.2.tgz",
-      "integrity": "sha512-3oSeUO0TMV67hN1AmbXsK4yaqU7tjiHlbxRDZOpH0KW9+CeX4bRAaX0Anxt0tx2MrpRpWwQaPwIlISEJhYU5Pw=="
-    },
     "boolean": {
       "version": "3.2.0",
       "resolved": "https://registry.npmjs.org/boolean/-/boolean-3.2.0.tgz",
       "integrity": "sha512-d0II/GO9uf9lfUHH2BQsjxzRJZBdsjgsBiW4BvhWk/3qoKwQFjIDVN19PfX8F2D/r9PCMTtLWjYVCFrpeYUzsw=="
     },
-    "brace-expansion": {
-      "version": "2.0.1",
-      "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-2.0.1.tgz",
-      "integrity": "sha512-XnAIvQ8eM+kC6aULx6wuQiwVsnzsi9d3WxzV3FpWTGA19F621kwdbsAcFKXgKUHZWsy+mY6iL1sHTxWEFCytDA==",
-      "requires": {
-        "balanced-match": "^1.0.0"
-      }
-    },
-    "chownr": {
-      "version": "3.0.0",
-      "resolved": "https://registry.npmjs.org/chownr/-/chownr-3.0.0.tgz",
-      "integrity": "sha512-+IxzY9BZOQd/XuYPRmrvEVjF/nqj5kgT4kEq7VofrDoM1MxoRjEWkrCC3EtLi59TVawxTAn+orJwFQcrqEN1+g=="
-    },
     "cliui": {
       "version": "8.0.1",
       "resolved": "https://registry.npmjs.org/cliui/-/cliui-8.0.1.tgz",
@@ -1901,6 +1421,7 @@
       "version": "2.0.1",
       "resolved": "https://registry.npmjs.org/color-convert/-/color-convert-2.0.1.tgz",
       "integrity": "sha512-RRECPsj7iu/xb5oKYcsFHSppFNnsj/52OVTRKb4zP5onXwVF3zVmmToNcOfGC+CRDpfK/U584fMg38ZHCaElKQ==",
+      "dev": true,
       "requires": {
         "color-name": "~1.1.4"
       }
@@ -1908,7 +1429,8 @@
     "color-name": {
       "version": "1.1.4",
       "resolved": "https://registry.npmjs.org/color-name/-/color-name-1.1.4.tgz",
-      "integrity": "sha512-dOy+3AuW3a2wNbZHIuMZpTcgjGuLU/uBL/ubcZF9OXbDo8ff4O8yVp5Bf0efS8uEoYo5q4Fx7dY9OgQGXgAsQA=="
+      "integrity": "sha512-dOy+3AuW3a2wNbZHIuMZpTcgjGuLU/uBL/ubcZF9OXbDo8ff4O8yVp5Bf0efS8uEoYo5q4Fx7dY9OgQGXgAsQA==",
+      "dev": true
     },
     "color-support": {
       "version": "1.1.3",
@@ -1931,16 +1453,6 @@
       "integrity": "sha512-ty/fTekppD2fIwRvnZAVdeOiGd1c7YXEixbgJTNzqcxJWKQnjJ/V1bNEEE6hygpM3WjwHFUVK6HTjWSzV4a8sQ==",
       "dev": true
     },
-    "cross-spawn": {
-      "version": "7.0.6",
-      "resolved": "https://registry.npmjs.org/cross-spawn/-/cross-spawn-7.0.6.tgz",
-      "integrity": "sha512-uV2QOWP2nWzsy2aMp8aRibhi9dlzF5Hgh5SHaB9OiTGEyDTiJJyx0uy51QXdyWbtAHNua4XJzUKca3OzKUd3vA==",
-      "requires": {
-        "path-key": "^3.1.0",
-        "shebang-command": "^2.0.0",
-        "which": "^2.0.1"
-      }
-    },
     "debug": {
       "version": "4.3.4",
       "resolved": "https://registry.npmjs.org/debug/-/debug-4.3.4.tgz",
@@ -1993,15 +1505,11 @@
       "resolved": "https://registry.npmjs.org/detect-node/-/detect-node-2.1.0.tgz",
       "integrity": "sha512-T0NIuQpnTvFDATNuHN5roPwSBG83rFsuO+MXXH9/3N1eFbn4wcPjttvjMLEPWJ0RGUYgQE7cGgS3tNxbqCGM7g=="
     },
-    "eastasianwidth": {
-      "version": "0.2.0",
-      "resolved": "https://registry.npmjs.org/eastasianwidth/-/eastasianwidth-0.2.0.tgz",
-      "integrity": "sha512-I88TYZWc9XiYHRQ4/3c5rjjfgkjhLyW2luGIheGERbNQ6OY7yTybanSpDXZa8y7VUP9YmDcYa+eyq4ca7iLqWA=="
-    },
     "emoji-regex": {
       "version": "8.0.0",
       "resolved": "https://registry.npmjs.org/emoji-regex/-/emoji-regex-8.0.0.tgz",
-      "integrity": "sha512-MSjYzcWNOA0ewAHpz0MxpYFvwg6yjy1NG3xteoqz644VCo/RPgnr1/GGt+ic3iJTzQ8Eu3TdM14SawnVUmGE6A=="
+      "integrity": "sha512-MSjYzcWNOA0ewAHpz0MxpYFvwg6yjy1NG3xteoqz644VCo/RPgnr1/GGt+ic3iJTzQ8Eu3TdM14SawnVUmGE6A==",
+      "dev": true
     },
     "error-ex": {
       "version": "1.3.2",
@@ -2050,22 +1558,6 @@
       "integrity": "sha512-wWN62YITEaOpSK584EZXJafH1AGpO8RVgElfkuXbTOrPX4fIfOyEpW/CsiNd8JdYrAoOvafRTOEnvsO++qCqFA==",
       "dev": true
     },
-    "foreground-child": {
-      "version": "3.1.1",
-      "resolved": "https://registry.npmjs.org/foreground-child/-/foreground-child-3.1.1.tgz",
-      "integrity": "sha512-TMKDUnIte6bfb5nWv7V/caI169OHgvwjb7V4WkeUvbQQdjr5rWKqHFiKWb/fcOwB+CzBT+qbWjvj+DVwRskpIg==",
-      "requires": {
-        "cross-spawn": "^7.0.0",
-        "signal-exit": "^4.0.1"
-      },
-      "dependencies": {
-        "signal-exit": {
-          "version": "4.1.0",
-          "resolved": "https://registry.npmjs.org/signal-exit/-/signal-exit-4.1.0.tgz",
-          "integrity": "sha512-bzyZ1e88w9O1iNJbKnOlvYTrWPDl46O1bG0D3XInv+9tkPrxrN8jUUTiFlDkkmKWgn1M6CfIA13SuGqOa9Korw=="
-        }
-      }
-    },
     "form-data": {
       "version": "4.0.0",
       "resolved": "https://registry.npmjs.org/form-data/-/form-data-4.0.0.tgz",
@@ -2130,25 +1622,6 @@
       "integrity": "sha512-DyFP3BM/3YHTQOCUL/w0OZHR0lpKeGrxotcHWcqNEdnltqFwXVfhEBQ94eIo34AfQpo0rGki4cyIiftY06h2Fg==",
       "dev": true
     },
-    "glob": {
-      "version": "10.3.12",
-      "resolved": "https://registry.npmjs.org/glob/-/glob-10.3.12.tgz",
-      "integrity": "sha512-TCNv8vJ+xz4QiqTpfOJA7HvYv+tNIRHKfUWw/q+v2jdgN4ebz+KY9tGx5J4rHP0o84mNP+ApH66HRX8us3Khqg==",
-      "requires": {
-        "foreground-child": "^3.1.0",
-        "jackspeak": "^2.3.6",
-        "minimatch": "^9.0.1",
-        "minipass": "^7.0.4",
-        "path-scurry": "^1.10.2"
-      },
-      "dependencies": {
-        "minipass": {
-          "version": "7.0.4",
-          "resolved": "https://registry.npmjs.org/minipass/-/minipass-7.0.4.tgz",
-          "integrity": "sha512-jYofLM5Dam9279rdkWzqHozUo4ybjdZmCsDHePy5V/PbBcVMiSZR97gmAy45aqi8CK1lG2ECd356FU86avfwUQ=="
-        }
-      }
-    },
     "global-agent": {
       "version": "3.0.0",
       "resolved": "https://registry.npmjs.org/global-agent/-/global-agent-3.0.0.tgz",
@@ -2217,21 +1690,14 @@
     "is-fullwidth-code-point": {
       "version": "3.0.0",
       "resolved": "https://registry.npmjs.org/is-fullwidth-code-point/-/is-fullwidth-code-point-3.0.0.tgz",
-      "integrity": "sha512-zymm5+u+sCsSWyD9qNaejV3DFvhCKclKdizYaJUuHA83RLjb7nSuGnddCHGv0hk+KY7BMAlsWeK4Ueg6EV6XQg=="
+      "integrity": "sha512-zymm5+u+sCsSWyD9qNaejV3DFvhCKclKdizYaJUuHA83RLjb7nSuGnddCHGv0hk+KY7BMAlsWeK4Ueg6EV6XQg==",
+      "dev": true
     },
     "isexe": {
       "version": "2.0.0",
       "resolved": "https://registry.npmjs.org/isexe/-/isexe-2.0.0.tgz",
-      "integrity": "sha512-RHxMLp9lnKHGHRng9QFhRCMbYAcVpn69smSGcq3f36xjgVVWThj4qqLbTLlq7Ssj8B+fIQ1EuCEGI2lKsyQeIw=="
-    },
-    "jackspeak": {
-      "version": "2.3.6",
-      "resolved": "https://registry.npmjs.org/jackspeak/-/jackspeak-2.3.6.tgz",
-      "integrity": "sha512-N3yCS/NegsOBokc8GAdM8UcmfsKiSS8cipheD/nivzr700H+nsMOxJjQnvwOcRYVuFkdH0wGUvW2WbXGmrZGbQ==",
-      "requires": {
-        "@isaacs/cliui": "^8.0.2",
-        "@pkgjs/parseargs": "^0.11.0"
-      }
+      "integrity": "sha512-RHxMLp9lnKHGHRng9QFhRCMbYAcVpn69smSGcq3f36xjgVVWThj4qqLbTLlq7Ssj8B+fIQ1EuCEGI2lKsyQeIw==",
+      "dev": true
     },
     "json-parse-better-errors": {
       "version": "1.0.2",
@@ -2320,14 +1786,6 @@
         "mime-db": "1.52.0"
       }
     },
-    "minimatch": {
-      "version": "9.0.4",
-      "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-9.0.4.tgz",
-      "integrity": "sha512-KqWh+VchfxcMNRAJjj2tnsSJdNbHsVgnkBhTNrW7AjVo6OvLtxw8zfT9oLw1JSohlFzJ8jCoTgaoXvJ+kHt6fw==",
-      "requires": {
-        "brace-expansion": "^2.0.1"
-      }
-    },
     "minimist": {
       "version": "1.2.8",
       "resolved": "https://registry.npmjs.org/minimist/-/minimist-1.2.8.tgz",
@@ -2337,23 +1795,8 @@
     "minipass": {
       "version": "5.0.0",
       "resolved": "https://registry.npmjs.org/minipass/-/minipass-5.0.0.tgz",
-      "integrity": "sha512-3FnjYuehv9k6ovOEbyOswadCDPX1piCfhV8ncmYtHOjuPwylVWsghTLo7rabjC3Rx5xD4HDx8Wm1xnMF7S5qFQ=="
-    },
-    "minizlib": {
-      "version": "3.0.1",
-      "resolved": "https://registry.npmjs.org/minizlib/-/minizlib-3.0.1.tgz",
-      "integrity": "sha512-umcy022ILvb5/3Djuu8LWeqUa8D68JaBzlttKeMWen48SjabqS3iY5w/vzeMzMUNhLDifyhbOwKDSznB1vvrwg==",
-      "requires": {
-        "minipass": "^7.0.4",
-        "rimraf": "^5.0.5"
-      },
-      "dependencies": {
-        "minipass": {
-          "version": "7.0.4",
-          "resolved": "https://registry.npmjs.org/minipass/-/minipass-7.0.4.tgz",
-          "integrity": "sha512-jYofLM5Dam9279rdkWzqHozUo4ybjdZmCsDHePy5V/PbBcVMiSZR97gmAy45aqi8CK1lG2ECd356FU86avfwUQ=="
-        }
-      }
+      "integrity": "sha512-3FnjYuehv9k6ovOEbyOswadCDPX1piCfhV8ncmYtHOjuPwylVWsghTLo7rabjC3Rx5xD4HDx8Wm1xnMF7S5qFQ==",
+      "dev": true
     },
     "mkdirp": {
       "version": "0.5.6",
@@ -2415,27 +1858,6 @@
         "json-parse-better-errors": "^1.0.1"
       }
     },
-    "path-key": {
-      "version": "3.1.1",
-      "resolved": "https://registry.npmjs.org/path-key/-/path-key-3.1.1.tgz",
-      "integrity": "sha512-ojmeN0qd+y0jszEtoY48r0Peq5dwMEkIlCOu6Q5f41lfkswXuKtYrhgoTpLnyIcHm24Uhqx+5Tqm2InSwLhE6Q=="
-    },
-    "path-scurry": {
-      "version": "1.10.2",
-      "resolved": "https://registry.npmjs.org/path-scurry/-/path-scurry-1.10.2.tgz",
-      "integrity": "sha512-7xTavNy5RQXnsjANvVvMkEjvloOinkAjv/Z6Ildz9v2RinZ4SBKTWFOVRbaF8p0vpHnyjV/UwNDdKuUv6M5qcA==",
-      "requires": {
-        "lru-cache": "^10.2.0",
-        "minipass": "^5.0.0 || ^6.0.2 || ^7.0.0"
-      },
-      "dependencies": {
-        "lru-cache": {
-          "version": "10.2.0",
-          "resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-10.2.0.tgz",
-          "integrity": "sha512-2bIM8x+VAf6JT4bKAljS1qUWgMsqZRPGJS6FSahIMPVvctcNhyVp7AJu7quxOW9jwkryBReKZY5tY5JYv2n/7Q=="
-        }
-      }
-    },
     "protobufjs": {
       "version": "7.2.5",
       "resolved": "https://registry.npmjs.org/protobufjs/-/protobufjs-7.2.5.tgz",
@@ -2499,14 +1921,6 @@
       "integrity": "sha512-fGxEI7+wsG9xrvdjsrlmL22OMTTiHRwAMroiEeMgq8gzoLC/PQr7RsRDSTLUg/bZAZtF+TVIkHc6/4RIKrui+Q==",
       "dev": true
     },
-    "rimraf": {
-      "version": "5.0.5",
-      "resolved": "https://registry.npmjs.org/rimraf/-/rimraf-5.0.5.tgz",
-      "integrity": "sha512-CqDakW+hMe/Bz202FPEymy68P+G50RfMQK+Qo5YUqc9SPipvbGjCGKd0RSKEelbsfQuw3g5NZDSrlZZAJurH1A==",
-      "requires": {
-        "glob": "^10.3.7"
-      }
-    },
     "roarr": {
       "version": "2.15.4",
       "resolved": "https://registry.npmjs.org/roarr/-/roarr-2.15.4.tgz",
@@ -2553,19 +1967,6 @@
       "integrity": "sha512-KiKBS8AnWGEyLzofFfmvKwpdPzqiy16LvQfK3yv/fVH7Bj13/wl3JSR1J+rfgRE9q7xUJK4qvgS8raSOeLUehw==",
       "dev": true
     },
-    "shebang-command": {
-      "version": "2.0.0",
-      "resolved": "https://registry.npmjs.org/shebang-command/-/shebang-command-2.0.0.tgz",
-      "integrity": "sha512-kHxr2zZpYtdmrN1qDjrrX/Z1rR1kG8Dx+gkpK1G4eXmvXswmcE1hTWBWYUzlraYw1/yZp6YuDY77YtvbN0dmDA==",
-      "requires": {
-        "shebang-regex": "^3.0.0"
-      }
-    },
-    "shebang-regex": {
-      "version": "3.0.0",
-      "resolved": "https://registry.npmjs.org/shebang-regex/-/shebang-regex-3.0.0.tgz",
-      "integrity": "sha512-7++dFhtcx3353uBaq8DDR4NuxBetBzC7ZQOhmTQInHEd6bSrXdiEyzCvG07Z44UYdLShWUyXt5M/yhz8ekcb1A=="
-    },
     "signal-exit": {
       "version": "3.0.7",
       "resolved": "https://registry.npmjs.org/signal-exit/-/signal-exit-3.0.7.tgz",
@@ -2590,16 +1991,7 @@
       "version": "4.2.3",
       "resolved": "https://registry.npmjs.org/string-width/-/string-width-4.2.3.tgz",
       "integrity": "sha512-wKyQRQpjJ0sIp62ErSZdGsjMJWsap5oRNihHhu6G7JVO/9jIB6UyevL+tXuOqrng8j/cxKTWyWUwvSTriiZz/g==",
-      "requires": {
-        "emoji-regex": "^8.0.0",
-        "is-fullwidth-code-point": "^3.0.0",
-        "strip-ansi": "^6.0.1"
-      }
-    },
-    "string-width-cjs": {
-      "version": "npm:string-width@4.2.3",
-      "resolved": "https://registry.npmjs.org/string-width/-/string-width-4.2.3.tgz",
-      "integrity": "sha512-wKyQRQpjJ0sIp62ErSZdGsjMJWsap5oRNihHhu6G7JVO/9jIB6UyevL+tXuOqrng8j/cxKTWyWUwvSTriiZz/g==",
+      "dev": true,
       "requires": {
         "emoji-regex": "^8.0.0",
         "is-fullwidth-code-point": "^3.0.0",
@@ -2610,14 +2002,7 @@
       "version": "6.0.1",
       "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-6.0.1.tgz",
       "integrity": "sha512-Y38VPSHcqkFrCpFnQ9vuSXmquuv5oXOKpGeT6aGrr3o3Gc9AlVa6JBfUSOCnbxGGZF+/0ooI7KrPuUSztUdU5A==",
-      "requires": {
-        "ansi-regex": "^5.0.1"
-      }
-    },
-    "strip-ansi-cjs": {
-      "version": "npm:strip-ansi@6.0.1",
-      "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-6.0.1.tgz",
-      "integrity": "sha512-Y38VPSHcqkFrCpFnQ9vuSXmquuv5oXOKpGeT6aGrr3o3Gc9AlVa6JBfUSOCnbxGGZF+/0ooI7KrPuUSztUdU5A==",
+      "dev": true,
       "requires": {
         "ansi-regex": "^5.0.1"
       }
@@ -2634,31 +2019,6 @@
       "integrity": "sha512-6fPc+R4ihwqP6N/aIv2f1gMH8lOVtWQHoqC4yK6oSDVVocumAsfCqjkXnqiYMhmMwS/mEHLp7Vehlt3ql6lEig==",
       "dev": true
     },
-    "tar": {
-      "version": "7.0.1",
-      "resolved": "https://registry.npmjs.org/tar/-/tar-7.0.1.tgz",
-      "integrity": "sha512-IjMhdQMZFpKsHEQT3woZVxBtCQY+0wk3CVxdRkGXEgyGa0dNS/ehPvOMr2nmfC7x5Zj2N+l6yZUpmICjLGS35w==",
-      "requires": {
-        "@isaacs/fs-minipass": "^4.0.0",
-        "chownr": "^3.0.0",
-        "minipass": "^5.0.0",
-        "minizlib": "^3.0.1",
-        "mkdirp": "^3.0.1",
-        "yallist": "^5.0.0"
-      },
-      "dependencies": {
-        "mkdirp": {
-          "version": "3.0.1",
-          "resolved": "https://registry.npmjs.org/mkdirp/-/mkdirp-3.0.1.tgz",
-          "integrity": "sha512-+NsyUUAZDmo6YVHzL/stxSu3t9YS1iljliy3BSDrXJ/dkn1KYdmtZODGGjLcc9XLgVVpH4KshHB8XmZgMhaBXg=="
-        },
-        "yallist": {
-          "version": "5.0.0",
-          "resolved": "https://registry.npmjs.org/yallist/-/yallist-5.0.0.tgz",
-          "integrity": "sha512-YgvUTfwqyc7UXVMrB+SImsVYSmTS8X/tSrtdNZMImM+n7+QTriRXyXim0mBrTXNeqzVF0KWGgHPeiyViFFrNDw=="
-        }
-      }
-    },
     "type-fest": {
       "version": "0.13.1",
       "resolved": "https://registry.npmjs.org/type-fest/-/type-fest-0.13.1.tgz",
@@ -2686,6 +2046,7 @@
       "version": "2.0.2",
       "resolved": "https://registry.npmjs.org/which/-/which-2.0.2.tgz",
       "integrity": "sha512-BLI3Tl1TW3Pvl70l3yq3Y64i+awpwXqsGBYWkkqMtnbXgrMD+yj7rhW0kuEDxzJaYXGjEW5ogapKNMEKNMjibA==",
+      "dev": true,
       "requires": {
         "isexe": "^2.0.0"
       }
@@ -2710,16 +2071,6 @@
         "strip-ansi": "^6.0.0"
       }
     },
-    "wrap-ansi-cjs": {
-      "version": "npm:wrap-ansi@7.0.0",
-      "resolved": "https://registry.npmjs.org/wrap-ansi/-/wrap-ansi-7.0.0.tgz",
-      "integrity": "sha512-YVGIj2kamLSTxw6NsZjoBxfSwsn0ycdesmc4p+Q21c5zPuZ1pl+NfxVdxPtdHvmNVOQ6XSYG4AUtyt/Fi7D16Q==",
-      "requires": {
-        "ansi-styles": "^4.0.0",
-        "string-width": "^4.1.0",
-        "strip-ansi": "^6.0.0"
-      }
-    },
     "y18n": {
       "version": "5.0.8",
       "resolved": "https://registry.npmjs.org/y18n/-/y18n-5.0.8.tgz",
diff --git a/js/node/package.json b/js/node/package.json
index 195e252f1064b..22af4b7876d37 100644
--- a/js/node/package.json
+++ b/js/node/package.json
@@ -7,17 +7,15 @@
   },
   "author": "fs-eire",
   "binary": {
-    "module_path": "./bin",
-    "host": "https://onnxruntimetestdata.blob.core.windows.net/onnxruntime-node-prebuild/",
     "napi_versions": [
-      3
+      6
     ]
   },
   "version": "1.22.0",
   "dependencies": {
+    "adm-zip": "^0.5.16",
     "global-agent": "^3.0.0",
-    "onnxruntime-common": "file:../common",
-    "tar": "^7.0.1"
+    "onnxruntime-common": "file:../common"
   },
   "scripts": {
     "postinstall": "node ./script/install",
diff --git a/js/node/script/install-metadata-versions.js b/js/node/script/install-metadata-versions.js
new file mode 100644
index 0000000000000..1261a36994300
--- /dev/null
+++ b/js/node/script/install-metadata-versions.js
@@ -0,0 +1,7 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+// This file is generated by /js/scripts/update-version.ts
+// Do not modify file content manually.
+
+module.exports = { nuget: [{ feed: 'nuget', version: '1.22.0' }] };
diff --git a/js/node/script/install-metadata.js b/js/node/script/install-metadata.js
new file mode 100644
index 0000000000000..e0186ec45d1b4
--- /dev/null
+++ b/js/node/script/install-metadata.js
@@ -0,0 +1,58 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+'use strict';
+
+const metadataVersions = require('./install-metadata-versions.js');
+
+const metadata = {
+  // Requirements defines a list of manifest to install for a specific platform/architecture combination.
+  requirements: {
+    'win32/x64': [],
+    'win32/arm64': [],
+    'linux/x64': ['cuda12'],
+    'linux/arm64': [],
+    'darwin/x64': [],
+    'darwin/arm64': [],
+  },
+  // Each manifest defines a list of files to install
+  manifests: {
+    'linux/x64:cuda12': {
+      './libonnxruntime_providers_cuda.so': {
+        package: 'nuget:linux/x64:cuda12',
+        path: 'runtimes/win-x64/native/libonnxruntime_providers_cuda.so',
+      },
+      './libonnxruntime_providers_shared.so': {
+        package: 'nuget:linux/x64:cuda12',
+        path: 'runtimes/win-x64/native/libonnxruntime_providers_shared.so',
+      },
+      './libonnxruntime_providers_tensorrt.so': {
+        package: 'nuget:linux/x64:cuda12',
+        path: 'runtimes/win-x64/native/libonnxruntime_providers_tensorrt.so',
+      },
+    },
+  },
+  // Each package defines a list of package metadata. The first available package will be used.
+  packages: {
+    'nuget:win32/x64:cuda12': {
+      name: 'Microsoft.ML.OnnxRuntime.Gpu.Windows',
+      versions: metadataVersions.nuget,
+    },
+    'nuget:linux/x64:cuda12': {
+      name: 'Microsoft.ML.OnnxRuntime.Gpu.Linux',
+      versions: metadataVersions.nuget,
+    },
+  },
+  feeds: {
+    nuget: {
+      type: 'nuget',
+      index: 'https://api.nuget.org/v3/index.json',
+    },
+    nuget_nightly: {
+      type: 'nuget',
+      index: 'https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/ORT-Nightly/nuget/v3/index.json',
+    },
+  },
+};
+
+module.exports = metadata;
diff --git a/js/node/script/install-utils.js b/js/node/script/install-utils.js
new file mode 100644
index 0000000000000..abfacce881600
--- /dev/null
+++ b/js/node/script/install-utils.js
@@ -0,0 +1,306 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+'use strict';
+
+const fs = require('fs');
+const https = require('https');
+const { execFileSync } = require('child_process');
+const path = require('path');
+const os = require('os');
+const AdmZip = require('adm-zip'); // Use adm-zip instead of spawn
+
+async function downloadFile(url, dest) {
+  return new Promise((resolve, reject) => {
+    const file = fs.createWriteStream(dest);
+    https
+      .get(url, (res) => {
+        if (res.statusCode !== 200) {
+          file.close();
+          fs.unlinkSync(dest);
+          reject(new Error(`Failed to download from ${url}. HTTP status code = ${res.statusCode}`));
+          return;
+        }
+
+        res.pipe(file);
+        file.on('finish', () => {
+          file.close();
+          resolve();
+        });
+        file.on('error', (err) => {
+          fs.unlinkSync(dest);
+          reject(err);
+        });
+      })
+      .on('error', (err) => {
+        fs.unlinkSync(dest);
+        reject(err);
+      });
+  });
+}
+
+async function downloadJson(url) {
+  return new Promise((resolve, reject) => {
+    https
+      .get(url, (res) => {
+        const { statusCode } = res;
+        const contentType = res.headers['content-type'];
+
+        if (!statusCode) {
+          reject(new Error('No response statud code from server.'));
+          return;
+        }
+        if (statusCode >= 400 && statusCode < 500) {
+          resolve(null);
+          return;
+        } else if (statusCode !== 200) {
+          reject(new Error(`Failed to download build list. HTTP status code = ${statusCode}`));
+          return;
+        }
+        if (!contentType || !/^application\/json/.test(contentType)) {
+          reject(new Error(`unexpected content type: ${contentType}`));
+          return;
+        }
+        res.setEncoding('utf8');
+        let rawData = '';
+        res.on('data', (chunk) => {
+          rawData += chunk;
+        });
+        res.on('end', () => {
+          try {
+            resolve(JSON.parse(rawData));
+          } catch (e) {
+            reject(e);
+          }
+        });
+        res.on('error', (err) => {
+          reject(err);
+        });
+      })
+      .on('error', (err) => {
+        reject(err);
+      });
+  });
+}
+
+async function installPackages(packages, manifests, feeds) {
+  // Step.1: resolve packages
+  const resolvedPackages = new Map();
+  for (const packageCandidates of packages) {
+    // iterate all candidates from packagesInfo and try to find the first one that exists
+    for (const { feed, version } of packageCandidates.versions) {
+      const { type, index } = feeds[feed];
+      const pkg = await resolvePackage(type, index, packageCandidates.name, version);
+      if (pkg) {
+        resolvedPackages.set(packageCandidates, pkg);
+        break;
+      }
+    }
+    if (!resolvedPackages.has(packageCandidates)) {
+      throw new Error(`Failed to resolve package. No package exists for: ${JSON.stringify(packageCandidates)}`);
+    }
+  }
+
+  // Step.2: download packages
+  for (const [pkgInfo, pkg] of resolvedPackages) {
+    const manifestsForPackage = manifests.filter((x) => x.packagesInfo === pkgInfo);
+    await pkg.download(manifestsForPackage);
+  }
+}
+
+async function resolvePackage(type, index, packageName, version) {
+  // https://learn.microsoft.com/en-us/nuget/api/overview
+  const nugetPackageUrlResolver = async (index, packageName, version) => {
+    // STEP.1 - get Nuget package index
+    const nugetIndex = await downloadJson(index);
+    if (!nugetIndex) {
+      throw new Error(`Failed to download Nuget index from ${index}`);
+    }
+
+    // STEP.2 - get the base url of "PackageBaseAddress/3.0.0"
+    const packageBaseUrl = nugetIndex.resources.find((x) => x['@type'] === 'PackageBaseAddress/3.0.0')?.['@id'];
+    if (!packageBaseUrl) {
+      throw new Error(`Failed to find PackageBaseAddress in Nuget index`);
+    }
+
+    // STEP.3 - get the package version info
+    const packageInfo = await downloadJson(`${packageBaseUrl}${packageName.toLowerCase()}/index.json`);
+    if (!packageInfo.versions.includes(version.toLowerCase())) {
+      throw new Error(`Failed to find specific package versions for ${packageName} in ${index}`);
+    }
+
+    // STEP.4 - generate the package URL
+    const packageUrl = `${packageBaseUrl}${packageName.toLowerCase()}/${version.toLowerCase()}/${packageName.toLowerCase()}.${version.toLowerCase()}.nupkg`;
+    const packageFileName = `${packageName.toLowerCase()}.${version.toLowerCase()}.nupkg`;
+
+    return {
+      download: async (manifests) => {
+        if (manifests.length === 0) {
+          return;
+        }
+
+        // Create a temporary directory
+        const tempDir = path.join(os.tmpdir(), `onnxruntime-node-pkgs_${Date.now()}`);
+        fs.mkdirSync(tempDir, { recursive: true });
+
+        try {
+          const packageFilePath = path.join(tempDir, packageFileName);
+
+          // Download the NuGet package
+          console.log(`Downloading ${packageUrl}`);
+          await downloadFile(packageUrl, packageFilePath);
+
+          // Load the NuGet package (which is a ZIP file)
+          let zip;
+          try {
+            zip = new AdmZip(packageFilePath);
+          } catch (err) {
+            throw new Error(`Failed to open NuGet package: ${err.message}`);
+          }
+
+          // Extract only the needed files from the package
+          const extractDir = path.join(tempDir, 'extracted');
+          fs.mkdirSync(extractDir, { recursive: true });
+
+          // Process each manifest and extract/copy files to their destinations
+          for (const manifest of manifests) {
+            const { filepath, pathInPackage } = manifest;
+
+            // Create directory for the target file
+            const targetDir = path.dirname(filepath);
+            fs.mkdirSync(targetDir, { recursive: true });
+
+            // Check if the file exists directly in the zip
+            const zipEntry = zip.getEntry(pathInPackage);
+            if (!zipEntry) {
+              throw new Error(`Failed to find ${pathInPackage} in NuGet package`);
+            }
+
+            console.log(`Extracting ${pathInPackage} to ${filepath}`);
+
+            // Extract just this entry to a temporary location
+            const extractedFilePath = path.join(extractDir, path.basename(pathInPackage));
+            zip.extractEntryTo(zipEntry, extractDir, false, true);
+
+            // Copy to the final destination
+            fs.copyFileSync(extractedFilePath, filepath);
+          }
+        } finally {
+          // Clean up the temporary directory - always runs even if an error occurs
+          try {
+            fs.rmSync(tempDir, { recursive: true });
+          } catch (e) {
+            console.warn(`Failed to clean up temporary directory: ${tempDir}`, e);
+            // Don't rethrow this error as it would mask the original error
+          }
+        }
+      },
+    };
+  };
+
+  switch (type) {
+    case 'nuget':
+      return await nugetPackageUrlResolver(index, packageName, version);
+    default:
+      throw new Error(`Unsupported package type: ${type}`);
+  }
+}
+
+function tryGetCudaVersion() {
+  // Should only return 11 or 12.
+
+  // try to get the CUDA version from the system ( `nvcc --version` )
+  let ver = 12;
+  try {
+    const nvccVersion = execFileSync('nvcc', ['--version'], { encoding: 'utf8' });
+    const match = nvccVersion.match(/release (\d+)/);
+    if (match) {
+      ver = parseInt(match[1]);
+      if (ver !== 11 && ver !== 12) {
+        throw new Error(`Unsupported CUDA version: ${ver}`);
+      }
+    }
+  } catch (e) {
+    if (e?.code === 'ENOENT') {
+      console.warn('`nvcc` not found. Assuming CUDA 12.');
+    } else {
+      console.warn('Failed to detect CUDA version from `nvcc --version`:', e.message);
+    }
+  }
+
+  // assume CUDA 12 if failed to detect
+  return ver;
+}
+
+function parseInstallFlag() {
+  let flag = process.env.ONNXRUNTIME_NODE_INSTALL || process.env.npm_config_onnxruntime_node_install;
+  if (!flag) {
+    for (let i = 0; i < process.argv.length; i++) {
+      if (process.argv[i].startsWith('--onnxruntime-node-install=')) {
+        flag = process.argv[i].split('=')[1];
+        break;
+      } else if (process.argv[i] === '--onnxruntime-node-install') {
+        flag = 'true';
+      }
+    }
+  }
+  switch (flag) {
+    case 'true':
+    case '1':
+    case 'ON':
+      return true;
+    case 'skip':
+      return false;
+    case undefined: {
+      flag = parseInstallCudaFlag();
+      if (flag === 'skip') {
+        return false;
+      }
+      if (flag === 11) {
+        throw new Error('CUDA 11 is no longer supported. Please consider using CPU or upgrade to CUDA 12.');
+      }
+      if (flag === 12) {
+        return 'cuda12';
+      }
+      return undefined;
+    }
+    default:
+      if (!flag || typeof flag !== 'string') {
+        throw new Error(`Invalid value for --onnxruntime-node-install: ${flag}`);
+      }
+  }
+}
+
+function parseInstallCudaFlag() {
+  let flag = process.env.ONNXRUNTIME_NODE_INSTALL_CUDA || process.env.npm_config_onnxruntime_node_install_cuda;
+  if (!flag) {
+    for (let i = 0; i < process.argv.length; i++) {
+      if (process.argv[i].startsWith('--onnxruntime-node-install-cuda=')) {
+        flag = process.argv[i].split('=')[1];
+        break;
+      } else if (process.argv[i] === '--onnxruntime-node-install-cuda') {
+        flag = 'true';
+      }
+    }
+  }
+  switch (flag) {
+    case 'true':
+    case '1':
+    case 'ON':
+      return tryGetCudaVersion();
+    case 'v11':
+      return 11;
+    case 'v12':
+      return 12;
+    case 'skip':
+    case undefined:
+      return flag;
+    default:
+      throw new Error(`Invalid value for --onnxruntime-node-install-cuda: ${flag}`);
+  }
+}
+
+module.exports = {
+  installPackages,
+  parseInstallFlag,
+};
diff --git a/js/node/script/install.js b/js/node/script/install.js
index d406da3591eec..b278b4ade6e3c 100644
--- a/js/node/script/install.js
+++ b/js/node/script/install.js
@@ -8,21 +8,20 @@
 // not always available.
 
 // The purpose of this script is to download the required binaries for the platform and architecture.
-// Currently, most of the binaries are already bundled in the package, except for the following:
-// - Linux/x64/CUDA 12
+// Currently, most of the binaries are already bundled in the package, except for the files that described in the file
+// install-metadata.js.
 //
-// The CUDA binaries are not bundled because they are too large to be allowed in the npm registry. Instead, they are
-// downloaded from the GitHub release page of ONNX Runtime. The script will download the binaries if they are not
-// already present in the package.
+// Some files (eg. the CUDA EP binaries) are not bundled because they are too large to be allowed in the npm registry.
+// Instead, they are downloaded from the Nuget feed. The script will download the binaries if they are not already
+// present in the NPM package.
 
 // Step.1: Check if we should exit early
 const os = require('os');
-const fs = require('fs');
-const https = require('https');
 const path = require('path');
-const tar = require('tar');
-const { execFileSync } = require('child_process');
 const { bootstrap: globalAgentBootstrap } = require('global-agent');
+const { installPackages, parseInstallFlag } = require('./install-utils.js');
+
+const INSTALL_METADATA = require('./install-metadata.js');
 
 // Bootstrap global-agent to honor the proxy settings in
 // environment variables, e.g. GLOBAL_AGENT_HTTPS_PROXY.
@@ -30,169 +29,106 @@ const { bootstrap: globalAgentBootstrap } = require('global-agent');
 globalAgentBootstrap();
 
 // commandline flag:
-// --onnxruntime-node-install-cuda         Force install the CUDA EP binaries. Try to detect the CUDA version.
-// --onnxruntime-node-install-cuda=v11     Force install the CUDA EP binaries for CUDA 11.
-// --onnxruntime-node-install-cuda=v12     Force install the CUDA EP binaries for CUDA 12.
+//
+// --onnxruntime-node-install              Force install the files that are not bundled in the package.
+//
+// --onnxruntime-node-install=skip         Skip the installation of the files that are not bundled in the package.
+//
+// --onnxruntime-node-install=cuda12       Force install the CUDA EP binaries for CUDA 12.
+//
+// --onnxruntime-node-install-cuda         Force install the CUDA EP binaries.
+//                                         (deprecated, use --onnxruntime-node-install=cuda12)
+//
 // --onnxruntime-node-install-cuda=skip    Skip the installation of the CUDA EP binaries.
+//                                         (deprecated, use --onnxruntime-node-install=skip)
 //
-// Alternatively, use environment variable "ONNXRUNTIME_NODE_INSTALL_CUDA"
 //
-// If the flag is not provided, the script will only install the CUDA EP binaries when:
-// - The platform is Linux/x64.
-// - The binaries are not already present in the package.
-// - The installation is not a local install (when used inside ONNX Runtime repo).
+// Alternatively, use environment variable "ONNXRUNTIME_NODE_INSTALL" or "ONNXRUNTIME_NODE_INSTALL_CUDA" (deprecated).
 //
-const INSTALL_CUDA_FLAG = parseInstallCudaFlag();
-const NO_INSTALL = INSTALL_CUDA_FLAG === 'skip';
-const FORCE_INSTALL = !NO_INSTALL && INSTALL_CUDA_FLAG;
-
-const IS_LINUX_X64 = os.platform() === 'linux' && os.arch() === 'x64';
-const BIN_FOLDER = path.join(__dirname, '..', 'bin/napi-v3/linux/x64');
-const BIN_FOLDER_EXISTS = fs.existsSync(BIN_FOLDER);
-const CUDA_DLL_EXISTS = fs.existsSync(path.join(BIN_FOLDER, 'libonnxruntime_providers_cuda.so'));
-const ORT_VERSION = require('../package.json').version;
-
-const npm_config_local_prefix = process.env.npm_config_local_prefix;
-const npm_package_json = process.env.npm_package_json;
-const SKIP_LOCAL_INSTALL =
-  npm_config_local_prefix && npm_package_json && path.dirname(npm_package_json) === npm_config_local_prefix;
-
-const shouldInstall = FORCE_INSTALL || (!SKIP_LOCAL_INSTALL && IS_LINUX_X64 && BIN_FOLDER_EXISTS && !CUDA_DLL_EXISTS);
-if (NO_INSTALL || !shouldInstall) {
+// If the flag is not provided, the script will look up the metadata file to determine the manifest.
+//
+
+/**
+ * Possible values:
+ * - undefined: the default behavior. This is the value when no installation flag is specified.
+ *
+ * - false: skip installation. This is the value when the installation flag is set to "skip":
+ *   --onnxruntime-node-install=skip
+ *
+ * - true: force installation. This is the value when the installation flag is set with no value:
+ *   --onnxruntime-node-install
+ *
+ * - string: the installation flag is set to a specific value:
+ *   --onnxruntime-node-install=cuda12
+ */
+const INSTALL_FLAG = parseInstallFlag();
+
+// if installation is skipped, exit early
+if (INSTALL_FLAG === false) {
   process.exit(0);
 }
-
-// Step.2: Download the required binaries
-const artifactUrl = {
-  get 11() {
-    // TODO: support ORT Cuda v11 binaries
-    throw new Error(`CUDA 11 binaries are not supported by this script yet.
-
-To use ONNX Runtime Node.js binding with CUDA v11 support, please follow the manual steps:
-
-1. Use "--onnxruntime-node-install-cuda=skip" to skip the auto installation.
-2. Navigate to https://aiinfra.visualstudio.com/PublicPackages/_artifacts/feed/onnxruntime-cuda-11
-3. Download the binaries for your platform and architecture
-4. Extract the following binaries to "node_modules/onnxruntime-node/bin/napi-v3/linux/x64:
-   - libonnxruntime_providers_tensorrt.so
-   - libonnxruntime_providers_shared.so
-   - libonnxruntime.so.${ORT_VERSION}
-   - libonnxruntime_providers_cuda.so
-`);
-  },
-  12: `https://github.com/microsoft/onnxruntime/releases/download/v${ORT_VERSION}/onnxruntime-linux-x64-gpu-${
-    ORT_VERSION
-  }.tgz`,
-}[INSTALL_CUDA_FLAG || tryGetCudaVersion()];
-console.log(`Downloading "${artifactUrl}"...`);
-
-const FILES = new Set([
-  'libonnxruntime_providers_tensorrt.so',
-  'libonnxruntime_providers_shared.so',
-  `libonnxruntime.so.${ORT_VERSION}`,
-  'libonnxruntime_providers_cuda.so',
-]);
-
-downloadAndExtract(artifactUrl, BIN_FOLDER, FILES);
-
-async function downloadAndExtract(url, dest, files) {
-  return new Promise((resolve, reject) => {
-    https.get(url, (res) => {
-      const { statusCode } = res;
-      const contentType = res.headers['content-type'];
-
-      if (statusCode === 301 || statusCode === 302) {
-        downloadAndExtract(res.headers.location, dest, files).then(
-          (value) => resolve(value),
-          (reason) => reject(reason),
-        );
-        return;
-      } else if (statusCode !== 200) {
-        throw new Error(`Failed to download the binaries: ${res.statusCode} ${res.statusMessage}.
-
-Use "--onnxruntime-node-install-cuda=skip" to skip the installation. You will still be able to use ONNX Runtime, but the CUDA EP will not be available.`);
-      }
-
-      if (!contentType || !/^application\/octet-stream/.test(contentType)) {
-        throw new Error(`unexpected content type: ${contentType}`);
-      }
-
-      res
-        .pipe(
-          tar.t({
-            strict: true,
-            onentry: (entry) => {
-              const filename = path.basename(entry.path);
-              if (entry.type === 'File' && files.has(filename)) {
-                console.log(`Extracting "${filename}" to "${dest}"...`);
-                entry.pipe(fs.createWriteStream(path.join(dest, filename)));
-                entry.on('finish', () => {
-                  console.log(`Finished extracting "${filename}".`);
-                });
-              }
-            },
-          }),
-        )
-        .on('error', (err) => {
-          throw new Error(`Failed to extract the binaries: ${err.message}.
-
-Use "--onnxruntime-node-install-cuda=skip" to skip the installation. You will still be able to use ONNX Runtime, but the CUDA EP will not be available.`);
-        });
-    });
-  });
+// if installation is not specified, exit early when the installation is local (e.g. `npm ci` in <ORT_ROOT>/js/node/)
+if (INSTALL_FLAG === undefined) {
+  const npm_config_local_prefix = process.env.npm_config_local_prefix;
+  const npm_package_json = process.env.npm_package_json;
+  const IS_LOCAL_INSTALL =
+    npm_config_local_prefix && npm_package_json && path.dirname(npm_package_json) === npm_config_local_prefix;
+  if (IS_LOCAL_INSTALL) {
+    process.exit(0);
+  }
 }
 
-function tryGetCudaVersion() {
-  // Should only return 11 or 12.
-
-  // try to get the CUDA version from the system ( `nvcc --version` )
-  let ver = 12;
-  try {
-    const nvccVersion = execFileSync('nvcc', ['--version'], { encoding: 'utf8' });
-    const match = nvccVersion.match(/release (\d+)/);
-    if (match) {
-      ver = parseInt(match[1]);
-      if (ver !== 11 && ver !== 12) {
-        throw new Error(`Unsupported CUDA version: ${ver}`);
-      }
-    }
-  } catch (e) {
-    if (e?.code === 'ENOENT') {
-      console.warn('`nvcc` not found. Assuming CUDA 12.');
-    } else {
-      console.warn('Failed to detect CUDA version from `nvcc --version`:', e.message);
+const PLATFORM = `${os.platform()}/${os.arch()}`;
+let INSTALL_MANIFEST_NAMES = INSTALL_METADATA.requirements[PLATFORM] ?? [];
+
+// if installation is specified explicitly, validate the manifest
+if (typeof INSTALL_FLAG === 'string') {
+  const installations = INSTALL_FLAG.split(',').map((x) => x.trim());
+  for (const installation of installations) {
+    if (INSTALL_MANIFEST_NAMES.indexOf(installation) === -1) {
+      throw new Error(`Invalid installation: ${installation} for platform: ${PLATFORM}`);
     }
   }
+  INSTALL_MANIFEST_NAMES = installations;
+}
+
+const BIN_FOLDER = path.join(__dirname, '..', 'bin/napi-v6', PLATFORM);
+const INSTALL_MANIFESTS = [];
+
+const PACKAGES = new Set();
+for (const name of INSTALL_MANIFEST_NAMES) {
+  const manifest = INSTALL_METADATA.manifests[`${PLATFORM}:${name}`];
+  if (!manifest) {
+    throw new Error(`Manifest not found: ${name} for platform: ${PLATFORM}`);
+  }
 
-  // assume CUDA 12 if failed to detect
-  return ver;
+  for (const [filename, { package: pkg, path: pathInPackage }] of Object.entries(manifest)) {
+    const packageCandidates = INSTALL_METADATA.packages[pkg];
+    if (!packageCandidates) {
+      throw new Error(`Package information not found: ${pkg}`);
+    }
+    PACKAGES.add(packageCandidates);
+
+    INSTALL_MANIFESTS.push({
+      filepath: path.normalize(path.join(BIN_FOLDER, filename)),
+      packagesInfo: packageCandidates,
+      pathInPackage,
+    });
+  }
 }
 
-function parseInstallCudaFlag() {
-  let flag = process.env.ONNXRUNTIME_NODE_INSTALL_CUDA || process.env.npm_config_onnxruntime_node_install_cuda;
-  if (!flag) {
-    for (let i = 0; i < process.argv.length; i++) {
-      if (process.argv[i].startsWith('--onnxruntime-node-install-cuda=')) {
-        flag = process.argv[i].split('=')[1];
-        break;
-      } else if (process.argv[i] === '--onnxruntime-node-install-cuda') {
-        flag = 'true';
-      }
+// If the installation flag is not specified, we do a check to see if the files are already installed.
+if (INSTALL_FLAG === undefined) {
+  let hasMissingFiles = false;
+  for (const { filepath } of INSTALL_MANIFESTS) {
+    if (!require('fs').existsSync(filepath)) {
+      hasMissingFiles = true;
+      break;
     }
   }
-  switch (flag) {
-    case 'true':
-    case '1':
-    case 'ON':
-      return tryGetCudaVersion();
-    case 'v11':
-      return 11;
-    case 'v12':
-      return 12;
-    case 'skip':
-    case undefined:
-      return flag;
-    default:
-      throw new Error(`Invalid value for --onnxruntime-node-install-cuda: ${flag}`);
+  if (!hasMissingFiles) {
+    process.exit(0);
   }
 }
+
+void installPackages(PACKAGES, INSTALL_MANIFESTS, INSTALL_METADATA.feeds);
diff --git a/js/node/src/inference_session_wrap.cc b/js/node/src/inference_session_wrap.cc
index 5512b418b5cfb..84ed3457a488b 100644
--- a/js/node/src/inference_session_wrap.cc
+++ b/js/node/src/inference_session_wrap.cc
@@ -5,18 +5,12 @@
 
 #include "common.h"
 #include "inference_session_wrap.h"
+#include "ort_instance_data.h"
 #include "run_options_helper.h"
 #include "session_options_helper.h"
 #include "tensor_helper.h"
 #include <string>
 
-Napi::FunctionReference InferenceSessionWrap::wrappedSessionConstructor;
-Napi::FunctionReference InferenceSessionWrap::ortTensorConstructor;
-
-Napi::FunctionReference& InferenceSessionWrap::GetTensorConstructor() {
-  return InferenceSessionWrap::ortTensorConstructor;
-}
-
 Napi::Object InferenceSessionWrap::Init(Napi::Env env, Napi::Object exports) {
   // create ONNX runtime env
   Ort::InitApi();
@@ -37,8 +31,8 @@ Napi::Object InferenceSessionWrap::Init(Napi::Env env, Napi::Object exports) {
        InstanceAccessor("inputMetadata", &InferenceSessionWrap::GetMetadata, nullptr, napi_default, reinterpret_cast<void*>(true)),
        InstanceAccessor("outputMetadata", &InferenceSessionWrap::GetMetadata, nullptr, napi_default, reinterpret_cast<void*>(false))});
 
-  wrappedSessionConstructor = Napi::Persistent(func);
-  wrappedSessionConstructor.SuppressDestruct();
+  OrtInstanceData::Create(env, func);
+
   exports.Set("InferenceSession", func);
 
   Napi::Function listSupportedBackends = Napi::Function::New(env, InferenceSessionWrap::ListSupportedBackends);
@@ -55,22 +49,15 @@ Napi::Value InferenceSessionWrap::InitOrtOnce(const Napi::CallbackInfo& info) {
   Napi::HandleScope scope(env);
 
   int log_level = info[0].As<Napi::Number>().Int32Value();
-
-  Ort::Env* ortEnv = env.GetInstanceData<Ort::Env>();
-  if (ortEnv == nullptr) {
-    ortEnv = new Ort::Env{OrtLoggingLevel(log_level), "onnxruntime-node"};
-    env.SetInstanceData(ortEnv);
-  }
-
   Napi::Function tensorConstructor = info[1].As<Napi::Function>();
-  ortTensorConstructor = Napi::Persistent(tensorConstructor);
-  ortTensorConstructor.SuppressDestruct();
+
+  OrtInstanceData::InitOrt(env, log_level, tensorConstructor);
 
   return env.Undefined();
 }
 
 InferenceSessionWrap::InferenceSessionWrap(const Napi::CallbackInfo& info)
-    : Napi::ObjectWrap<InferenceSessionWrap>(info), initialized_(false), disposed_(false), session_(nullptr), defaultRunOptions_(nullptr) {}
+    : Napi::ObjectWrap<InferenceSessionWrap>(info), initialized_(false), disposed_(false), session_(nullptr) {}
 
 Napi::Value InferenceSessionWrap::LoadModel(const Napi::CallbackInfo& info) {
   Napi::Env env = info.Env();
@@ -83,14 +70,13 @@ Napi::Value InferenceSessionWrap::LoadModel(const Napi::CallbackInfo& info) {
   ORT_NAPI_THROW_TYPEERROR_IF(argsLength == 0, env, "Expect argument: model file path or buffer.");
 
   try {
-    defaultRunOptions_.reset(new Ort::RunOptions{});
     Ort::SessionOptions sessionOptions;
 
     if (argsLength == 2 && info[0].IsString() && info[1].IsObject()) {
       Napi::String value = info[0].As<Napi::String>();
 
       ParseSessionOptions(info[1].As<Napi::Object>(), sessionOptions);
-      this->session_.reset(new Ort::Session(*env.GetInstanceData<Ort::Env>(),
+      this->session_.reset(new Ort::Session(*OrtInstanceData::OrtEnv(),
 #ifdef _WIN32
                                             reinterpret_cast<const wchar_t*>(value.Utf16Value().c_str()),
 #else
@@ -105,7 +91,7 @@ Napi::Value InferenceSessionWrap::LoadModel(const Napi::CallbackInfo& info) {
       int64_t bytesLength = info[2].As<Napi::Number>().Int64Value();
 
       ParseSessionOptions(info[3].As<Napi::Object>(), sessionOptions);
-      this->session_.reset(new Ort::Session(*env.GetInstanceData<Ort::Env>(),
+      this->session_.reset(new Ort::Session(*OrtInstanceData::OrtEnv(),
                                             reinterpret_cast<char*>(buffer) + bytesOffset, bytesLength,
                                             sessionOptions));
     } else {
@@ -225,7 +211,7 @@ Napi::Value InferenceSessionWrap::Run(const Napi::CallbackInfo& info) {
       ParseRunOptions(info[2].As<Napi::Object>(), runOptions);
     }
     if (preferredOutputLocations_.size() == 0) {
-      session_->Run(runOptions == nullptr ? *defaultRunOptions_.get() : runOptions,
+      session_->Run(runOptions == nullptr ? *OrtInstanceData::OrtDefaultRunOptions() : runOptions,
                     inputIndex == 0 ? nullptr : &inputNames_cstr[0], inputIndex == 0 ? nullptr : &inputValues[0],
                     inputIndex, outputIndex == 0 ? nullptr : &outputNames_cstr[0],
                     outputIndex == 0 ? nullptr : &outputValues[0], outputIndex);
@@ -254,7 +240,7 @@ Napi::Value InferenceSessionWrap::Run(const Napi::CallbackInfo& info) {
         }
       }
 
-      session_->Run(runOptions == nullptr ? *defaultRunOptions_.get() : runOptions, *ioBinding_);
+      session_->Run(runOptions == nullptr ? *OrtInstanceData::OrtDefaultRunOptions() : runOptions, *ioBinding_);
 
       auto outputs = ioBinding_->GetOutputValues();
       ORT_NAPI_THROW_ERROR_IF(outputs.size() != outputIndex, env, "Output count mismatch.");
@@ -278,8 +264,6 @@ Napi::Value InferenceSessionWrap::Dispose(const Napi::CallbackInfo& info) {
   ORT_NAPI_THROW_ERROR_IF(this->disposed_, env, "Session already disposed.");
 
   this->ioBinding_.reset(nullptr);
-
-  this->defaultRunOptions_.reset(nullptr);
   this->session_.reset(nullptr);
 
   this->disposed_ = true;
diff --git a/js/node/src/inference_session_wrap.h b/js/node/src/inference_session_wrap.h
index 776cdc0d3b51e..7a6b1232400ec 100644
--- a/js/node/src/inference_session_wrap.h
+++ b/js/node/src/inference_session_wrap.h
@@ -12,7 +12,6 @@
 class InferenceSessionWrap : public Napi::ObjectWrap<InferenceSessionWrap> {
  public:
   static Napi::Object Init(Napi::Env env, Napi::Object exports);
-  static Napi::FunctionReference& GetTensorConstructor();
 
   InferenceSessionWrap(const Napi::CallbackInfo& info);
 
@@ -79,15 +78,10 @@ class InferenceSessionWrap : public Napi::ObjectWrap<InferenceSessionWrap> {
 
   // private members
 
-  // persistent constructor
-  static Napi::FunctionReference wrappedSessionConstructor;
-  static Napi::FunctionReference ortTensorConstructor;
-
   // session objects
   bool initialized_;
   bool disposed_;
   std::unique_ptr<Ort::Session> session_;
-  std::unique_ptr<Ort::RunOptions> defaultRunOptions_;
 
   // input/output metadata
   std::vector<std::string> inputNames_;
diff --git a/js/node/src/ort_instance_data.cc b/js/node/src/ort_instance_data.cc
new file mode 100644
index 0000000000000..d9b66909f1291
--- /dev/null
+++ b/js/node/src/ort_instance_data.cc
@@ -0,0 +1,60 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include <atomic>
+#include <mutex>
+
+#include "common.h"
+#include "ort_instance_data.h"
+#include "onnxruntime_cxx_api.h"
+
+std::unique_ptr<Ort::Env> OrtInstanceData::ortEnv;
+std::unique_ptr<Ort::RunOptions> OrtInstanceData::ortDefaultRunOptions;
+std::mutex OrtInstanceData::ortEnvMutex;
+std::atomic<uint64_t> OrtInstanceData::ortEnvRefCount;
+std::atomic<bool> OrtInstanceData::ortEnvDestroyed;
+
+OrtInstanceData::OrtInstanceData() {
+  ++ortEnvRefCount;
+}
+
+OrtInstanceData::~OrtInstanceData() {
+  if (--ortEnvRefCount == 0) {
+    std::lock_guard<std::mutex> lock(ortEnvMutex);
+    if (ortEnv) {
+      ortDefaultRunOptions.reset(nullptr);
+      ortEnv.reset();
+      ortEnvDestroyed = true;
+    }
+  }
+}
+
+void OrtInstanceData::Create(Napi::Env env, Napi::Function inferenceSessionWrapperFunction) {
+  ORT_NAPI_THROW_ERROR_IF(env.GetInstanceData<void>() != nullptr, env, "OrtInstanceData already created.");
+  auto data = new OrtInstanceData{};
+  data->wrappedSessionConstructor = Napi::Persistent(inferenceSessionWrapperFunction);
+  env.SetInstanceData(data);
+}
+
+void OrtInstanceData::InitOrt(Napi::Env env, int log_level, Napi::Function tensorConstructor) {
+  auto data = env.GetInstanceData<OrtInstanceData>();
+  ORT_NAPI_THROW_ERROR_IF(data == nullptr, env, "OrtInstanceData not created.");
+
+  data->ortTensorConstructor = Napi::Persistent(tensorConstructor);
+
+  if (!ortEnv) {
+    std::lock_guard<std::mutex> lock(ortEnvMutex);
+    if (!ortEnv) {
+      ORT_NAPI_THROW_ERROR_IF(ortEnvDestroyed, env, "OrtEnv already destroyed.");
+      ortEnv.reset(new Ort::Env{OrtLoggingLevel(log_level), "onnxruntime-node"});
+      ortDefaultRunOptions.reset(new Ort::RunOptions{});
+    }
+  }
+}
+
+const Napi::FunctionReference& OrtInstanceData::TensorConstructor(Napi::Env env) {
+  auto data = env.GetInstanceData<OrtInstanceData>();
+  ORT_NAPI_THROW_ERROR_IF(data == nullptr, env, "OrtInstanceData not created.");
+
+  return data->ortTensorConstructor;
+}
diff --git a/js/node/src/ort_instance_data.h b/js/node/src/ort_instance_data.h
new file mode 100644
index 0000000000000..bb70ac8e87d3a
--- /dev/null
+++ b/js/node/src/ort_instance_data.h
@@ -0,0 +1,50 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include <napi.h>
+#include "onnxruntime_cxx_api.h"
+
+/**
+ * The OrtInstanceData class is designed to manage the lifecycle of necessary instance data, including:
+ * - The Ort::Env singleton instance.
+ *   This is a global singleton that is shared across all InferenceSessionWrap instances. It is created when the first
+ *   time `InferenceSession.initOrtOnce()` is called. It is destroyed when the last active NAPI Env is destroyed.
+ *   Once destroyed, it cannot be created again.
+ *
+ * - The Object reference of the InferenceSessionWrap class and the Tensor constructor.
+ *   This is a per-env data that has the same lifecycle as the Napi::Env. If there are worker threads, each thread will
+ *   have its own handle to the InferenceSessionWrap class and the Tensor constructor.
+ *
+ * The OrtInstanceData class is bind to the Napi::Env using environment life cycle APIs.
+ * see https://nodejs.org/api/n-api.html#environment-life-cycle-apis
+ */
+struct OrtInstanceData {
+  // Create a new OrtInstanceData object related to the Napi::Env
+  static void Create(Napi::Env env, Napi::Function inferenceSessionWrapperFunction);
+  // Initialize Ort for the Napi::Env
+  static void InitOrt(Napi::Env env, int log_level, Napi::Function tensorConstructor);
+  // Get the Tensor constructor reference for the Napi::Env
+  static const Napi::FunctionReference& TensorConstructor(Napi::Env env);
+  // Get the global Ort::Env
+  static const Ort::Env* OrtEnv() { return ortEnv.get(); }
+  // Get the default Ort::RunOptions
+  static Ort::RunOptions* OrtDefaultRunOptions() { return ortDefaultRunOptions.get(); }
+
+  ~OrtInstanceData();
+
+ private:
+  OrtInstanceData();
+
+  // per env persistent constructors
+  Napi::FunctionReference wrappedSessionConstructor;
+  Napi::FunctionReference ortTensorConstructor;
+
+  // ORT env (global singleton)
+  static std::unique_ptr<Ort::Env> ortEnv;
+  static std::unique_ptr<Ort::RunOptions> ortDefaultRunOptions;
+  static std::mutex ortEnvMutex;
+  static std::atomic<uint64_t> ortEnvRefCount;
+  static std::atomic<bool> ortEnvDestroyed;
+};
diff --git a/js/node/src/session_options_helper.cc b/js/node/src/session_options_helper.cc
index 3c607d88e5402..b189b45556306 100644
--- a/js/node/src/session_options_helper.cc
+++ b/js/node/src/session_options_helper.cc
@@ -321,44 +321,46 @@ void ParseSessionOptions(const Napi::Object options, Ort::SessionOptions& sessio
   // external data
   if (options.Has("externalData")) {
     auto externalDataValue = options.Get("externalData");
-    ORT_NAPI_THROW_TYPEERROR_IF(!externalDataValue.IsArray(), options.Env(),
-                                "Invalid argument: sessionOptions.externalData must be an array.");
-    auto externalData = externalDataValue.As<Napi::Array>();
-    std::vector<std::basic_string<ORTCHAR_T>> paths;
-    std::vector<char*> buffs;
-    std::vector<size_t> sizes;
+    if (!externalDataValue.IsNull() && !externalDataValue.IsUndefined()) {
+      ORT_NAPI_THROW_TYPEERROR_IF(!externalDataValue.IsArray(), options.Env(),
+                                  "Invalid argument: sessionOptions.externalData must be an array.");
+      auto externalData = externalDataValue.As<Napi::Array>();
+      std::vector<std::basic_string<ORTCHAR_T>> paths;
+      std::vector<char*> buffs;
+      std::vector<size_t> sizes;
 
-    for (const auto& kvp : externalData) {
-      Napi::Value value = kvp.second;
-      ORT_NAPI_THROW_TYPEERROR_IF(!value.IsObject(), options.Env(),
-                                  "Invalid argument: sessionOptions.externalData value must be an object in Node.js binding.");
-      Napi::Object obj = value.As<Napi::Object>();
-      ORT_NAPI_THROW_TYPEERROR_IF(!obj.Has("path") || !obj.Get("path").IsString(), options.Env(),
-                                  "Invalid argument: sessionOptions.externalData value must have a 'path' property of type string in Node.js binding.");
+      for (const auto& kvp : externalData) {
+        Napi::Value value = kvp.second;
+        ORT_NAPI_THROW_TYPEERROR_IF(!value.IsObject(), options.Env(),
+                                    "Invalid argument: sessionOptions.externalData value must be an object in Node.js binding.");
+        Napi::Object obj = value.As<Napi::Object>();
+        ORT_NAPI_THROW_TYPEERROR_IF(!obj.Has("path") || !obj.Get("path").IsString(), options.Env(),
+                                    "Invalid argument: sessionOptions.externalData value must have a 'path' property of type string in Node.js binding.");
 #ifdef _WIN32
-      auto path = obj.Get("path").As<Napi::String>().Utf16Value();
-      paths.push_back(std::wstring{path.begin(), path.end()});
+        auto path = obj.Get("path").As<Napi::String>().Utf16Value();
+        paths.push_back(std::wstring{path.begin(), path.end()});
 #else
-      auto path = obj.Get("path").As<Napi::String>().Utf8Value();
-      paths.push_back(path);
+        auto path = obj.Get("path").As<Napi::String>().Utf8Value();
+        paths.push_back(path);
 #endif
-      ORT_NAPI_THROW_TYPEERROR_IF(!obj.Has("data") ||
-                                      !obj.Get("data").IsBuffer() ||
-                                      !(obj.Get("data").IsTypedArray() && obj.Get("data").As<Napi::TypedArray>().TypedArrayType() == napi_uint8_array),
-                                  options.Env(),
-                                  "Invalid argument: sessionOptions.externalData value must have an 'data' property of type buffer or typed array in Node.js binding.");
+        ORT_NAPI_THROW_TYPEERROR_IF(!obj.Has("data") ||
+                                        !obj.Get("data").IsBuffer() ||
+                                        !(obj.Get("data").IsTypedArray() && obj.Get("data").As<Napi::TypedArray>().TypedArrayType() == napi_uint8_array),
+                                    options.Env(),
+                                    "Invalid argument: sessionOptions.externalData value must have an 'data' property of type buffer or typed array in Node.js binding.");
 
-      auto data = obj.Get("data");
-      if (data.IsBuffer()) {
-        buffs.push_back(data.As<Napi::Buffer<char>>().Data());
-        sizes.push_back(data.As<Napi::Buffer<char>>().Length());
-      } else {
-        auto typedArray = data.As<Napi::TypedArray>();
-        buffs.push_back(reinterpret_cast<char*>(typedArray.ArrayBuffer().Data()) + typedArray.ByteOffset());
-        sizes.push_back(typedArray.ByteLength());
+        auto data = obj.Get("data");
+        if (data.IsBuffer()) {
+          buffs.push_back(data.As<Napi::Buffer<char>>().Data());
+          sizes.push_back(data.As<Napi::Buffer<char>>().Length());
+        } else {
+          auto typedArray = data.As<Napi::TypedArray>();
+          buffs.push_back(reinterpret_cast<char*>(typedArray.ArrayBuffer().Data()) + typedArray.ByteOffset());
+          sizes.push_back(typedArray.ByteLength());
+        }
       }
+      sessionOptions.AddExternalInitializersFromFilesInMemory(paths, buffs, sizes);
     }
-    sessionOptions.AddExternalInitializersFromFilesInMemory(paths, buffs, sizes);
   }
 }
 
diff --git a/js/node/src/tensor_helper.cc b/js/node/src/tensor_helper.cc
index 12b1a79793ff3..0630386cfc645 100644
--- a/js/node/src/tensor_helper.cc
+++ b/js/node/src/tensor_helper.cc
@@ -7,6 +7,7 @@
 #include <unordered_map>
 
 #include "common.h"
+#include "ort_instance_data.h"
 #include "tensor_helper.h"
 #include "inference_session_wrap.h"
 
@@ -275,12 +276,18 @@ Napi::Value OrtValueToNapiValue(Napi::Env env, Ort::Value&& value) {
     }
 
     // new Tensor("string", stringArray /* string[] */, dims /* number[] */)
-    return scope.Escape(InferenceSessionWrap::GetTensorConstructor().New({Napi::String::New(env, "string"), stringArray, dims}));
+    return scope.Escape(OrtInstanceData::TensorConstructor(env)
+                            .New({Napi::String::New(env, "string"),
+                                  stringArray,
+                                  dims}));
   } else {
     // number data
     if (isGpuBuffer) {
       // Tensor.fromGpuBuffer(buffer, options)
-      Napi::Function tensorFromGpuBuffer = InferenceSessionWrap::GetTensorConstructor().Value().Get("fromGpuBuffer").As<Napi::Function>();
+      Napi::Function tensorFromGpuBuffer = OrtInstanceData::TensorConstructor(env)
+                                               .Value()
+                                               .Get("fromGpuBuffer")
+                                               .As<Napi::Function>();
       OrtValue* underlyingOrtValue = value.release();
 
       auto options = Napi::Object::New(env);
@@ -311,10 +318,10 @@ Napi::Value OrtValueToNapiValue(Napi::Env env, Ort::Value&& value) {
       NAPI_THROW_IF_FAILED(env, status, Napi::Value);
 
       // new Tensor(type, typedArrayData, dims)
-      return scope.Escape(InferenceSessionWrap::GetTensorConstructor().New(
-          {type,
-           Napi::Value(env, typedArrayData),
-           dims}));
+      return scope.Escape(OrtInstanceData::TensorConstructor(env)
+                              .New({type,
+                                    Napi::Value(env, typedArrayData),
+                                    dims}));
     }
   }
 }
diff --git a/js/node/test/e2e/worker-test.ts b/js/node/test/e2e/worker-test.ts
new file mode 100644
index 0000000000000..3b803c86d8b5b
--- /dev/null
+++ b/js/node/test/e2e/worker-test.ts
@@ -0,0 +1,51 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+import { Worker, isMainThread, parentPort } from 'node:worker_threads';
+import { InferenceSession, Tensor } from 'onnxruntime-common';
+import { assertTensorEqual, SQUEEZENET_INPUT0_DATA, SQUEEZENET_OUTPUT0_DATA, TEST_DATA_ROOT } from '../test-utils';
+import * as path from 'path';
+
+if (isMainThread) {
+  describe('E2E Tests - worker test', () => {
+    it('should run in worker', (done) => {
+      const worker = new Worker(__filename, {
+        stdout: true,
+        stderr: true,
+      });
+      worker.on('message', (msg) => {
+        if (msg.result === 'success') {
+          done();
+        } else {
+          done(new Error(`Worker failed: ${msg.error}`));
+        }
+      });
+      worker.on('error', (err) => {
+        console.error(`Worker error: ${err}`);
+        done(err);
+      });
+    });
+  });
+} else {
+  const workerMain = async () => {
+    // require onnxruntime-node.
+    require('../..');
+
+    const input0 = new Tensor('float32', SQUEEZENET_INPUT0_DATA, [1, 3, 224, 224]);
+    const expectedOutput0 = new Tensor('float32', SQUEEZENET_OUTPUT0_DATA, [1, 1000, 1, 1]);
+
+    const session = await InferenceSession.create(path.join(TEST_DATA_ROOT, 'squeezenet.onnx'));
+
+    const result = await session!.run({ data_0: input0 }, ['softmaxout_1']);
+    console.log('result:', result);
+    assertTensorEqual(result.softmaxout_1, expectedOutput0);
+  };
+  workerMain().then(
+    () => {
+      parentPort?.postMessage({ result: 'success' });
+    },
+    (err) => {
+      parentPort?.postMessage({ result: 'failed', error: err });
+    },
+  );
+}
diff --git a/js/node/test/test-main.ts b/js/node/test/test-main.ts
index 6e7905a24711a..ec7d4e2fc12d0 100644
--- a/js/node/test/test-main.ts
+++ b/js/node/test/test-main.ts
@@ -21,6 +21,7 @@ require('./unittests/lib/tensor');
 // E2E tests
 require('./e2e/simple-e2e-tests');
 require('./e2e/inference-session-run');
+require('./e2e/worker-test');
 
 // Test ONNX spec tests
 import { run as runTestRunner } from './test-runner';
diff --git a/js/scripts/update-version.ts b/js/scripts/update-version.ts
index df6a9ea334db5..ef7c5bfbbf0a6 100644
--- a/js/scripts/update-version.ts
+++ b/js/scripts/update-version.ts
@@ -1,8 +1,9 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-// This script update source file "version.ts" under the following folders:
-// /js/${arg0}/lib/version.ts
+// This script update the following source files:
+// - /js/${arg0}/lib/version.ts
+// - /js/node/script/install-metadata-versions.js (only for arg0=="node")
 //
 // version data is read from file /js/${arg0}/package.json
 
@@ -21,7 +22,9 @@ if (typeof version !== 'string') {
   throw new Error(`failed to parse "version" from file: ${PACKAGE_JSON_FILE}`);
 }
 
-const FILE_CONTENT = `// Copyright (c) Microsoft Corporation. All rights reserved.
+{
+  // update /js/${arg0}/lib/version.ts
+  const FILE_CONTENT = `// Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
 // This file is generated by /js/scripts/update-version.ts
@@ -30,4 +33,45 @@ const FILE_CONTENT = `// Copyright (c) Microsoft Corporation. All rights reserve
 export const version = ${JSON.stringify(version)};
 `;
 
-fs.writeFileSync(path.join(__dirname, '..', packageName, 'lib', 'version.ts'), FILE_CONTENT);
+  fs.writeFileSync(path.join(__dirname, '..', packageName, 'lib', 'version.ts'), FILE_CONTENT);
+}
+
+if (packageName === 'node') {
+  // update /js/node/script/install-metadata-versions.js
+
+  // If there is a second argument, use it as the version candidates. Otherwise, use the version from package.json.
+  // ";" will be used as the separator.
+  const versionCandidates = (process.argv[3] ?? '')
+    .split(';')
+    .map((v) => v.trim())
+    .filter((v) => !!v);
+
+  type NodeInstallMetadataVersions = Record<string, Array<{ feed: string; version: string }>>;
+  const versions: NodeInstallMetadataVersions = { nuget: [] };
+
+  if (versionCandidates.length > 0) {
+    // append dev versions
+    for (const v of versionCandidates) {
+      versions.nuget.push({
+        feed: 'nuget_nightly',
+        version: v,
+      });
+    }
+  } else {
+    // append release version
+    versions.nuget.push({
+      feed: 'nuget',
+      version,
+    });
+  }
+
+  const FILE_CONTENT = `// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+// This file is generated by /js/scripts/update-version.ts
+// Do not modify file content manually.
+
+module.exports = ${JSON.stringify(versions)};
+`;
+  fs.writeFileSync(path.join(__dirname, '..', 'node', 'script', 'install-metadata-versions.js'), FILE_CONTENT);
+}
diff --git a/js/web/lib/wasm/jsep/backend-webnn.ts b/js/web/lib/wasm/jsep/backend-webnn.ts
index c2a855bedca22..4de02983d068d 100644
--- a/js/web/lib/wasm/jsep/backend-webnn.ts
+++ b/js/web/lib/wasm/jsep/backend-webnn.ts
@@ -79,11 +79,20 @@ export class WebNNBackend {
    * Maps from session id to list of graph inputs.
    */
   private sessionGraphInputs: Map<number, string[]> = new Map();
+  /**
+   * Maps from session id to list of graph outputs.
+   */
+  private sessionGraphOutputs: Map<number, string[]> = new Map();
   /**
    * Temporary graph inputs for the current session.
    * These inputs will be registered when the session is created.
    */
   private temporaryGraphInputs: string[] = [];
+  /**
+   * Temporary graph outputs for the current session.
+   * These outputs will be registered when the session is created.
+   */
+  private temporaryGraphOutputs: string[] = [];
   /**
    * Temporary tensors for the current session.
    */
@@ -167,10 +176,15 @@ export class WebNNBackend {
       this.sessionGraphInputs.set(sessionId, this.temporaryGraphInputs);
       this.temporaryGraphInputs = [];
     }
+    if (this.temporaryGraphOutputs.length > 0) {
+      this.sessionGraphOutputs.set(sessionId, this.temporaryGraphOutputs);
+      this.temporaryGraphOutputs = [];
+    }
   }
 
   public onReleaseSession(sessionId: number): void {
     this.sessionGraphInputs.delete(sessionId);
+    this.sessionGraphOutputs.delete(sessionId);
     const mlContext = this.mlContextBySessionId.get(sessionId)!;
     if (!mlContext) {
       // Current session is not a WebNN session.
@@ -363,6 +377,10 @@ export class WebNNBackend {
     this.temporaryGraphInputs.push(inputName);
   }
 
+  public registerGraphOutput(outputName: string): void {
+    this.temporaryGraphOutputs.push(outputName);
+  }
+
   public isGraphInput(sessionId: number, inputName: string): boolean {
     const inputNames = this.sessionGraphInputs.get(sessionId);
     if (!inputNames) {
@@ -371,6 +389,14 @@ export class WebNNBackend {
     return inputNames.includes(inputName);
   }
 
+  public isGraphOutput(sessionId: number, outputName: string): boolean {
+    const outputNames = this.sessionGraphOutputs.get(sessionId);
+    if (!outputNames) {
+      return false;
+    }
+    return outputNames.includes(outputName);
+  }
+
   public isInt64Supported(sessionId: number): boolean {
     const context = this.mlContextBySessionId.get(sessionId);
     return !!context?.opSupportLimits().input.dataTypes.includes('int64');
diff --git a/js/web/lib/wasm/wasm-core-impl.ts b/js/web/lib/wasm/wasm-core-impl.ts
index 8dd643293937b..227c89a53afc6 100644
--- a/js/web/lib/wasm/wasm-core-impl.ts
+++ b/js/web/lib/wasm/wasm-core-impl.ts
@@ -172,7 +172,13 @@ export const initEp = async (env: Env, epName: string): Promise<void> => {
 /**
  * valid data locations for input/output tensors.
  */
-type SupportedTensorDataLocationForInputOutput = 'cpu' | 'cpu-pinned' | 'gpu-buffer' | 'ml-tensor';
+type SupportedTensorDataLocationForInputOutput =
+  | 'cpu'
+  | 'cpu-pinned'
+  | 'gpu-buffer'
+  | 'ml-tensor'
+  // Use 'ml-tensor' during inference, but output a tensor located on the CPU.
+  | 'ml-tensor-cpu-output';
 
 type IOBindingState = {
   /**
@@ -424,6 +430,11 @@ export const createSession = async (
           typeof options?.preferredOutputLocation === 'string'
             ? options.preferredOutputLocation
             : (options?.preferredOutputLocation?.[nameString] ?? 'cpu');
+        const isGraphOutput = wasm.webnnIsGraphOutput;
+        if (location === 'cpu' && isGraphOutput && isGraphOutput(sessionHandle, nameString)) {
+          outputPreferredLocations.push('ml-tensor-cpu-output');
+          continue;
+        }
         if (location !== 'cpu' && location !== 'cpu-pinned' && location !== 'gpu-buffer' && location !== 'ml-tensor') {
           throw new Error(`Not supported preferred output location: ${location}.`);
         }
@@ -438,7 +449,10 @@ export const createSession = async (
 
     // use IO binding only when at least one output is preferred to be on GPU.
     let bindingState: IOBindingState | null = null;
-    if (!BUILD_DEFS.DISABLE_JSEP && outputPreferredLocations.some((l) => l === 'gpu-buffer' || l === 'ml-tensor')) {
+    if (
+      !BUILD_DEFS.DISABLE_JSEP &&
+      outputPreferredLocations.some((l) => l === 'gpu-buffer' || l === 'ml-tensor' || l === 'ml-tensor-cpu-output')
+    ) {
       ioBindingHandle = wasm._OrtCreateBinding(sessionHandle);
       if (ioBindingHandle === 0) {
         checkLastError("Can't create IO binding.");
@@ -447,7 +461,10 @@ export const createSession = async (
       bindingState = {
         handle: ioBindingHandle,
         outputPreferredLocations,
-        outputPreferredLocationsEncoded: outputPreferredLocations.map((l) => dataLocationStringToEnum(l)),
+        outputPreferredLocationsEncoded: outputPreferredLocations
+          // 'ml-tensor-cpu-output' is treated as 'ml-tensor' for the purpose of IO binding.
+          .map((l) => (l === 'ml-tensor-cpu-output' ? 'ml-tensor' : l))
+          .map((l) => dataLocationStringToEnum(l)),
       };
     }
 
@@ -599,10 +616,11 @@ export const prepareInputOutputTensor = async (
       }
     } else {
       const isGraphInput = wasm.webnnIsGraphInput;
-      if (dataType !== 'string' && isGraphInput) {
+      const isGraphOutput = wasm.webnnIsGraphOutput;
+      if (dataType !== 'string' && isGraphInput && isGraphOutput) {
         const tensorName = wasm.UTF8ToString(tensorNameUTF8Encoded);
         // Promote the tensor to 'ml-tensor' if it is a graph input.
-        if (isGraphInput(sessionId, tensorName)) {
+        if (isGraphInput(sessionId, tensorName) || isGraphOutput(sessionId, tensorName)) {
           const dataTypeEnum = tensorDataTypeStringToEnum(dataType);
           dataByteLength = calculateTensorSizeInBytes(dataTypeEnum, dims)!;
           actualLocation = 'ml-tensor';
@@ -810,6 +828,7 @@ export const run = async (
     }
 
     const output: TensorMetadata[] = [];
+    const outputPromises: Array<Promise<[number, Tensor.DataType]>> = [];
 
     for (let i = 0; i < outputCount; i++) {
       const tensor = Number(wasm.getValue(outputValuesOffset + i * ptrSize, '*'));
@@ -958,6 +977,20 @@ export const run = async (
               },
               'ml-tensor',
             ]);
+          } else if (preferredLocation === 'ml-tensor-cpu-output' && size > 0) {
+            const data = wasm.webnnCreateMLTensorDownloader!(dataOffset, type as Tensor.MLTensorDataTypes)();
+            const index = output.length;
+            // Delay the data download and releasing the tensor until we can wait for all output tensors to be downloaded.
+            keepOutputTensor = true;
+            outputPromises.push(
+              (async () => {
+                const result: [number, Tensor.DataType] = [index, await data];
+                wasm.webnnReleaseTensorId!(dataOffset);
+                wasm._OrtReleaseTensor(tensor);
+                return result;
+              })(),
+            );
+            output.push([type, dims, [], 'cpu']);
           } else {
             const typedArrayConstructor = tensorTypeToTypedArrayConstructor(type);
             const data = new typedArrayConstructor(size);
@@ -975,7 +1008,6 @@ export const run = async (
         if (!keepOutputTensor) {
           wasm._OrtReleaseTensor(tensor);
         }
-        wasm.webnnOnRunEnd?.(sessionHandle);
       }
     }
 
@@ -992,8 +1024,14 @@ export const run = async (
         false,
       ]);
     }
+    // Wait for all output tensor data to be downloaded.
+    for (const [index, data] of await Promise.all(outputPromises)) {
+      output[index][2] = data;
+    }
     return output;
   } finally {
+    wasm.webnnOnRunEnd?.(sessionHandle);
+
     wasm.stackRestore(beforeRunStack);
 
     if (BUILD_DEFS.USE_WEBGPU_EP) {
diff --git a/js/web/lib/wasm/wasm-types.ts b/js/web/lib/wasm/wasm-types.ts
index b2ca8480f1546..22af02b2790f4 100644
--- a/js/web/lib/wasm/wasm-types.ts
+++ b/js/web/lib/wasm/wasm-types.ts
@@ -287,6 +287,19 @@ export declare namespace JSEP {
      * @returns whether the input is a WebNN graph input.
      */
     webnnIsGraphInput: (sessionId: number, inputName: string) => boolean;
+    /**
+     * [exported from pre-jsep.js] Register a WebNN graph output.
+     * @param outputName - specify the output name.
+     */
+    webnnRegisterGraphOutput: (outputName: string) => void;
+    /**
+     * [exported from pre-jsep.js] Check if a graph output is a WebNN graph output.
+     * @param sessionId - specify the session ID.
+     * @param outputName - specify the output name.
+     * @returns whether the output is a WebNN graph output.
+     */
+    webnnIsGraphOutput: (sessionId: number, outputName: string) => boolean;
+
     /**
      * [exported from pre-jsep.js] Create a temporary MLTensor for a session.
      * @param sessionId - specify the session ID.
diff --git a/js/web/script/pull-prebuilt-wasm-artifacts.ts b/js/web/script/pull-prebuilt-wasm-artifacts.ts
index e5eace8d80dcf..89c57c191de0e 100644
--- a/js/web/script/pull-prebuilt-wasm-artifacts.ts
+++ b/js/web/script/pull-prebuilt-wasm-artifacts.ts
@@ -189,7 +189,7 @@ async function main() {
   if (!run) {
     // API reference: https://docs.github.com/en/rest/actions/workflow-runs?apiVersion=2022-11-28#list-workflow-runs-for-a-workflow
     const mainRunData = await downloadJson(
-      `https://api.github.com/repos/Microsoft/onnxruntime/actions/workflows/152051496/runs?branch=main${allowImcomplete ? '' : '&status=completed'}&per_page=1&exclude_pull_requests=1`,
+      `https://api.github.com/repos/Microsoft/onnxruntime/actions/workflows/152051496/runs?branch=main${allowImcomplete ? '' : '&status=success'}&per_page=1&exclude_pull_requests=1`,
     );
     if (mainRunData.workflow_runs.length === 0) {
       throw new Error('No build found');
diff --git a/js/web/script/test-runner-cli-args.ts b/js/web/script/test-runner-cli-args.ts
index 088a66b24f7bd..f546f58a28bfa 100644
--- a/js/web/script/test-runner-cli-args.ts
+++ b/js/web/script/test-runner-cli-args.ts
@@ -405,7 +405,7 @@ export function parseTestRunnerCliArgs(cmdlineArgs: string[]): TestRunnerCliArgs
   //       and ChromeCanary is not in CI.
 
   const defaultBrowserBackends = ['webgl', 'webgpu', 'wasm' /*, 'webnn'*/];
-  const nodejsBackends = ['cpu', 'wasm'];
+  const nodejsBackends = ['cpu', 'wasm', 'webgpu'];
   const backendArgs = args.backend || args.b;
   const backend =
     typeof backendArgs !== 'string'
diff --git a/js/web/test/e2e/exports/testcases/vite-default/package-lock.json b/js/web/test/e2e/exports/testcases/vite-default/package-lock.json
index 708e458748b3a..c9da59b4b0021 100644
--- a/js/web/test/e2e/exports/testcases/vite-default/package-lock.json
+++ b/js/web/test/e2e/exports/testcases/vite-default/package-lock.json
@@ -12,7 +12,7 @@
       },
       "devDependencies": {
         "@vitejs/plugin-vue": "^5.2.1",
-        "vite": "^6.2.5"
+        "vite": "^6.2.6"
       }
     },
     "node_modules/@babel/helper-string-parser": {
@@ -1069,9 +1069,9 @@
       }
     },
     "node_modules/vite": {
-      "version": "6.2.5",
-      "resolved": "https://registry.npmjs.org/vite/-/vite-6.2.5.tgz",
-      "integrity": "sha512-j023J/hCAa4pRIUH6J9HemwYfjB5llR2Ps0CWeikOtdR8+pAURAk0DoJC5/mm9kd+UgdnIy7d6HE4EAvlYhPhA==",
+      "version": "6.2.6",
+      "resolved": "https://registry.npmjs.org/vite/-/vite-6.2.6.tgz",
+      "integrity": "sha512-9xpjNl3kR4rVDZgPNdTL0/c6ao4km69a/2ihNQbcANz8RuCOK3hQBmLSJf3bRKVQjVMda+YvizNE8AwvogcPbw==",
       "dev": true,
       "license": "MIT",
       "dependencies": {
diff --git a/js/web/test/e2e/exports/testcases/vite-default/package.json b/js/web/test/e2e/exports/testcases/vite-default/package.json
index 904db7a41de9c..5169734074299 100644
--- a/js/web/test/e2e/exports/testcases/vite-default/package.json
+++ b/js/web/test/e2e/exports/testcases/vite-default/package.json
@@ -13,6 +13,6 @@
   },
   "devDependencies": {
     "@vitejs/plugin-vue": "^5.2.1",
-    "vite": "^6.2.5"
+    "vite": "^6.2.6"
   }
 }
diff --git a/js/web/test/test-main.ts b/js/web/test/test-main.ts
index 96ff3a16a716c..f32107e4a0c65 100644
--- a/js/web/test/test-main.ts
+++ b/js/web/test/test-main.ts
@@ -13,7 +13,16 @@ import { Logger } from '../lib/onnxjs/instrument';
 
 import { Test } from './test-types';
 
-if (ORT_WEB_TEST_CONFIG.model.some((testGroup) => testGroup.tests.some((test) => test.backend === 'cpu'))) {
+if (
+  // when NPM test is launched with `-e=node` and (`-b=cpu` or `-b=webgpu`), load ONNXRuntime Node.js binding.
+  platform.name === 'Node.js' &&
+  (ORT_WEB_TEST_CONFIG.model.some((testGroup) =>
+    testGroup.tests.some((test) => test.backend === 'cpu' || test.backend === 'webgpu'),
+  ) ||
+    ORT_WEB_TEST_CONFIG.op.some((testGroup) =>
+      testGroup.tests.some((test) => test.backend === 'cpu' || test.backend === 'webgpu'),
+    ))
+) {
   // require onnxruntime-node
   require('../../node');
 }
diff --git a/onnxruntime/contrib_ops/cuda/bert/group_query_attention.cc b/onnxruntime/contrib_ops/cuda/bert/group_query_attention.cc
index 9f1bc46ee297d..68c4b01d2db20 100644
--- a/onnxruntime/contrib_ops/cuda/bert/group_query_attention.cc
+++ b/onnxruntime/contrib_ops/cuda/bert/group_query_attention.cc
@@ -63,6 +63,7 @@ GroupQueryAttention<T>::GroupQueryAttention(const OpKernelInfo& info)
 
   if (!disable_flash_attention_) {
     zeros_ = this->GetScratchBuffer<int>(kZerosCount, nullptr);
+    CUDA_CALL_THROW(cudaMemset(zeros_.get(), 0, kZerosCount * sizeof(int)));
   }
 }
 
diff --git a/onnxruntime/contrib_ops/webgpu/bert/flash_attention.cc b/onnxruntime/contrib_ops/webgpu/bert/flash_attention.cc
index bb5de40eb27c5..eb5de4634a4d8 100644
--- a/onnxruntime/contrib_ops/webgpu/bert/flash_attention.cc
+++ b/onnxruntime/contrib_ops/webgpu/bert/flash_attention.cc
@@ -146,12 +146,12 @@ Status FlashAttentionProgram::GenerateShaderCode(ShaderHelper& shader) const {
   const min_value : q_element_t = q_element_t(-65504.0);
 
   // Default SHM usage limit is 16KB in Dawn.
-  var<workgroup> k_tile : array<array<q_value_t, qkv_head_size_vec>, max_k_step>; // 96 * 2 * 16 = 3KB.
-  var<workgroup> v_tile : array<array<q_value_t, qkv_head_size_vec>, max_k_step>; // 96 * 2 * 16 = 3KB.
+  // vec4<f16> * qkv_head_size_vec * max_k_step = 8 * (128/4) * 16 = 4KB. 128 is head_size for phi4.
+  var<workgroup> k_tile : array<array<q_value_t, qkv_head_size_vec>, max_k_step>;
+  var<workgroup> v_tile : array<array<q_value_t, qkv_head_size_vec>, max_k_step>;
 
   // Private memory per lane.
   var<private> q_tile : array<q_value_t, qkv_head_size_vec>;
-  var<private> o_tile : array<q_value_t, qkv_head_size_vec>;
   fn loadq(q_idx_global : u32, head_idx: u32)
   {
       // Stored as float16[batch_size,sequence_length,3072] the inputs as per onnx MHA
@@ -186,6 +186,34 @@ Status FlashAttentionProgram::GenerateShaderCode(ShaderHelper& shader) const {
           v_tile[slot][idx%qkv_head_size_vec] = val;
       }
   }
+)HELPER_FN";
+
+  if (is_qualcomm_) {
+    shader.AdditionalImplementation() << R"HELPER_FN(
+  const half_qkv_head_size_vec = qkv_head_size_vec / 2u;
+
+  // Move half of o_tile from private memory into workgroup memory to reduce register pressure.
+  // Note that register spill was observed on Qualcomm if whole o_tile is on private memory.
+  // vec4<f16> * half_qkv_head_size_vec * workgroup_size_x = 8 * (128/4/2) * 64 = 8KB.
+  var<workgroup> o_tile_r : array<array<q_value_t, half_qkv_head_size_vec>, workgroup_size_x>;
+
+  // Private memory per lane.
+  var<private> o_tile : array<q_value_t, half_qkv_head_size_vec>;
+  fn writeo(o_idx_global: u32, head_idx: u32, local_idx: u32)
+  {
+      // Stored as float16[batch_size,sequence_length,3072]
+      let offset = o_idx_global * num_heads * qkv_head_size_vec + head_idx * qkv_head_size_vec;
+      for (var idx:u32 = 0; idx < half_qkv_head_size_vec; idx ++)
+      {
+          output[offset+idx] = o_tile[idx];
+          output[offset+idx+half_qkv_head_size_vec] = o_tile_r[local_idx][idx];
+      }
+  }
+    )HELPER_FN";
+  } else {
+    shader.AdditionalImplementation() << R"HELPER_FN(
+  // Private memory per lane.
+  var<private> o_tile : array<q_value_t, qkv_head_size_vec>;
   fn writeo(o_idx_global: u32, head_idx: u32)
   {
       // Stored as float16[batch_size,sequence_length,3072]
@@ -195,7 +223,8 @@ Status FlashAttentionProgram::GenerateShaderCode(ShaderHelper& shader) const {
           output[offset+idx] = o_tile[idx];
       }
   }
-)HELPER_FN";
+    )HELPER_FN";
+  }
 
   if (has_attention_bias_) {
     shader.AdditionalImplementation() << R"HELPER_FN(
@@ -228,7 +257,7 @@ Status FlashAttentionProgram::GenerateShaderCode(ShaderHelper& shader) const {
   // Each lane/thread is responsible for a single q.
   shader.MainFunctionBody() << R"MAIN_FN(
   let head_idx = u32(workgroup_idx / uniforms.num_seq_tile);
-  let capped_sg_id = min(sg_id, max_k_step);
+  let capped_sg_id = min(sg_id, max_k_step - 1u);
   let capped_sg_size = min(sg_size, max_k_step);
 
   // Load Q
@@ -324,30 +353,31 @@ Status FlashAttentionProgram::GenerateShaderCode(ShaderHelper& shader) const {
       qk_4[2] = select(min_value, qk_4[2], k_start+14 < seq_causal_length);
       qk_4[3] = select(min_value, qk_4[3], k_start+15 < seq_causal_length);
     }
-
-    //
-    // Compute SoftMax as per Flash Attention technique.
-    //
-    // Crux of Flash Attention is here, that allows for partial softmax computation,
-    // direct update of output and merging with previous results.
-    // https://courses.cs.washington.edu/courses/cse599m/23sp/notes/flashattn.pdf
-    // Where b is the block size of the tile. Xi is storing QKtranspose for the ith tile.
-    // mi_local is the max of Xi. Note: _ in this notation means what follows is a
-    // subscript. max_j=1:b (Xi[j]) is the max of Xi[j] for j=1 to b.
-    //
-    // for i = 1, #tiles do
-    //  Xi = Q[k,:] Kt[:, (i-1) b : i b]
-    //  mi_local= max_j=1:b (Xi[j])
-    //  Mi = max(M_(i-1), mi_local)
-    //  d'_i = d'_(i-1) * e^(M_(i-1)-M_i) + Σ_j=1:b e^(Xi[j]-Mi)
-    //  o'_i = o'_(i-1) * d'_(i-1) * e^(M_(i-1)-M_i) / d'_i + Σ_j=1:b (e^(Xi[j]-Mi) / d'_i) V[j + (i - 1)b,:]
-    // end
-    //
-    // In the code below:
-    // dleft is the first term of d'_i expression above : d'_(i-1) * e^(M_(i-1)-M_i).
-    // sum is the second term of the same expression    : Σ_j=1:b e^(Xi[j]-Mi)
-    // o_ratio is the part of the first term of o'_i expression above : d'_(i-1) * e^(M_(i-1)-M_i) / d'_i
-    //
+)MAIN_FN";
+  //
+  // Compute SoftMax as per Flash Attention technique.
+  //
+  // Crux of Flash Attention is here, that allows for partial softmax computation,
+  // direct update of output and merging with previous results.
+  // https://courses.cs.washington.edu/courses/cse599m/23sp/notes/flashattn.pdf
+  // Where b is the block size of the tile. Xi is storing QKtranspose for the ith tile.
+  // mi_local is the max of Xi. Note: _ in this notation means what follows is a
+  // subscript. max_j=1:b (Xi[j]) is the max of Xi[j] for j=1 to b.
+  //
+  // for i = 1, #tiles do
+  //  Xi = Q[k,:] Kt[:, (i-1) b : i b]
+  //  mi_local= max_j=1:b (Xi[j])
+  //  Mi = max(M_(i-1), mi_local)
+  //  d'_i = d'_(i-1) * e^(M_(i-1)-M_i) + Σ_j=1:b e^(Xi[j]-Mi)
+  //  o'_i = o'_(i-1) * d'_(i-1) * e^(M_(i-1)-M_i) / d'_i + Σ_j=1:b (e^(Xi[j]-Mi) / d'_i) V[j + (i - 1)b,:]
+  // end
+  //
+  // In the code below:
+  // dleft is the first term of d'_i expression above : d'_(i-1) * e^(M_(i-1)-M_i).
+  // sum is the second term of the same expression    : Σ_j=1:b e^(Xi[j]-Mi)
+  // o_ratio is the part of the first term of o'_i expression above : d'_(i-1) * e^(M_(i-1)-M_i) / d'_i
+  //
+  shader.MainFunctionBody() << R"MAIN_FN(
     var local_max_temp = max(qk_1, qk_2);
     if (sg_size > 8)
     {
@@ -379,6 +409,87 @@ Status FlashAttentionProgram::GenerateShaderCode(ShaderHelper& shader) const {
     previous_denom = d;
     let o_ratio = dleft / d;
 
+)MAIN_FN";
+
+  if (is_qualcomm_) {
+    shader.MainFunctionBody() << R"MAIN_FN(
+    if (sg_size > 8) {
+      for (var i:u32 = 0; i < half_qkv_head_size_vec; i++)
+      {
+          var val = v_tile[capped_sg_id][i];
+          var sum = subgroupShuffle(val, 0) * qk_1[0];
+          sum += subgroupShuffle(val, 1) * qk_1[1];
+          sum += subgroupShuffle(val, 2) * qk_1[2];
+          sum += subgroupShuffle(val, 3) * qk_1[3];
+          sum += subgroupShuffle(val, 4) * qk_2[0];
+          sum += subgroupShuffle(val, 5) * qk_2[1];
+          sum += subgroupShuffle(val, 6) * qk_2[2];
+          sum += subgroupShuffle(val, 7) * qk_2[3];
+          sum += subgroupShuffle(val, 8) * qk_3[0];
+          sum += subgroupShuffle(val, 9) * qk_3[1];
+          sum += subgroupShuffle(val, 10) * qk_3[2];
+          sum += subgroupShuffle(val, 11) * qk_3[3];
+          sum += subgroupShuffle(val, 12) * qk_4[0];
+          sum += subgroupShuffle(val, 13) * qk_4[1];
+          sum += subgroupShuffle(val, 14) * qk_4[2];
+          sum += subgroupShuffle(val, 15) * qk_4[3];
+          o_tile[i] = o_tile[i] * o_ratio + sum;
+
+          val = v_tile[capped_sg_id][half_qkv_head_size_vec + i];
+          sum = subgroupShuffle(val, 0) * qk_1[0];
+          sum += subgroupShuffle(val, 1) * qk_1[1];
+          sum += subgroupShuffle(val, 2) * qk_1[2];
+          sum += subgroupShuffle(val, 3) * qk_1[3];
+          sum += subgroupShuffle(val, 4) * qk_2[0];
+          sum += subgroupShuffle(val, 5) * qk_2[1];
+          sum += subgroupShuffle(val, 6) * qk_2[2];
+          sum += subgroupShuffle(val, 7) * qk_2[3];
+          sum += subgroupShuffle(val, 8) * qk_3[0];
+          sum += subgroupShuffle(val, 9) * qk_3[1];
+          sum += subgroupShuffle(val, 10) * qk_3[2];
+          sum += subgroupShuffle(val, 11) * qk_3[3];
+          sum += subgroupShuffle(val, 12) * qk_4[0];
+          sum += subgroupShuffle(val, 13) * qk_4[1];
+          sum += subgroupShuffle(val, 14) * qk_4[2];
+          sum += subgroupShuffle(val, 15) * qk_4[3];
+          o_tile_r[local_idx][i] = o_tile_r[local_idx][i] * o_ratio + sum;
+      }
+    }
+    else
+    {
+      for (var i:u32 = 0; i < half_qkv_head_size_vec; i++)
+      {
+          var val = v_tile[capped_sg_id][i];
+          var sum = subgroupShuffle(val, 0) * qk_1[0];
+          sum += subgroupShuffle(val, 1) * qk_1[1];
+          sum += subgroupShuffle(val, 2) * qk_1[2];
+          sum += subgroupShuffle(val, 3) * qk_1[3];
+          sum += subgroupShuffle(val, 4) * qk_2[0];
+          sum += subgroupShuffle(val, 5) * qk_2[1];
+          sum += subgroupShuffle(val, 6) * qk_2[2];
+          sum += subgroupShuffle(val, 7) * qk_2[3];
+          o_tile[i] = o_tile[i] * o_ratio + sum;
+
+          val = v_tile[capped_sg_id][half_qkv_head_size_vec + i];
+          sum = subgroupShuffle(val, 0) * qk_1[0];
+          sum += subgroupShuffle(val, 1) * qk_1[1];
+          sum += subgroupShuffle(val, 2) * qk_1[2];
+          sum += subgroupShuffle(val, 3) * qk_1[3];
+          sum += subgroupShuffle(val, 4) * qk_2[0];
+          sum += subgroupShuffle(val, 5) * qk_2[1];
+          sum += subgroupShuffle(val, 6) * qk_2[2];
+          sum += subgroupShuffle(val, 7) * qk_2[3];
+          o_tile_r[local_idx][i] = o_tile_r[local_idx][i] * o_ratio + sum;
+      }
+    }
+  }
+
+  if (valid_q) {
+    writeo(q_idx_global, head_idx, local_idx);
+  }
+)MAIN_FN";
+  } else {
+    shader.MainFunctionBody() << R"MAIN_FN(
     if (sg_size > 8) {
       for (var i:u32 = 0; i < qkv_head_size_vec; i++)
       {
@@ -424,6 +535,7 @@ Status FlashAttentionProgram::GenerateShaderCode(ShaderHelper& shader) const {
     writeo(q_idx_global, head_idx);
   }
 )MAIN_FN";
+  }
 
   return Status::OK();
 }
@@ -761,7 +873,8 @@ Status ApplyFlashAttention(const Tensor* Q, const Tensor* K, const Tensor* V, co
   if (parameters.sequence_length_ > 1) {
     const uint32_t tile_size = 64;
     bool has_attention_bias = attention_bias != nullptr;
-    FlashAttentionProgram program{"FlashAttention", has_attention_bias, parameters.head_size_, parameters.num_heads_};
+    bool is_qualcomm = context.AdapterInfo().vendor == std::string_view{"qualcomm"};
+    FlashAttentionProgram program{"FlashAttention", has_attention_bias, is_qualcomm, parameters.head_size_, parameters.num_heads_};
     program.AddInputs({{Q, ProgramTensorMetadataDependency::TypeAndRank, 4},
                        {present_key, ProgramTensorMetadataDependency::TypeAndRank, 4},
                        {present_value, ProgramTensorMetadataDependency::TypeAndRank, 4}});
@@ -771,13 +884,10 @@ Status ApplyFlashAttention(const Tensor* Q, const Tensor* K, const Tensor* V, co
     program.AddOutputs({{output, ProgramTensorMetadataDependency::TypeAndRank, 4}});
     const float alpha = parameters.scale_ == 0.0f ? 1.f / sqrt(static_cast<float>(parameters.head_size_))
                                                   : parameters.scale_;
-    std::string cache_hint = std::to_string(has_attention_bias) +
-                             std::to_string(parameters.head_size_) +
-                             std::to_string(parameters.num_heads_);
     const uint32_t num_seq_tile = (parameters.sequence_length_ + tile_size - 1) / tile_size;
     program.SetDispatchGroupSize(parameters.num_heads_ * num_seq_tile)
         .SetWorkgroupSize(tile_size)
-        .CacheHint(cache_hint)
+        .CacheHint(has_attention_bias, parameters.head_size_, parameters.num_heads_, is_qualcomm)
         .AddUniformVariables({{static_cast<uint32_t>(parameters.sequence_length_)},
                               {static_cast<uint32_t>(parameters.total_sequence_length_)},
                               {static_cast<uint32_t>(parameters.past_present_share_buffer_ ? parameters.past_sequence_length_ : parameters.total_sequence_length_)},
@@ -821,7 +931,8 @@ bool CanApplyFlashAttention(const Tensor* bias, const Tensor* present_key, const
          bias == nullptr &&
          context.HasFeature(wgpu::FeatureName::Subgroups) &&
          present_key != nullptr && present_value != nullptr && present_key->SizeInBytes() > 0 &&
-         present_value->SizeInBytes() > 0 && parameters.head_size_ % 4 == 0;
+         present_value->SizeInBytes() > 0 &&
+         ((context.AdapterInfo().vendor == std::string_view{"qualcomm"} && parameters.head_size_ % 8 == 0) || parameters.head_size_ % 4 == 0);
 }
 
 }  // namespace webgpu
diff --git a/onnxruntime/contrib_ops/webgpu/bert/flash_attention.h b/onnxruntime/contrib_ops/webgpu/bert/flash_attention.h
index c066d6249c8b2..181e411cdc91f 100644
--- a/onnxruntime/contrib_ops/webgpu/bert/flash_attention.h
+++ b/onnxruntime/contrib_ops/webgpu/bert/flash_attention.h
@@ -36,10 +36,12 @@ class FlashAttentionProgram final : public Program<FlashAttentionProgram> {
  public:
   FlashAttentionProgram(const std::string& kernel_name,
                         bool has_attention_bias,
+                        bool is_qualcomm,
                         int qkv_head_size,
                         int qkv_num_heads)
       : Program{kernel_name},
         has_attention_bias_(has_attention_bias),
+        is_qualcomm_(is_qualcomm),
         qkv_head_size_(qkv_head_size),
         qkv_num_heads_(qkv_num_heads) {
   }
@@ -57,6 +59,7 @@ class FlashAttentionProgram final : public Program<FlashAttentionProgram> {
 
  private:
   bool has_attention_bias_;
+  bool is_qualcomm_;
   int qkv_head_size_;
   int qkv_num_heads_;
 };
diff --git a/onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits.cc b/onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits.cc
index 0e75990045b4a..22a0034ed8013 100644
--- a/onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits.cc
+++ b/onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits.cc
@@ -684,7 +684,9 @@ Status MatMulNBits::ComputeInternal(onnxruntime::webgpu::ComputeContext& context
   }
 
   // On FP32 only GPUs, integer math is faster than FP32 therefore always use DP4A independent of length of M.
-  if ((M >= kMinMForTileOptimization || y->DataType() == DataTypeImpl::GetType<float>()) && CanApplyDP4AMatrixMatMulNBits(context, accuracy_level_, block_size, batch_count, N, K, components_a, has_zero_points)) {
+  if ((M >= kMinMForTileOptimization || y->DataType() == DataTypeImpl::GetType<float>() ||
+       context.AdapterInfo().vendor == std::string_view{"qualcomm"}) &&
+      CanApplyDP4AMatrixMatMulNBits(context, accuracy_level_, block_size, batch_count, N, K, components_a, has_zero_points)) {
     return ApplyDP4AMatrixMatMulNBits(a, b, scales, M, N, K, block_size, kMinMForTileOptimization, context, y);
   }
 
diff --git a/onnxruntime/core/common/string_utils.h b/onnxruntime/core/common/string_utils.h
index 716eed1afec51..c2e26f629330f 100644
--- a/onnxruntime/core/common/string_utils.h
+++ b/onnxruntime/core/common/string_utils.h
@@ -3,6 +3,8 @@
 
 #pragma once
 
+#include <algorithm>
+#include <cctype>
 #include <string>
 #include <string_view>
 #include <vector>
@@ -84,5 +86,21 @@ inline uint32_t GetHashFromString(const std::string& str_value) {
   return hash;
 }
 
+/**
+ * Returns a lowercase version of the input string.
+ * @param str The string to lowercase.
+ * @return The lowercased string.
+ */
+inline std::string GetLowercaseString(std::string str) {
+  // https://en.cppreference.com/w/cpp/string/byte/tolower
+  // The behavior of tolower from <cctype> is undefined if the argument is neither representable as unsigned char
+  // nor equal to EOF. To use tolower safely with a plain char (or signed char), the argument must be converted to
+  // unsigned char.
+  std::transform(str.begin(), str.end(), str.begin(), [](unsigned char c) {
+    return static_cast<char>(std::tolower(c));
+  });
+  return str;
+}
+
 }  // namespace utils
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/framework/config_options.cc b/onnxruntime/core/framework/config_options.cc
index 9fe5beafd6e7e..a638660de262e 100644
--- a/onnxruntime/core/framework/config_options.cc
+++ b/onnxruntime/core/framework/config_options.cc
@@ -31,14 +31,14 @@ std::string ConfigOptions::GetConfigOrDefault(const std::string& config_key,
 
 Status ConfigOptions::AddConfigEntry(const char* config_key, const char* config_value) noexcept {
   std::string key = config_key;
-  if (key.empty() || key.length() > 128)
-    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Config key is empty or longer than maximum length 128");
+  if (key.empty() || key.length() > kMaxKeyLength)
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Config key is empty or longer than maximum length ",
+                           kMaxKeyLength);
 
   std::string val = config_value;
-  if (val.length() > onnxruntime::kMaxStrLen)
+  if (val.length() > kMaxValueLength)
     return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
-                           "Config value is longer than maximum length: ",
-                           onnxruntime::kMaxStrLen);
+                           "Config value is longer than maximum length: ", kMaxValueLength);
 
   auto iter = configurations.find(config_key);
   if (iter != configurations.cend()) {
@@ -52,6 +52,10 @@ Status ConfigOptions::AddConfigEntry(const char* config_key, const char* config_
   return Status::OK();
 }
 
+const std::unordered_map<std::string, std::string>& ConfigOptions::GetConfigOptionsMap() const noexcept {
+  return configurations;
+}
+
 std::ostream& operator<<(std::ostream& os, const ConfigOptions& config_options) {
   for (const auto& [key, value] : config_options.configurations) {
     os << "  " << key << ": " << value;
diff --git a/onnxruntime/core/framework/config_options.h b/onnxruntime/core/framework/config_options.h
index efdfdb45abbaa..028220d15fc8a 100644
--- a/onnxruntime/core/framework/config_options.h
+++ b/onnxruntime/core/framework/config_options.h
@@ -15,6 +15,11 @@ namespace onnxruntime {
  * Provides infrastructure to add/get config entries
  */
 struct ConfigOptions {
+  // Maximum key/value string lengths specified in
+  // core/session/onnxruntime_session_options_config_keys.h
+  static constexpr size_t kMaxKeyLength = 1024;
+  static constexpr size_t kMaxValueLength = 2048;
+
   std::unordered_map<std::string, std::string> configurations;
 
   // Gets the config string associated with the given config_key.
@@ -33,6 +38,9 @@ struct ConfigOptions {
   // Add a config pair (config_key, config_value) to this instance of ConfigOptions
   Status AddConfigEntry(const char* config_key, const char* config_value) noexcept;
 
+  // Gets a constant reference the map of all configurations.
+  const std::unordered_map<std::string, std::string>& GetConfigOptionsMap() const noexcept;
+
   friend std::ostream& operator<<(std::ostream& os, const ConfigOptions& config_options);
 };
 
diff --git a/onnxruntime/core/framework/graph_partitioner.cc b/onnxruntime/core/framework/graph_partitioner.cc
index 50f14104cfd7a..7f6cb03936be1 100644
--- a/onnxruntime/core/framework/graph_partitioner.cc
+++ b/onnxruntime/core/framework/graph_partitioner.cc
@@ -748,7 +748,8 @@ static Status InlineFunctionsAOTImpl(const ExecutionProviders& execution_provide
 // Validate the ep_context_path to make sure it is file path and check whether the file exist already
 static Status GetValidatedEpContextPath(const std::filesystem::path& ep_context_path,
                                         const std::filesystem::path& model_path,
-                                        std::filesystem::path& context_cache_path) {
+                                        std::filesystem::path& context_cache_path,
+                                        bool allow_overwrite_output_model = false) {
   if (!ep_context_path.empty()) {
     context_cache_path = ep_context_path;
     if (!context_cache_path.has_filename()) {
@@ -765,7 +766,7 @@ static Status GetValidatedEpContextPath(const std::filesystem::path& ep_context_
     return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Both ep_context_path and model_path are empty.");
   }
 
-  if (std::filesystem::exists(context_cache_path)) {
+  if (std::filesystem::exists(context_cache_path) && !allow_overwrite_output_model) {
     return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Failed to generate EP context model since the file '",
                            context_cache_path, "' exist already. Please remove the EP context model if you want to re-generate it.");
   }
@@ -775,8 +776,7 @@ static Status GetValidatedEpContextPath(const std::filesystem::path& ep_context_
 
 static Status CreateEpContextModel(const ExecutionProviders& execution_providers,
                                    const Graph& graph,
-                                   const std::filesystem::path& ep_context_path,
-                                   const std::filesystem::path& ep_context_ext_ini_path,
+                                   const EpContextModelGenerationOptions& ep_context_gen_options,
                                    const logging::Logger& logger) {
   InlinedVector<const Node*> all_ep_context_nodes;
   for (const auto& ep : execution_providers) {
@@ -785,6 +785,9 @@ static Status CreateEpContextModel(const ExecutionProviders& execution_providers
   }
 
   if (all_ep_context_nodes.size() < 1) {
+    ORT_RETURN_IF(ep_context_gen_options.error_if_no_compiled_nodes,
+                  "Compiled model does not contain any EPContext nodes. "
+                  "Check that the session EPs support compilation and can execute at least one model subgraph.");
     return Status::OK();
   }
 
@@ -798,7 +801,10 @@ static Status CreateEpContextModel(const ExecutionProviders& execution_providers
   };
 
   std::filesystem::path context_cache_path;
-  ORT_RETURN_IF_ERROR(GetValidatedEpContextPath(ep_context_path, graph.ModelPath(), context_cache_path));
+  ORT_RETURN_IF_ERROR(GetValidatedEpContextPath(ep_context_gen_options.output_model_file_path,
+                                                graph.ModelPath(),
+                                                context_cache_path,
+                                                ep_context_gen_options.overwrite_existing_output_file));
 
   Model ep_context_model(graph.Name(), false, graph.GetModel().MetaData(),
                          graph.GetModel().ModelPath(),  // use source model path so that external initializers can find the data file path
@@ -848,20 +854,39 @@ static Status CreateEpContextModel(const ExecutionProviders& execution_providers
     }
   }
 
-  size_t ini_size_threshold = 0;
-  std::filesystem::path external_ini_path;
-  if (ep_context_ext_ini_path.empty()) {
+  size_t ini_size_threshold = ep_context_gen_options.output_external_initializer_size_threshold;
+  std::filesystem::path external_ini_path = ep_context_gen_options.output_external_initializers_file_path;
+  if (external_ini_path.empty()) {
     // Set the threshold to the max so all initializers are forced into the Onnx file
     ini_size_threshold = SIZE_MAX;
     external_ini_path = "./model_ext_ini.bin";
-  } else {
-    // Set the theshold to 0 so all initializers are forced into the external file
-    ini_size_threshold = 0;
-    external_ini_path = ep_context_ext_ini_path;
   }
+
   ModelSavingOptions model_saving_options{ini_size_threshold};
-  ORT_RETURN_IF_ERROR(Model::SaveWithExternalInitializers(ep_context_model, context_cache_path,
-                                                          external_ini_path, model_saving_options));
+
+  if (ep_context_gen_options.output_model_buffer_ptr != nullptr &&
+      ep_context_gen_options.output_model_buffer_size_ptr != nullptr &&
+      ep_context_gen_options.output_model_buffer_allocator != nullptr) {
+    ORT_RETURN_IF_ERROR(ep_context_model.MainGraph().Resolve());
+    // TODO(adrianlizarraga): Investigate if we can make this more memory efficient.
+    // May be able to use allocator to directly allocate the ModelProto to avoid a copy.
+    ONNX_NAMESPACE::ModelProto model_proto = ep_context_model.ToGraphProtoWithExternalInitializers(external_ini_path,
+                                                                                                   context_cache_path,
+                                                                                                   model_saving_options);
+    size_t buffer_size = model_proto.ByteSizeLong();
+    ORT_RETURN_IF(buffer_size > static_cast<size_t>(std::numeric_limits<int>::max()),
+                  "Cannot serialize ONNX ModelProto larger than 2GB");
+
+    AllocatorPtr allocator = ep_context_gen_options.output_model_buffer_allocator;
+    IAllocatorUniquePtr<void> buffer = IAllocator::MakeUniquePtr<void>(allocator, buffer_size);
+    model_proto.SerializeToArray(buffer.get(), static_cast<int>(buffer_size));
+
+    *ep_context_gen_options.output_model_buffer_size_ptr = buffer_size;
+    *ep_context_gen_options.output_model_buffer_ptr = buffer.release();
+  } else {
+    ORT_RETURN_IF_ERROR(Model::SaveWithExternalInitializers(ep_context_model, context_cache_path,
+                                                            external_ini_path, model_saving_options));
+  }
 
   return Status::OK();
 }
@@ -1110,6 +1135,7 @@ Status GraphPartitioner::Partition(Graph& graph, FuncManager& func_mgr,
                                    const ConfigOptions& config_options,
                                    const logging::Logger& logger,
                                    Mode mode,
+                                   const EpContextModelGenerationOptions& ep_context_gen_options,
                                    const layout_transformation::DebugGraphFn& debug_graph_fn) const {
   // It is a greedy partitioning algorithm per provider preferences user provided when calling ONNX RUNTIME right now.
   // 1. Execution providers' capabilities are checked one by one.
@@ -1156,12 +1182,12 @@ Status GraphPartitioner::Partition(Graph& graph, FuncManager& func_mgr,
 
   if (mode == Mode::kNormal || mode == Mode::kAssignOnly) {
 #if !defined(ORT_MINIMAL_BUILD)
-    bool ep_context_enabled = config_options.GetConfigOrDefault(kOrtSessionOptionEpContextEnable, "0") == "1";
-    if (ep_context_enabled) {
-      std::string ep_context_path = config_options.GetConfigOrDefault(kOrtSessionOptionEpContextFilePath, "");
+    if (ep_context_gen_options.enable && ep_context_gen_options.output_model_buffer_ptr == nullptr) {
       // Check before EP compile graphs
       std::filesystem::path context_cache_path;
-      ORT_RETURN_IF_ERROR(GetValidatedEpContextPath(ep_context_path, graph.ModelPath(), context_cache_path));
+      ORT_RETURN_IF_ERROR(GetValidatedEpContextPath(ep_context_gen_options.output_model_file_path, graph.ModelPath(),
+                                                    context_cache_path,
+                                                    ep_context_gen_options.overwrite_existing_output_file));
     }
 
     // We use this only if Resource Aware Partitioning is enabled for any of the EPs
@@ -1172,15 +1198,13 @@ Status GraphPartitioner::Partition(Graph& graph, FuncManager& func_mgr,
     ORT_RETURN_IF_ERROR(PartitionOnnxFormatModel(partition_params, mode, providers_, kernel_registry_mgr_,
                                                  ep_acc_map, *graph_optimizer_registry_, logger));
 
-    if (ep_context_enabled) {
-      std::string ep_context_path = config_options.GetConfigOrDefault(kOrtSessionOptionEpContextFilePath, "");
-      std::string external_ini_file_name = config_options.GetConfigOrDefault(
-          kOrtSessionOptionsEpContextModelExternalInitializersFileName, "");
-      ORT_RETURN_IF_ERROR(CreateEpContextModel(providers_, graph, ep_context_path, external_ini_file_name, logger));
+    if (ep_context_gen_options.enable) {
+      ORT_RETURN_IF_ERROR(CreateEpContextModel(providers_, graph, ep_context_gen_options, logger));
     }
 #else
     ORT_UNUSED_PARAMETER(config_options);
     ORT_UNUSED_PARAMETER(logger);
+    ORT_UNUSED_PARAMETER(ep_context_gen_options);
     return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "ONNX models are not supported in this build.");
 #endif  //! defined(ORT_MINIMAL_BUILD)
   } else {
diff --git a/onnxruntime/core/framework/graph_partitioner.h b/onnxruntime/core/framework/graph_partitioner.h
index 87edc7a64c6b5..6e36d79701fd7 100644
--- a/onnxruntime/core/framework/graph_partitioner.h
+++ b/onnxruntime/core/framework/graph_partitioner.h
@@ -15,6 +15,7 @@ class ExecutionProviders;
 class KernelRegistryManager;
 class Model;
 struct ConfigOptions;
+struct EpContextModelGenerationOptions;
 
 class GraphPartitioner {
  public:
@@ -49,6 +50,7 @@ class GraphPartitioner {
                    const ConfigOptions& config_options,
                    const logging::Logger& logger,
                    Mode mode = Mode::kNormal,
+                   const EpContextModelGenerationOptions& ep_context_gen_options = {},
                    const layout_transformation::DebugGraphFn& debug_graph_fn = {}) const;
 
   bool IsLoadCancellationFlagSet() const {
diff --git a/onnxruntime/core/framework/session_options.cc b/onnxruntime/core/framework/session_options.cc
index 9d6cd3e58225e..a56383034686c 100644
--- a/onnxruntime/core/framework/session_options.cc
+++ b/onnxruntime/core/framework/session_options.cc
@@ -4,6 +4,7 @@
 #include "core/framework/session_options.h"
 #include "core/common/logging/logging.h"
 #include "core/framework/ort_value.h"
+#include "core/session/onnxruntime_session_options_config_keys.h"
 
 namespace onnxruntime {
 
@@ -96,4 +97,21 @@ void SessionOptions::AddCustomOpLibraryHandle(PathString library_name, void* lib
 }
 #endif  // !defined(ORT_MINIMAL_BUILD) || defined(ORT_MINIMAL_BUILD_CUSTOM_OPS)
 
+EpContextModelGenerationOptions::EpContextModelGenerationOptions(const ConfigOptions& config_options) {
+  enable = config_options.GetConfigOrDefault(kOrtSessionOptionEpContextEnable, "0") == "1";
+  output_model_file_path = config_options.GetConfigOrDefault(kOrtSessionOptionEpContextFilePath, "");
+  output_external_initializers_file_path = config_options.GetConfigOrDefault(
+      kOrtSessionOptionsEpContextModelExternalInitializersFileName, "");
+  output_external_initializer_size_threshold = 0;
+  embed_ep_context_in_model = config_options.GetConfigOrDefault(kOrtSessionOptionEpContextEmbedMode, "0") == "1";
+}
+
+EpContextModelGenerationOptions SessionOptions::GetEpContextGenerationOptions() const {
+  if (this->has_explicit_ep_context_gen_options) {
+    return this->ep_context_gen_options;
+  }
+
+  return EpContextModelGenerationOptions(this->config_options);
+}
+
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/framework/session_options.h b/onnxruntime/core/framework/session_options.h
index ef323b99b006c..4cf7829fef549 100644
--- a/onnxruntime/core/framework/session_options.h
+++ b/onnxruntime/core/framework/session_options.h
@@ -11,6 +11,7 @@
 #include <functional>
 #include <gsl/gsl>
 #include "core/common/inlined_containers.h"
+#include "core/framework/allocator.h"
 #include "core/framework/config_options.h"
 #include "core/framework/ort_value.h"
 #include "core/session/onnxruntime_c_api.h"
@@ -69,6 +70,26 @@ struct FreeDimensionOverride {
 
 using CheckLoadCancellationFn = std::function<bool()>;
 
+struct EpContextModelGenerationOptions {
+  EpContextModelGenerationOptions() = default;
+
+  // Initializes from string key/value pairs in session config options.
+  explicit EpContextModelGenerationOptions(const ConfigOptions& config_options);
+
+  bool enable = false;
+  bool overwrite_existing_output_file = false;
+  bool error_if_no_compiled_nodes = false;
+  bool embed_ep_context_in_model = false;
+
+  std::string output_model_file_path;
+  void** output_model_buffer_ptr = nullptr;
+  size_t* output_model_buffer_size_ptr = nullptr;
+  AllocatorPtr output_model_buffer_allocator = nullptr;
+
+  std::string output_external_initializers_file_path;
+  size_t output_external_initializer_size_threshold = 0;
+};
+
 /**
  * Configuration information for a session.
  */
@@ -199,6 +220,15 @@ struct SessionOptions {
   // Load cancellation flag is necessary to be within shared memory as session_options are
   // copied internally and the flag needs to be accessible across all copies.
   std::shared_ptr<std::atomic_bool> load_cancellation_flag = std::make_shared<std::atomic_bool>(false);
+
+  // Options for generating compile EPContext models were previously stored in session_option.configs as
+  // string key/value pairs. To support more advanced options, such as setting input/output buffers, we
+  // now have to store EPContext options in a struct of type EpContextModelGenerationOptions.
+  // The function GetEpContextGenerationOptions() handles conversion of string key/value pairs to the new
+  // struct type.
+  bool has_explicit_ep_context_gen_options = false;
+  EpContextModelGenerationOptions ep_context_gen_options = {};
+  EpContextModelGenerationOptions GetEpContextGenerationOptions() const;
 };
 
 inline std::ostream& operator<<(std::ostream& os, const SessionOptions& session_options) {
diff --git a/onnxruntime/core/framework/transpose_helper.cc b/onnxruntime/core/framework/transpose_helper.cc
index 32d15bdf9060b..75f9492fb071d 100644
--- a/onnxruntime/core/framework/transpose_helper.cc
+++ b/onnxruntime/core/framework/transpose_helper.cc
@@ -22,7 +22,8 @@ struct has_mlas_transpose<uint32_t> : std::true_type {};
 template <typename T>
 typename std::enable_if<!has_mlas_transpose<T>::value, void>::type SimpleTransposeSingleAxisOutwards(
     const T* input_data, T* output_data, int64_t num_loops, int64_t num_writers, int64_t writes_per_loop,
-    int64_t writes_per_writer_per_loop) {
+    int64_t writes_per_writer_per_loop, concurrency::ThreadPool* tp = nullptr) {
+  ORT_UNUSED_PARAMETER(tp);
   const T* end;
   for (int64_t l = 0; l < num_loops; ++l) {
     T* output_for_first_writer = output_data;
@@ -48,10 +49,10 @@ typename std::enable_if<!has_mlas_transpose<T>::value, void>::type SimpleTranspo
 template <typename T>
 typename std::enable_if<has_mlas_transpose<T>::value, void>::type SimpleTransposeSingleAxisOutwards(
     const T* input_data, T* output_data, int64_t num_loops, int64_t num_writers, int64_t writes_per_loop,
-    int64_t writes_per_writer_per_loop) {
+    int64_t writes_per_writer_per_loop, concurrency::ThreadPool* tp = nullptr) {
   for (int64_t l = 0; l < num_loops; ++l) {
     MlasTranspose(input_data, output_data, static_cast<size_t>(writes_per_writer_per_loop),
-                  static_cast<size_t>(num_writers));
+                  static_cast<size_t>(num_writers), tp);
     input_data += writes_per_loop;
     output_data += writes_per_loop;
   }
@@ -82,25 +83,25 @@ void TransposeSingleAxisOutwards(gsl::span<const size_t> permutations, const Ten
   switch (bytes_per_write) {
     case (sizeof(uint8_t)): {
       SimpleTransposeSingleAxisOutwards(input_data, output_data, num_loops, num_writers, writes_per_loop,
-                                        writes_per_writer_per_loop);
+                                        writes_per_writer_per_loop, tp);
       break;
     }
     case (sizeof(uint16_t)): {
       SimpleTransposeSingleAxisOutwards(reinterpret_cast<const uint16_t*>(input_data),
                                         reinterpret_cast<uint16_t*>(output_data), num_loops, num_writers,
-                                        writes_per_loop, writes_per_writer_per_loop);
+                                        writes_per_loop, writes_per_writer_per_loop, tp);
       break;
     }
     case (sizeof(uint32_t)): {
       SimpleTransposeSingleAxisOutwards(reinterpret_cast<const uint32_t*>(input_data),
                                         reinterpret_cast<uint32_t*>(output_data), num_loops, num_writers,
-                                        writes_per_loop, writes_per_writer_per_loop);
+                                        writes_per_loop, writes_per_writer_per_loop, tp);
       break;
     }
     case (sizeof(uint64_t)): {
       SimpleTransposeSingleAxisOutwards(reinterpret_cast<const uint64_t*>(input_data),
                                         reinterpret_cast<uint64_t*>(output_data), num_loops, num_writers,
-                                        writes_per_loop, writes_per_writer_per_loop);
+                                        writes_per_loop, writes_per_writer_per_loop, tp);
       break;
     }
     default: {
@@ -125,7 +126,8 @@ void TransposeSingleAxisOutwards(gsl::span<const size_t> permutations, const Ten
 template <typename T>
 typename std::enable_if<!has_mlas_transpose<T>::value, void>::type SimpleTransposeSingleAxisInwards(
     const T* input_data, T* output_data, int64_t num_loops, int64_t num_readers, int64_t reads_per_loop,
-    int64_t reads_per_reader_per_loop) {
+    int64_t reads_per_reader_per_loop, concurrency::ThreadPool* tp = nullptr) {
+  ORT_UNUSED_PARAMETER(tp);
   T* end;
   for (int64_t l = 0; l < num_loops; ++l) {
     const T* input_for_first_reader = input_data;
@@ -150,10 +152,10 @@ typename std::enable_if<!has_mlas_transpose<T>::value, void>::type SimpleTranspo
 template <typename T>
 typename std::enable_if<has_mlas_transpose<T>::value, void>::type SimpleTransposeSingleAxisInwards(
     const T* input_data, T* output_data, int64_t num_loops, int64_t num_readers, int64_t reads_per_loop,
-    int64_t reads_per_reader_per_loop) {
+    int64_t reads_per_reader_per_loop, concurrency::ThreadPool* tp = nullptr) {
   for (int64_t l = 0; l < num_loops; ++l) {
     MlasTranspose(input_data, output_data, static_cast<size_t>(num_readers),
-                  static_cast<size_t>(reads_per_reader_per_loop));
+                  static_cast<size_t>(reads_per_reader_per_loop), tp);
     input_data += reads_per_loop;
     output_data += reads_per_loop;
   }
@@ -162,7 +164,8 @@ typename std::enable_if<has_mlas_transpose<T>::value, void>::type SimpleTranspos
 // moving a single axis inwards where the read/write size is a power of 2 and between 8 and 64 bits.
 //  `input_shape_override` overrides the shape of `input` for compute purposes.
 void TransposeSingleAxisInwards(gsl::span<const size_t> permutations, const Tensor& input, Tensor& output,
-                                size_t from, size_t to, const TensorShape* input_shape_override = nullptr) {
+                                size_t from, size_t to, const TensorShape* input_shape_override = nullptr,
+                                concurrency::ThreadPool* tp = nullptr) {
   ORT_UNUSED_PARAMETER(permutations);
 
   const auto& input_shape = input_shape_override ? *input_shape_override : input.Shape();
@@ -184,25 +187,25 @@ void TransposeSingleAxisInwards(gsl::span<const size_t> permutations, const Tens
   switch (bytes_per_read) {
     case (sizeof(uint8_t)): {
       SimpleTransposeSingleAxisInwards(input_data, output_data, num_loops, num_readers, reads_per_loop,
-                                       reads_per_reader_per_loop);
+                                       reads_per_reader_per_loop, tp);
       break;
     }
     case (sizeof(uint16_t)): {
       SimpleTransposeSingleAxisInwards(reinterpret_cast<const uint16_t*>(input_data),
                                        reinterpret_cast<uint16_t*>(output_data), num_loops, num_readers, reads_per_loop,
-                                       reads_per_reader_per_loop);
+                                       reads_per_reader_per_loop, tp);
       break;
     }
     case (sizeof(uint32_t)): {
       SimpleTransposeSingleAxisInwards(reinterpret_cast<const uint32_t*>(input_data),
                                        reinterpret_cast<uint32_t*>(output_data), num_loops, num_readers, reads_per_loop,
-                                       reads_per_reader_per_loop);
+                                       reads_per_reader_per_loop, tp);
       break;
     }
     case (sizeof(uint64_t)): {
       SimpleTransposeSingleAxisInwards(reinterpret_cast<const uint64_t*>(input_data),
                                        reinterpret_cast<uint64_t*>(output_data), num_loops, num_readers, reads_per_loop,
-                                       reads_per_reader_per_loop);
+                                       reads_per_reader_per_loop, tp);
       break;
     }
     default: {
@@ -236,7 +239,7 @@ void SingleAxisTranspose(gsl::span<const size_t> permutations, const Tensor& inp
   if (from > to) {
     TransposeSingleAxisOutwards(permutations, input, output, from, to, input_shape_override, tp);
   } else {
-    TransposeSingleAxisInwards(permutations, input, output, from, to, input_shape_override);
+    TransposeSingleAxisInwards(permutations, input, output, from, to, input_shape_override, tp);
   }
 }
 
diff --git a/onnxruntime/core/mlas/inc/mlas.h b/onnxruntime/core/mlas/inc/mlas.h
index db21157d2fdce..266370997fd46 100644
--- a/onnxruntime/core/mlas/inc/mlas.h
+++ b/onnxruntime/core/mlas/inc/mlas.h
@@ -1056,49 +1056,15 @@ MlasComputeTanh(
 // Transpose routines.
 //
 
+template<typename DataType>
 void
 MLASCALL
 MlasTranspose(
-    const uint8_t* Input,
-    uint8_t* Output,
-    size_t M,
-    size_t N
-    );
-
-void
-MLASCALL
-MlasTranspose(
-    const int8_t* Input,
-    int8_t* Output,
-    size_t M,
-    size_t N
-    );
-
-void
-MLASCALL
-MlasTranspose(
-    const uint16_t* Input,
-    uint16_t* Output,
-    size_t M,
-    size_t N
-    );
-
-void
-MLASCALL
-MlasTranspose(
-    const uint32_t* Input,
-    uint32_t* Output,
+    const DataType* Input,
+    DataType* Output,
     size_t M,
-    size_t N
-    );
-
-void
-MLASCALL
-MlasTranspose(
-    const float* Input,
-    float* Output,
-    size_t M,
-    size_t N
+    size_t N,
+    MLAS_THREADPOOL* ThreadPool
     );
 
 //
@@ -1940,20 +1906,22 @@ MlasConvDepthwise(
     MLAS_HALF_GEMM_POSTPROCESSOR* PostProc
     );
 
-
 inline
 void
 MlasTranspose(
     const MLAS_FP16* Input,
     MLAS_FP16* Output,
     size_t M,
-    size_t N
+    size_t N,
+    MLAS_THREADPOOL* ThreadPool
     )
 {
     MlasTranspose(
         reinterpret_cast<const uint16_t*>(Input),
         reinterpret_cast<uint16_t*>(Output),
-        M, N);
+        M,
+        N,
+        ThreadPool);
 }
 
 
diff --git a/onnxruntime/core/mlas/inc/mlas_q4.h b/onnxruntime/core/mlas/inc/mlas_q4.h
index aec14070ffd55..c5f846fc7ffed 100644
--- a/onnxruntime/core/mlas/inc/mlas_q4.h
+++ b/onnxruntime/core/mlas/inc/mlas_q4.h
@@ -266,7 +266,7 @@ MlasBlockwiseQuantizedShape(
 /**
  * @brief Compute the sizes of the quantized data and quantization parameter buffers.
  *
- * @param qbits                             The bit width of each quantized value.
+ * @tparam qbits                            The bit width of each quantized value.
  * @param block_size                        The number of quantized values in a block.
  * @param columnwise                        Whether a block contains values from a matrix column (true) or row (false).
  * @param rows                              Number of matrix rows.
@@ -277,9 +277,9 @@ MlasBlockwiseQuantizedShape(
  *
  * If the qbits or block_size values are unsupported the output sizes will be zero.
  */
+template <int qbits>
 void MLASCALL
 MlasBlockwiseQuantizedBufferSizes(
-    int qbits,
     int block_size,
     bool columnwise,
     int rows,
diff --git a/onnxruntime/core/mlas/lib/q4_dq.cpp b/onnxruntime/core/mlas/lib/q4_dq.cpp
index 015d69de68766..c543770ee22d8 100644
--- a/onnxruntime/core/mlas/lib/q4_dq.cpp
+++ b/onnxruntime/core/mlas/lib/q4_dq.cpp
@@ -328,7 +328,7 @@ struct BitsTraits {
     static constexpr float halfRange = static_cast<float>(kMid - kMin);
 
     // number of qbit elements to pack into whole bytes
-    static constexpr int kPackSize = (qbits == 8) ? 1 : (qbits == 4) ? 2 : (qbits == 2) ? 4 : 0;
+    static constexpr int kPackSize = (qbits == 8) ? 1 : ((qbits == 4) ? 2 : ((qbits == 2) ? 4 : 0));
     static_assert(kPackSize != 0, "Packing to whole bytes not supported for this qbits!");
 };
 
@@ -387,12 +387,14 @@ range2scale(float min, float max, ScaleT& scale)
 
 
 /**
- * @brief Blockwise quantization methods
+ * TODO(fajin): use int4/8 for symmetric quantization so the (vq - zp) operation in MatMulNBits can be saved.
+ * @brief Blockwise quantization methods. Source is row major. Dest, scale and zp are column major.
+ *        Always quantize to unsigned int.
  * @tparam ElementT       source data type, e.g. fp32/fp16
  * @tparam block_size     number of elemenets quantized together
  * @tparam qbits          number of bits in each quantized element
- * @tparam Columnwise     true:  elements in a block come from one single column
- *                        false: elements in a block come from one single row
+ * @tparam Columnwise     true:  quantize along src column, pack along src column.
+ *                        false: quantize along src row, pack along src column.
  */
 template <
     typename ElementT,
@@ -402,11 +404,18 @@ template <
 struct BlockwiseQuantizer {
     // To support other qbits, need to add bit packing code for
     // storing to dst and zero points
-    static_assert(qbits == 4, "Only 4b block quantization is supported!");
+    static_assert(qbits == 2 || qbits == 4 || qbits == 8, "Only 2b, 4b and 8b block quantization is supported!");
 
     using QuantBlk = std::conditional_t<Columnwise, Shape2D<block_size, 1>, Shape2D<1, block_size>>;
     using ThreadBlk = Shape2D<QuantBlk::kRow * BitsTraits<qbits, false>::kPackSize, QuantBlk::kColumn>;
 
+    static
+    MLAS_FORCEINLINE
+    int GetElem(int val, int idx)
+    {
+        return (val >> (qbits * idx)) & ((1 << qbits) - 1);
+    }
+
     static
     MLAS_FORCEINLINE
     void quantizeMetaShape(int rows, int columns, int& meta_rows, int& meta_cols)
@@ -440,14 +449,14 @@ struct BlockwiseQuantizer {
         scale_num_elements = meta_rows * meta_cols;
 
         if (zero_point_bytes) {
-            // this works for qbits == 4 but may need to be updated for other qbits values
+            // this works for qbits == 2, 4 or 8 but may need to be updated for other qbits values
             *zero_point_bytes = ((meta_rows * qbits + 7) / 8) * meta_cols;
         }
     }
 
     /**
      * @brief Quantized a Matrix shape [rows, columns], resulting quantized
-     *        and packed data are stored in column major (transposed)
+     *        and packed data are stored in column major (transposed).
      * @param[out] dst           pointer to the quantized weights, column major: [columns, rows]
      * @param[out] scale         pointer to the scales, column major: [columns/QuantBlk::kColumn, rows/QuantBlk::kRow]
      * @param[out] zero_points   pointer to the zero points, same shape as scale
@@ -479,8 +488,10 @@ struct BlockwiseQuantizer {
         MlasTryBatchParallel(
             thread_pool, total_thrd_blks,
             [&](ptrdiff_t block_idx) {
-                uint8_t zp_bytes[BitsTraits<qbits, false>::kPackSize];
-                std::fill_n(zp_bytes, BitsTraits<qbits, false>::kPackSize, (uint8_t)8);
+                constexpr int kPackSize = BitsTraits<qbits, false>::kPackSize;
+                uint8_t zp_bytes[kPackSize], vi[kPackSize];
+                std::fill_n(zp_bytes, kPackSize, (uint8_t)BitsTraits<qbits, false>::kMid);
+                std::fill_n(vi, kPackSize, 0);
 
                 const int32_t r_blk_idx = static_cast<int32_t>(block_idx / thrd_col_blks);
                 const int32_t c_blk_idx = static_cast<int32_t>(block_idx % thrd_col_blks);
@@ -495,7 +506,7 @@ struct BlockwiseQuantizer {
                 const int meta_col = c / QuantBlk::kColumn;
 
                 // compute scale and zero point
-                for (int kpack = 0; kpack < BitsTraits<qbits, false>::kPackSize; kpack++) {
+                for (int kpack = 0; kpack < kPackSize; kpack++) {
 
                     // scan a single block to extract range [min, max]
                     float min = std::numeric_limits<float>::max();
@@ -521,40 +532,42 @@ struct BlockwiseQuantizer {
                     }
                 }
 
-                // !! 4b specific code as we need to pack 2 4b numbers into one byte
                 if (zero_points != nullptr) {
-                    const int32_t meta_idx = meta_col * ((row_blks + 1) / 2) + meta_row / 2;
-                    zero_points[meta_idx] = (zp_bytes[0] & 0xf) | (zp_bytes[1] << 4);
+                    const int32_t meta_idx = meta_col * ((row_blks + kPackSize - 1) / kPackSize) + meta_row / kPackSize;
+                    if constexpr (qbits == 8) {
+                        zero_points[meta_idx] = zp_bytes[0];
+                    } else if constexpr (qbits == 4) {
+                        zero_points[meta_idx] = (zp_bytes[0] & 0xf) | (zp_bytes[1] << 4);
+                    } else if constexpr (qbits == 2) {
+                        zero_points[meta_idx] = (zp_bytes[0] & 0x3) | (zp_bytes[1] << 2) | (zp_bytes[2] << 4) | (zp_bytes[3] << 6);
+                    } else {
+                        MLAS_THROW_EX(std::runtime_error, "Unsupported qbits");
+                    }
                 }
 
                 for (int32_t j = c; j < c_end; ++j) {
                     const int32_t meta_c = j / QuantBlk::kColumn;
-                    for (int32_t i = r; i < r_end; i += 2) {
-                        const int32_t meta_r = i / QuantBlk::kRow;
-                        const float scale = static_cast<float>(scales[meta_c * row_blks + meta_r]);
-                        const float reciprocal_scale = scale ? 1.0f / scale : 0.0f;
-                        const int8_t zp = zp_bytes[meta_r & 1];
-                        const int8_t zp1 = zp_bytes[((i + 1) / QuantBlk::kRow) & 1];
-
-                        const float v0 = static_cast<float>(src[i * leadingDimension + j]);
-                        const uint8_t vi0 = (uint8_t)std::clamp(roundf(v0 * reciprocal_scale + zp),
-                                                                0.0f, BitsTraits<qbits, false>::kMaxFp);
-
-                        uint8_t vi1 = (uint8_t)zp;
-                        if (i + 1 < r_end) {
-                            float reciprocal_scale1 = reciprocal_scale;
-                            if constexpr (QuantBlk::kRow == 1) {
-                                const float scale1 =
-                                    static_cast<float>(scales[meta_c * row_blks + meta_r + 1]);
-                                reciprocal_scale1 = scale1 ? 1.0f / scale1 : 0.0f;
-                            }
-                            const float v1 = static_cast<float>(src[(i + 1) * leadingDimension + j]);
-                            vi1 = (uint8_t)std::clamp(roundf(v1 * reciprocal_scale1 + zp1), 0.0f,
-                                                      BitsTraits<qbits, false>::kMaxFp);
+                    for (int32_t i = r; i < r_end; i += kPackSize) {
+                        for (int l = 0; l < kPackSize && i + l < r_end; l++) {
+                            const int32_t meta_r = (i + l) / QuantBlk::kRow;
+                            const float scale = static_cast<float>(scales[meta_c * row_blks + meta_r]);
+                            const float reciprocal_scale = scale ? 1.0f / scale : 0.0f;
+                            const int32_t zp = zp_bytes[meta_r % kPackSize];
+
+                            const float v = static_cast<float>(src[(i + l) * leadingDimension + j]);
+                            vi[l] = (uint8_t)std::clamp(roundf(v * reciprocal_scale + zp),
+                                                        0.0f, BitsTraits<qbits, false>::kMaxFp);
                         }
 
-                        // !! 4b specific code
-                        dst[j * q_rows + i / 2] = (vi0 & 0xf) | (vi1 << 4);
+                        if constexpr (qbits == 8) {
+                            dst[j * q_rows + i / kPackSize] = vi[0];
+                        } else if constexpr (qbits == 4) {
+                            dst[j * q_rows + i / kPackSize] = (vi[0] & 0xf) | (vi[1] << 4);
+                        } else if constexpr (qbits == 2) {
+                            dst[j * q_rows + i / kPackSize] = (vi[0] & 0x3) | (vi[1] << 2) | (vi[2] << 4) | (vi[3] << 6);
+                        } else {
+                            MLAS_THROW_EX(std::runtime_error, "Unsupported qbits");
+                        }
                     }
                 }
             });
@@ -589,6 +602,7 @@ struct BlockwiseQuantizer {
 
         int q_rows, q_cols;
         quantizedShape(rows, columns, q_rows, q_cols);
+        constexpr int32_t kPackSize = BitsTraits<qbits, false>::kPackSize;
 
         MlasTryBatchParallel(
             thread_pool, total_thrd_blks,
@@ -605,38 +619,22 @@ struct BlockwiseQuantizer {
                 for (int32_t j = c; j < c_end; ++j) {
                     const int32_t meta_col = j / QuantBlk::kColumn;
 
-                    // !! 4b specific code
-                    // the whole loop is 4b specific due to sub 8 bit packing
-                    // and unpacking. We can potentially make this qbits generic
-                    // by wraping the packing/unpacking code like cutlass::Array
-                    for (int32_t i = r; i < r_end; i += 2) {
+                    for (int32_t i = r; i < r_end; ++i) {
                         const int32_t meta_row = i / QuantBlk::kRow;
-
-                        const float scale0 =
-                            static_cast<float>(scales[meta_col * row_blks + meta_row]);
-
+                        const float scale = static_cast<float>(scales[meta_col * row_blks + meta_row]);
                         const int zp_pair =
-                            (zero_points == nullptr)
-                                ? 0x88
-                                : zero_points[meta_col * ((row_blks + 1) / 2) + meta_row / 2];
-                        const int zp0 = (meta_row & 1) ? (zp_pair >> 4) : (zp_pair & 0xf);
-
-                        const uint8_t vi0 = weights[j * q_rows + i / 2] & 0xf;
-                        const float v0 = (static_cast<float>(vi0) - zp0) * scale0;
-
-                        dst[j * rows + i] = static_cast<ElementT>(v0);
-                        if ((i + 1) < r_end) {
-                            float scale1 = scale0;
-                            int zp1 = zp0;
-                            if constexpr (QuantBlk::kRow == 1) {
-                                scale1 =
-                                    static_cast<float>(scales[meta_col * row_blks + meta_row + 1]);
-                                zp1 = (zp_pair >> 4) & 0xf;
-                            }
-                            const uint8_t vi1 = weights[j * q_rows + i / 2] >> 4;
-                            const float v1 = (static_cast<float>(vi1) - zp1) * scale1;
-                            dst[j * rows + (i + 1)] = static_cast<ElementT>(v1);
-                        }
+                            zero_points
+                            ? zero_points[meta_col * ((row_blks + kPackSize - 1) / kPackSize) + meta_row / kPackSize]
+                            : 0;
+                        const int vi_pair = weights[j * q_rows + i / kPackSize];
+
+                        const int zp =
+                            zero_points
+                                ? GetElem(zp_pair, meta_row % kPackSize)
+                                : BitsTraits<qbits, false>::kMid;
+                        const int vi = GetElem(vi_pair, i % kPackSize);
+                        const float v = (vi - zp) * scale;
+                        dst[j * rows + i] = ElementT(v);
                     }
                 }
             });
@@ -1416,6 +1414,27 @@ MlasBlockwiseQuantizedShape(
     }
 }
 
+template
+void
+MlasBlockwiseQuantMetaShape<float, 2>(
+    int block_size,
+    bool columnwise,
+    int rows,
+    int columns,
+    int& meta_rows,
+    int& meta_cols
+    );
+
+template
+void
+MlasBlockwiseQuantMetaShape<MLAS_FP16, 2>(
+    int block_size,
+    bool columnwise,
+    int rows,
+    int columns,
+    int& meta_rows,
+    int& meta_cols
+    );
 
 template
 void
@@ -1439,6 +1458,50 @@ MlasBlockwiseQuantMetaShape<MLAS_FP16, 4>(
     int& meta_cols
     );
 
+    template
+void
+MlasBlockwiseQuantMetaShape<float, 8>(
+    int block_size,
+    bool columnwise,
+    int rows,
+    int columns,
+    int& meta_rows,
+    int& meta_cols
+    );
+
+template
+void
+MlasBlockwiseQuantMetaShape<MLAS_FP16, 8>(
+    int block_size,
+    bool columnwise,
+    int rows,
+    int columns,
+    int& meta_rows,
+    int& meta_cols
+    );
+
+template
+void
+MlasBlockwiseQuantizedShape<float, 2>(
+    int block_size,
+    bool columnwise,
+    int rows,
+    int columns,
+    int& q_rows,
+    int& q_cols
+    );
+
+template
+void
+MlasBlockwiseQuantizedShape<MLAS_FP16, 2>(
+    int block_size,
+    bool columnwise,
+    int rows,
+    int columns,
+    int& q_rows,
+    int& q_cols
+    );
+
 template
 void
 MlasBlockwiseQuantizedShape<float, 4>(
@@ -1461,9 +1524,31 @@ MlasBlockwiseQuantizedShape<MLAS_FP16, 4>(
     int& q_cols
     );
 
+    template
+void
+MlasBlockwiseQuantizedShape<float, 8>(
+    int block_size,
+    bool columnwise,
+    int rows,
+    int columns,
+    int& q_rows,
+    int& q_cols
+    );
+
+template
+void
+MlasBlockwiseQuantizedShape<MLAS_FP16, 8>(
+    int block_size,
+    bool columnwise,
+    int rows,
+    int columns,
+    int& q_rows,
+    int& q_cols
+    );
+
+template <int qbits>
 void MLASCALL
 MlasBlockwiseQuantizedBufferSizes(
-    int qbits,
     int block_size,
     bool columnwise,
     int rows,
@@ -1478,75 +1563,108 @@ MlasBlockwiseQuantizedBufferSizes(
         *q_zero_point_size_in_bytes = 0;
     }
 
-    if (qbits == 4) {
-        switch (block_size) {
-            case 16:
-                if (columnwise) {
-                    BlockwiseQuantizer<float, 16, 4, true>::quantizedBufferSizes(
-                        rows, columns, q_data_size_in_bytes, q_scale_num_elements, q_zero_point_size_in_bytes
-                    );
-                } else {
-                    BlockwiseQuantizer<float, 16, 4, false>::quantizedBufferSizes(
-                        rows, columns, q_data_size_in_bytes, q_scale_num_elements, q_zero_point_size_in_bytes
-                    );
-                }
-                break;
-
-            case 32:
-                if (columnwise) {
-                    BlockwiseQuantizer<float, 32, 4, true>::quantizedBufferSizes(
-                        rows, columns, q_data_size_in_bytes, q_scale_num_elements, q_zero_point_size_in_bytes
-                    );
-                } else {
-                    BlockwiseQuantizer<float, 32, 4, false>::quantizedBufferSizes(
-                        rows, columns, q_data_size_in_bytes, q_scale_num_elements, q_zero_point_size_in_bytes
-                    );
-                }
-                break;
-
-            case 64:
-                if (columnwise) {
-                    BlockwiseQuantizer<float, 64, 4, true>::quantizedBufferSizes(
-                        rows, columns, q_data_size_in_bytes, q_scale_num_elements, q_zero_point_size_in_bytes
-                    );
-                } else {
-                    BlockwiseQuantizer<float, 64, 4, false>::quantizedBufferSizes(
-                        rows, columns, q_data_size_in_bytes, q_scale_num_elements, q_zero_point_size_in_bytes
-                    );
-                }
-                break;
-
-            case 128:
-                if (columnwise) {
-                    BlockwiseQuantizer<float, 128, 4, true>::quantizedBufferSizes(
-                        rows, columns, q_data_size_in_bytes, q_scale_num_elements, q_zero_point_size_in_bytes
-                    );
-                } else {
-                    BlockwiseQuantizer<float, 128, 4, false>::quantizedBufferSizes(
-                        rows, columns, q_data_size_in_bytes, q_scale_num_elements, q_zero_point_size_in_bytes
-                    );
-                }
-                break;
-
-            case 256:
-                if (columnwise) {
-                    BlockwiseQuantizer<float, 256, 4, true>::quantizedBufferSizes(
-                        rows, columns, q_data_size_in_bytes, q_scale_num_elements, q_zero_point_size_in_bytes
-                    );
-                } else {
-                    BlockwiseQuantizer<float, 256, 4, false>::quantizedBufferSizes(
-                        rows, columns, q_data_size_in_bytes, q_scale_num_elements, q_zero_point_size_in_bytes
-                    );
-                }
-                break;
+    switch (block_size) {
+        case 16:
+            if (columnwise) {
+                BlockwiseQuantizer<float, 16, qbits, true>::quantizedBufferSizes(
+                    rows, columns, q_data_size_in_bytes, q_scale_num_elements, q_zero_point_size_in_bytes
+                );
+            } else {
+                BlockwiseQuantizer<float, 16, qbits, false>::quantizedBufferSizes(
+                    rows, columns, q_data_size_in_bytes, q_scale_num_elements, q_zero_point_size_in_bytes
+                );
+            }
+            break;
 
-            default:
-                // Only block size 16, 32, 64, 128, 256 are supported.
-                break;
-        }
+        case 32:
+            if (columnwise) {
+                BlockwiseQuantizer<float, 32, qbits, true>::quantizedBufferSizes(
+                    rows, columns, q_data_size_in_bytes, q_scale_num_elements, q_zero_point_size_in_bytes
+                );
+            } else {
+                BlockwiseQuantizer<float, 32, qbits, false>::quantizedBufferSizes(
+                    rows, columns, q_data_size_in_bytes, q_scale_num_elements, q_zero_point_size_in_bytes
+                );
+            }
+            break;
+
+        case 64:
+            if (columnwise) {
+                BlockwiseQuantizer<float, 64, qbits, true>::quantizedBufferSizes(
+                    rows, columns, q_data_size_in_bytes, q_scale_num_elements, q_zero_point_size_in_bytes
+                );
+            } else {
+                BlockwiseQuantizer<float, 64, qbits, false>::quantizedBufferSizes(
+                    rows, columns, q_data_size_in_bytes, q_scale_num_elements, q_zero_point_size_in_bytes
+                );
+            }
+            break;
+
+        case 128:
+            if (columnwise) {
+                BlockwiseQuantizer<float, 128, qbits, true>::quantizedBufferSizes(
+                    rows, columns, q_data_size_in_bytes, q_scale_num_elements, q_zero_point_size_in_bytes
+                );
+            } else {
+                BlockwiseQuantizer<float, 128, qbits, false>::quantizedBufferSizes(
+                    rows, columns, q_data_size_in_bytes, q_scale_num_elements, q_zero_point_size_in_bytes
+                );
+            }
+            break;
+
+        case 256:
+            if (columnwise) {
+                BlockwiseQuantizer<float, 256, qbits, true>::quantizedBufferSizes(
+                    rows, columns, q_data_size_in_bytes, q_scale_num_elements, q_zero_point_size_in_bytes
+                );
+            } else {
+                BlockwiseQuantizer<float, 256, qbits, false>::quantizedBufferSizes(
+                    rows, columns, q_data_size_in_bytes, q_scale_num_elements, q_zero_point_size_in_bytes
+                );
+            }
+            break;
+
+        default:
+            // Only block size 16, 32, 64, 128, 256 are supported.
+            break;
     }
 }
 
+template
+void MLASCALL
+MlasBlockwiseQuantizedBufferSizes<2>(
+    int block_size,
+    bool columnwise,
+    int rows,
+    int columns,
+    size_t& q_data_size_in_bytes,
+    size_t& q_scale_num_elements,
+    size_t* q_zero_point_size_in_bytes
+);
+
+template
+void MLASCALL
+MlasBlockwiseQuantizedBufferSizes<4>(
+    int block_size,
+    bool columnwise,
+    int rows,
+    int columns,
+    size_t& q_data_size_in_bytes,
+    size_t& q_scale_num_elements,
+    size_t* q_zero_point_size_in_bytes
+);
+
+template
+void MLASCALL
+MlasBlockwiseQuantizedBufferSizes<8>(
+    int block_size,
+    bool columnwise,
+    int rows,
+    int columns,
+    size_t& q_data_size_in_bytes,
+    size_t& q_scale_num_elements,
+    size_t* q_zero_point_size_in_bytes
+);
 
 template <typename T, int qbits>
 void
@@ -1620,6 +1738,36 @@ MlasQuantizeBlockwise(
     }
 }
 
+template
+void
+MlasQuantizeBlockwise<float, 2>(
+    uint8_t* dst,
+    float* scales,
+    uint8_t* zero_points,
+    const float* src,
+    int block_size,
+    bool columnwise,
+    int rows,
+    int columns,
+    int leading_dimension,
+    MLAS_THREADPOOL* thread_pool
+    );
+
+template
+void
+MlasQuantizeBlockwise<MLAS_FP16, 2>(
+    uint8_t* dst,
+    MLAS_FP16* scales,
+    uint8_t* zero_points,
+    const MLAS_FP16* src,
+    int block_size,
+    bool columnwise,
+    int rows,
+    int columns,
+    int leading_dimension,
+    MLAS_THREADPOOL* thread_pool
+    );
+
 template
 void
 MlasQuantizeBlockwise<float, 4>(
@@ -1650,6 +1798,35 @@ MlasQuantizeBlockwise<MLAS_FP16, 4>(
     MLAS_THREADPOOL* thread_pool
     );
 
+    template
+    void
+    MlasQuantizeBlockwise<float, 8>(
+        uint8_t* dst,
+        float* scales,
+        uint8_t* zero_points,
+        const float* src,
+        int block_size,
+        bool columnwise,
+        int rows,
+        int columns,
+        int leading_dimension,
+        MLAS_THREADPOOL* thread_pool
+        );
+
+    template
+    void
+    MlasQuantizeBlockwise<MLAS_FP16, 8>(
+        uint8_t* dst,
+        MLAS_FP16* scales,
+        uint8_t* zero_points,
+        const MLAS_FP16* src,
+        int block_size,
+        bool columnwise,
+        int rows,
+        int columns,
+        int leading_dimension,
+        MLAS_THREADPOOL* thread_pool
+        );
 
 template <typename T, int qbits>
 void
@@ -1717,6 +1894,32 @@ MlasDequantizeBlockwise(
     }
 }
 
+template void
+MlasDequantizeBlockwise<float, 2>(
+    float* dst,
+    const uint8_t* src,
+    const float* scales,
+    const uint8_t* zero_points,
+    int block_size,
+    bool columnwise,
+    int rows,
+    int columns,
+    MLAS_THREADPOOL* thread_pool
+);
+
+template void
+MlasDequantizeBlockwise<MLAS_FP16, 2>(
+    MLAS_FP16* dst,
+    const uint8_t* src,
+    const MLAS_FP16* scales,
+    const uint8_t* zero_points,
+    int block_size,
+    bool columnwise,
+    int rows,
+    int columns,
+    MLAS_THREADPOOL* thread_pool
+);
+
 template void
 MlasDequantizeBlockwise<float, 4>(
     float* dst,
@@ -1730,6 +1933,45 @@ MlasDequantizeBlockwise<float, 4>(
     MLAS_THREADPOOL* thread_pool
 );
 
+template void
+MlasDequantizeBlockwise<MLAS_FP16, 4>(
+    MLAS_FP16* dst,
+    const uint8_t* src,
+    const MLAS_FP16* scales,
+    const uint8_t* zero_points,
+    int block_size,
+    bool columnwise,
+    int rows,
+    int columns,
+    MLAS_THREADPOOL* thread_pool
+);
+
+template void
+MlasDequantizeBlockwise<float, 8>(
+    float* dst,
+    const uint8_t* src,
+    const float* scales,
+    const uint8_t* zero_points,
+    int block_size,
+    bool columnwise,
+    int rows,
+    int columns,
+    MLAS_THREADPOOL* thread_pool
+);
+
+template void
+MlasDequantizeBlockwise<MLAS_FP16, 8>(
+    MLAS_FP16* dst,
+    const uint8_t* src,
+    const MLAS_FP16* scales,
+    const uint8_t* zero_points,
+    int block_size,
+    bool columnwise,
+    int rows,
+    int columns,
+    MLAS_THREADPOOL* thread_pool
+);
+
 template <typename Tin, int qbits>
 bool
 MlasQDQQuantizeBlockwise(
diff --git a/onnxruntime/core/mlas/lib/transpose.cpp b/onnxruntime/core/mlas/lib/transpose.cpp
index a758a0e59fb4f..1ee2f90357e9e 100644
--- a/onnxruntime/core/mlas/lib/transpose.cpp
+++ b/onnxruntime/core/mlas/lib/transpose.cpp
@@ -16,6 +16,20 @@ Module Name:
 
 #include "mlasi.h"
 
+//
+// Define the parameters to execute segments of a transpose operation on worker
+// threads.
+//
+
+template<typename ElementType>
+struct MLAS_TRANPOSE_WORK_BLOCK {
+    ptrdiff_t ThreadCountM;
+    const ElementType* Input;
+    ElementType* Output;
+    size_t M;
+    size_t N;
+};
+
 #if defined(MLAS_SSE2_INTRINSICS)
 
 MLAS_FORCEINLINE
@@ -541,51 +555,69 @@ MlasTranspose8xNVector(
     MlasTranspose4xNVector(&Input[InputStride * 4], InputStride, &Output[OutputStride * 4], OutputStride);
 }
 
+template <typename ElementType>
 void
-MLASCALL
-MlasTranspose(
-    const uint32_t* Input,
-    uint32_t* Output,
-    size_t M,
-    size_t N
-    )
+MlasTransposeThreaded(
+    void* Context,
+    ptrdiff_t ThreadId
+);
 /*++
 
 Routine Description:
 
-    This routine transposes the input matrix (M rows by N columns) to the
-    output matrix (N rows by M columns).
+    This routine is invoked from a worker thread to execute a segment of a transpose
 
 Arguments:
 
-    Input - Supplies the input buffer.
-
-    Output - Supplies the output buffer.
-
-    M - Supplies the number of rows for the input matrix and the number of
-        columns for the output matrix.
+    Context - Supplies the pointer to the context for the threaded operation.
 
-    N - Supplies the number of columns for the input matrix and the number of
-        rows for the output matrix.
+    ThreadId - Supplies the current index of the threaded operation.
 
 Return Value:
 
     None.
 
 --*/
+
+template<>
+void
+MlasTransposeThreaded<uint32_t>(
+    void* Context,
+    ptrdiff_t ThreadId
+    )
 {
-    size_t n = N;
+    const auto* WorkBlock = (MLAS_TRANPOSE_WORK_BLOCK<uint32_t>*)Context;
+
+    //
+    // Partition the operation along the M dimension.
+    //
+
+    size_t IndexM;
+    size_t CountM;
+    MlasPartitionWork(ThreadId, WorkBlock->ThreadCountM, WorkBlock->M, &IndexM, &CountM);
+
+    //
+    // Set transpose parameters.
+    //
+
+    const size_t M = WorkBlock->M;
+    const size_t N = WorkBlock->N;
+
+    const uint32_t* Input = WorkBlock->Input + IndexM * N;
+    uint32_t* Output = WorkBlock->Output + IndexM;
 
     //
     // Transpose elements from the input matrix to the output matrix 4 columns
     // at a time.
     //
 
+    size_t n = N;
+
     while (n >= 4) {
 
         const uint32_t* s = Input;
         uint32_t* d = Output;
-        size_t m = M;
+        size_t m = CountM;
 
 #if defined(MLAS_SSE2_INTRINSICS) || defined(MLAS_NEON_INTRINSICS) || defined(MLAS_TARGET_POWER) || \
     defined(MLAS_LSX_INTRINSICS)
@@ -624,7 +656,7 @@ Return Value:
 
         const uint32_t* s = Input;
         uint32_t* d = Output;
-        size_t m = M;
+        size_t m = CountM;
 
         while (m >= 4) {
 
@@ -650,68 +682,45 @@ Return Value:
     }
 }
 
+template<>
 void
-MLASCALL
-MlasTranspose(
-    const float* Input,
-    float* Output,
-    size_t M,
-    size_t N
+MlasTransposeThreaded<uint16_t>(
+    void* Context,
+    ptrdiff_t ThreadId
     )
 {
-    MlasTranspose(
-        reinterpret_cast<const uint32_t*>(Input),
-        reinterpret_cast<uint32_t*>(Output),
-        M,
-        N);
-}
-
-
-void
-MLASCALL
-MlasTranspose(
-    const uint16_t* Input,
-    uint16_t* Output,
-    size_t M,
-    size_t N
-    )
-/*++
-
-Routine Description:
-
-    This routine transposes the input matrix (M rows by N columns) to the
-    output matrix (N rows by M columns).
-
-Arguments:
-
-    Input - Supplies the input buffer.
+    const auto* WorkBlock = (MLAS_TRANPOSE_WORK_BLOCK<uint16_t>*)Context;
 
-    Output - Supplies the output buffer.
-
-    M - Supplies the number of rows for the input matrix and the number of
-        columns for the output matrix.
+    //
+    // Partition the operation along the M dimension.
+    //
 
-    N - Supplies the number of columns for the input matrix and the number of
-        rows for the output matrix.
+    size_t IndexM;
+    size_t CountM;
+    MlasPartitionWork(ThreadId, WorkBlock->ThreadCountM, WorkBlock->M, &IndexM, &CountM);
 
-Return Value:
+    //
+    // Set transpose parameters.
+    //
 
-    None.
+    const size_t M = WorkBlock->M;
+    const size_t N = WorkBlock->N;
 
---*/
-{
-    size_t n = N;
+    const uint16_t* Input = WorkBlock->Input + IndexM * N;
+    uint16_t* Output = WorkBlock->Output + IndexM;
 
     //
     // Transpose elements from the input matrix to the output matrix 4 columns
     // at a time.
     //
 
+    size_t n = N;
+
     while (n >= 4) {
 
         const uint16_t* s = Input;
         uint16_t* d = Output;
-        size_t m = M;
+        size_t m = CountM;
 
 #if defined(MLAS_SSE2_INTRINSICS) || defined(MLAS_NEON_INTRINSICS)  || defined(MLAS_LSX_INTRINSICS)
 
@@ -749,7 +758,7 @@ Return Value:
 
         const uint16_t* s = Input;
         uint16_t* d = Output;
-        size_t m = M;
+        size_t m = CountM;
 
         while (m >= 4) {
 
@@ -775,52 +784,46 @@ Return Value:
     }
 }
 
-
+template<>
 void
-MLASCALL
-MlasTranspose(
-    const uint8_t* Input,
-    uint8_t* Output,
-    size_t M,
-    size_t N
+MlasTransposeThreaded<uint8_t>(
+    void* Context,
+    ptrdiff_t ThreadId
     )
-/*++
-
-Routine Description:
-
-    This routine transposes the input matrix (M rows by N columns) to the
-    output matrix (N rows by M columns).
-
-Arguments:
-
-    Input - Supplies the input buffer.
-
-    Output - Supplies the output buffer.
+{
+    const auto* WorkBlock = (MLAS_TRANPOSE_WORK_BLOCK<uint8_t>*)Context;
 
-    M - Supplies the number of rows for the input matrix and the number of
-        columns for the output matrix.
+    //
+    // Partition the operation along the M dimension.
+    //
 
-    N - Supplies the number of columns for the input matrix and the number of
-        rows for the output matrix.
+    size_t IndexM;
+    size_t CountM;
+    MlasPartitionWork(ThreadId, WorkBlock->ThreadCountM, WorkBlock->M, &IndexM, &CountM);
 
-Return Value:
+    //
+    // Set transpose parameters.
+    //
 
-    None.
+    const size_t M = WorkBlock->M;
+    const size_t N = WorkBlock->N;
 
---*/
-{
-    size_t n = N;
+    const uint8_t* Input = WorkBlock->Input + IndexM * N;
+    uint8_t* Output = WorkBlock->Output + IndexM;
 
     //
     // Transpose elements from the input matrix to the output matrix 8 columns
     // at a time.
     //
+
+    size_t n = N;
+
 #if defined(MLAS_TARGET_POWER)
     while (n >= 16) {
 
         const uint8_t* s = Input;
         uint8_t* d = Output;
-        size_t m = M;
+        size_t m = CountM;
         while (m >= 16) {
 
             MlasTranspose16x16Block(s, N, d, M);
@@ -848,7 +851,7 @@ Return Value:
 
         const uint8_t* s = Input;
         uint8_t* d = Output;
-        size_t m = M;
+        size_t m = CountM;
 
 #if defined(MLAS_SSE2_INTRINSICS) || defined(MLAS_NEON_INTRINSICS)  || defined(MLAS_LSX_INTRINSICS)
 
@@ -886,7 +889,7 @@ Return Value:
 
         const uint8_t* s = Input;
         uint8_t* d = Output;
-        size_t m = M;
+        size_t m = CountM;
 
         while (m >= 8) {
 
@@ -912,17 +915,140 @@ Return Value:
     }
 }
 
+template<typename DataType>
 void
 MLASCALL
 MlasTranspose(
+    const DataType* Input,
+    DataType* Output,
+    size_t M,
+    size_t N,
+    MLAS_THREADPOOL* ThreadPool
+    )
+/*++
+
+Routine Description:
+
+    This routine transposes the input matrix (M rows by N columns) to the
+    output matrix (N rows by M columns).
+
+Arguments:
+
+    Input - Supplies the input buffer.
+
+    Output - Supplies the output buffer.
+
+    M - Supplies the number of rows for the input matrix and the number of
+        columns for the output matrix.
+
+    N - Supplies the number of columns for the input matrix and the number of
+        rows for the output matrix.
+
+    ThreadPool - Supplies the thread pool object to use, else nullptr if the
+        base library threading support should be used.
+
+Return Value:
+
+    None.
+
+--*/
+{
+    MLAS_TRANPOSE_WORK_BLOCK<DataType> WorkBlock;
+
+    //
+    // Capture the transpose parameters to the work block.
+    //
+
+    WorkBlock.Input = Input;
+    WorkBlock.Output = Output;
+    WorkBlock.M = M;
+    WorkBlock.N = N;
+
+    //
+    // Compute the number of target threads given the complexity of the transpose
+    // operation. Limit the number of threads to the number of rows and try to
+    // keep each thread processing a minimum number of elements before using
+    // another thread.
+    //
+
+    ptrdiff_t ThreadCountM = MlasGetMaximumThreadCount(ThreadPool);
+
+    if (size_t(ThreadCountM) > M) {
+        ThreadCountM = ptrdiff_t(M);
+    }
+
+    WorkBlock.ThreadCountM = ThreadCountM;
+
+    MlasExecuteThreaded(MlasTransposeThreaded<DataType>, &WorkBlock, ThreadCountM, ThreadPool);
+}
+
+template
+void
+MLASCALL
+MlasTranspose<uint32_t>(
+    const uint32_t* Input,
+    uint32_t* Output,
+    size_t M,
+    size_t N,
+    MLAS_THREADPOOL* ThreadPool
+    );
+
+template
+void
+MLASCALL
+MlasTranspose<uint16_t>(
+    const uint16_t* Input,
+    uint16_t* Output,
+    size_t M,
+    size_t N,
+    MLAS_THREADPOOL* ThreadPool
+    );
+
+template
+void
+MLASCALL
+MlasTranspose<uint8_t>(
+    const uint8_t* Input,
+    uint8_t* Output,
+    size_t M,
+    size_t N,
+    MLAS_THREADPOOL* ThreadPool
+    );
+
+template<>
+void
+MLASCALL
+MlasTranspose<int8_t>(
     const int8_t* Input,
     int8_t* Output,
     size_t M,
-    size_t N)
+    size_t N,
+    MLAS_THREADPOOL* ThreadPool
+    )
 {
     MlasTranspose(
         reinterpret_cast<const uint8_t*>(Input),
         reinterpret_cast<uint8_t*>(Output),
         M,
-        N);
+        N,
+        ThreadPool);
+}
+
+template<>
+void
+MLASCALL
+MlasTranspose<float>(
+    const float* Input,
+    float* Output,
+    size_t M,
+    size_t N,
+    MLAS_THREADPOOL* ThreadPool
+    )
+{
+    MlasTranspose(
+        reinterpret_cast<const uint32_t*>(Input),
+        reinterpret_cast<uint32_t*>(Output),
+        M,
+        N,
+        ThreadPool);
 }
diff --git a/onnxruntime/core/optimizer/attention_fusion.cc b/onnxruntime/core/optimizer/attention_fusion.cc
index ff8943de79679..f92451cf7fe6d 100644
--- a/onnxruntime/core/optimizer/attention_fusion.cc
+++ b/onnxruntime/core/optimizer/attention_fusion.cc
@@ -29,37 +29,6 @@ static bool ValidateAddBiasInitializer(const Graph& graph, const Node& add, int6
   return optimizer_utils::ValidateShape(input_b, {hidden_size});
 }
 
-// Merge 1-D weights (q, k and v) by concatenating them one by one.
-template <typename T>
-void MergeWeights(const T* q, const T* k, const T* v, std::vector<T>& result, int64_t element_count) {
-  for (int64_t i = 0; i < element_count; i++) {
-    result.push_back(*q);
-    q++;
-  }
-
-  for (int64_t i = 0; i < element_count; i++) {
-    result.push_back(*k);
-    k++;
-  }
-
-  for (int64_t i = 0; i < element_count; i++) {
-    result.push_back(*v);
-    v++;
-  }
-}
-
-// Merge 2-D weights (q, k and v) by concatenating them row by row.
-template <typename T>
-void MergeMatMulWeights(const T* q_weight, const T* k_weight, const T* v_weight,
-                        std::vector<T>& result, int64_t hidden_size) {
-  const T* q = q_weight;
-  const T* k = k_weight;
-  const T* v = v_weight;
-  for (int64_t i = 0; i < hidden_size; i++, q += hidden_size, k += hidden_size, v += hidden_size) {
-    MergeWeights(q, k, v, result, hidden_size);
-  }
-}
-
 // Load q, k and v weights, and validate their data types.
 static bool LoadQkvWeights(
     Graph& graph,
@@ -123,9 +92,9 @@ static NodeArg& MergeQkvWeights(Graph& graph, int64_t hidden_size,
     std::vector<float> result;
     result.reserve(gsl::narrow<size_t>(element_count));
     if (is_matmul) {
-      MergeMatMulWeights<float>(q_weight, k_weight, v_weight, result, hidden_size);
+      optimizer_utils::MergeMatMulWeightsByRow<float>(q_weight, k_weight, v_weight, result, hidden_size, hidden_size, hidden_size);
     } else {
-      MergeWeights<float>(q_weight, k_weight, v_weight, result, hidden_size);
+      optimizer_utils::MergeWeights1d<float>(q_weight, k_weight, v_weight, result, hidden_size, hidden_size);
     }
     utils::SetRawDataInTensorProto(initializer, result.data(), gsl::narrow<size_t>(element_count) * sizeof(float));
   } else {  // data_type == ONNX_NAMESPACE::TensorProto_DataType_FLOAT16
@@ -135,9 +104,9 @@ static NodeArg& MergeQkvWeights(Graph& graph, int64_t hidden_size,
     std::vector<MLFloat16> result;
     result.reserve(gsl::narrow<size_t>(element_count));
     if (is_matmul) {
-      MergeMatMulWeights<MLFloat16>(q_weight, k_weight, v_weight, result, hidden_size);
+      optimizer_utils::MergeMatMulWeightsByRow<MLFloat16>(q_weight, k_weight, v_weight, result, hidden_size, hidden_size, hidden_size);
     } else {
-      MergeWeights<MLFloat16>(q_weight, k_weight, v_weight, result, hidden_size);
+      optimizer_utils::MergeWeights1d<MLFloat16>(q_weight, k_weight, v_weight, result, hidden_size, hidden_size);
     }
     utils::SetRawDataInTensorProto(initializer, result.data(), gsl::narrow<size_t>(element_count) * sizeof(MLFloat16));
   }
diff --git a/onnxruntime/core/optimizer/graph_transformer_utils.cc b/onnxruntime/core/optimizer/graph_transformer_utils.cc
index eae2a464cef7e..b03959e4f067b 100644
--- a/onnxruntime/core/optimizer/graph_transformer_utils.cc
+++ b/onnxruntime/core/optimizer/graph_transformer_utils.cc
@@ -45,6 +45,7 @@
 #include "core/optimizer/gemm_activation_fusion.h"
 #include "core/optimizer/gemm_sum_fusion.h"
 #include "core/optimizer/gemm_transpose_fusion.h"
+#include "core/optimizer/group_query_attention_fusion.h"
 #include "core/optimizer/identical_children_consolidation.h"
 #include "core/optimizer/identity_elimination.h"
 #include "core/optimizer/label_encoder_fusion.h"
@@ -282,6 +283,8 @@ InlinedVector<std::unique_ptr<GraphTransformer>> GenerateTransformers(
       const bool enable_gelu_approximation =
           session_options.config_options.GetConfigOrDefault(kOrtSessionOptionsEnableGeluApproximation, "0") == "1";
 
+      const InlinedHashSet<std::string_view> cuda_eps = {onnxruntime::kCudaExecutionProvider};
+
       const InlinedHashSet<std::string_view> cuda_rocm_eps = {onnxruntime::kCudaExecutionProvider,
                                                               onnxruntime::kRocmExecutionProvider};
       const InlinedHashSet<std::string_view> cpu_cuda_rocm_eps = {onnxruntime::kCpuExecutionProvider,
@@ -353,6 +356,8 @@ InlinedVector<std::unique_ptr<GraphTransformer>> GenerateTransformers(
       transformers.emplace_back(std::make_unique<MatmulTransposeFusion>(cpu_cuda_dml_rocm_eps));
       transformers.emplace_back(std::make_unique<BiasGeluFusion>(cpu_acl_cuda_dml_rocm_eps));
 
+      transformers.emplace_back(std::make_unique<GroupQueryAttentionFusion>(cuda_eps));
+
       transformers.emplace_back(std::make_unique<SkipLayerNormFusion>(cpu_acl_cuda_dml_rocm_eps));
 
       transformers.emplace_back(std::make_unique<FastGeluFusion>(cpu_cuda_dml_rocm_eps));
diff --git a/onnxruntime/core/optimizer/group_query_attention_fusion.cc b/onnxruntime/core/optimizer/group_query_attention_fusion.cc
new file mode 100644
index 0000000000000..fcb744bea4df1
--- /dev/null
+++ b/onnxruntime/core/optimizer/group_query_attention_fusion.cc
@@ -0,0 +1,520 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/optimizer/initializer.h"
+#include "core/optimizer/group_query_attention_fusion.h"
+#include "core/graph/graph_utils.h"
+#include "core/optimizer/utils.h"
+#include <onnx/defs/attr_proto_util.h>
+
+#define DEBUG_LOG(x) LOGS(logger, VERBOSE) << x
+
+using namespace ONNX_NAMESPACE;
+namespace onnxruntime {
+
+static NodeArg& MergeQkvWeightsForMatMul(Graph& graph,
+                                         int64_t q_hidden_size,
+                                         int64_t kv_hidden_size,
+                                         const TensorProto* q_tensor,
+                                         const TensorProto* k_tensor,
+                                         const TensorProto* v_tensor) {
+  assert(nullptr != q_tensor);
+  assert(nullptr != k_tensor);
+  assert(nullptr != v_tensor);
+
+  int64_t input_hidden_size = q_tensor->dims(0);
+
+  Initializer q_initializer(*q_tensor, graph.ModelPath());
+  Initializer k_initializer(*k_tensor, graph.ModelPath());
+  Initializer v_initializer(*v_tensor, graph.ModelPath());
+
+  int64_t output_hidden_size = q_hidden_size + 2 * kv_hidden_size;
+
+  TensorProto qkv_weight_initializer;
+  qkv_weight_initializer.set_name(graph.GenerateNodeArgName("qkv_weight"));
+  qkv_weight_initializer.add_dims(input_hidden_size);
+  qkv_weight_initializer.add_dims(output_hidden_size);
+  qkv_weight_initializer.set_data_type(q_tensor->data_type());
+
+  const MLFloat16* q_data = q_initializer.data<MLFloat16>();
+  const MLFloat16* k_data = k_initializer.data<MLFloat16>();
+  const MLFloat16* v_data = v_initializer.data<MLFloat16>();
+
+  int64_t element_count = input_hidden_size * output_hidden_size;
+  std::vector<MLFloat16> merged_qkv_weight;
+  merged_qkv_weight.reserve(gsl::narrow<size_t>(element_count));
+
+  optimizer_utils::MergeMatMulWeightsByRow(q_data, k_data, v_data, merged_qkv_weight, input_hidden_size, q_hidden_size, kv_hidden_size);
+  utils::SetRawDataInTensorProto(qkv_weight_initializer, merged_qkv_weight.data(), element_count * sizeof(MLFloat16));
+
+  return graph_utils::AddInitializer(graph, qkv_weight_initializer);
+}
+
+static std::vector<NodeArg*> MergeQkvWeightsForMatMulNBits(
+    Graph& graph,
+    int64_t q_hidden_size,
+    int64_t kv_hidden_size,
+    int64_t blocks,
+    int64_t block_size,
+    const TensorProto* q_tensor,
+    const TensorProto* k_tensor,
+    const TensorProto* v_tensor,
+    const TensorProto* q_scale_tensor,
+    const TensorProto* q_zero_point_tensor,
+    const TensorProto* k_scale_tensor,
+    const TensorProto* k_zero_point_tensor,
+    const TensorProto* v_scale_tensor,
+    const TensorProto* v_zero_point_tensor) {
+  // B and scale tensors are required.
+  assert(q_tensor != nullptr);
+  assert(k_tensor != nullptr);
+  assert(v_tensor != nullptr);
+  assert(q_scale_tensor != nullptr);
+  assert(k_scale_tensor != nullptr);
+  assert(v_scale_tensor != nullptr);
+
+  // Determine if all zero-point tensors exist.
+  bool has_zero_points = (q_zero_point_tensor != nullptr &&
+                          k_zero_point_tensor != nullptr &&
+                          v_zero_point_tensor != nullptr);
+
+  Initializer q_initializer(*q_tensor, graph.ModelPath());
+  Initializer k_initializer(*k_tensor, graph.ModelPath());
+  Initializer v_initializer(*v_tensor, graph.ModelPath());
+
+  Initializer q_scale_initializer(*q_scale_tensor, graph.ModelPath());
+  Initializer k_scale_initializer(*k_scale_tensor, graph.ModelPath());
+  Initializer v_scale_initializer(*v_scale_tensor, graph.ModelPath());
+
+  const uint8_t* q_data = q_initializer.data<uint8_t>();
+  const uint8_t* k_data = k_initializer.data<uint8_t>();
+  const uint8_t* v_data = v_initializer.data<uint8_t>();
+
+  const MLFloat16* q_scale_data = q_scale_initializer.data<MLFloat16>();
+  const MLFloat16* k_scale_data = k_scale_initializer.data<MLFloat16>();
+  const MLFloat16* v_scale_data = v_scale_initializer.data<MLFloat16>();
+
+  int64_t output_hidden_size = q_hidden_size + 2 * kv_hidden_size;
+
+  TensorProto qkv_weight_initializer;
+  qkv_weight_initializer.set_name(graph.GenerateNodeArgName("qkv_weight"));
+  qkv_weight_initializer.add_dims(output_hidden_size);
+  qkv_weight_initializer.add_dims(blocks);
+  qkv_weight_initializer.add_dims(block_size);
+  qkv_weight_initializer.set_data_type(q_tensor->data_type());
+
+  TensorProto qkv_scale_initializer;
+  qkv_scale_initializer.set_name(graph.GenerateNodeArgName("qkv_scale"));
+
+  // Preserve scale tensor shape (the dimension is either 1 or 2).
+  if (q_scale_tensor->dims().size() > 1) {
+    qkv_scale_initializer.add_dims(output_hidden_size);
+    qkv_scale_initializer.add_dims(blocks);
+  } else {
+    qkv_scale_initializer.add_dims(output_hidden_size * blocks);
+  }
+  qkv_scale_initializer.set_data_type(q_scale_tensor->data_type());
+
+  int64_t element_count = output_hidden_size * blocks * block_size;
+  std::vector<uint8_t> merged_qkv_weight;
+  merged_qkv_weight.reserve(gsl::narrow<size_t>(element_count));
+
+  int64_t scale_elements_count = output_hidden_size * blocks;
+  std::vector<MLFloat16> merged_qkv_scale;
+  merged_qkv_scale.reserve(gsl::narrow<size_t>(scale_elements_count));
+
+  optimizer_utils::MergeMatMulWeightsByBlocks(q_data, k_data, v_data, merged_qkv_weight, q_hidden_size, kv_hidden_size, blocks, block_size);
+  optimizer_utils::MergeMatMulWeightsByBlocks(q_scale_data, k_scale_data, v_scale_data, merged_qkv_scale, q_hidden_size, kv_hidden_size, blocks, 1);
+
+  utils::SetRawDataInTensorProto(qkv_weight_initializer, merged_qkv_weight.data(), element_count * sizeof(uint8_t));
+  utils::SetRawDataInTensorProto(qkv_scale_initializer, merged_qkv_scale.data(), scale_elements_count * sizeof(MLFloat16));
+
+  NodeArg& qkv_weight_arg = graph_utils::AddInitializer(graph, qkv_weight_initializer);
+  NodeArg& qkv_scale_arg = graph_utils::AddInitializer(graph, qkv_scale_initializer);
+
+  std::vector<NodeArg*> result_node_args = {&qkv_weight_arg, &qkv_scale_arg};
+
+  if (has_zero_points) {
+    Initializer q_zp_initializer(*q_zero_point_tensor, graph.ModelPath());
+    Initializer k_zp_initializer(*k_zero_point_tensor, graph.ModelPath());
+    Initializer v_zp_initializer(*v_zero_point_tensor, graph.ModelPath());
+
+    const uint8_t* q_zero_points_data = q_zp_initializer.data<uint8_t>();
+    const uint8_t* k_zero_points_data = k_zp_initializer.data<uint8_t>();
+    const uint8_t* v_zero_points_data = v_zp_initializer.data<uint8_t>();
+
+    TensorProto qkv_zp_initializer;
+
+    // We use 4 bit quantization, hence dividing by 2 since we need 1/2 of the bytes.
+    int64_t zp_elements_count = output_hidden_size * blocks / 2;
+
+    qkv_zp_initializer.set_name(graph.GenerateNodeArgName("qkv_zp"));
+    qkv_zp_initializer.add_dims(zp_elements_count);
+    qkv_zp_initializer.set_data_type(q_zero_point_tensor->data_type());
+
+    std::vector<uint8_t> merged_qkv_zp;
+    merged_qkv_zp.reserve(gsl::narrow<size_t>(zp_elements_count));
+
+    optimizer_utils::MergeMatMulWeightsByBlocks(q_zero_points_data, k_zero_points_data, v_zero_points_data,
+                                                merged_qkv_zp, q_hidden_size, kv_hidden_size, blocks / 2, 1);
+
+    utils::SetRawDataInTensorProto(qkv_zp_initializer, merged_qkv_zp.data(), zp_elements_count * sizeof(uint8_t));
+
+    NodeArg& qkv_zp_arg = graph_utils::AddInitializer(graph, qkv_zp_initializer);
+    result_node_args.push_back(&qkv_zp_arg);
+  }
+
+  return result_node_args;
+}
+
+static bool LoadQKVProjectionTensors(Graph& graph,
+                                     bool quantization_used,
+                                     Node* q_node,
+                                     Node* k_node,
+                                     Node* v_node,
+                                     const TensorProto*& q_proj_tensor,
+                                     const TensorProto*& k_proj_tensor,
+                                     const TensorProto*& v_proj_tensor,
+                                     const TensorProto*& q_scale_tensor,
+                                     const TensorProto*& k_scale_tensor,
+                                     const TensorProto*& v_scale_tensor,
+                                     const TensorProto*& q_zero_points_tensor,
+                                     const TensorProto*& k_zero_points_tensor,
+                                     const TensorProto*& v_zero_points_tensor) {
+  // Only support bits = 4 fusion on MatMulNBits.
+  if (quantization_used && (q_node->GetAttributes().at("bits").i() != 4 || k_node->GetAttributes().at("bits").i() != 4 || v_node->GetAttributes().at("bits").i() != 4)) {
+    return false;
+  }
+
+  if (!graph.GetInitializedTensor(q_node->InputDefs()[1]->Name(), q_proj_tensor)) {
+    return false;
+  }
+
+  if (quantization_used && !graph.GetInitializedTensor(q_node->InputDefs()[2]->Name(), q_scale_tensor)) {
+    return false;
+  }
+
+  if (!graph.GetInitializedTensor(k_node->InputDefs()[1]->Name(), k_proj_tensor)) {
+    return false;
+  }
+
+  if (quantization_used && !graph.GetInitializedTensor(k_node->InputDefs()[2]->Name(), k_scale_tensor)) {
+    return false;
+  }
+
+  if (!graph.GetInitializedTensor(v_node->InputDefs()[1]->Name(), v_proj_tensor)) {
+    return false;
+  }
+
+  if (quantization_used && !graph.GetInitializedTensor(v_node->InputDefs()[2]->Name(), v_scale_tensor)) {
+    return false;
+  }
+
+  // Extract zero points tensors only if they're present.
+  if (quantization_used && q_node->InputDefs().size() > 3 &&
+      !graph.GetInitializedTensor(q_node->InputDefs()[3]->Name(), q_zero_points_tensor)) {
+    return false;
+  }
+
+  if (quantization_used && k_node->InputDefs().size() > 3 &&
+      !graph.GetInitializedTensor(k_node->InputDefs()[3]->Name(), k_zero_points_tensor)) {
+    return false;
+  }
+
+  if (quantization_used && v_node->InputDefs().size() > 3 &&
+      !graph.GetInitializedTensor(v_node->InputDefs()[3]->Name(), v_zero_points_tensor)) {
+    return false;
+  }
+
+  if (quantization_used) {
+    if ((q_zero_points_tensor || k_zero_points_tensor || v_zero_points_tensor) &&
+        (!q_zero_points_tensor || !k_zero_points_tensor || !v_zero_points_tensor)) {
+      return false;
+    }
+
+    // Support only packed zp tensors for now.
+    if (q_zero_points_tensor && k_zero_points_tensor && v_zero_points_tensor && (q_zero_points_tensor->data_type() != TensorProto::UINT8 || k_zero_points_tensor->data_type() != TensorProto::UINT8 || v_zero_points_tensor->data_type() != TensorProto::UINT8)) {
+      return false;
+    }
+
+    return q_proj_tensor->data_type() == TensorProto::UINT8 && k_proj_tensor->data_type() == TensorProto::UINT8 && v_proj_tensor->data_type() == TensorProto::UINT8 &&
+           q_scale_tensor->data_type() == TensorProto::FLOAT16 && k_scale_tensor->data_type() == TensorProto::FLOAT16 && v_scale_tensor->data_type() == TensorProto::FLOAT16;
+  } else {
+    return q_proj_tensor->data_type() == TensorProto::FLOAT16 && k_proj_tensor->data_type() == TensorProto::FLOAT16 && v_proj_tensor->data_type() == TensorProto::FLOAT16;
+  }
+}
+
+static bool CheckIfAnyOfRequiredGQANodesDoesNotExist(Node* rotary_node_1, Node* rotary_node_2, Node* q_node, Node* k_node, Node* v_node) {
+  return rotary_node_1 == nullptr || rotary_node_2 == nullptr || q_node == nullptr || k_node == nullptr || v_node == nullptr;
+}
+
+static void FusePreGQANodes(Graph& graph, Node* q_node, Node* k_node, Node* v_node, Node* rotary_node_1, Node* rotary_node_2, Node* new_node, NodeArg& new_node_output_arg) {
+  graph_utils::MoveAllNodeInputEdges(graph, *q_node, *new_node);
+
+  auto target_idx = new_node->Index();
+
+  // Get and remove the old output edges.
+  auto output_edges = graph_utils::GraphEdge::GetNodeOutputEdges(*rotary_node_2);
+  graph_utils::GraphEdge::RemoveGraphEdges(graph, output_edges);
+
+  // Add the new output edges to the new node.
+  for (auto cur = output_edges.cbegin(), end = output_edges.cend(); cur != end; ++cur) {
+    graph.AddEdge(target_idx, cur->dst_node, cur->src_arg_index, cur->dst_arg_index);
+  }
+
+  auto nodes = {q_node, k_node, v_node, rotary_node_1, rotary_node_2};
+
+  // Remove old nodes and their outdoing edges.
+  for (Node* node : nodes) {
+    graph_utils::RemoveNodeOutputEdges(graph, *node);
+    graph.RemoveNode(node->Index());
+  }
+
+  const std::array output_defs{&new_node_output_arg};
+
+  auto& new_node_output_defs = new_node->MutableOutputDefs();
+  new_node_output_defs.assign(output_defs.begin(), output_defs.end());
+}
+
+Status GroupQueryAttentionFusion::ApplyImpl(
+    Graph& graph,
+    bool& modified,
+    int graph_level,
+    const logging::Logger& logger) const {
+  GraphViewer graph_viewer(graph);
+  const auto& node_topology_list = graph_viewer.GetNodesInTopologicalOrder();
+
+  for (auto node_index : node_topology_list) {
+    auto* node_ptr = graph.GetNode(node_index);
+    if (node_ptr == nullptr)
+      continue;  // we removed the node as part of an earlier fusion
+
+    Node& node = *node_ptr;
+
+    ORT_RETURN_IF_ERROR(Recurse(node, modified, graph_level, logger));
+
+    if (!graph_utils::IsSupportedOptypeVersionAndDomain(node, "GroupQueryAttention", {1}, kMSDomain) ||
+        !graph_utils::IsSupportedProvider(node, GetCompatibleExecutionProviders())) {
+      continue;
+    }
+
+    const auto& gqa_node_attrs = node.GetAttributes();
+    const auto do_rotary_attr_it = gqa_node_attrs.find("do_rotary");
+
+    // Check if GQA is already fused.
+    if (do_rotary_attr_it != gqa_node_attrs.end() && do_rotary_attr_it->second.i() == 1) {
+      continue;
+    }
+
+    const TensorProto* k_proj_tensor = nullptr;
+    const TensorProto* k_scale_tensor = nullptr;
+    const TensorProto* k_zero_points_tensor = nullptr;
+    const TensorProto* q_proj_tensor = nullptr;
+    const TensorProto* q_scale_tensor = nullptr;
+    const TensorProto* q_zero_points_tensor = nullptr;
+    const TensorProto* v_proj_tensor = nullptr;
+    const TensorProto* v_scale_tensor = nullptr;
+    const TensorProto* v_zero_points_tensor = nullptr;
+
+    NodeArg* cos_cache_arg = nullptr;
+    NodeArg* sin_cache_arg = nullptr;
+    NodeArg* past_key_values_key_arg = node.MutableInputDefs()[3];
+    NodeArg* past_key_values_value_arg = node.MutableInputDefs()[4];
+    NodeArg* seqlens_k = node.MutableInputDefs()[5];
+    NodeArg* total_seq_len = node.MutableInputDefs()[6];
+
+    bool quantization_used = false;
+
+    Node* rotary_node_1 = nullptr;
+    Node* rotary_node_2 = nullptr;
+    Node* q_node = nullptr;
+    Node* k_node = nullptr;
+    Node* v_node = nullptr;
+
+    for (auto pre_gqa_node = node.InputNodesBegin(); pre_gqa_node != node.InputNodesEnd(); ++pre_gqa_node) {
+      Node& rotary_or_v_node = *graph.GetNode(pre_gqa_node->Index());
+
+      if (rotary_or_v_node.OpType() == "RotaryEmbedding") {
+        if (!rotary_node_1) {
+          rotary_node_1 = &rotary_or_v_node;
+        } else {
+          rotary_node_2 = &rotary_or_v_node;
+        }
+
+        for (auto pre_rotary_node = rotary_or_v_node.InputNodesBegin(); pre_rotary_node != rotary_or_v_node.InputNodesEnd(); ++pre_rotary_node) {
+          // Some models might have input nodes that are unrelated to MatMulNBits or MatMul.
+          if (pre_rotary_node->OpType() != "MatMulNBits" && pre_rotary_node->OpType() != "MatMul") {
+            continue;
+          }
+
+          auto& mat_mul_or_nbits_node = *graph.GetNode(pre_rotary_node->Index());
+
+          // Q always comes before K.
+          if (!q_node) {
+            q_node = &mat_mul_or_nbits_node;
+          } else {
+            k_node = &mat_mul_or_nbits_node;
+          }
+        }
+
+        if (cos_cache_arg == nullptr) {
+          cos_cache_arg = rotary_or_v_node.MutableInputDefs()[2];
+        }
+
+        if (sin_cache_arg == nullptr) {
+          sin_cache_arg = rotary_or_v_node.MutableInputDefs()[3];
+        }
+      } else if (rotary_or_v_node.OpType() == "MatMulNBits" || rotary_or_v_node.OpType() == "MatMul") {
+        v_node = &rotary_or_v_node;
+      }
+    }
+
+    if (CheckIfAnyOfRequiredGQANodesDoesNotExist(rotary_node_1, rotary_node_2, q_node, k_node, v_node)) {
+      // Some of the required pre-GQA nodes required for fusion were not retrieved,
+      // this can be expected if the model has extra nodes in between MatMuls and rotary embeddings.
+      continue;
+    }
+
+    if (q_node->OpType() == "MatMulNBits" && k_node->OpType() == "MatMulNBits" && v_node->OpType() == "MatMulNBits") {
+      quantization_used = true;
+    } else if (q_node->OpType() == "MatMul" && k_node->OpType() == "MatMul" && v_node->OpType() == "MatMul") {
+      quantization_used = false;
+    } else {
+      continue;
+    }
+
+    if (!LoadQKVProjectionTensors(graph,
+                                  quantization_used,
+                                  q_node,
+                                  k_node,
+                                  v_node,
+                                  q_proj_tensor,
+                                  k_proj_tensor,
+                                  v_proj_tensor,
+                                  q_scale_tensor,
+                                  k_scale_tensor,
+                                  v_scale_tensor,
+                                  q_zero_points_tensor,
+                                  k_zero_points_tensor,
+                                  v_zero_points_tensor)) {
+      DEBUG_LOG("Some of the required tensors were not able to load");
+      continue;
+    }
+
+    // The input to the newly created MatMul or MatMulNBits node.
+    NodeArg* layer_norm = q_node->MutableInputDefs()[0];
+
+    const onnx::TypeProto* layer_norm_tensor_proto = layer_norm->TypeAsProto();
+    onnx::TypeProto mutable_matmul_or_nbits_tensor_proto = *layer_norm_tensor_proto;
+    auto* matmul_or_nbits_tensor_type = mutable_matmul_or_nbits_tensor_proto.mutable_tensor_type();
+    auto* matmul_or_nbits_output_shape = matmul_or_nbits_tensor_type->mutable_shape();
+
+    int64_t head_size = past_key_values_key_arg->Shape()->dim(3).dim_value();
+    int64_t num_heads = node.GetAttributes().at("num_heads").i();
+    int64_t kv_num_heads = node.GetAttributes().at("kv_num_heads").i();
+    int64_t q_hidden_size = num_heads * head_size;
+    int64_t kv_hidden_size = kv_num_heads * head_size;
+    int64_t output_hidden_size = q_hidden_size + 2 * kv_hidden_size;
+
+    // Ensure the output shape has 3 dimensions [batch_size, sequence_length, hidden_size]
+    if (matmul_or_nbits_output_shape->dim_size() == 3) {
+      auto* third_dim = matmul_or_nbits_output_shape->mutable_dim(2);
+      third_dim->set_dim_value(output_hidden_size);
+    } else {
+      DEBUG_LOG("The newly created node does not follow output def shape of [batch_size, sequence_length, hidden_size]");
+      continue;
+    }
+
+    auto& matmul_or_nbits_output = graph.GetOrCreateNodeArg(graph.GenerateNodeArgName("MatMul_output"), &mutable_matmul_or_nbits_tensor_proto);
+    const std::array mmnb_output_defs{&matmul_or_nbits_output};
+
+    Node* mat_mul_or_n_bits_new_node = nullptr;
+
+    if (!quantization_used) {
+      auto& qkv_weights = MergeQkvWeightsForMatMul(graph, q_hidden_size, kv_hidden_size, q_proj_tensor, k_proj_tensor, v_proj_tensor);
+      std::array mmnb_input_defs{layer_norm, &qkv_weights};
+
+      mat_mul_or_n_bits_new_node = &graph.AddNode(graph.GenerateNodeName("MatMul"),
+                                                  "MatMul",
+                                                  "MatMul fusion node",
+                                                  mmnb_input_defs,
+                                                  mmnb_output_defs,
+                                                  &q_node->GetAttributes(),
+                                                  kOnnxDomainAlias);
+    } else {
+      auto qkv_args = MergeQkvWeightsForMatMulNBits(
+          graph,
+          q_hidden_size,
+          kv_hidden_size,
+          q_proj_tensor->dims(1),
+          q_proj_tensor->dims(2),
+          q_proj_tensor,
+          k_proj_tensor,
+          v_proj_tensor,
+          q_scale_tensor,
+          q_zero_points_tensor,
+          k_scale_tensor,
+          k_zero_points_tensor,
+          v_scale_tensor,
+          v_zero_points_tensor);
+
+      // If the zero points tensor was present.
+      if (qkv_args.size() == 3) {
+        const std::array mmnb_input_defs = {layer_norm, qkv_args[0], qkv_args[1], qkv_args[2]};
+
+        mat_mul_or_n_bits_new_node = &graph.AddNode(graph.GenerateNodeName("MatMulNBits"),
+                                                    "MatMulNBits",
+                                                    "MatMulNBits fusion node",
+                                                    mmnb_input_defs,
+                                                    mmnb_output_defs,
+                                                    &q_node->GetAttributes(),
+                                                    kMSDomain);
+
+      } else {
+        const std::array mmnb_input_defs = {layer_norm, qkv_args[0], qkv_args[1]};
+
+        mat_mul_or_n_bits_new_node = &graph.AddNode(graph.GenerateNodeName("MatMulNBits"),
+                                                    "MatMulNBits",
+                                                    "MatMulNBits fusion node",
+                                                    mmnb_input_defs,
+                                                    mmnb_output_defs,
+                                                    &q_node->GetAttributes(),
+                                                    kMSDomain);
+      }
+
+      mat_mul_or_n_bits_new_node->GetMutableAttributes()["N"] = ONNX_NAMESPACE::MakeAttribute("N", static_cast<int64_t>(output_hidden_size));
+    }
+
+    mat_mul_or_n_bits_new_node->SetExecutionProviderType(node.GetExecutionProviderType());
+    FusePreGQANodes(graph, q_node, k_node, v_node, rotary_node_1, rotary_node_2, mat_mul_or_n_bits_new_node, matmul_or_nbits_output);
+
+    node.GetMutableAttributes()["do_rotary"] = ONNX_NAMESPACE::MakeAttribute("do_rotary", static_cast<int64_t>(1));
+
+    std::string empty_name;
+    auto& empty_node_arg = graph.GetOrCreateNodeArg(empty_name, nullptr);
+
+    const std::array gqa_input_defs{
+        &matmul_or_nbits_output,
+        &empty_node_arg,
+        &empty_node_arg,
+        past_key_values_key_arg,
+        past_key_values_value_arg,
+        seqlens_k,
+        total_seq_len,
+        cos_cache_arg,
+        sin_cache_arg};
+
+    auto& gqa_input_args = node.MutableInputArgsCount();
+    gqa_input_args[7] = 1;
+    gqa_input_args[8] = 1;
+
+    // Switch GQA input defs from unfused into the fused form.
+    auto& gqa_node_input_defs = node.MutableInputDefs();
+    gqa_node_input_defs.assign(gqa_input_defs.begin(), gqa_input_defs.end());
+
+    modified = true;
+  }
+
+  return Status::OK();
+}
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/optimizer/group_query_attention_fusion.h b/onnxruntime/core/optimizer/group_query_attention_fusion.h
new file mode 100644
index 0000000000000..58ea9d19f9bea
--- /dev/null
+++ b/onnxruntime/core/optimizer/group_query_attention_fusion.h
@@ -0,0 +1,22 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/optimizer/graph_transformer.h"
+
+namespace onnxruntime {
+
+/**
+@Class GroupQueryAttention
+*/
+class GroupQueryAttentionFusion : public GraphTransformer {
+ public:
+  explicit GroupQueryAttentionFusion(const InlinedHashSet<std::string_view>& compatible_execution_providers = {}) noexcept
+      : GraphTransformer("GroupQueryAttentionFusion", compatible_execution_providers) {
+  }
+
+  Status ApplyImpl(Graph& graph, bool& modified, int graph_level, const logging::Logger& logger) const override;
+};
+
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/optimizer/matmul_transpose_fusion.cc b/onnxruntime/core/optimizer/matmul_transpose_fusion.cc
index 8eb224013618d..9edd1ca4230f6 100644
--- a/onnxruntime/core/optimizer/matmul_transpose_fusion.cc
+++ b/onnxruntime/core/optimizer/matmul_transpose_fusion.cc
@@ -310,6 +310,19 @@ Status MatmulTransposeFusion::ApplyImpl(Graph& graph, bool& modified, int graph_
       continue;
     }
 
+    NodeArg* right_input = node.MutableInputDefs()[1];
+    auto right_type = right_input->TypeAsProto()->tensor_type().elem_type();
+    if (!IsAllowedFusedMatMulDataType(static_cast<ONNX_NAMESPACE::TensorProto_DataType>(right_type))) {
+      continue;
+    }
+
+    if (left_input == right_input) {
+      // If both inputs are the same, we skip the fusion.
+      // Currently, this situation is not handled correctly in the code below.
+      // Otherwise, the model initialization may fail. See https://github.com/microsoft/onnxruntime/issues/24341.
+      continue;
+    }
+
     bool is_trans_left = false;
     bool is_trans_batch_left = false;
     Node* left = nullptr;
@@ -325,12 +338,6 @@ Status MatmulTransposeFusion::ApplyImpl(Graph& graph, bool& modified, int graph_
       }
     }
 
-    NodeArg* right_input = node.MutableInputDefs()[1];
-    auto right_type = right_input->TypeAsProto()->tensor_type().elem_type();
-    if (!IsAllowedFusedMatMulDataType(static_cast<ONNX_NAMESPACE::TensorProto_DataType>(right_type))) {
-      continue;
-    }
-
     bool is_trans_right = false;
     bool is_trans_batch_right = false;
     Node* right = nullptr;
diff --git a/onnxruntime/core/optimizer/utils.h b/onnxruntime/core/optimizer/utils.h
index caab7ca1d9f67..b0da4becb0146 100644
--- a/onnxruntime/core/optimizer/utils.h
+++ b/onnxruntime/core/optimizer/utils.h
@@ -4,6 +4,7 @@
 #pragma once
 
 #if !defined(ORT_MINIMAL_BUILD)
+#include <vector>
 #include "core/graph/onnx_protobuf.h"
 #include "core/graph/graph.h"
 #include "core/common/inlined_containers.h"
@@ -101,6 +102,52 @@ bool IsSupportedDataType(const Node& node, const T& supported_data_types) {
   return true;
 }
 
+// Merge 1-D weights (q, k and v) by concatenating them one by one.
+template <typename T>
+void MergeWeights1d(const T* q, const T* k, const T* v, std::vector<T>& result, int64_t q_element_count, int64_t kv_element_count) {
+  for (int64_t i = 0; i < q_element_count; i++) {
+    result.push_back(*q);
+    q++;
+  }
+
+  for (int64_t i = 0; i < kv_element_count; i++) {
+    result.push_back(*k);
+    k++;
+  }
+
+  for (int64_t i = 0; i < kv_element_count; i++) {
+    result.push_back(*v);
+    v++;
+  }
+}
+
+// Merge 2-D weights (q, k and v) by concatenating them row by row.
+template <typename T>
+void MergeMatMulWeightsByRow(const T* q_weight, const T* k_weight, const T* v_weight,
+                             std::vector<T>& result, int64_t row_hidden_size, int64_t q_col_hidden_size, int64_t kv_col_hidden_size) {
+  const T* q = q_weight;
+  const T* k = k_weight;
+  const T* v = v_weight;
+  for (int64_t i = 0; i < row_hidden_size; i++, q += q_col_hidden_size, k += kv_col_hidden_size, v += kv_col_hidden_size) {
+    MergeWeights1d(q, k, v, result, q_col_hidden_size, kv_col_hidden_size);
+  }
+}
+
+template <typename T>
+void AppendBlocks(const T* weight, std::vector<T>& result, int64_t N, int64_t blocks, int64_t block_size) {
+  result.insert(result.end(), weight, weight + N * blocks * block_size);
+}
+
+// Merge Q, K, V tensors into a single vector in the order:
+// [N rows of Q, M rows of K, M rows of V].
+template <typename T>
+void MergeMatMulWeightsByBlocks(const T* q_weight, const T* k_weight, const T* v_weight,
+                                std::vector<T>& result, int64_t q_hidden_size, int64_t kv_hidden_size, int64_t blocks, int64_t block_size) {
+  AppendBlocks(q_weight, result, q_hidden_size, blocks, block_size);
+  AppendBlocks(k_weight, result, kv_hidden_size, blocks, block_size);
+  AppendBlocks(v_weight, result, kv_hidden_size, blocks, block_size);
+}
+
 bool IsOperationDeterministic(const std::string& domain, const std::string& op);
 
 template <typename T>
diff --git a/onnxruntime/core/providers/coreml/builders/impl/gather_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/gather_op_builder.cc
index 7d32675e3e510..3e691cc1745b0 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/gather_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/gather_op_builder.cc
@@ -2,6 +2,7 @@
 // Licensed under the MIT License.
 
 #include "core/providers/coreml/builders/impl/base_op_builder.h"
+#include "core/providers/coreml/builders/impl/builder_utils.h"
 #include "core/providers/coreml/builders/op_builder_factory.h"
 #include "core/providers/coreml/builders/model_builder.h"
 #include "core/providers/coreml/shape_utils.h"
@@ -18,6 +19,7 @@ class GatherOpBuilder : public BaseOpBuilder {
 
   bool IsOpSupportedImpl(const Node& node, const OpBuilderInputParams& input_params,
                          const logging::Logger& logger) const override;
+  bool SupportsMLProgram() const override { return true; }
 };
 
 namespace {
@@ -28,13 +30,37 @@ int64_t GetAxisAttribute(const Node& node) {
 }  // namespace
 
 Status GatherOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node,
-                                              const logging::Logger& /*logger*/) const {
-  auto layer = model_builder.CreateNNLayer(node);
-  layer->mutable_gather()->set_axis(GetAxisAttribute(node));
-  *layer->mutable_input()->Add() = node.InputDefs()[0]->Name();    // data
-  *layer->mutable_input()->Add() = node.InputDefs()[1]->Name();    // indices
-  *layer->mutable_output()->Add() = node.OutputDefs()[0]->Name();  // output
-  model_builder.AddLayer(std::move(layer));
+                                              const logging::Logger& logger) const {
+  if (model_builder.CreateMLProgram()) {
+    using CoreML::Specification::MILSpec::Operation;
+    std::unique_ptr<Operation> op = model_builder.CreateOperation(node, "gather");
+
+    std::optional<int32_t> output_datatype;
+
+    int32_t input_type;
+    ORT_RETURN_IF_NOT(GetType(*node.InputDefs()[0], input_type, logger), "Failed to get input type");
+
+    if (input_type == ONNX_NAMESPACE::TensorProto_DataType_INT64) {
+      output_datatype = ONNX_NAMESPACE::TensorProto_DataType_INT32;
+    }
+
+    const auto axis = GetAxisAttribute(node);
+    // coreml docs claims validate_indices is optional but in practice it is required
+    const auto validate_indices = false;
+    AddOperationInput(*op, "x", node.InputDefs()[0]->Name());                                   // data
+    AddOperationInput(*op, "indices", node.InputDefs()[1]->Name());                             // indices
+    AddOperationInput(*op, "axis", model_builder.AddScalarConstant(op->type(), "axis", axis));  // axis attr
+    AddOperationInput(*op, "validate_indices", model_builder.AddScalarConstant(op->type(), "validate_indices", validate_indices));
+    AddOperationOutput(*op, *node.OutputDefs()[0], output_datatype);  // output
+    model_builder.AddOperation(std::move(op));
+  } else {
+    auto layer = model_builder.CreateNNLayer(node);
+    layer->mutable_gather()->set_axis(GetAxisAttribute(node));
+    *layer->mutable_input()->Add() = node.InputDefs()[0]->Name();    // data
+    *layer->mutable_input()->Add() = node.InputDefs()[1]->Name();    // indices
+    *layer->mutable_output()->Add() = node.OutputDefs()[0]->Name();  // output
+    model_builder.AddLayer(std::move(layer));
+  }
   return Status::OK();
 }
 
diff --git a/onnxruntime/core/providers/coreml/builders/model_builder.cc b/onnxruntime/core/providers/coreml/builders/model_builder.cc
index 3551f5759201e..c8df88e38e096 100644
--- a/onnxruntime/core/providers/coreml/builders/model_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/model_builder.cc
@@ -71,9 +71,36 @@ void CopyRawDataToRepeatedField(const ONNX_NAMESPACE::TensorProto& tensor_proto,
   }
 }
 
+template <>
+void CopyRawDataToRepeatedField<int64_t, int32_t>(const ONNX_NAMESPACE::TensorProto& tensor_proto,
+                                                  google::protobuf::RepeatedField<int32_t>& repeated_field) {
+  const auto& raw_data = tensor_proto.raw_data();
+  const int64_t* data = reinterpret_cast<const int64_t*>(raw_data.data());
+  const size_t element_count = raw_data.size() / sizeof(int64_t);
+
+  // Reserve space to avoid multiple reallocations
+  repeated_field.Reserve(narrow<int>(element_count));
+
+  // Use std::transform with proper iterators
+  std::transform(data, data + element_count,
+                 google::protobuf::RepeatedFieldBackInserter(&repeated_field),
+                 [](int64_t v) {
+                   return narrow<int32_t>(v);
+                 });
+}
+
+void CopyInt64DataToInt32(const ONNX_NAMESPACE::TensorProto& tensor_proto, MILSpec::TensorValue& tensor_value) {
+  const int num_entries = tensor_proto.int64_data_size();
+  auto& int32_out = *tensor_value.mutable_ints()->mutable_values();
+  int32_out.Reserve(num_entries);
+  for (int i = 0; i < num_entries; ++i) {
+    int32_out.AddAlreadyReserved(narrow<int32_t>(tensor_proto.int64_data(i)));
+  }
+}
+
 // copy T data from the TensorProto.int32_t field to TensorValue.bytes
 template <typename T>
-void CopyInt32DataToBytes(const ONNX_NAMESPACE::TensorProto& tensor_proto, MILSpec::TensorValue tensor_value) {
+void CopyInt32DataToBytes(const ONNX_NAMESPACE::TensorProto& tensor_proto, MILSpec::TensorValue& tensor_value) {
   const int num_entries = tensor_proto.int32_data_size();
   std::string& bytes = *tensor_value.mutable_bytes()->mutable_values();
   bytes.resize(num_entries * sizeof(T));
@@ -87,7 +114,7 @@ void CopyInt32DataToBytes(const ONNX_NAMESPACE::TensorProto& tensor_proto, MILSp
 
 // copy T data from the TensorProto.uint64_data field to TensorValue.bytes
 template <typename T>
-void CopyUInt64DataToBytes(const ONNX_NAMESPACE::TensorProto& tensor_proto, MILSpec::TensorValue tensor_value) {
+void CopyUInt64DataToBytes(const ONNX_NAMESPACE::TensorProto& tensor_proto, MILSpec::TensorValue& tensor_value) {
   const int num_entries = tensor_proto.uint64_data_size();
   std::string& bytes = *tensor_value.mutable_bytes()->mutable_values();
   bytes.resize(num_entries * sizeof(T));
@@ -143,18 +170,16 @@ void CopyOnnxTensorToCoreMLTensor(const ONNX_NAMESPACE::TensorProto& tensor_prot
       break;
     }
     case ONNX_NAMESPACE::TensorProto_DataType_INT64: {
-      // enable when this is proven to not be the case
-      ORT_THROW(
-          "INT64 is unexpected as CoreML uses 32-bit int for indices. "
-          "Most likely an initializer that should have been skipped was not.");
-      //// from: int64_data/raw, to: longints
-      // if (has_raw_data) {
-      //   CopyRawDataToRepeatedField<int64_t>(tensor_proto, *tensor_value.mutable_longints()->mutable_values());
+      // from: int64_data/raw, to: ints (use narrow to convert to int32)
+      // CoreML tensors have a longints field, but the CoreML op definitions only use int32,
+      // so we convert any int64 to int32
+      if (has_raw_data) {
+        CopyRawDataToRepeatedField<int64_t, int32_t>(tensor_proto, *tensor_value.mutable_ints()->mutable_values());
 
-      //} else {
-      //  tensor_value.mutable_longints()->mutable_values()->CopyFrom(tensor_proto.int64_data());
-      //}
-      // break;
+      } else {
+        CopyInt64DataToInt32(tensor_proto, tensor_value);
+      }
+      break;
     }
     case ONNX_NAMESPACE::TensorProto_DataType_FLOAT16: {
       // from: int32_data/raw, to: bytes
@@ -356,7 +381,11 @@ MILSpec::Value OnnxTensorToCoreMLTensor(const ONNX_NAMESPACE::TensorProto& tenso
   // populate ValueType with tensor data type, dims and rank
   MILSpec::ValueType& value_type = *value.mutable_type();
   MILSpec::TensorType& tensor_type = *value_type.mutable_tensortype();
-  tensor_type.set_datatype(OnnxDataTypeToMILSpec(tensor_proto.data_type()));
+  MILSpec::DataType data_type = OnnxDataTypeToMILSpec(tensor_proto.data_type());
+  MILSpec::DataType converted_data_type = data_type == MILSpec::DataType::INT64
+                                              ? MILSpec::DataType::INT32
+                                              : data_type;
+  tensor_type.set_datatype(converted_data_type);
 
   tensor_type.set_rank(tensor_proto.dims().size());
   for (const auto& dim : tensor_proto.dims()) {
diff --git a/onnxruntime/core/providers/cpu/cpu_provider_factory.cc b/onnxruntime/core/providers/cpu/cpu_provider_factory.cc
index 4b6ab05be0345..22cf7024663a9 100644
--- a/onnxruntime/core/providers/cpu/cpu_provider_factory.cc
+++ b/onnxruntime/core/providers/cpu/cpu_provider_factory.cc
@@ -16,6 +16,8 @@ struct CpuProviderFactory : IExecutionProviderFactory {
   CpuProviderFactory(bool create_arena) : create_arena_(create_arena) {}
   ~CpuProviderFactory() override = default;
   std::unique_ptr<IExecutionProvider> CreateProvider() override;
+  std::unique_ptr<IExecutionProvider> CreateProvider(const OrtSessionOptions& session_options,
+                                                     const OrtLogger& session_logger) override;
 
  private:
   bool create_arena_;
@@ -27,6 +29,16 @@ std::unique_ptr<IExecutionProvider> CpuProviderFactory::CreateProvider() {
   return std::make_unique<CPUExecutionProvider>(info);
 }
 
+std::unique_ptr<IExecutionProvider> CpuProviderFactory::CreateProvider(const OrtSessionOptions& session_options,
+                                                                       const OrtLogger& session_logger) {
+  CPUExecutionProviderInfo info;
+  info.create_arena = session_options.value.enable_cpu_mem_arena;
+
+  auto cpu_ep = std::make_unique<CPUExecutionProvider>(info);
+  cpu_ep->SetLogger(reinterpret_cast<const logging::Logger*>(&session_logger));
+  return cpu_ep;
+}
+
 std::shared_ptr<IExecutionProviderFactory> CPUProviderFactoryCreator::Create(int use_arena) {
   return std::make_shared<onnxruntime::CpuProviderFactory>(use_arena != 0);
 }
diff --git a/onnxruntime/core/providers/cpu/fp16/fp16_conv.cc b/onnxruntime/core/providers/cpu/fp16/fp16_conv.cc
index 87558ef3f2505..5da35d258a164 100644
--- a/onnxruntime/core/providers/cpu/fp16/fp16_conv.cc
+++ b/onnxruntime/core/providers/cpu/fp16/fp16_conv.cc
@@ -416,7 +416,8 @@ Status FusedConvFp16::Compute(OpKernelContext* context) const {
           Xdata,
           static_cast<MLFloat16*>(transpose_input_buffer.get()),
           static_cast<size_t>(C),
-          static_cast<size_t>(input_image_size));
+          static_cast<size_t>(input_image_size),
+          thread_pool);
       input_data = static_cast<MLFloat16*>(transpose_input_buffer.get());
       output_data = static_cast<MLFloat16*>(transpose_output_buffer.get());
       add_src = nullptr;
@@ -573,7 +574,8 @@ Status FusedConvFp16::Compute(OpKernelContext* context) const {
           output_data,
           Ydata,
           static_cast<size_t>(output_image_size),
-          static_cast<size_t>(M));
+          static_cast<size_t>(M),
+          thread_pool);
       if (sum_data != nullptr) {
         MLAS_HALF_GEMM_ACTIVATION_PROCESSOR proc(activation_, sum_data);
         proc.Process(Ydata, 0, 0, static_cast<size_t>(M),
diff --git a/onnxruntime/core/providers/cpu/fp16/fp16_pool.cc b/onnxruntime/core/providers/cpu/fp16/fp16_pool.cc
index 9729500d0e1ff..8be90a249f711 100644
--- a/onnxruntime/core/providers/cpu/fp16/fp16_pool.cc
+++ b/onnxruntime/core/providers/cpu/fp16/fp16_pool.cc
@@ -156,7 +156,8 @@ Status PoolFp16::Compute(OpKernelContext* context) const {
           Xdata,
           static_cast<MLFloat16*>(transpose_input_buffer.get()),
           static_cast<size_t>(C),
-          static_cast<size_t>(input_image_size));
+          static_cast<size_t>(input_image_size),
+          thread_pool);
       input_data = static_cast<MLFloat16*>(transpose_input_buffer.get());
       output_data = static_cast<MLFloat16*>(transpose_output_buffer.get());
     }
@@ -206,7 +207,8 @@ Status PoolFp16::Compute(OpKernelContext* context) const {
           output_data,
           Ydata,
           static_cast<size_t>(output_image_size),
-          static_cast<size_t>(C));
+          static_cast<size_t>(C),
+          thread_pool);
     }
     Xdata += input_image_size * C;
     Ydata += output_image_size * C;
diff --git a/onnxruntime/core/providers/cpu/nn/conv_transpose.cc b/onnxruntime/core/providers/cpu/nn/conv_transpose.cc
index 4e0b560e2ec3c..d16fabccce41a 100644
--- a/onnxruntime/core/providers/cpu/nn/conv_transpose.cc
+++ b/onnxruntime/core/providers/cpu/nn/conv_transpose.cc
@@ -86,7 +86,7 @@ Status ConvTranspose<float>::PrePack(const Tensor& tensor, int input_idx, Alloca
     for (int64_t group_id = 0; group_id < conv_transpose_attrs_.group; ++group_id) {
       MlasTranspose(tensor.Data<float>() + (group_id * N * K),
                     ((float*)packed_filter_data) + (group_id * packed_elements_per_group),
-                    K, N);
+                    K, N, nullptr);
     }
 
     bool share_prepacked_weights = (prepacked_weights != nullptr);
diff --git a/onnxruntime/core/providers/cpu/quantization/qlinearconv.cc b/onnxruntime/core/providers/cpu/quantization/qlinearconv.cc
index 7797cbe678bd4..01dd62d9e186d 100644
--- a/onnxruntime/core/providers/cpu/quantization/qlinearconv.cc
+++ b/onnxruntime/core/providers/cpu/quantization/qlinearconv.cc
@@ -787,7 +787,8 @@ Status QLinearConv<ActType>::Compute(OpKernelContext* context) const {
           Xdata,
           static_cast<ActType*>(transpose_input_buffer.get()),
           static_cast<size_t>(C),
-          static_cast<size_t>(input_image_size));
+          static_cast<size_t>(input_image_size),
+          thread_pool);
       input_data = static_cast<ActType*>(transpose_input_buffer.get());
       output_data = static_cast<ActType*>(transpose_output_buffer.get());
     }
@@ -997,7 +998,8 @@ Status QLinearConv<ActType>::Compute(OpKernelContext* context) const {
           output_data,
           Ydata,
           static_cast<size_t>(output_image_size),
-          static_cast<size_t>(M));
+          static_cast<size_t>(M),
+          thread_pool);
     }
 
     Xdata += X_offset;
diff --git a/onnxruntime/core/providers/cpu/tensor/upsample.cc b/onnxruntime/core/providers/cpu/tensor/upsample.cc
index babbac0b7be17..b533f1b7dc80b 100644
--- a/onnxruntime/core/providers/cpu/tensor/upsample.cc
+++ b/onnxruntime/core/providers/cpu/tensor/upsample.cc
@@ -1333,6 +1333,7 @@ Status Upsample<T>::BaseCompute(OpKernelContext* context,
       const int64_t output_width = is_2D ? output_dims[1] : (is_nchw ? output_dims[3] : output_dims[2]);
       const float height_scale = is_2D ? scales[0] : (is_nchw ? scales[2] : scales[1]);
       const float width_scale = is_2D ? scales[1] : (is_nchw ? scales[3] : scales[2]);
+      const bool is_upsampling = height_scale >= 1.0f && width_scale >= 1.0f;
 
       if (antialias_) {
         if (!is_nchw) {
@@ -1348,6 +1349,14 @@ Status Upsample<T>::BaseCompute(OpKernelContext* context,
                                  Y->MutableData<T>(), alloc, get_original_coordinate_,
                                  output_height * output_width * num_channels > 64 ? context->GetOperatorThreadPool() : nullptr);
         }
+      } else if (!is_nchw && is_upsampling) {
+        // Antialiasing has no effect during image upsampling, so the antialiasing logic can be reused as-is.
+        // TODO(yilyu): Benchmark whether is_nchw with upscaling should use ResizeBiCubicAntiAlias or ResizeBiCubic.
+        NhwcResizeBiCubicAntiAlias(batch_size, num_channels, input_height, input_width, output_height, output_width,
+                                   height_scale, width_scale, cubic_coeff_a_, use_extrapolation_,
+                                   extrapolation_value_, exclude_outside_, roi, X,
+                                   Y->MutableData<T>(), alloc, get_original_coordinate_,
+                                   output_height * output_width * num_channels > 64 ? context->GetOperatorThreadPool() : nullptr);
       } else {
         ResizeBiCubic(batch_size, num_channels, input_height, input_width, output_height, output_width,
                       height_scale, width_scale, cubic_coeff_a_, use_extrapolation_,
diff --git a/onnxruntime/core/providers/cpu/tensor/upsamplebase.h b/onnxruntime/core/providers/cpu/tensor/upsamplebase.h
index b768fedd8513a..4c393f8ae6574 100644
--- a/onnxruntime/core/providers/cpu/tensor/upsamplebase.h
+++ b/onnxruntime/core/providers/cpu/tensor/upsamplebase.h
@@ -439,10 +439,24 @@ class UpsampleBase {
                         is_resize_ ? "Resize operator" : "Upsample operator");
     } else if (UpsampleMode::CUBIC == mode) {
       // we support cubic in NHWC format once anti-alias is enabled
-      ORT_RETURN_IF_NOT(scales.size() == 2 || (scales.size() == 4 && scales[0] == 1 && scales[1] == 1) ||
-                            (antialias_ && scales.size() == 4 && scales[0] == 1 && scales[3] == 1),
-                        "'Cubic' mode only support 2-D inputs ('Bicubic') or 4-D inputs "
-                        "with the corresponding outermost 2 scale values being 1 in the ",
+      bool is_supported = false;
+      if (scales.size() == 2) {
+        is_supported = true;
+      } else if (scales.size() == 4) {
+        const bool outermost_scales_one = (scales[0] == 1.0f && scales[1] == 1.0f);
+        const bool outer_and_innermost_scale_one = (scales[0] == 1.0f && scales[3] == 1.0f);
+        is_supported =
+            outermost_scales_one ||
+            (antialias_ && outer_and_innermost_scale_one) ||
+            (!antialias_ && outer_and_innermost_scale_one && scales[1] >= 1.0f && scales[2] >= 1.0f);
+      }
+      ORT_RETURN_IF_NOT(is_supported,
+                        "'Cubic' mode only supports:\n"
+                        "  * 2-D inputs ('Bicubic') or\n"
+                        "  * 4-D inputs with the corresponding outermost 2 scale values being 1"
+                        " or the corresponding outermost and innermost scale values being 1 with antialias attribute"
+                        " or the corresponding outermost and innermost scale values being 1 and other scales >= 1 without antialias attribute\n"
+                        "in the ",
                         is_resize_ ? "Resize operator" : "Upsample operator");
     }
     return Status::OK();
diff --git a/onnxruntime/core/providers/execution_provider_factory.cc b/onnxruntime/core/providers/execution_provider_factory.cc
new file mode 100644
index 0000000000000..14521b9eb00a6
--- /dev/null
+++ b/onnxruntime/core/providers/execution_provider_factory.cc
@@ -0,0 +1,14 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/providers/providers.h"
+
+#include <memory>
+#include "core/framework/execution_provider.h"
+
+namespace onnxruntime {
+std::unique_ptr<IExecutionProvider> IExecutionProviderFactory::CreateProvider(
+    const OrtSessionOptions& /*session_options*/, const OrtLogger& /*session_logger*/) {
+  return CreateProvider();
+}
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/openvino/backend_manager.cc b/onnxruntime/core/providers/openvino/backend_manager.cc
index 215cfafb2d174..d758430f39108 100644
--- a/onnxruntime/core/providers/openvino/backend_manager.cc
+++ b/onnxruntime/core/providers/openvino/backend_manager.cc
@@ -324,7 +324,7 @@ static bool IsQDQGraph(const onnxruntime::GraphViewer& graph_viewer) {
 static void DumpOpenVINOEPModel([[maybe_unused]] const std::filesystem::path& onnx_model_path_name,
                                 [[maybe_unused]] ONNX_NAMESPACE::ModelProto* model_proto,
                                 [[maybe_unused]] const onnxruntime::Node& fused_node) {
-#ifdef  NOT_RELEASE
+#ifdef NOT_RELEASE
   if (openvino_ep::backend_utils::IsDebugEnabled()) {
     auto model_name = onnx_model_path_name.empty() ? "unknown.onnx" : onnx_model_path_name.filename();
 
diff --git a/onnxruntime/core/providers/openvino/openvino_provider_factory.cc b/onnxruntime/core/providers/openvino/openvino_provider_factory.cc
index e35143df29941..e36ff48d0351d 100644
--- a/onnxruntime/core/providers/openvino/openvino_provider_factory.cc
+++ b/onnxruntime/core/providers/openvino/openvino_provider_factory.cc
@@ -202,6 +202,11 @@ struct OpenVINOProviderFactory : IExecutionProviderFactory {
     ParseConfigOptions(provider_info_);
     return std::make_unique<OpenVINOExecutionProvider>(provider_info_, shared_context_);
   }
+  std::unique_ptr<IExecutionProvider> CreateProvider(const OrtSessionOptions& /*session_options*/,
+                                                     const OrtLogger& /*session_logger*/) override {
+    // TODO(adrianlizarraga): Construct a ProviderInfo object from the session options.
+    return CreateProvider();
+  }
 
  private:
   ProviderInfo provider_info_;
diff --git a/onnxruntime/core/providers/openvino/ov_interface.cc b/onnxruntime/core/providers/openvino/ov_interface.cc
index 6afbd8ce761e5..a175ca863d1d1 100644
--- a/onnxruntime/core/providers/openvino/ov_interface.cc
+++ b/onnxruntime/core/providers/openvino/ov_interface.cc
@@ -49,11 +49,11 @@ void printDebugInfo(const ov::CompiledModel& obj) {
 // Function to check if a given OV property is enabled
 std::optional<bool> queryOVProperty(const std::string& property, const std::string& device_type) {
   try {
-      // Get the property value
-      auto supported_properties = OVCore::Get()->core.get_property(device_type, ov::supported_properties);
-      return std::find(supported_properties.begin(), supported_properties.end(), property) != supported_properties.end();
+    // Get the property value
+    auto supported_properties = OVCore::Get()->core.get_property(device_type, ov::supported_properties);
+    return std::find(supported_properties.begin(), supported_properties.end(), property) != supported_properties.end();
   } catch (const std::exception&) {
-      return std::nullopt; // Property not found or invalid
+    return std::nullopt;  // Property not found or invalid
   }
 }
 
diff --git a/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.cc b/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.cc
index 26642459a6863..f08ca14d6e786 100644
--- a/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.cc
+++ b/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.cc
@@ -21,12 +21,7 @@ bool GraphHasEpContextNode(const onnxruntime::GraphViewer& graph_viewer) {
   for (const auto& node : graph_viewer.Nodes()) {
     if (EPCONTEXT_OP == node.OpType()) {
       NodeAttrHelper node_helper(node);
-      std::string cache_source = node_helper.Get(SOURCE, "");
-
-      std::transform(cache_source.begin(),
-                     cache_source.end(),
-                     cache_source.begin(),
-                     [](unsigned char c) { return static_cast<unsigned char>(std::tolower(c)); });
+      std::string cache_source = qnn::utils::GetLowercaseString(node_helper.Get(SOURCE, ""));
 
       if (cache_source == "qnnexecutionprovider" || cache_source == "qnn") {
         return true;
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/conv_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/conv_op_builder.cc
index 08c92be695511..23811c200213a 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/conv_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/conv_op_builder.cc
@@ -87,8 +87,8 @@ Status ConvOpBuilder::IsOpSupported(QnnModelWrapper& qnn_model_wrapper,
   }
 
   ONNX_NAMESPACE::DataType input_data_type = input_0.node_arg.Type();
-  bool is_npu_backend = IsNpuBackend(qnn_model_wrapper.GetQnnBackendType());
-  ORT_RETURN_IF(!is_npu_backend && input_data_type != ONNX_NAMESPACE::Utils::DataTypeUtils::ToType("float"),
+  bool is_cpu_backend = IsCpuBackend(qnn_model_wrapper.GetQnnBackendType());
+  ORT_RETURN_IF(is_cpu_backend && input_data_type != ONNX_NAMESPACE::Utils::DataTypeUtils::ToType("float"),
                 "QNN EP: Data type ", input_data_type->c_str(),
                 " is not supported for Conv operator in CPU backend.");
 
@@ -112,6 +112,7 @@ Status ConvOpBuilder::IsOpSupported(QnnModelWrapper& qnn_model_wrapper,
   }
 
   // Validate that weight is signed type for per-channel quantization (required by QNN docs).
+  bool is_npu_backend = IsNpuBackend(qnn_model_wrapper.GetQnnBackendType());
   if (is_npu_backend) {
     const auto& input_1 = inputs[1];  // weight
     bool is_per_axis_quant = false;
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/expand_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/expand_op_builder.cc
index b64e4a6a16a56..1dd12abc2baf9 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/expand_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/expand_op_builder.cc
@@ -110,6 +110,12 @@ Status ExpandOpBuilder::ProcessInputs(QnnModelWrapper& qnn_model_wrapper,
         FillShapeInputData(shape_data, shape_size, static_cast<float>(1.0));
         break;
       }
+      case QNN_DATATYPE_INT_64: {
+        // QNN-EP doesn't support INT64 shape input.
+        qnn_data_type = QNN_DATATYPE_INT_32;
+        FillShapeInputData(shape_data, shape_size, static_cast<int32_t>(1));
+        break;
+      }
       case QNN_DATATYPE_INT_32: {
         FillShapeInputData(shape_data, shape_size, static_cast<int32_t>(1));
         break;
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/resize_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/resize_op_builder.cc
index 5e173b7aff030..347f0651069dc 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/resize_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/resize_op_builder.cc
@@ -222,7 +222,8 @@ Status ResizeOpBuilder::IsOpSupported(QnnModelWrapper& qnn_model_wrapper,
   ORT_RETURN_IF_NOT(input_shape[0] == output_shape[0] && input_shape[1] == output_shape[1],
                     "QNN EP: Resize may only change the spatial dimensions.");
 
-  if (!is_npu_backend) {
+  const bool is_cpu_backend = IsCpuBackend(qnn_model_wrapper.GetQnnBackendType());
+  if (is_cpu_backend) {
     ONNX_NAMESPACE::DataType input_data_type = input_0.node_arg.Type();
     ORT_RETURN_IF(input_data_type != ONNX_NAMESPACE::Utils::DataTypeUtils::ToType("float"),
                   "QNN EP: Data type ", input_data_type->c_str(),
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc
index 6b9f6a5e73e0f..aea354d0550b7 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc
+++ b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc
@@ -8,8 +8,7 @@
 #include <string>
 #include "QnnOpDef.h"
 #include "CPU/QnnCpuCommon.h"
-// TODO: not exist for Windows yet
-// #include "GPU/QnnGpuCommon.h"
+#include "GPU/QnnGpuCommon.h"
 #include "DSP/QnnDspCommon.h"
 #include "HTP/QnnHtpCommon.h"
 #include "HTP/QnnHtpContext.h"
@@ -171,10 +170,9 @@ void QnnBackendManager::SetQnnBackendType(uint32_t backend_id) {
     case QNN_BACKEND_ID_CPU:
       qnn_backend_type_ = QnnBackendType::CPU;
       break;
-      // TODO: update once it's ready for Widows
-      // case QNN_BACKEND_ID_GPU:
-      //  qnn_backend_type_ = QnnBackendType::GPU;
-      //  break;
+    case QNN_BACKEND_ID_GPU:
+      qnn_backend_type_ = QnnBackendType::GPU;
+      break;
     case QNN_BACKEND_ID_DSP:
       qnn_backend_type_ = QnnBackendType::DSP;
       break;
@@ -371,8 +369,16 @@ Status QnnBackendManager::InitializeQnnLog(const logging::Logger& logger) {
 QnnLog_Level_t QnnBackendManager::MapOrtSeverityToQNNLogLevel(logging::Severity ort_log_level) {
   // Map ORT log severity to Qnn log level
   switch (ort_log_level) {
-    case logging::Severity::kVERBOSE:
-      return QNN_LOG_LEVEL_VERBOSE;
+    case logging::Severity::kVERBOSE: {
+      switch ((GetQnnBackendType())) {
+        case QnnBackendType::GPU:
+          // Currently GPU needs this log level to work.
+          // This switch will be removed once this is resolved.
+          return QNN_LOG_LEVEL_DEBUG;
+        default:
+          return QNN_LOG_LEVEL_VERBOSE;
+      }
+    }
     case logging::Severity::kINFO:
       return QNN_LOG_LEVEL_INFO;
     case logging::Severity::kWARNING:
@@ -617,16 +623,31 @@ Status QnnBackendManager::CreateContext(bool enable_htp_weight_sharing) {
 
   QnnContext_Config_t context_priority_config = QNN_CONTEXT_CONFIG_INIT;
   ORT_RETURN_IF_ERROR(SetQnnContextConfig(context_priority_, context_priority_config));
+
   const QnnContext_Config_t* npu_context_configs[] = {&context_priority_config,
                                                       &context_config_weight_sharing,
                                                       nullptr};
   const QnnContext_Config_t* empty_context_configs[] = {nullptr};
-  bool is_npu_backend = IsNpuBackend(GetQnnBackendType());
+
+  const QnnContext_Config_t** configs = nullptr;
+  switch (GetQnnBackendType()) {
+    case QnnBackendType::HTP:
+    case QnnBackendType::DSP:
+      configs = npu_context_configs;
+      break;
+    case QnnBackendType::GPU:
+      // Currently only this works with QnnGpu.
+      configs = nullptr;
+      break;
+    default:
+      configs = empty_context_configs;
+      break;
+  }
 
   Qnn_ContextHandle_t context = nullptr;
   Qnn_ErrorHandle_t result = qnn_interface_.contextCreate(backend_handle_,
                                                           device_handle_,
-                                                          is_npu_backend ? npu_context_configs : empty_context_configs,
+                                                          configs,
                                                           &context);
 
   ORT_RETURN_IF(QNN_CONTEXT_NO_ERROR != result, "Failed to create context. Error: ", QnnErrorHandleToString(result));
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_def.cc b/onnxruntime/core/providers/qnn/builder/qnn_def.cc
index 3380436fcf156..d3a086ea1bc9f 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_def.cc
+++ b/onnxruntime/core/providers/qnn/builder/qnn_def.cc
@@ -598,5 +598,18 @@ bool IsNpuBackend(QnnBackendType backend_type) {
   return backend_type == QnnBackendType::HTP || backend_type == QnnBackendType::DSP;
 }
 
+bool IsGpuBackend(QnnBackendType backend_type) {
+  return backend_type == QnnBackendType::GPU;
+}
+
+bool IsCpuBackend(QnnBackendType backend_type) {
+  return backend_type == QnnBackendType::CPU;
+}
+
+// Is it Qualcomm hardware ?
+bool IsQpuBackend(QnnBackendType backend_type) {
+  return IsNpuBackend(backend_type) || IsGpuBackend(backend_type);
+}
+
 }  // namespace qnn
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_def.h b/onnxruntime/core/providers/qnn/builder/qnn_def.h
index 6351779b23f99..ee4f385f03889 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_def.h
+++ b/onnxruntime/core/providers/qnn/builder/qnn_def.h
@@ -73,8 +73,14 @@ enum class QnnBackendType : uint8_t {
   HTP_FP16
 };
 
+bool IsCpuBackend(QnnBackendType backend_type);
+
 bool IsNpuBackend(QnnBackendType backend_type);
 
+bool IsGpuBackend(QnnBackendType backend_type);
+
+bool IsQpuBackend(QnnBackendType backend_type);
+
 // constexpr config values
 constexpr const int kSleepMinLatency = 40;
 constexpr const int kSleepLowLatency = 100;
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_utils.h b/onnxruntime/core/providers/qnn/builder/qnn_utils.h
index 3becac8540343..7065a4b31f77e 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_utils.h
+++ b/onnxruntime/core/providers/qnn/builder/qnn_utils.h
@@ -3,6 +3,7 @@
 #pragma once
 
 #include <algorithm>
+#include <cctype>
 #include <functional>
 #include <numeric>
 #include <string>
@@ -26,6 +27,22 @@ class QnnOpConfigWrapper;
 class QnnModelWrapper;
 
 namespace utils {
+/**
+ * Returns a lowercase version of the input string.
+ * /param str The string to lowercase.
+ * /return The lowercased string.
+ */
+inline std::string GetLowercaseString(std::string str) {
+  // https://en.cppreference.com/w/cpp/string/byte/tolower
+  // The behavior of tolower from <cctype> is undefined if the argument is neither representable as unsigned char
+  // nor equal to EOF. To use tolower safely with a plain char (or signed char), the argument must be converted to
+  // unsigned char.
+  std::transform(str.begin(), str.end(), str.begin(), [](unsigned char c) {
+    return static_cast<char>(std::tolower(c));
+  });
+  return str;
+}
+
 size_t GetElementSizeByType(const Qnn_DataType_t& data_type);
 
 size_t GetElementSizeByType(ONNXTensorElementDataType elem_type);
diff --git a/onnxruntime/core/providers/qnn/ort_api.h b/onnxruntime/core/providers/qnn/ort_api.h
index 814be3c09750d..d25269be075de 100644
--- a/onnxruntime/core/providers/qnn/ort_api.h
+++ b/onnxruntime/core/providers/qnn/ort_api.h
@@ -43,6 +43,7 @@
 #include "core/optimizer/qdq_transformer/selectors_actions/shared/utils.h"
 #include "core/providers/common.h"
 #include "core/providers/partitioning_utils.h"
+#include "core/session/abi_session_options_impl.h"
 #include "core/session/onnxruntime_cxx_api.h"
 #else
 // Includes when building QNN EP as a shared library
diff --git a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
index 142a7362ad6ae..a2b0726602ba3 100644
--- a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
+++ b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
@@ -80,10 +80,7 @@ static bool ParseBackendTypeName(std::string_view backend_type_name, std::string
 
 static void ParseProfilingLevel(std::string profiling_level_string,
                                 qnn::ProfilingLevel& profiling_level) {
-  std::transform(profiling_level_string.begin(),
-                 profiling_level_string.end(),
-                 profiling_level_string.begin(),
-                 [](unsigned char c) { return static_cast<unsigned char>(std::tolower(c)); });
+  profiling_level_string = qnn::utils::GetLowercaseString(profiling_level_string);
   LOGS_DEFAULT(INFO) << "profiling_level: " << profiling_level_string;
   if (profiling_level_string == "off") {
     profiling_level = qnn::ProfilingLevel::OFF;
@@ -98,10 +95,7 @@ static void ParseProfilingLevel(std::string profiling_level_string,
 
 static void ParseHtpPerformanceMode(std::string htp_performance_mode_string,
                                     qnn::HtpPerformanceMode& htp_performance_mode) {
-  std::transform(htp_performance_mode_string.begin(),
-                 htp_performance_mode_string.end(),
-                 htp_performance_mode_string.begin(),
-                 [](unsigned char c) { return static_cast<unsigned char>(std::tolower(c)); });
+  htp_performance_mode_string = qnn::utils::GetLowercaseString(htp_performance_mode_string);
   LOGS_DEFAULT(VERBOSE) << "Htp performance mode: " << htp_performance_mode_string;
   if (htp_performance_mode_string == "burst") {
     htp_performance_mode = qnn::HtpPerformanceMode::kHtpBurst;
@@ -129,10 +123,7 @@ static void ParseHtpPerformanceMode(std::string htp_performance_mode_string,
 }
 
 static void ParseQnnContextPriority(std::string context_priority_string, qnn::ContextPriority& context_priority) {
-  std::transform(context_priority_string.begin(),
-                 context_priority_string.end(),
-                 context_priority_string.begin(),
-                 [](unsigned char c) { return static_cast<unsigned char>(std::tolower(c)); });
+  context_priority_string = qnn::utils::GetLowercaseString(context_priority_string);
   LOGS_DEFAULT(VERBOSE) << "QNN context priority: " << context_priority_string;
   if (context_priority_string == "low") {
     context_priority = qnn::ContextPriority::LOW;
@@ -789,8 +780,8 @@ QNNExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph_viewer
     return result;
   }
 
-  if ((context_cache_enabled_ || is_qnn_ctx_model) && !IsNpuBackend(qnn_backend_manager_->GetQnnBackendType())) {
-    LOGS(logger, ERROR) << "Qnn context cache only works for HTP or DSP backend.";
+  if ((context_cache_enabled_ || is_qnn_ctx_model) && !IsQpuBackend(qnn_backend_manager_->GetQnnBackendType())) {
+    LOGS(logger, ERROR) << "Qnn context cache only works for HTP/DSP/GPU backend.";
     return result;
   }
 
diff --git a/onnxruntime/core/providers/qnn/qnn_provider_factory.cc b/onnxruntime/core/providers/qnn/qnn_provider_factory.cc
index d4dd446751359..7b92a23e428eb 100644
--- a/onnxruntime/core/providers/qnn/qnn_provider_factory.cc
+++ b/onnxruntime/core/providers/qnn/qnn_provider_factory.cc
@@ -1,8 +1,11 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License
 
+#include <string>
+#include <unordered_map>
 #include "core/providers/qnn/qnn_provider_factory_creator.h"
 #include "core/providers/qnn/qnn_execution_provider.h"
+#include "core/providers/qnn/builder/qnn_utils.h"
 
 namespace onnxruntime {
 struct QNNProviderFactory : IExecutionProviderFactory {
@@ -17,6 +20,30 @@ struct QNNProviderFactory : IExecutionProviderFactory {
     return std::make_unique<QNNExecutionProvider>(provider_options_map_, config_options_);
   }
 
+  std::unique_ptr<IExecutionProvider> CreateProvider(const OrtSessionOptions& session_options,
+                                                     const OrtLogger& session_logger) override {
+    const ConfigOptions& config_options = session_options.GetConfigOptions();
+    const std::unordered_map<std::string, std::string>& config_options_map = config_options.GetConfigOptionsMap();
+
+    // The implementation of the SessionOptionsAppendExecutionProvider C API function automatically adds EP options to
+    // the session option configurations with the key prefix "ep.<lowercase_ep_name>.".
+    // We extract those EP options and pass them to QNN EP as separate "provider options".
+    std::unordered_map<std::string, std::string> provider_options = provider_options_map_;
+    std::string key_prefix = "ep.";
+    key_prefix += qnn::utils::GetLowercaseString(kQnnExecutionProvider);
+    key_prefix += ".";
+
+    for (const auto& [key, value] : config_options_map) {
+      if (key.rfind(key_prefix, 0) == 0) {
+        provider_options[key.substr(key_prefix.size())] = value;
+      }
+    }
+
+    auto qnn_ep = std::make_unique<QNNExecutionProvider>(provider_options, &config_options);
+    qnn_ep->SetLogger(reinterpret_cast<const logging::Logger*>(&session_logger));
+    return qnn_ep;
+  }
+
  private:
   ProviderOptions provider_options_map_;
   const ConfigOptions* config_options_;
diff --git a/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc b/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc
index 0d01215efaa14..0f6cf56cd951f 100644
--- a/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc
+++ b/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc
@@ -26,6 +26,7 @@
 #include "core/providers/cpu/tensor/unsqueeze.h"
 #include "core/providers/cpu/tensor/upsamplebase.h"
 #include "core/providers/cpu/tensor/tile.h"
+#include "core/providers/providers.h"
 
 #ifndef DISABLE_CONTRIB_OPS
 #include "contrib_ops/cpu/bert/attention_base.h"
@@ -330,6 +331,11 @@ bool IAllocator::CalcMemSizeForArrayWithAlignment(size_t nmemb, size_t size, siz
   return g_host->IAllocator__CalcMemSizeForArrayWithAlignment(nmemb, size, alignment, out);
 }
 
+std::unique_ptr<IExecutionProvider> IExecutionProviderFactory::CreateProvider(
+    const OrtSessionOptions& session_options, const OrtLogger& session_logger) {
+  return g_host->IExecutionProviderFactory__CreateProvider(this, session_options, session_logger);
+}
+
 std::vector<std::unique_ptr<ComputeCapability>> IExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph_viewer,
                                                                                   const IKernelLookup& kernel_lookup,
                                                                                   const GraphOptimizerRegistry& graph_optimizer_registry,
diff --git a/onnxruntime/core/providers/shared_library/provider_interfaces.h b/onnxruntime/core/providers/shared_library/provider_interfaces.h
index bc8905c225822..5545aca1b6140 100644
--- a/onnxruntime/core/providers/shared_library/provider_interfaces.h
+++ b/onnxruntime/core/providers/shared_library/provider_interfaces.h
@@ -256,6 +256,10 @@ struct ProviderHost {
   // IAllocator
   virtual bool IAllocator__CalcMemSizeForArrayWithAlignment(size_t nmemb, size_t size, size_t alignment, size_t* out) = 0;
 
+  // IExecutionProviderFactory
+  virtual std::unique_ptr<IExecutionProvider> IExecutionProviderFactory__CreateProvider(
+      IExecutionProviderFactory* p, const OrtSessionOptions& session_options, const OrtLogger& session_logger) = 0;
+
   // IExecutionProvider
   virtual std::vector<std::unique_ptr<ComputeCapability>> IExecutionProvider__GetCapability(const IExecutionProvider* p, const onnxruntime::GraphViewer& graph_viewer,
                                                                                             const IExecutionProvider::IKernelLookup& kernel_lookup,
@@ -626,11 +630,13 @@ struct ProviderHost {
   virtual std::optional<std::string> ConfigOptions__GetConfigEntry(const ConfigOptions* p, const std::string& config_key) = 0;
   virtual std::string ConfigOptions__GetConfigOrDefault(const ConfigOptions* p, const std::string& config_key,
                                                         const std::string& default_value) = 0;
+  virtual const std::unordered_map<std::string, std::string>& ConfigOptions__GetConfigOptionsMap(const ConfigOptions* p) = 0;
 
   // OrtRunOptions
   virtual const ConfigOptions& RunOptions__GetConfigOptions(const RunOptions* p) = 0;
   // OrtSessionOptions
   virtual const std::unordered_map<std::string, std::string>& SessionOptions__GetConfigOptionsMap(const OrtSessionOptions* p) = 0;
+  virtual const ConfigOptions& SessionOptions__GetConfigOptions(const OrtSessionOptions* p) = 0;
   virtual bool SessionOptions__GetEnableProfiling(const OrtSessionOptions* p) = 0;
   // ComputeCapability
   virtual std::unique_ptr<ComputeCapability> ComputeCapability__construct(std::unique_ptr<IndexedSubGraph> t_sub_graph) = 0;
diff --git a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
index 5f0f9ca4c8584..5ca268c453be6 100644
--- a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
+++ b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
@@ -518,6 +518,10 @@ struct ConfigOptions final {
     return g_host->ConfigOptions__GetConfigOrDefault(this, config_key, default_value);
   }
 
+  const std::unordered_map<std::string, std::string>& GetConfigOptionsMap() const {
+    return g_host->ConfigOptions__GetConfigOptionsMap(this);
+  }
+
   PROVIDER_DISALLOW_ALL(ConfigOptions)
 };
 
@@ -1561,9 +1565,14 @@ struct OrtRunOptions final {
 };
 
 struct OrtSessionOptions final {
-  const std::unordered_map<std::string, std::string>& GetConfigOptions() const {
+  const std::unordered_map<std::string, std::string>& GetConfigOptionsMap() const {
     return onnxruntime::g_host->SessionOptions__GetConfigOptionsMap(this);
   }
+
+  const onnxruntime::ConfigOptions& GetConfigOptions() const {
+    return onnxruntime::g_host->SessionOptions__GetConfigOptions(this);
+  }
+
   bool GetEnableProfiling() const {
     return onnxruntime::g_host->SessionOptions__GetEnableProfiling(this);
   }
diff --git a/onnxruntime/core/providers/vitisai/imp/global_api.cc b/onnxruntime/core/providers/vitisai/imp/global_api.cc
index 33aa8fa2b31b8..b51e882629177 100644
--- a/onnxruntime/core/providers/vitisai/imp/global_api.cc
+++ b/onnxruntime/core/providers/vitisai/imp/global_api.cc
@@ -512,7 +512,7 @@ vaip_core::OrtApiForVaip* create_org_api_hook() {
   the_global_api.session_option_configuration = [](
                                                     void* mmap, void* session_options, void (*push)(void* mmap, const char* name, const char* value)) {
     auto options = reinterpret_cast<OrtSessionOptions*>(session_options);
-    auto option_list = options->GetConfigOptions();
+    auto option_list = options->GetConfigOptionsMap();
     // option_list.GetConfigEntry
     for (const auto& option : option_list) {
       push(mmap, option.first.c_str(), option.second.c_str());
diff --git a/onnxruntime/core/providers/webgpu/nn/batch_norm.cc b/onnxruntime/core/providers/webgpu/nn/batch_norm.cc
index ae4057bdea173..4bcfc93860a91 100644
--- a/onnxruntime/core/providers/webgpu/nn/batch_norm.cc
+++ b/onnxruntime/core/providers/webgpu/nn/batch_norm.cc
@@ -51,14 +51,13 @@ Status BatchNormalizationProgram::GenerateShaderCode(ShaderHelper& shader) const
   const ShaderVariableHelper& output = shader.AddOutput("output", ShaderUsage::UseUniform | ShaderUsage::UseIndicesTypeAlias);
 
   shader.MainFunctionBody() << shader.GuardAgainstOutOfBoundsWorkgroupSizes("uniforms.output_size")
-                            << "  let idx = global_idx * " << components_ << ";\n"
-                            << "  var outputIndices = " << output.OffsetToIndices("idx") << ";\n";
+                            << "  var outputIndices = " << output.OffsetToIndices("global_idx") << ";\n";
   if (spatial_) {
     if (input_tensor.Rank() == 1) {
       shader.MainFunctionBody() << "  let cOffset = 0u;\n";
     } else {
       if (format_ == DataLayout::NHWC) {
-        shader.MainFunctionBody() << "  let cOffset = outputIndices[" << input_tensor.Rank() - 1 << "] / " << components_ << ";\n";
+        shader.MainFunctionBody() << "  let cOffset = outputIndices[" << input_tensor.Rank() - 1 << "];\n";
       } else {
         shader.MainFunctionBody() << "  let cOffset = outputIndices[1];\n";
       }
@@ -122,7 +121,7 @@ Status BatchNormalization<is_nhwc>::ComputeInternal(ComputeContext& context) con
 
   ORT_RETURN_IF_ERROR(BatchNormHelper::ValidateInputs(input_tensor, scale, B, input_mean, input_var, spatial_ == 1, format_ == DataLayout::NHWC));
 
-  BatchNormalizationProgram program{epsilon_, spatial_, format_, static_cast<int64_t>(components)};
+  BatchNormalizationProgram program{epsilon_, spatial_, format_};
   program
       .CacheHint(epsilon_, spatial_, format_, components)
       .AddInputs({{input_tensor, ProgramTensorMetadataDependency::TypeAndRank, components},
diff --git a/onnxruntime/core/providers/webgpu/nn/batch_norm.h b/onnxruntime/core/providers/webgpu/nn/batch_norm.h
index 00dc7679620fb..5e5cf759237a7 100644
--- a/onnxruntime/core/providers/webgpu/nn/batch_norm.h
+++ b/onnxruntime/core/providers/webgpu/nn/batch_norm.h
@@ -11,11 +11,8 @@ namespace webgpu {
 
 class BatchNormalizationProgram final : public Program<BatchNormalizationProgram> {
  public:
-  BatchNormalizationProgram(float epsilon, int64_t spatial, DataLayout format, int64_t components) : Program{"BatchNormalization"},
-                                                                                                     epsilon_{epsilon},
-                                                                                                     spatial_{spatial},
-                                                                                                     format_{format},
-                                                                                                     components_{components} {}
+  BatchNormalizationProgram(float epsilon, int64_t spatial, DataLayout format)
+      : Program{"BatchNormalization"}, epsilon_{epsilon}, spatial_{spatial}, format_{format} {}
 
   Status GenerateShaderCode(ShaderHelper& sh) const override;
 
@@ -25,7 +22,6 @@ class BatchNormalizationProgram final : public Program<BatchNormalizationProgram
   float epsilon_;
   int64_t spatial_;
   DataLayout format_;
-  int64_t components_;
 };
 
 template <bool is_nhwc>
diff --git a/onnxruntime/core/providers/webgpu/nn/conv2d_mm_webgpu.cc b/onnxruntime/core/providers/webgpu/nn/conv2d_mm_webgpu.cc
index 24e49304cf532..1e5e52215b53f 100644
--- a/onnxruntime/core/providers/webgpu/nn/conv2d_mm_webgpu.cc
+++ b/onnxruntime/core/providers/webgpu/nn/conv2d_mm_webgpu.cc
@@ -214,7 +214,7 @@ Conv2dMMProgram CreateConv2dMMProgram(const Activation& activation, const std::v
     std::transform(vec.begin(), vec.end(), std::ostream_iterator<std::string>(oss, ","), [](uint32_t i) { return std::to_string(i); });
     return oss.str();
   };
-  program.CacheHint(activation.ToString(), stringify({inner_element_size, static_cast<uint32_t>(is_vec4 ? 1 : 0), fit_a_outer, fit_b_outer, fit_inner, tile_a_outer, tile_a_outer, tile_inner, static_cast<uint32_t>(components)}))
+  program.CacheHint(activation.ToString(), is_channels_last, stringify({inner_element_size, static_cast<uint32_t>(is_vec4 ? 1 : 0), fit_a_outer, fit_b_outer, fit_inner, tile_a_outer, tile_a_outer, tile_inner, static_cast<uint32_t>(components)}))
       .AddOutput({output, ProgramTensorMetadataDependency::TypeAndRank, reduced_output_shape, components})
       .SetDispatchGroupSize(dispatch[0], dispatch[1], dispatch[2])
       .SetWorkgroupSize(workgroup_size[0], workgroup_size[1], workgroup_size[2])
diff --git a/onnxruntime/core/providers/webgpu/nn/conv_backprop_webgpu.cc b/onnxruntime/core/providers/webgpu/nn/conv_backprop_webgpu.cc
index 74f3e0dcc85f5..9c97995bee3d7 100644
--- a/onnxruntime/core/providers/webgpu/nn/conv_backprop_webgpu.cc
+++ b/onnxruntime/core/providers/webgpu/nn/conv_backprop_webgpu.cc
@@ -161,7 +161,7 @@ ConvTranspose2DProgram CreateConvTranspose2DProgram(const std::vector<const Tens
   auto output_channels_per_group = weight_shape[3];
   auto a_components = is_channels_last ? GetMaxComponents(input_channels_per_group) : 1;
   bool pack_input_as4 = is_channels_last && output_channels_per_group == 1 && input_channels_per_group >= 4;
-  auto input_channels_per_group_int = pack_input_as4 ? ((input_channels_per_group + 3) / 4) * 4 : (input_channels_per_group / a_components) * a_components;
+  auto input_channels_per_group_int = pack_input_as4 ? (input_channels_per_group / 4) * 4 : (input_channels_per_group / a_components) * a_components;
   auto input_channels_remainder = input_channels_per_group - input_channels_per_group_int;
   auto components = is_channels_last ? GetMaxComponents(output_channels_per_group) : 1;
   auto b_components = is_channels_last ? (output_channels_per_group == 1 ? a_components : components) : 1;
diff --git a/onnxruntime/core/providers/webgpu/nn/instance_norm.cc b/onnxruntime/core/providers/webgpu/nn/instance_norm.cc
new file mode 100644
index 0000000000000..0cab454a5a530
--- /dev/null
+++ b/onnxruntime/core/providers/webgpu/nn/instance_norm.cc
@@ -0,0 +1,242 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/providers/webgpu/nn/instance_norm.h"
+#include "core/providers/cpu/nn/instance_norm_helper.h"
+#include "core/providers/webgpu/tensor/transpose.h"
+#include "core/common/inlined_containers.h"
+#include "core/providers/cpu/tensor/utils.h"
+#include "core/providers/webgpu/shader_helper.h"
+#include "core/providers/webgpu/webgpu_supported_types.h"
+#include "core/providers/webgpu/webgpu_utils.h"
+namespace onnxruntime {
+namespace webgpu {
+
+Status ComputeChannelScaleShiftProgram::GenerateShaderCode(ShaderHelper& shader) const {
+  const auto& input = shader.AddInput("x", ShaderUsage::UseUniform | ShaderUsage::UseIndicesTypeAlias | ShaderUsage::UseElementTypeAlias | ShaderUsage::UseValueTypeAlias | ShaderUsage::UseIndicesTypeAlias);
+  const auto& scale = shader.AddInput("scale", ShaderUsage::UseUniform | ShaderUsage::UseIndicesTypeAlias);
+  const auto& bias = shader.AddInput("bias", ShaderUsage::UseUniform | ShaderUsage::UseIndicesTypeAlias);
+  const ShaderVariableHelper& output = shader.AddOutput("output", ShaderUsage::UseUniform | ShaderUsage::UseIndicesTypeAlias | ShaderUsage::UseValueTypeAlias);
+
+  shader.AdditionalImplementation() << "var<workgroup> workgroup_shared_sum : array<x_value_t, " << workgroup_size_ << ">;\n"
+                                    << "var<workgroup> workgroup_shared_squared_sum : array<x_value_t, " << workgroup_size_ << ">;\n"
+                                    << "const workgroup_size = " << workgroup_size_ << ";\n";
+  shader.MainFunctionBody() << "  let batch = workgroup_idx / uniforms.x_shape[1];\n"
+                            << "  let channel = workgroup_idx % uniforms.x_shape[1];\n"
+                            << "  let hight = uniforms.x_shape[2];\n"
+                            << "   // initialize workgroup memory<< \n"
+                            << "  var sum = x_value_t(0);\n"
+                            << "  var squared_sum = x_value_t(0);\n"
+                            << "  for (var h = local_idx; h < hight; h += workgroup_size) {\n"
+                            << "    let indices = x_indices_t(batch, channel, h);\n"
+                            << "    let value =" << input.GetByIndices("indices") << ";\n"
+                            << "    sum += value;\n"
+                            << "    squared_sum += value * value;\n"
+                            << "  }\n"
+                            << "  workgroup_shared_sum[local_idx] = sum;\n"
+                            << "  workgroup_shared_squared_sum[local_idx] = squared_sum;\n"
+                            << "  workgroupBarrier();\n"
+                            << "  for (var currSize = workgroup_size >> 1; currSize > 0; currSize = currSize >> 1) {\n"
+                            << "    if (local_idx < u32(currSize)) {\n"
+                            << "      workgroup_shared_sum[local_idx] = workgroup_shared_sum[local_idx] + workgroup_shared_sum[local_idx + u32(currSize)];\n"
+                            << "      workgroup_shared_squared_sum[local_idx] = workgroup_shared_squared_sum[local_idx] + workgroup_shared_squared_sum[local_idx + u32(currSize)];\n"
+                            << "    }\n"
+                            << "    workgroupBarrier();\n"
+                            << "  }\n"
+                            << "  if (local_idx == 0) {\n"
+                            << "    let sum_final = " << SumVector("workgroup_shared_sum[0]", components_) << " / x_element_t(hight * " << components_ << ");\n"
+                            << "    let squared_sum_final = " << SumVector("workgroup_shared_squared_sum[0]", components_) << " / x_element_t(hight * " << components_ << ");\n"
+                            << "    let inv_std_dev = inverseSqrt(squared_sum_final - sum_final * sum_final + x_element_t(" << std::to_string(epsilon_) << "));\n"
+                            << "    let channel_scale = inv_std_dev * " << scale.GetByOffset("channel") << ";\n"
+                            << "    let channel_shift = " << bias.GetByOffset("channel") << " - sum_final * channel_scale;\n"
+                            << "    " << output.SetByOffset("workgroup_idx", "output_value_t(channel_scale, channel_shift)") << ";\n"
+                            << "  }\n";
+  return Status::OK();
+}
+
+// This function expects channels first. The spacial dimensions are expected to be in the last dimensions.
+Status ComputeChannelScaleAndShift(ComputeContext& context, const Tensor* input, const Tensor* scale, const Tensor* bias, float epsilon, Tensor* output) {
+  const auto& input_shape = input->Shape();
+  const auto batch_size = input_shape[0];
+  const auto channels = input_shape[1];
+  const auto spatial_size = input->Shape().SizeFromDimension(2);
+  const auto components = GetMaxComponents(spatial_size);
+  auto units_of_work = batch_size * channels;
+  auto workgroup_size = units_of_work == 1 ? static_cast<int>(WORKGROUP_SIZE) : static_cast<int>(256);
+  TensorShapeVector reduce_input_shape_vector = {batch_size, channels, spatial_size / components};
+  TensorShapeVector output_shape_vector = {batch_size, channels, 2};
+  TensorShapeVector reduced_output_shape_vector = {batch_size, channels, 1};
+  TensorShape reduced_input_shape(reduce_input_shape_vector);
+  TensorShape output_shape(output_shape_vector);
+  TensorShape reduced_output_shape(reduced_output_shape_vector);
+  *output = context.CreateGPUTensor(input->DataType(), output_shape);
+  ComputeChannelScaleShiftProgram program = ComputeChannelScaleShiftProgram(components, epsilon, workgroup_size);
+  program.CacheHint(components, units_of_work)
+      .AddInputs({{input, ProgramTensorMetadataDependency::TypeAndRank, reduced_input_shape, components},
+                  {scale, ProgramTensorMetadataDependency::TypeAndRank},
+                  {bias, ProgramTensorMetadataDependency::TypeAndRank}})
+      .AddOutputs({{output, ProgramTensorMetadataDependency::TypeAndRank, reduced_output_shape, 2}})
+      .SetDispatchGroupSize(static_cast<uint32_t>(units_of_work))
+      .SetWorkgroupSize(workgroup_size);
+  return context.RunProgram(program);
+}
+
+Status InstanceNormProgram::GenerateShaderCode(ShaderHelper& shader) const {
+  const auto& input = shader.AddInput("input", ShaderUsage::UseUniform | ShaderUsage::UseIndicesTypeAlias | ShaderUsage::UseValueTypeAlias);
+  const auto& channel_scale_shift = shader.AddInput("channel_scale_shift", ShaderUsage::UseUniform | ShaderUsage::UseIndicesTypeAlias);
+  const auto& output = shader.AddOutput("output", ShaderUsage::UseUniform | ShaderUsage::UseIndicesTypeAlias | ShaderUsage::UseValueTypeAlias);
+  shader.MainFunctionBody() << shader.GuardAgainstOutOfBoundsWorkgroupSizes("uniforms.output_size")
+                            << "let outputIndices = " << output.OffsetToIndices("global_idx")
+                            << "let batch = outputIndices[0];\n"
+                            << "let channel = outputIndices[1];\n"
+                            << "let channel_scale_shift_indices = channel_scale_shift_indices_t(batch, channel, 0);\n"
+                            << "let channel_scale_shift = " << channel_scale_shift.GetByIndices("channel_scale_shift_indices") << ";\n"
+                            << "let input_value = " << input.GetByOffset("global_idx") << ";\n"
+                            << "let output_value = input_value * output_value_t(channel_scale_sift.x) + output_value_t(channel_scale_shift.y);\n"
+                            << output.SetByOffset("global_idx", "output_value") << ";\n";
+  return Status::OK();
+}
+
+Status InstanceNormProgramNHWC::GenerateShaderCode(ShaderHelper& shader) const {
+  const auto& input = shader.AddInput("input", ShaderUsage::UseUniform | ShaderUsage::UseIndicesTypeAlias | ShaderUsage::UseValueTypeAlias | ShaderUsage::UseElementTypeAlias);
+  const auto& channel_scale_shift = shader.AddInput("channel_scale_shift", ShaderUsage::UseUniform | ShaderUsage::UseIndicesTypeAlias);
+  const auto& output = shader.AddOutput("output", ShaderUsage::UseUniform | ShaderUsage::UseIndicesTypeAlias | ShaderUsage::UseValueTypeAlias);
+  shader.MainFunctionBody() << shader.GuardAgainstOutOfBoundsWorkgroupSizes("uniforms.output_size")
+                            << "let current_image_number = global_idx / (uniforms.C * uniforms.H);\n"
+                            << "let current_channel_number = global_idx % uniforms.C;\n"
+                            << "let scale_offset = (current_image_number * uniforms.C + current_channel_number);\n"
+                            << "var scale : input_value_t;\n"
+                            << "var shift : input_value_t;\n"
+                            << "let input_value = " << input.GetByOffset("global_idx") << ";\n";
+  if (components_ > 1) {
+    shader.MainFunctionBody() << "for (var i : u32 = 0; i < uniforms.components; i = i + 1) {\n"
+                              << "  let scale_sift =  " << channel_scale_shift.GetByOffset("scale_offset + i") << ";\n"
+                              << "  scale[i] = input_element_t(scale_sift.x);\n"
+                              << "  shift[i] = input_element_t(scale_sift.y);\n"
+                              << "}\n";
+  } else {
+    shader.MainFunctionBody() << "let scale_shift = " << channel_scale_shift.GetByOffset("scale_offset") << ";\n"
+                              << "scale = scale_shift.x;\n"
+                              << "shift = scale_shift.y;\n";
+  }
+  shader.MainFunctionBody() << "let output_value = fma(input_value, scale, shift);\n";
+  shader.MainFunctionBody() << output.SetByOffset("global_idx", "output_value") << ";\n";
+
+  return Status::OK();
+}
+
+template <>
+Status InstanceNorm<true>::ComputeInternal(ComputeContext& context) const {
+  const auto* input = context.Input<Tensor>(0);
+  const auto* scale = context.Input<Tensor>(1);
+  const auto* bias = context.Input<Tensor>(2);
+  ORT_RETURN_IF_ERROR(InstanceNormHelper::ValidateInputs(input, scale, bias, true));
+  TensorShape input_shape = input->Shape();
+  TensorShapeVector input_shape_vector = input->Shape().AsShapeVector();
+  const auto rank = input_shape_vector.size();
+  const auto batch_size = input_shape_vector[0];
+  const auto channels = input_shape_vector[rank - 1];
+  const auto spatial_size = input->Shape().SizeFromDimension(1) / channels;
+  Tensor input_transpose;
+  // Transpose input to NCHW format
+  TensorShapeVector input_transpose_shape_vector(rank);
+  input_transpose_shape_vector[0] = input_shape_vector[0];
+  input_transpose_shape_vector[1] = input_shape_vector[rank - 1];
+  InlinedVector<size_t> permute(rank);
+  permute[0] = static_cast<size_t>(0);
+  permute[1] = rank - 1;
+  for (size_t i = 0; i < rank - 2; ++i) {
+    input_transpose_shape_vector[i + 2] = input_shape_vector[i + 1];
+    permute[i + 2] = i + 1;
+  }
+  auto input_transpose_size = static_cast<uint32_t>(input_shape.Size());
+  TensorShape input_transpose_shape(input_transpose_shape_vector);
+  input_transpose = context.CreateGPUTensor(input->DataType(), input_transpose_shape);
+  TransposeProgram transpose_program{permute, false};
+  transpose_program
+      .CacheHint(absl::StrJoin(permute, "-"))
+      .AddInput({input, ProgramTensorMetadataDependency::TypeAndRank, input_shape, 1})
+      .AddOutput({&input_transpose, ProgramTensorMetadataDependency::TypeAndRank})
+      .AddUniformVariable({input_transpose_size})
+      .SetDispatchGroupSize((input_transpose_size + WORKGROUP_SIZE - 1) / WORKGROUP_SIZE);
+  ORT_RETURN_IF_ERROR(context.RunProgram(transpose_program));
+
+  Tensor channel_scale_shift;
+  ORT_RETURN_IF_ERROR(ComputeChannelScaleAndShift(context, &input_transpose, scale, bias, epsilon_, &channel_scale_shift));
+  TensorShape output_shape(input_shape_vector);
+  Tensor* output = context.Output(0, output_shape);
+  const auto components = GetMaxComponents(channels);
+  auto output_size = (output_shape.Size() + components - 1) / components;
+  InstanceNormProgramNHWC program(components);
+  TensorShapeVector channel_scale_shift_shape_vector = {batch_size, channels, 1};
+  TensorShape reduced_channel_scale_shift_shape(channel_scale_shift_shape_vector);
+  program.CacheHint(components)
+      .AddInputs({{input, ProgramTensorMetadataDependency::TypeAndRank, components},
+                  {&channel_scale_shift, ProgramTensorMetadataDependency::TypeAndRank, reduced_channel_scale_shift_shape, 2}})
+      .AddOutput({output, ProgramTensorMetadataDependency::TypeAndRank, components})
+      .SetDispatchGroupSize((output_size + WORKGROUP_SIZE - 1) / WORKGROUP_SIZE)
+      .AddUniformVariables({static_cast<uint32_t>(output_size), static_cast<uint32_t>(components), static_cast<uint32_t>(channels / components), static_cast<uint32_t>(spatial_size)});
+  return context.RunProgram(program);
+}
+
+template <>
+Status InstanceNorm<false>::ComputeInternal(ComputeContext& context) const {
+  const auto* input = context.Input<Tensor>(0);
+  const auto* scale = context.Input<Tensor>(1);
+  const auto* bias = context.Input<Tensor>(2);
+  ORT_RETURN_IF_ERROR(InstanceNormHelper::ValidateInputs(input, scale, bias, false));
+  TensorShape input_shape = input->Shape();
+  TensorShapeVector input_shape_vector = input->Shape().AsShapeVector();
+  const auto batch_size = input_shape_vector[0];
+  const auto channels = input_shape_vector[1];
+  const auto spatial_size = input->Shape().SizeFromDimension(2);
+  Tensor channel_scale_shift;
+  ORT_RETURN_IF_ERROR(ComputeChannelScaleAndShift(context, input, scale, bias, epsilon_, &channel_scale_shift));
+  const auto output_shape(input_shape_vector);
+  Tensor* output = context.Output(0, output_shape);
+  const auto components = GetMaxComponents(spatial_size);
+  TensorShapeVector modified_input_shape_vector = {batch_size, channels, spatial_size / components};
+  TensorShape modified_input_shape(modified_input_shape_vector);
+  TensorShape modified_output_shape(modified_input_shape_vector);
+  auto output_size = (modified_output_shape.Size() + components - 1) / components;
+  InstanceNormProgram program;
+  program
+      .AddInputs({{input, ProgramTensorMetadataDependency::TypeAndRank, modified_input_shape, components},
+                  {&channel_scale_shift, ProgramTensorMetadataDependency::TypeAndRank, channel_scale_shift.Shape(), 2}})
+      .AddOutput({output, ProgramTensorMetadataDependency::TypeAndRank, modified_output_shape, components})
+      .SetDispatchGroupSize((output_size + WORKGROUP_SIZE - 1) / WORKGROUP_SIZE)
+      .AddUniformVariables({static_cast<uint32_t>(output_size)});
+  return context.RunProgram(program);
+}
+
+#define WEBGPU_INSTANCE_NORM_VERSIONED_KERNEL(start, end, domain, is_nhwc) \
+  ONNX_OPERATOR_VERSIONED_KERNEL_EX(                                       \
+      InstanceNormalization,                                               \
+      domain,                                                              \
+      start,                                                               \
+      end,                                                                 \
+      kWebGpuExecutionProvider,                                            \
+      (*KernelDefBuilder::Create())                                        \
+          .TypeConstraint("T", WebGpuSupportedFloatTypes()),               \
+      InstanceNorm<is_nhwc>);
+
+#define WEBGPU_INSTANCE_NORM_KERNEL(version, domain, is_nhwc) \
+  ONNX_OPERATOR_KERNEL_EX(                                    \
+      InstanceNormalization,                                  \
+      domain,                                                 \
+      version,                                                \
+      kWebGpuExecutionProvider,                               \
+      (*KernelDefBuilder::Create())                           \
+          .TypeConstraint("T", WebGpuSupportedFloatTypes()),  \
+      InstanceNorm<is_nhwc>);
+
+WEBGPU_INSTANCE_NORM_VERSIONED_KERNEL(1, 5, kOnnxDomain, false)
+WEBGPU_INSTANCE_NORM_VERSIONED_KERNEL(6, 21, kOnnxDomain, false)
+WEBGPU_INSTANCE_NORM_KERNEL(22, kOnnxDomain, false)
+
+WEBGPU_INSTANCE_NORM_VERSIONED_KERNEL(1, 5, kMSInternalNHWCDomain, true)
+WEBGPU_INSTANCE_NORM_VERSIONED_KERNEL(6, 21, kMSInternalNHWCDomain, true)
+WEBGPU_INSTANCE_NORM_KERNEL(22, kMSInternalNHWCDomain, true)
+
+}  // namespace webgpu
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/webgpu/nn/instance_norm.h b/onnxruntime/core/providers/webgpu/nn/instance_norm.h
new file mode 100644
index 0000000000000..b067ac22e2e2f
--- /dev/null
+++ b/onnxruntime/core/providers/webgpu/nn/instance_norm.h
@@ -0,0 +1,58 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/providers/webgpu/webgpu_kernel.h"
+#include "core/providers/webgpu/program.h"
+
+namespace onnxruntime {
+namespace webgpu {
+
+class ComputeChannelScaleShiftProgram final : public Program<ComputeChannelScaleShiftProgram> {
+ public:
+  ComputeChannelScaleShiftProgram(int components, float epsilon, int workgroup_size) : Program{"ComputeChannelScaleShift"}, components_(components), epsilon_(epsilon), workgroup_size_(workgroup_size) {}
+
+  Status GenerateShaderCode(ShaderHelper& sh) const override;
+
+  int components_;
+  float epsilon_;
+  int workgroup_size_;
+};
+
+class InstanceNormProgram final : public Program<InstanceNormProgram> {
+ public:
+  InstanceNormProgram() : Program{"InstanceNorm"} {}
+
+  Status GenerateShaderCode(ShaderHelper& sh) const override;
+
+  WEBGPU_PROGRAM_DEFINE_UNIFORM_VARIABLES({"output_size", ProgramUniformVariableDataType::Uint32});
+};
+
+class InstanceNormProgramNHWC final : public Program<InstanceNormProgramNHWC> {
+ public:
+  InstanceNormProgramNHWC(int components) : Program{"InstanceNormNHWC"}, components_(components) {}
+
+  Status GenerateShaderCode(ShaderHelper& sh) const override;
+
+  WEBGPU_PROGRAM_DEFINE_UNIFORM_VARIABLES({"output_size", ProgramUniformVariableDataType::Uint32}, {"components", ProgramUniformVariableDataType::Uint32}, {"C", ProgramUniformVariableDataType::Uint32}, {"H", ProgramUniformVariableDataType::Uint32});
+
+ private:
+  int components_;
+};
+
+template <bool is_nhwc>
+class InstanceNorm final : public WebGpuKernel {
+ public:
+  InstanceNorm(const OpKernelInfo& info) : WebGpuKernel(info) {
+    epsilon_ = info.GetAttrOrDefault<float>("epsilon", 1e-5f);
+  }
+  Status ComputeInternal(ComputeContext& context) const override;
+
+ private:
+  float epsilon_;
+};
+Status ComputeChannelScaleAndShift(ComputeContext& context, const Tensor* input, const Tensor* scale, const Tensor* bias, float epsilon, Tensor* output);
+
+}  // namespace webgpu
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/webgpu/reduction/reduction_ops.cc b/onnxruntime/core/providers/webgpu/reduction/reduction_ops.cc
index 8f2619b6cb2b6..4b23f6fc67669 100644
--- a/onnxruntime/core/providers/webgpu/reduction/reduction_ops.cc
+++ b/onnxruntime/core/providers/webgpu/reduction/reduction_ops.cc
@@ -7,6 +7,7 @@
 #include "core/providers/webgpu/data_transfer.h"
 #include "core/providers/webgpu/shader_helper.h"
 #include "core/providers/webgpu/webgpu_supported_types.h"
+#include "core/providers/webgpu/tensor/transpose.h"
 
 namespace onnxruntime {
 namespace webgpu {
@@ -99,20 +100,112 @@ REGISTER_REDUCE_VERSIONED_KERNEL(ArgMin, 1, 10);
 REGISTER_REDUCE_VERSIONED_KERNEL(ArgMin, 11, 12);
 REGISTER_REDUCE_KERNEL(ArgMin, 13);
 
-Status ReduceKernelProgram::GenerateShaderCode(ShaderHelper& shader) const {
+std::unordered_map<std::string, ReduceOpType> reduce_op_types = {
+    {"ReduceMax", ReduceOpType::Max},
+    {"ReduceMin", ReduceOpType::Min},
+    {"ReduceMean", ReduceOpType::Mean},
+    {"ReduceSum", ReduceOpType::Sum},
+    {"ReduceProd", ReduceOpType::Prod},
+    {"ReduceSumSquare", ReduceOpType::SumSquare},
+    {"ReduceLogSumExp", ReduceOpType::LogSumExp},
+    {"ReduceL1", ReduceOpType::L1},
+    {"ReduceL2", ReduceOpType::L2},
+    {"ReduceLogSum", ReduceOpType::LogSum},
+    {"ArgMax", ReduceOpType::ArgMax},
+    {"ArgMin", ReduceOpType::ArgMin},
+    {"ArgMax_select_last_index", ReduceOpType::ArgMax_select_last_index},
+    {"ArgMin_select_last_index", ReduceOpType::ArgMin_select_last_index},
+};
+
+std::unordered_map<ReduceOpType, std::string> reduce_op_code_map = {
+    {ReduceOpType::Max, "select(bestValue, candidate, candidate > bestValue)"},
+    {ReduceOpType::Min, "select(bestValue, candidate, candidate < bestValue)"},
+    {ReduceOpType::Mean, "bestValue + candidate"},
+    {ReduceOpType::Sum, "bestValue + candidate"},
+    {ReduceOpType::Prod, "bestValue * candidate"},
+    {ReduceOpType::SumSquare, "bestValue + candidate * candidate"},
+    {ReduceOpType::LogSumExp, "bestValue + output_value_t(exp(f32(candidate)))"},
+    {ReduceOpType::L1, "bestValue + abs(candidate)"},
+    {ReduceOpType::L2, "bestValue + candidate * candidate"},
+    {ReduceOpType::LogSum, "bestValue + candidate"},
+};
+
+std::unordered_map<ReduceOpType, std::string> reduce_op_shared_code_map = {
+    {ReduceOpType::Max, "select(bestValue, candidate, candidate > bestValue)"},
+    {ReduceOpType::Min, "select(bestValue, candidate, candidate < bestValue)"},
+    {ReduceOpType::Mean, "bestValue + candidate"},
+    {ReduceOpType::Sum, "bestValue + candidate"},
+    {ReduceOpType::Prod, "bestValue * candidate"},
+    {ReduceOpType::SumSquare, "bestValue + candidate"},
+    {ReduceOpType::LogSumExp, "bestValue + candidate"},
+    {ReduceOpType::L1, "bestValue + candidate"},
+    {ReduceOpType::L2, "bestValue + candidate"},
+    {ReduceOpType::LogSum, "bestValue + candidate"},
+};
+
+std::unordered_map<ReduceOpType, std::string> reduce_op_init_values_map = {
+    {ReduceOpType::Max, "_A[offset]"},
+    {ReduceOpType::Min, "_A[offset]"},
+    {ReduceOpType::Mean, "0"},
+    {ReduceOpType::Sum, "0"},
+    {ReduceOpType::Prod, "1"},
+    {ReduceOpType::SumSquare, "0"},
+    {ReduceOpType::LogSumExp, "0"},
+    {ReduceOpType::L1, "0"},
+    {ReduceOpType::L2, "0"},
+    {ReduceOpType::LogSum, "0"},
+};
+
+std::unordered_map<ReduceOpType, std::string> reduce_op_output_values_map = {
+    {ReduceOpType::Max, "bestValue"},
+    {ReduceOpType::Min, "bestValue"},
+    {ReduceOpType::Mean, "bestValue"},
+    {ReduceOpType::Sum, "bestValue"},
+    {ReduceOpType::Prod, "bestValue"},
+    {ReduceOpType::SumSquare, "bestValue"},
+    {ReduceOpType::LogSumExp, "log(f32(bestValue))"},
+    {ReduceOpType::L1, "bestValue"},
+    {ReduceOpType::L2, "sqrt(f32(bestValue))"},
+    {ReduceOpType::LogSum, "log(f32(bestValue))"},
+};
+
+std::unordered_map<ReduceOpType, ReduceOpSpecificCode> reduce_op_naive_code_map = {
+    {ReduceOpType::Max, {"var max_element = first_element;", "max_element = max(max_element, current_element);", "let output_value = output_value_t(max_element);"}},
+    {ReduceOpType::Min, {"var min_element = first_element;", "min_element = min(min_element, current_element);", "let output_value = output_value_t(min_element);"}},
+    {ReduceOpType::Mean, {"var sum = f32(0);", "sum += f32(current_element);", "let output_value = output_value_t(sum / f32(uniforms.reduce_size));"}},
+    {ReduceOpType::Sum, {"var sum = f32(0);", "sum += f32(current_element);", "let output_value = output_value_t(sum);"}},
+    {ReduceOpType::Prod, {"var prod = f32(1);", "prod *= f32(current_element);", "let output_value = output_value_t(prod);"}},
+    {ReduceOpType::SumSquare, {"var sum_square = f32(0);", "sum_square += f32(current_element * current_element);", "let output_value = output_value_t(sum_square);"}},
+    {ReduceOpType::LogSumExp, {"var log_sum_exp = f32(0);", "log_sum_exp += exp(f32(current_element));", "let output_value = output_value_t(log(log_sum_exp));"}},
+    {ReduceOpType::L1, {"var l1 = f32(0);", "l1 += abs(f32(current_element));", "let output_value = output_value_t(l1);"}},
+    {ReduceOpType::L2, {"var l2 = f32(0);", "l2 += f32(current_element * current_element);", "let output_value = output_value_t(sqrt(l2));"}},
+    {ReduceOpType::LogSum, {"var sum = f32(0);", "sum += f32(current_element);", "let output_value = output_value_t(log(sum));"}},
+    {ReduceOpType::ArgMax, {"var best_element = first_element; var best_index = u32(0);", "if (current_element > best_element) { best_element = current_element; best_index = last_index; };", "let output_value = output_value_t(best_index);"}},
+    {ReduceOpType::ArgMin, {"var best_element = first_element;; var best_index = u32(0);", "if (current_element < best_element) { best_element = current_element; best_index = last_index; };", "let output_value = output_value_t(best_index);"}},
+    {ReduceOpType::ArgMax_select_last_index, {"var best_element = first_element; var best_index = u32(0);", "if (current_element >= best_element) { best_element = current_element; best_index = last_index; };", "let output_value = output_value_t(best_index);"}},
+    {ReduceOpType::ArgMin_select_last_index, {"var best_element = first_element;; var best_index = u32(0);", "if (current_element <= best_element) { best_element = current_element; best_index = last_index; };", "let output_value = output_value_t(best_index);"}},
+};
+
+ReduceOpType StringToReduceOp(std::string name) {
+  ORT_ENFORCE(reduce_op_types.find(name) != reduce_op_types.end(), "Unsupported reduction op type: ", name);
+  return reduce_op_types[name];
+}
+
+Status ReduceNaiveProgram::GenerateShaderCode(ShaderHelper& shader) const {
+  const auto& code = reduce_op_naive_code_map.at(reduce_op_type_);
   const auto& output = shader.AddOutput("output", ShaderUsage::UseUniform | ShaderUsage::UseIndicesTypeAlias | ShaderUsage::UseValueTypeAlias);
   if (is_input_empty_) {
     shader.MainFunctionBody() << shader.GuardAgainstOutOfBoundsWorkgroupSizes("uniforms.output_size")
-                              << code_[0]
-                              << code_[2]
+                              << code.loop_header_
+                              << code.loop_footer_
                               << output.SetByOffset("global_idx", "output_value");
     return Status::OK();
   }
   const auto& input = shader.AddInput("input", ShaderUsage::UseUniform | ShaderUsage::UseIndicesTypeAlias | ShaderUsage::UseValueTypeAlias);
   bool reduce_on_all_axes = no_op_with_empty_axes_ == false && axes_.empty();
-  std::string loop_header = code_[0].find("first_element") == std::string::npos ? code_[0] : "let first_element = " + input.GetByIndices("input_indices") + ";\n" + code_[0] + "\n";
-  std::string loop_body = "let current_element: input_value_t = " + input.GetByIndices("input_indices") + ";\n" + code_[1];
-  std::string loop_footer = code_[2];
+  std::string loop_header = code.loop_header_.find("first_element") == std::string::npos ? code.loop_header_ : "let first_element = " + input.GetByIndices("input_indices") + ";\n" + code.loop_header_ + "\n";
+  std::string loop_body = "let current_element: input_value_t = " + input.GetByIndices("input_indices") + ";\n" + code.loop_body_;
+  std::string loop_footer = code.loop_footer_;
   const auto input_rank = input.Rank();
   for (int i = 0, l = 0; i < input_rank; ++i) {
     if (reduce_on_all_axes || std::find(axes_.begin(), axes_.end(), i) != axes_.end()) {
@@ -152,11 +245,49 @@ Status ReduceKernelProgram::GenerateShaderCode(ShaderHelper& shader) const {
   return Status::OK();
 }
 
+Status ReduceSharedProgram::GenerateShaderCode(ShaderHelper& shader) const {
+  const auto& input = shader.AddInput("_A", ShaderUsage::UseUniform | ShaderUsage::UseIndicesTypeAlias | ShaderUsage::UseValueTypeAlias | ShaderUsage::UseElementTypeAlias);
+  const auto& output = shader.AddOutput("output", ShaderUsage::UseUniform | ShaderUsage::UseIndicesTypeAlias | ShaderUsage::UseValueTypeAlias | ShaderUsage::UseElementTypeAlias);
+  shader.AdditionalImplementation()
+      << "var<workgroup> aBestValues : array<output_value_t, " << workgroup_size_ << ">;\n\n"
+      << "fn DIV_CEIL(a : u32, b : u32) -> u32 {\n"
+      << "  return ((a - 1u) / b + 1u);\n"
+      << "}\n";
+  shader.MainFunctionBody() << "let outputIndex = global_idx / " << workgroup_size_ << ";\n"
+                            << "let offset = outputIndex * uniforms.reduceSize;\n"
+                            << "var bestValue = output_value_t(" << reduce_op_init_values_map[reduce_op_type_] << ");\n"
+                            << "let length = uniforms.reduceSize;\n"
+                            << "for (var k = local_idx; k < length; k += " << workgroup_size_ << ") {\n"
+                            << "  let candidate = output_value_t(" << input.GetByOffset("offset + k") << ");\n"
+                            << "  bestValue = " << reduce_op_code_map[reduce_op_type_] << ";\n"
+                            << "}\n"
+                            << "aBestValues[local_idx] = bestValue;\n"
+                            << "workgroupBarrier();\n"
+                            << "var reduceSize = min(length, " << workgroup_size_ << ");\n"
+                            << "for (var currentSize = reduceSize / 2; reduceSize > 1; currentSize = reduceSize / 2) {\n"
+                            << "  let interval = DIV_CEIL(reduceSize, 2u);\n"
+                            << "  if (local_idx < currentSize) {\n"
+                            << "    let candidate = aBestValues[local_idx + interval];\n"
+                            << "    bestValue = " << reduce_op_shared_code_map[reduce_op_type_] << ";\n"
+                            << "    aBestValues[local_idx] = bestValue;\n"
+                            << "  }\n"
+                            << "  reduceSize = interval;\n"
+                            << "  workgroupBarrier();\n"
+                            << "}\n"
+                            << "if (local_idx == 0) {\n"
+                            << "  let outputValue = output_value_t(" << (reduce_op_type_ == ReduceOpType::Mean ? "(bestValue / output_element_t(uniforms.reduceSize))" : reduce_op_output_values_map[reduce_op_type_]) << ");\n"
+                            << "  " << output.SetByOffset("outputIndex", "outputValue") << ";\n"
+                            << "}\n";
+  return Status::OK();
+}
+
 template <bool allow_multi_axes>
 Status ReduceKernel<allow_multi_axes>::ComputeInternal(ComputeContext& context) const {
   const auto* input_tensor = context.Input(0);
   ORT_RETURN_IF_ERROR(CheckInput(input_tensor));
   InlinedVector<uint32_t> input_axes;
+  bool add_suffix = name_ == "ArgMax" || name_ == "ArgMin";
+  ReduceOpType reduce_op_type = StringToReduceOp(name_ + std::string((select_last_index_ != 0 && add_suffix) ? "_select_last_index" : ""));
   auto rank = input_tensor->Shape().NumDimensions();
   auto transform_axis = [rank](int64_t axis) {
     if (axis < 0) {
@@ -190,14 +321,13 @@ Status ReduceKernel<allow_multi_axes>::ComputeInternal(ComputeContext& context)
         // For ReduceSumSquare with scalar input, output = input * input
         auto output = context.Output(0, input_tensor->Shape());
         // We need to run the operation even for scalar inputs for these ops
-        const auto code = GetOpSpecificCode(input_tensor);
         constexpr uint32_t output_size = 1;
-        constexpr uint32_t reduce_axes = 0;
-        ReduceKernelProgram program(name_, keepdims_, noop_with_empty_axes_, input_axes, code, false);
+        constexpr uint32_t reduce_size = 1;
+        ReduceNaiveProgram program(name_, reduce_op_type, keepdims_, noop_with_empty_axes_, input_axes, false);
         program.AddInput({input_tensor, ProgramTensorMetadataDependency::TypeAndRank})
             .AddOutput({output, ProgramTensorMetadataDependency::TypeAndRank})
             .SetDispatchGroupSize(1)
-            .AddUniformVariables({{output_size}, {static_cast<uint32_t>(noop_with_empty_axes_ ? 1 : 0)}, {reduce_axes}});
+            .AddUniformVariables({{output_size}, {static_cast<uint32_t>(noop_with_empty_axes_ ? 1 : 0)}, {reduce_size}});
         return context.RunProgram(program);
       } else {
         // For other ops, or when axes is empty with noop_with_empty_axes_ true, just copy the input
@@ -213,166 +343,110 @@ Status ReduceKernel<allow_multi_axes>::ComputeInternal(ComputeContext& context)
       std::iota(input_axes.begin(), input_axes.end(), 0);
     }
   }
-  const auto code = GetOpSpecificCode(input_tensor);
+  // reduce_axes element is either 1 or 0 depending on whether the axis is reduced or not
+  std::vector<uint32_t> reduce_axes;
+  reduce_axes.resize(rank, 0);
+  for (auto axis : input_axes) {
+    reduce_axes[axis] = 1;
+  }
+  size_t output_size = 1;
+  size_t reduce_size = 1;
   // Compute output shape
-  std::vector<int64_t> output_shape;
+  TensorShapeVector output_shape_vector;
   bool is_input_empty = false;
   for (size_t i = 0; i < input_tensor->Shape().NumDimensions(); ++i) {
     is_input_empty |= input_tensor->Shape()[i] == 0;
-    if (std::find(input_axes.begin(), input_axes.end(), i) != input_axes.end()) {
+    if (reduce_axes[i] == 1) {
       if (keepdims_) {
-        output_shape.push_back(1);
+        output_shape_vector.push_back(1);
       }
+      reduce_size *= input_tensor->Shape()[i];
     } else {
-      output_shape.push_back(input_tensor->Shape()[i]);
+      output_shape_vector.push_back(input_tensor->Shape()[i]);
+      output_size *= input_tensor->Shape()[i];
     }
   }
-  TensorShape output_tensor_shape(output_shape);
-  int64_t output_size = output_tensor_shape.Size();
+  TensorShape output_shape(output_shape_vector);
   if (output_size == 0) {
-    ORT_IGNORE_RETURN_VALUE(context.Output(0, output_tensor_shape));
+    ORT_IGNORE_RETURN_VALUE(context.Output(0, output_shape));
     return Status::OK();
   }
 
-  auto input_rank = input_tensor->Shape().NumDimensions();
-  // reduce_axes element is either 1 or 0 depending on whether the axis is reduced or not
-  std::vector<uint32_t> reduce_axes;
-  reduce_axes.resize(input_rank, 0);
-  for (auto axis : input_axes) {
-    reduce_axes[axis] = 1;
-  }
-
-  ReduceKernelProgram program(name_, keepdims_, noop_with_empty_axes_, input_axes, code, is_input_empty);
-  if (!is_input_empty) {
-    program.AddInput({input_tensor, ProgramTensorMetadataDependency::TypeAndRank});
-  }
-
-  // TODO: the ReduceKernel class is designed to use `keepdims_`, `noop_with_empty_axes_` and input axes as uniform variables,
-  //       but the current implementation does not work without them in cache key.
-  //       This is a temporary workaround to make it work. We should fix this in the future.
-  program.CacheHint(keepdims_,
-                    noop_with_empty_axes_,
-                    select_last_index_,
-                    absl::StrJoin(input_axes, ","))
-      .AddOutput({context.Output(0, output_shape), ProgramTensorMetadataDependency::TypeAndRank})
-      .SetDispatchGroupSize((output_size + WORKGROUP_SIZE - 1) / WORKGROUP_SIZE)
-      .AddUniformVariables({{static_cast<uint32_t>(output_size)},
-                            {static_cast<uint32_t>(noop_with_empty_axes_ ? 1 : 0)},
-                            {reduce_axes}});
-
-  return context.RunProgram(program);
-}
-
-ReduceOpSpecificCode ReduceMean::GetOpSpecificCode(const Tensor* input_tensor) const {
-  const TensorShape& input_shape = input_tensor->Shape();
-  size_t input_rank = input_shape.NumDimensions();
-  std::string loop_header = "var sum = f32(0);";
-  std::string loop_body = "sum += f32(current_element);";
-  std::stringstream ss;
-  ss << "var size: u32 = 1;\n"
-     << "for (var i: u32 = 0; i < " << input_rank << "; i += 1) { \n"
-     << "  let index_reduced_or_not = " << GetElementAt("uniforms.reduce_axes", "i", input_rank) << ";\n"
-     << "  if (index_reduced_or_not == 1) { \n"
-     << "    size = size * " << GetElementAt("uniforms.input_shape", "i", input_rank) << ";\n"
-     << "  }\n"
-     << "}\n"
-     << "let output_value = output_value_t(sum / f32(size));";
-  std::string loop_footer = ss.str();
-  ReduceOpSpecificCode code({loop_header, loop_body, loop_footer});
-  return code;
-}
+  bool use_naive_reduction = name_ == "ArgMin" || name_ == "ArgMax" || (reduce_size < 32 && output_size > 1024) || is_input_empty || input_tensor->Shape().NumDimensions() == 0;
 
-ReduceOpSpecificCode ReduceMax::GetOpSpecificCode(const Tensor* input_tensor) const {
-  ORT_UNUSED_PARAMETER(input_tensor);
-  std::string loop_header = "var max_element = first_element;";
-  std::string loop_body = "max_element = max(max_element, current_element);";
-  std::string loop_footer = "let output_value = output_value_t(max_element);";
-  ReduceOpSpecificCode code({loop_header, loop_body, loop_footer});
-  return code;
-}
-ReduceOpSpecificCode ReduceMin::GetOpSpecificCode(const Tensor* input_tensor) const {
-  ORT_UNUSED_PARAMETER(input_tensor);
-  std::string loop_header = "var min_element = first_element;";
-  std::string loop_body = "min_element = min(min_element, current_element);";
-  std::string loop_footer = "let output_value = output_value_t(min_element);";
-  ReduceOpSpecificCode code({loop_header, loop_body, loop_footer});
-  return code;
-}
-ReduceOpSpecificCode ReduceSum::GetOpSpecificCode(const Tensor* input_tensor) const {
-  ORT_UNUSED_PARAMETER(input_tensor);
-  std::string loop_header = "var sum = f32(0);";
-  std::string loop_body = "sum += f32(current_element);";
-  std::string loop_footer = "let output_value = output_value_t(sum);";
-  ReduceOpSpecificCode code({loop_header, loop_body, loop_footer});
-  return code;
-}
-ReduceOpSpecificCode ReduceProd::GetOpSpecificCode(const Tensor* input_tensor) const {
-  ORT_UNUSED_PARAMETER(input_tensor);
-  std::string loop_header = "var prod = f32(1);";
-  std::string loop_body = "prod *= f32(current_element);";
-  std::string loop_footer = "let output_value = output_value_t(prod);";
-  ReduceOpSpecificCode code({loop_header, loop_body, loop_footer});
-  return code;
-}
-ReduceOpSpecificCode ReduceL1::GetOpSpecificCode(const Tensor* input_tensor) const {
-  ORT_UNUSED_PARAMETER(input_tensor);
-  std::string loop_header = "var l1 = f32(0);";
-  std::string loop_body = "l1 += abs(f32(current_element));";
-  std::string loop_footer = "let output_value = output_value_t(l1);";
-  ReduceOpSpecificCode code({loop_header, loop_body, loop_footer});
-  return code;
-}
-ReduceOpSpecificCode ReduceL2::GetOpSpecificCode(const Tensor* input_tensor) const {
-  ORT_UNUSED_PARAMETER(input_tensor);
-  std::string loop_header = "var l2 = f32(0);";
-  std::string loop_body = "let t = f32(current_element); l2 += (t * t);";
-  std::string loop_footer = "l2 = sqrt(l2); let output_value = output_value_t(l2);";
-  ReduceOpSpecificCode code({loop_header, loop_body, loop_footer});
-  return code;
-}
-ReduceOpSpecificCode ReduceLogSum::GetOpSpecificCode(const Tensor* input_tensor) const {
-  ORT_UNUSED_PARAMETER(input_tensor);
-  std::string loop_header = "var sum = f32(0);";
-  std::string loop_body = "sum += f32(current_element);";
-  std::string loop_footer = "let log_sum = log(sum); let output_value = output_value_t(log_sum);";
-  ReduceOpSpecificCode code({loop_header, loop_body, loop_footer});
-  return code;
-}
-ReduceOpSpecificCode ReduceSumSquare::GetOpSpecificCode(const Tensor* input_tensor) const {
-  ORT_UNUSED_PARAMETER(input_tensor);
-  std::string loop_header = "var sum_square = f32(0);";
-  std::string loop_body = "let t = f32(current_element); sum_square += (t * t);";
-  std::string loop_footer = "let output_value = output_value_t(sum_square);";
-  ReduceOpSpecificCode code({loop_header, loop_body, loop_footer});
-  return code;
-}
-ReduceOpSpecificCode ReduceLogSumExp::GetOpSpecificCode(const Tensor* input_tensor) const {
-  ORT_UNUSED_PARAMETER(input_tensor);
-  std::string loop_header = "var sum_exp = f32(0);";
-  std::string loop_body = "sum_exp += exp(f32(current_element));";
-  std::string loop_footer = "let log_sum_exp = log(sum_exp); let output_value = output_value_t(log_sum_exp);";
-  ReduceOpSpecificCode code({loop_header, loop_body, loop_footer});
-  return code;
-}
+  if (use_naive_reduction) {
+    ReduceNaiveProgram program(name_, reduce_op_type, keepdims_, noop_with_empty_axes_, input_axes, is_input_empty);
+    if (!is_input_empty) {
+      program.AddInput({input_tensor, ProgramTensorMetadataDependency::TypeAndRank});
+    }
 
-ReduceOpSpecificCode ArgMin::GetOpSpecificCode(const Tensor* input_tensor) const {
-  ORT_UNUSED_PARAMETER(input_tensor);
-  std::string op = (select_last_index_) ? "<=" : "<";
-  std::string loop_header = "var best_element = first_element; var best_index = u32(0);";
-  std::string loop_body = "if (current_element " + op + " best_element) { best_element = current_element; best_index = last_index; };";
-  std::string loop_footer = "let output_value = output_value_t(best_index);";
-  ReduceOpSpecificCode code({loop_header, loop_body, loop_footer});
-  return code;
-}
+    // TODO: the ReduceKernel class is designed to use `keepdims_`, `noop_with_empty_axes_` and input axes as uniform variables,
+    //       but the current implementation does not work without them in cache key.
+    //       This is a temporary workaround to make it work. We should fix this in the future.
+    program.CacheHint(keepdims_,
+                      noop_with_empty_axes_,
+                      select_last_index_,
+                      absl::StrJoin(input_axes, ","))
+        .AddOutput({context.Output(0, output_shape), ProgramTensorMetadataDependency::TypeAndRank})
+        .SetDispatchGroupSize((output_size + WORKGROUP_SIZE - 1) / WORKGROUP_SIZE)
+        .AddUniformVariables({{static_cast<uint32_t>(output_size)},
+                              {static_cast<uint32_t>(noop_with_empty_axes_ ? 1 : 0)},
+                              {static_cast<uint32_t>(reduce_size)}});
 
-ReduceOpSpecificCode ArgMax::GetOpSpecificCode(const Tensor* input_tensor) const {
-  ORT_UNUSED_PARAMETER(input_tensor);
-  std::string op = (select_last_index_) ? ">=" : ">";
-  std::string loop_header = "var best_element = first_element; var best_index = u32(0);";
-  std::string loop_body = "if (current_element " + op + " best_element) { best_element = current_element; best_index = last_index; };";
-  std::string loop_footer = "let output_value = output_value_t(best_index);";
-  ReduceOpSpecificCode code({loop_header, loop_body, loop_footer});
-  return code;
+    return context.RunProgram(program);
+  } else {
+    bool are_axes_innermost = true;
+    size_t axes_rank = input_axes.size();
+    for (size_t i = 0; i < input_axes.size() && are_axes_innermost; ++i) {
+      if (input_axes[axes_rank - 1 - i] != rank - 1 - i) {
+        are_axes_innermost = false;
+        break;
+      }
+    }
+    Tensor input_transpose;
+    if (!are_axes_innermost) {
+      InlinedVector<size_t> perm;
+      for (size_t i = 0; i < rank; ++i) {
+        if (reduce_axes[i] == 0) {
+          perm.push_back(static_cast<size_t>(i));
+        }
+      }
+      for (size_t i = 0; i < rank; ++i) {
+        if (reduce_axes[i] == 1) {
+          perm.push_back(static_cast<size_t>(i));
+        }
+      }
+      // If the axes are not innermost, we need to reorder the input tensor to make them innermost
+      TensorShapeVector input_shape_vector = input_tensor->Shape().AsShapeVector();
+      TensorShapeVector input_transpose_shape_vector(input_shape_vector.size());
+      for (size_t i = 0; i < input_shape_vector.size(); ++i) {
+        input_transpose_shape_vector[i] = input_shape_vector[perm[i]];
+      }
+      TensorShape input_transpose_shape(input_transpose_shape_vector);
+      input_transpose = context.CreateGPUTensor(input_tensor->DataType(), input_transpose_shape);
+      TransposeProgram transpose_program(perm, false);
+      transpose_program.CacheHint(absl::StrJoin(perm, "-"))
+          .AddInput({input_tensor, ProgramTensorMetadataDependency::TypeAndRank})
+          .AddOutput({&input_transpose, ProgramTensorMetadataDependency::TypeAndRank})
+          .SetDispatchGroupSize((input_tensor->Shape().Size() + WORKGROUP_SIZE - 1) / WORKGROUP_SIZE)
+          .AddUniformVariable({static_cast<uint32_t>(input_transpose_shape.Size())});
+      ORT_RETURN_IF_ERROR(context.RunProgram(transpose_program));
+      input_tensor = &input_transpose;
+    }
+    auto workgroup_size = output_size == 1 ? static_cast<uint32_t>(256) : static_cast<uint32_t>(WORKGROUP_SIZE);
+    ReduceSharedProgram program(name_, reduce_op_type, workgroup_size);
+    program.CacheHint(keepdims_,
+                      noop_with_empty_axes_,
+                      select_last_index_,
+                      workgroup_size,
+                      absl::StrJoin(input_axes, ","))
+        .AddInput({input_tensor, ProgramTensorMetadataDependency::TypeAndRank})
+        .AddOutput({context.Output(0, output_shape), ProgramTensorMetadataDependency::TypeAndRank})
+        .SetDispatchGroupSize(static_cast<uint32_t>(output_size))
+        .SetWorkgroupSize(workgroup_size)
+        .AddUniformVariable({static_cast<uint32_t>(reduce_size)});
+    return context.RunProgram(program);
+  }
 }
 
 }  // namespace webgpu
diff --git a/onnxruntime/core/providers/webgpu/reduction/reduction_ops.h b/onnxruntime/core/providers/webgpu/reduction/reduction_ops.h
index 70ae6d3c71eb9..53abff9ca30c9 100644
--- a/onnxruntime/core/providers/webgpu/reduction/reduction_ops.h
+++ b/onnxruntime/core/providers/webgpu/reduction/reduction_ops.h
@@ -8,28 +8,70 @@
 #include "core/providers/cpu/reduction/reduction_kernel_base.h"
 #include "core/providers/webgpu/program.h"
 #include "core/providers/webgpu/shader_helper.h"
+#include <string>
+#include <unordered_map>
+
 namespace onnxruntime {
 namespace webgpu {
-// reduceOpSpecificCode is a 3-element array of strings that represent the op specific code for the reduce operation.
+// reduceOpSpecificCode is a struct of three strings that represent the op specific code for the reduce operation.
 // The first element is the loop header, the second element is the loop body, and the third element is the loop footer.
 // The loop header is the code that is executed before the loop starts. The loop body is the code that is executed for each element in the loop.
 // The loop footer is the code that is executed after the loop ends. The loop body should contain the code that accumulates the result of the reduction and
 // the loop footer should contain the code that assigins output_value the result of the reduction.
-typedef std::array<std::string, 3> ReduceOpSpecificCode;
-class ReduceKernelProgram final : public Program<ReduceKernelProgram> {
+typedef struct ReduceOpSpecificCode {
+  std::string loop_header_;
+  std::string loop_body_;
+  std::string loop_footer_;
+} ReduceOpSpecificCode;
+
+enum class ReduceOpType {
+  Max,
+  Min,
+  Mean,
+  Sum,
+  Prod,
+  SumSquare,
+  LogSumExp,
+  L1,
+  L2,
+  LogSum,
+  ArgMax,
+  ArgMax_select_last_index,
+  ArgMin,
+  ArgMin_select_last_index,
+};
+
+ReduceOpType StringToReduceOp(std::string name);
+
+class ReduceNaiveProgram final : public Program<ReduceNaiveProgram> {
  public:
-  ReduceKernelProgram(std::string name, bool keepdims, bool no_op_with_empty_axes, const InlinedVector<uint32_t>& axes, ReduceOpSpecificCode code, bool is_input_empty) : Program{name}, keepdims_(keepdims), no_op_with_empty_axes_(no_op_with_empty_axes), axes_(axes.begin(), axes.end()), code_(code), is_input_empty_(is_input_empty) {}
+  ReduceNaiveProgram(std::string name, ReduceOpType reduce_op_type, bool keepdims, bool no_op_with_empty_axes, const InlinedVector<uint32_t>& axes, bool is_input_empty) : Program{name}, keepdims_(keepdims), no_op_with_empty_axes_(no_op_with_empty_axes), axes_(axes.begin(), axes.end()), is_input_empty_(is_input_empty), reduce_op_type_(reduce_op_type) {}
   Status GenerateShaderCode(ShaderHelper& wgpuShaderModuleAddRef) const override;
   WEBGPU_PROGRAM_DEFINE_UNIFORM_VARIABLES({"output_size", ProgramUniformVariableDataType::Uint32},
                                           {"no_op_with_empty_axes", ProgramUniformVariableDataType::Uint32},
-                                          {"reduce_axes", ProgramUniformVariableDataType::Uint32});
+                                          {"reduce_size", ProgramUniformVariableDataType::Uint32});
 
  private:
   const bool keepdims_;
   const bool no_op_with_empty_axes_;
   InlinedVector<uint32_t> axes_;
-  ReduceOpSpecificCode code_;
   bool is_input_empty_;
+  const ReduceOpType reduce_op_type_;
+};
+
+class ReduceSharedProgram final : public Program<ReduceSharedProgram> {
+ public:
+  ReduceSharedProgram(std::string name, ReduceOpType reduce_op_type, uint32_t worgroup_size) : Program(name), reduce_op_type_(reduce_op_type), workgroup_size_(worgroup_size) {
+    if (reduce_op_type_ == ReduceOpType::ArgMax || reduce_op_type_ == ReduceOpType::ArgMin || reduce_op_type_ == ReduceOpType::ArgMax_select_last_index || reduce_op_type_ == ReduceOpType::ArgMin_select_last_index) {
+      ORT_THROW("ReduceSharedProgram: ArgMax/ArgMin is not supported in WebGPU yet.");
+    }
+  }
+  Status GenerateShaderCode(ShaderHelper& wgpuShaderModuleAddRef) const override;
+  WEBGPU_PROGRAM_DEFINE_UNIFORM_VARIABLES({"reduceSize", ProgramUniformVariableDataType::Uint32});
+
+ private:
+  const ReduceOpType reduce_op_type_;
+  uint32_t workgroup_size_;
 };
 
 template <bool allow_multi_axes = true>
@@ -47,7 +89,6 @@ class ReduceKernel : public WebGpuKernel, public ReduceKernelBase<allow_multi_ax
         allow_empty_input_(allow_empty_input) {
   }
   Status ComputeInternal(ComputeContext& ctx) const;
-  virtual ReduceOpSpecificCode GetOpSpecificCode(const Tensor* input_tensor) const = 0;
 
   Status CheckInput(const Tensor* input_tensor) const {
     ORT_ENFORCE(input_tensor != nullptr && (input_tensor->Shape().Size() > 0 || allow_empty_input_), "Input tensor cannot be null or empty");
@@ -62,73 +103,61 @@ class ReduceKernel : public WebGpuKernel, public ReduceKernelBase<allow_multi_ax
 class ReduceMean final : public ReduceKernel<true> {
  public:
   ReduceMean(const OpKernelInfo& info) : ReduceKernel<true>(info, "ReduceMean", true) {}
-  ReduceOpSpecificCode GetOpSpecificCode(const Tensor* input_tensor) const override;
 };
 
 class ReduceMax final : public ReduceKernel<true> {
  public:
   ReduceMax(const OpKernelInfo& info) : ReduceKernel<true>(info, "ReduceMax") {}
-  ReduceOpSpecificCode GetOpSpecificCode(const Tensor* input_tensor) const override;
 };
 
 class ReduceMin final : public ReduceKernel<true> {
  public:
   ReduceMin(const OpKernelInfo& info) : ReduceKernel<true>(info, "ReduceMin") {}
-  ReduceOpSpecificCode GetOpSpecificCode(const Tensor* input_tensor) const override;
 };
 
 class ReduceSum final : public ReduceKernel<true> {
  public:
   ReduceSum(const OpKernelInfo& info) : ReduceKernel<true>(info, "ReduceSum", true) {}
-  ReduceOpSpecificCode GetOpSpecificCode(const Tensor* input_tensor) const override;
 };
 
 class ReduceProd final : public ReduceKernel<true> {
  public:
   ReduceProd(const OpKernelInfo& info) : ReduceKernel<true>(info, "ReduceProd", true) {}
-  ReduceOpSpecificCode GetOpSpecificCode(const Tensor* input_tensor) const override;
 };
 
 class ReduceL1 final : public ReduceKernel<true> {
  public:
   ReduceL1(const OpKernelInfo& info) : ReduceKernel<true>(info, "ReduceL1", true) {}
-  ReduceOpSpecificCode GetOpSpecificCode(const Tensor* input_tensor) const override;
 };
 
 class ReduceL2 final : public ReduceKernel<true> {
  public:
   ReduceL2(const OpKernelInfo& info) : ReduceKernel<true>(info, "ReduceL2", true) {}
-  ReduceOpSpecificCode GetOpSpecificCode(const Tensor* input_tensor) const override;
 };
 
 class ReduceLogSum final : public ReduceKernel<true> {
  public:
   ReduceLogSum(const OpKernelInfo& info) : ReduceKernel<true>(info, "ReduceLogSum", true) {}
-  ReduceOpSpecificCode GetOpSpecificCode(const Tensor* input_tensor) const override;
 };
 
 class ReduceSumSquare final : public ReduceKernel<true> {
  public:
   ReduceSumSquare(const OpKernelInfo& info) : ReduceKernel<true>(info, "ReduceSumSquare", true) {}
-  ReduceOpSpecificCode GetOpSpecificCode(const Tensor* input_tensor) const override;
 };
 
 class ReduceLogSumExp final : public ReduceKernel<true> {
  public:
   ReduceLogSumExp(const OpKernelInfo& info) : ReduceKernel<true>(info, "ReduceLogSumExp", true) {}
-  ReduceOpSpecificCode GetOpSpecificCode(const Tensor* input_tensor) const override;
 };
 
 class ArgMin final : public ReduceKernel<false> {
  public:
   ArgMin(const OpKernelInfo& info) : ReduceKernel<false>(info, "ArgMin", true) {}
-  ReduceOpSpecificCode GetOpSpecificCode(const Tensor* input_tensor) const override;
 };
 
 class ArgMax final : public ReduceKernel<false> {
  public:
   ArgMax(const OpKernelInfo& info) : ReduceKernel<false>(info, "ArgMax", true) {}
-  ReduceOpSpecificCode GetOpSpecificCode(const Tensor* input_tensor) const override;
 };
 
 }  // namespace webgpu
diff --git a/onnxruntime/core/providers/webgpu/tensor/slice.cc b/onnxruntime/core/providers/webgpu/tensor/slice.cc
index a201c13de3fbc..eb55134a31608 100644
--- a/onnxruntime/core/providers/webgpu/tensor/slice.cc
+++ b/onnxruntime/core/providers/webgpu/tensor/slice.cc
@@ -138,12 +138,13 @@ Status Slice::ComputeInternal(ComputeContext& context) const {
   auto steps_raw = steps_tensor == nullptr ? gsl::make_span(steps_default) : steps_tensor->DataAsSpan<int64_t>();
 
   // PROCESS INPUTS
-  std::vector<uint32_t> axes;
+  std::vector<uint32_t> axes, axes_fixed;
   for (unsigned int i = 0; i < axes_raw.size(); i++) {
     int64_t val = axes_raw[i];
     if (val < 0) {
       val += input_rank;
     }
+    axes_fixed.push_back(static_cast<int32_t>(val));
     axes.push_back(static_cast<int32_t>(val));
   }
 
@@ -190,8 +191,8 @@ Status Slice::ComputeInternal(ComputeContext& context) const {
   if (static_cast<int64_t>(axes.size()) != input_rank) {
     for (uint32_t i = 0; i < input_rank; i++) {
       int idx = -1;
-      for (unsigned int j = 0; j < axes_raw.size(); j++) {
-        if (axes_raw[j] == i) {
+      for (unsigned int j = 0; j < axes_fixed.size(); j++) {
+        if (axes_fixed[j] == i) {
           idx = j;
           break;
         }
@@ -232,24 +233,23 @@ Status Slice::ComputeInternal(ComputeContext& context) const {
 
   // Reorder inputs in order of axis
   std::vector<int32_t> signs_reordered;
-  std::vector<uint32_t> steps_reordered, starts_reordered;
-  for (unsigned int i = 0; i < axes.size(); i++) {
-    signs_reordered.push_back(0);
-    steps_reordered.push_back(0);
-    starts_reordered.push_back(0);
-  }
-  for (unsigned int i = 0; i < axes.size(); i++) {
+  std::vector<uint32_t> steps_reordered, starts_reordered, ends_reordered;
+  signs_reordered.resize(static_cast<size_t>(input_rank), 0);
+  steps_reordered.resize(static_cast<size_t>(input_rank), 1);
+  starts_reordered.resize(static_cast<size_t>(input_rank), 0);
+  ends_reordered.resize(static_cast<size_t>(input_rank), 0);
+  for (unsigned int i = 0; i < input_rank; i++) {
     int32_t dim = axes[i];
     signs_reordered[dim] = signs[i];
     steps_reordered[dim] = steps[i];
     starts_reordered[dim] = starts[i];
+    ends_reordered[dim] = ends[i];
   }
 
   // calculate output dims
   std::vector<int64_t> output_dims;
-  for (unsigned int i = 0; i < axes.size(); i++) {
-    int32_t dim = axes[i];
-    float tmp = ceil((static_cast<float>(ends[dim]) - static_cast<float>(starts[dim])) / static_cast<float>(steps[dim]));
+  for (unsigned int i = 0; i < input_rank; i++) {
+    float tmp = ceil((static_cast<float>(ends_reordered[i]) - static_cast<float>(starts_reordered[i])) / static_cast<float>(steps_reordered[i]));
     if (tmp < 0)
       output_dims.push_back(0);
     else
diff --git a/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc b/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc
index 1d4fea09dd4a6..b126ca823970a 100644
--- a/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc
+++ b/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc
@@ -357,9 +357,12 @@ class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxD
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 13, Tile);
 
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 17, LayerNormalization);
-
-class ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 6, InstanceNormalization);
-class ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kMSInternalNHWCDomain, 6, InstanceNormalization);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 1, 5, InstanceNormalization);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kMSInternalNHWCDomain, 1, 5, InstanceNormalization);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 6, 21, InstanceNormalization);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kMSInternalNHWCDomain, 6, 21, InstanceNormalization);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 22, InstanceNormalization);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kMSInternalNHWCDomain, 22, InstanceNormalization);
 
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 11, float, Range);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 11, int32_t, Range);
@@ -686,8 +689,12 @@ std::unique_ptr<KernelRegistry> RegisterKernels() {
 
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 17, LayerNormalization)>,
 
-      // BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 6, InstanceNormalization)>,
-      // BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kMSInternalNHWCDomain, 6, InstanceNormalization)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 1, 5, InstanceNormalization)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kMSInternalNHWCDomain, 1, 5, InstanceNormalization)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 6, 21, InstanceNormalization)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kMSInternalNHWCDomain, 6, 21, InstanceNormalization)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 22, InstanceNormalization)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kMSInternalNHWCDomain, 22, InstanceNormalization)>,
 
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 11, float, Range)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 11, int32_t, Range)>,
diff --git a/onnxruntime/core/providers/webgpu/webgpu_provider_options.h b/onnxruntime/core/providers/webgpu/webgpu_provider_options.h
index fcfd6774f8ab8..a3b6cca4ceaf0 100644
--- a/onnxruntime/core/providers/webgpu/webgpu_provider_options.h
+++ b/onnxruntime/core/providers/webgpu/webgpu_provider_options.h
@@ -9,28 +9,28 @@ namespace options {
 
 // The following are the options that can be set in the WebGPU provider options.
 
-constexpr const char* kPreferredLayout = "WebGPU:preferredLayout";
-constexpr const char* kEnableGraphCapture = "WebGPU:enableGraphCapture";
+constexpr const char* kPreferredLayout = "ep.webgpuexecutionprovider.preferredLayout";
+constexpr const char* kEnableGraphCapture = "ep.webgpuexecutionprovider.enableGraphCapture";
 
-constexpr const char* kDawnProcTable = "WebGPU:dawnProcTable";
+constexpr const char* kDawnProcTable = "ep.webgpuexecutionprovider.dawnProcTable";
 
-constexpr const char* kDawnBackendType = "WebGPU:dawnBackendType";
+constexpr const char* kDawnBackendType = "ep.webgpuexecutionprovider.dawnBackendType";
 
-constexpr const char* kDeviceId = "WebGPU:deviceId";
-constexpr const char* kWebGpuInstance = "WebGPU:webgpuInstance";
-constexpr const char* kWebGpuDevice = "WebGPU:webgpuDevice";
+constexpr const char* kDeviceId = "ep.webgpuexecutionprovider.deviceId";
+constexpr const char* kWebGpuInstance = "ep.webgpuexecutionprovider.webgpuInstance";
+constexpr const char* kWebGpuDevice = "ep.webgpuexecutionprovider.webgpuDevice";
 
-constexpr const char* kStorageBufferCacheMode = "WebGPU:storageBufferCacheMode";
-constexpr const char* kUniformBufferCacheMode = "WebGPU:uniformBufferCacheMode";
-constexpr const char* kQueryResolveBufferCacheMode = "WebGPU:queryResolveBufferCacheMode";
-constexpr const char* kDefaultBufferCacheMode = "WebGPU:defaultBufferCacheMode";
+constexpr const char* kStorageBufferCacheMode = "ep.webgpuexecutionprovider.storageBufferCacheMode";
+constexpr const char* kUniformBufferCacheMode = "ep.webgpuexecutionprovider.uniformBufferCacheMode";
+constexpr const char* kQueryResolveBufferCacheMode = "ep.webgpuexecutionprovider.queryResolveBufferCacheMode";
+constexpr const char* kDefaultBufferCacheMode = "ep.webgpuexecutionprovider.defaultBufferCacheMode";
 
-constexpr const char* kValidationMode = "WebGPU:validationMode";
+constexpr const char* kValidationMode = "ep.webgpuexecutionprovider.validationMode";
 
-constexpr const char* kForceCpuNodeNames = "WebGPU:forceCpuNodeNames";
-constexpr const char* kEnablePIXCapture = "WebGPU:enablePIXCapture";
+constexpr const char* kForceCpuNodeNames = "ep.webgpuexecutionprovider.forceCpuNodeNames";
+constexpr const char* kEnablePIXCapture = "ep.webgpuexecutionprovider.enablePIXCapture";
 
-constexpr const char* kPreserveDevice = "WebGPU:preserveDevice";
+constexpr const char* kPreserveDevice = "ep.webgpuexecutionprovider.preserveDevice";
 
 // The following are the possible values for the provider options.
 
diff --git a/onnxruntime/core/providers/webnn/builders/model_builder.cc b/onnxruntime/core/providers/webnn/builders/model_builder.cc
index e42a33e858a07..9bc124af30b37 100644
--- a/onnxruntime/core/providers/webnn/builders/model_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/model_builder.cc
@@ -300,6 +300,7 @@ Status ModelBuilder::RegisterModelInputOutput(const NodeArg& node_arg, bool is_i
     emscripten::val::module_property("webnnRegisterGraphInput")(name);
     input_names_.push_back(name);
   } else {
+    emscripten::val::module_property("webnnRegisterGraphOutput")(name);
     output_names_.push_back(name);
   }
 
diff --git a/onnxruntime/core/providers/xnnpack/math/softmax.cc b/onnxruntime/core/providers/xnnpack/math/softmax.cc
index 15e260889b055..6786c29e1f056 100644
--- a/onnxruntime/core/providers/xnnpack/math/softmax.cc
+++ b/onnxruntime/core/providers/xnnpack/math/softmax.cc
@@ -160,7 +160,7 @@ Softmax::Softmax(const OpKernelInfo& info) : XnnpackKernel{info} {
   // we have checked it in GetCapability
   const auto* x_shape = input_defs[0]->Shape();
   size_t rank = x_shape->dim_size();
-  axis_ = gsl::narrow<int>(HandleNegativeAxis(axis_, int64_t(rank)));
+  axis_ = narrow<int>(HandleNegativeAxis(axis_, int64_t(rank)));
 
   auto input_shape = utils::GetTensorShapeFromTensorShapeProto(*x_shape);
   int64_t channels = opset_ < 13 ? input_shape.SizeFromDimension(axis_) : input_shape[axis_];
diff --git a/onnxruntime/core/providers/xnnpack/nn/average_pool.cc b/onnxruntime/core/providers/xnnpack/nn/average_pool.cc
index 1fc941d9f52f6..f320274f65db3 100644
--- a/onnxruntime/core/providers/xnnpack/nn/average_pool.cc
+++ b/onnxruntime/core/providers/xnnpack/nn/average_pool.cc
@@ -19,15 +19,15 @@ Status CreateXnnpackKernel(const PoolAttributes& pool_attrs,
                            struct xnn_operator*& p,
                            const OpQuantParam& quant_param,
                            OpComputeType avgpool_type) {
-  uint32_t input_padding_top = gsl::narrow<uint32_t>(pool_attrs.pads[0]);
-  uint32_t input_padding_left = gsl::narrow<uint32_t>(pool_attrs.pads[1]);
-  uint32_t input_padding_bottom = gsl::narrow<uint32_t>(pool_attrs.pads[2]);
-  uint32_t input_padding_right = gsl::narrow<uint32_t>(pool_attrs.pads[3]);
-
-  uint32_t pooling_height = gsl::narrow<uint32_t>(pool_attrs.kernel_shape[0]);
-  uint32_t pooling_width = gsl::narrow<uint32_t>(pool_attrs.kernel_shape[1]);
-  uint32_t stride_height = gsl::narrow<uint32_t>(pool_attrs.strides[0]);
-  uint32_t stride_width = gsl::narrow<uint32_t>(pool_attrs.strides[1]);
+  uint32_t input_padding_top = narrow<uint32_t>(pool_attrs.pads[0]);
+  uint32_t input_padding_left = narrow<uint32_t>(pool_attrs.pads[1]);
+  uint32_t input_padding_bottom = narrow<uint32_t>(pool_attrs.pads[2]);
+  uint32_t input_padding_right = narrow<uint32_t>(pool_attrs.pads[3]);
+
+  uint32_t pooling_height = narrow<uint32_t>(pool_attrs.kernel_shape[0]);
+  uint32_t pooling_width = narrow<uint32_t>(pool_attrs.kernel_shape[1]);
+  uint32_t stride_height = narrow<uint32_t>(pool_attrs.strides[0]);
+  uint32_t stride_width = narrow<uint32_t>(pool_attrs.strides[1]);
 
   uint32_t flags = 0;
   if (pool_attrs.auto_pad == AutoPadType::SAME_UPPER) {
diff --git a/onnxruntime/core/providers/xnnpack/nn/conv_base.cc b/onnxruntime/core/providers/xnnpack/nn/conv_base.cc
index 458e6000c8d70..44962c1796631 100644
--- a/onnxruntime/core/providers/xnnpack/nn/conv_base.cc
+++ b/onnxruntime/core/providers/xnnpack/nn/conv_base.cc
@@ -34,18 +34,18 @@ Status CreateXnnpackKernel(const ConvAttributes& conv_attrs,
   // if this is 1D input, we fake all the height related dims being 1 to make it 2D. so {W} -> {1, W}
   const auto is_1D = kernel_shape.size() == 1;
 
-  const uint32_t kernel_height = is_1D ? 1 : gsl::narrow<uint32_t>(kernel_shape[0]);
-  const uint32_t kernel_width = gsl::narrow<uint32_t>(kernel_shape[is_1D ? 0 : 1]);
+  const uint32_t kernel_height = is_1D ? 1 : narrow<uint32_t>(kernel_shape[0]);
+  const uint32_t kernel_width = narrow<uint32_t>(kernel_shape[is_1D ? 0 : 1]);
 
-  const uint32_t input_padding_top = is_1D ? 0 : gsl::narrow<uint32_t>(conv_attrs.pads[0]);
-  const uint32_t input_padding_left = gsl::narrow<uint32_t>(conv_attrs.pads[is_1D ? 0 : 1]);
-  const uint32_t input_padding_bottom = is_1D ? 0 : gsl::narrow<uint32_t>(conv_attrs.pads[2]);
-  const uint32_t input_padding_right = gsl::narrow<uint32_t>(conv_attrs.pads[is_1D ? 1 : 3]);
+  const uint32_t input_padding_top = is_1D ? 0 : narrow<uint32_t>(conv_attrs.pads[0]);
+  const uint32_t input_padding_left = narrow<uint32_t>(conv_attrs.pads[is_1D ? 0 : 1]);
+  const uint32_t input_padding_bottom = is_1D ? 0 : narrow<uint32_t>(conv_attrs.pads[2]);
+  const uint32_t input_padding_right = narrow<uint32_t>(conv_attrs.pads[is_1D ? 1 : 3]);
 
-  const uint32_t subsampling_height = is_1D ? 1 : gsl::narrow<uint32_t>(conv_attrs.strides[0]);
-  const uint32_t subsampling_width = gsl::narrow<uint32_t>(conv_attrs.strides[is_1D ? 0 : 1]);
-  const uint32_t dilation_height = is_1D ? 1 : gsl::narrow<uint32_t>(conv_attrs.dilations[0]);
-  const uint32_t dilation_width = gsl::narrow<uint32_t>(conv_attrs.dilations[is_1D ? 0 : 1]);
+  const uint32_t subsampling_height = is_1D ? 1 : narrow<uint32_t>(conv_attrs.strides[0]);
+  const uint32_t subsampling_width = narrow<uint32_t>(conv_attrs.strides[is_1D ? 0 : 1]);
+  const uint32_t dilation_height = is_1D ? 1 : narrow<uint32_t>(conv_attrs.dilations[0]);
+  const uint32_t dilation_width = narrow<uint32_t>(conv_attrs.dilations[is_1D ? 0 : 1]);
 
   uint32_t flags = 0;
   if (conv_attrs.auto_pad == AutoPadType::SAME_UPPER) {
@@ -62,9 +62,9 @@ Status CreateXnnpackKernel(const ConvAttributes& conv_attrs,
   // also, in the case of DepthWiseConv, group_count = C, IC is 1 constantly, OC is what DPconv require.
   // So we can unify it with IC and OC.
   // group is either 1 (for regular conv) or C (for depth-wise conv), and hence M % group == 0 so M/group is safe
-  uint32_t group_count = gsl::narrow<uint32_t>(conv_attrs.group);
-  size_t group_input_channels = gsl::narrow<size_t>(C / group_count);   // either C or 1
-  size_t group_output_channels = gsl::narrow<size_t>(M / group_count);  // either M or M/C
+  uint32_t group_count = narrow<uint32_t>(conv_attrs.group);
+  size_t group_input_channels = narrow<size_t>(C / group_count);   // either C or 1
+  size_t group_output_channels = narrow<size_t>(M / group_count);  // either M or M/C
   if (conv_type == OpComputeType::op_compute_type_fp32) {
     auto* B_data = Bias ? Bias->Data<float>() : nullptr;
     auto create_func = is_transpose ? xnn_create_deconvolution2d_nhwc_f32
diff --git a/onnxruntime/core/providers/xnnpack/nn/max_pool.cc b/onnxruntime/core/providers/xnnpack/nn/max_pool.cc
index 8d972f7d63bc1..36cc77e1bab5e 100644
--- a/onnxruntime/core/providers/xnnpack/nn/max_pool.cc
+++ b/onnxruntime/core/providers/xnnpack/nn/max_pool.cc
@@ -115,17 +115,17 @@ bool MaxPool::IsOnnxNodeSupported(const NodeUnit& node_unit,
 MaxPool::MaxPool(const OpKernelInfo& info)
     : XnnpackKernel(info),
       pool_attrs_{info, "MaxPool", info.node().SinceVersion()} {
-  uint32_t input_padding_top = gsl::narrow<uint32_t>(pool_attrs_.pads[0]);
-  uint32_t input_padding_left = gsl::narrow<uint32_t>(pool_attrs_.pads[1]);
-  uint32_t input_padding_bottom = gsl::narrow<uint32_t>(pool_attrs_.pads[2]);
-  uint32_t input_padding_right = gsl::narrow<uint32_t>(pool_attrs_.pads[3]);
-
-  uint32_t pooling_height = gsl::narrow<uint32_t>(pool_attrs_.kernel_shape[0]);
-  uint32_t pooling_width = gsl::narrow<uint32_t>(pool_attrs_.kernel_shape[1]);
-  uint32_t stride_height = gsl::narrow<uint32_t>(pool_attrs_.strides[0]);
-  uint32_t stride_width = gsl::narrow<uint32_t>(pool_attrs_.strides[1]);
-  uint32_t dilation_height = gsl::narrow<uint32_t>(pool_attrs_.dilations[0]);
-  uint32_t dilation_width = gsl::narrow<uint32_t>(pool_attrs_.dilations[1]);
+  uint32_t input_padding_top = narrow<uint32_t>(pool_attrs_.pads[0]);
+  uint32_t input_padding_left = narrow<uint32_t>(pool_attrs_.pads[1]);
+  uint32_t input_padding_bottom = narrow<uint32_t>(pool_attrs_.pads[2]);
+  uint32_t input_padding_right = narrow<uint32_t>(pool_attrs_.pads[3]);
+
+  uint32_t pooling_height = narrow<uint32_t>(pool_attrs_.kernel_shape[0]);
+  uint32_t pooling_width = narrow<uint32_t>(pool_attrs_.kernel_shape[1]);
+  uint32_t stride_height = narrow<uint32_t>(pool_attrs_.strides[0]);
+  uint32_t stride_width = narrow<uint32_t>(pool_attrs_.strides[1]);
+  uint32_t dilation_height = narrow<uint32_t>(pool_attrs_.dilations[0]);
+  uint32_t dilation_width = narrow<uint32_t>(pool_attrs_.dilations[1]);
 
   // get values from any fusion with an activation
   if (std::string activation; info.GetAttr<std::string>("activation", &activation).IsOK()) {
diff --git a/onnxruntime/core/quantization/quantization.h b/onnxruntime/core/quantization/quantization.h
index 9acdfa6d86ccf..70e89af5ee653 100644
--- a/onnxruntime/core/quantization/quantization.h
+++ b/onnxruntime/core/quantization/quantization.h
@@ -195,7 +195,7 @@ inline uint8_t* TransPoseInputData(const uint8_t* input,
   TensorShape outputshape{static_cast<int64_t>(M), static_cast<int64_t>(N)};
   buffer_holder.emplace(DataTypeImpl::GetType<uint8_t>(), outputshape, allocator);
   uint8_t* output = buffer_holder->MutableData<uint8_t>();
-  MlasTranspose(input, output, M, N);
+  MlasTranspose(input, output, M, N, nullptr);
   return output;
 }
 
diff --git a/onnxruntime/core/session/abi_session_options.cc b/onnxruntime/core/session/abi_session_options.cc
index e50ee5738c30e..f985cd603e549 100644
--- a/onnxruntime/core/session/abi_session_options.cc
+++ b/onnxruntime/core/session/abi_session_options.cc
@@ -20,7 +20,11 @@ OrtSessionOptions& OrtSessionOptions::operator=(const OrtSessionOptions&) {
   ORT_THROW("not implemented");
 }
 OrtSessionOptions::OrtSessionOptions(const OrtSessionOptions& other)
-    : value(other.value), provider_factories(other.provider_factories) {
+    : value(other.value), custom_op_domains_(other.custom_op_domains_), provider_factories(other.provider_factories) {
+}
+
+const onnxruntime::ConfigOptions& OrtSessionOptions::GetConfigOptions() const noexcept {
+  return value.config_options;
 }
 
 #if !defined(ORT_MINIMAL_BUILD) || defined(ORT_MINIMAL_BUILD_CUSTOM_OPS)
diff --git a/onnxruntime/core/session/abi_session_options_impl.h b/onnxruntime/core/session/abi_session_options_impl.h
index de0d8933e606d..c5c2f5cbc0fde 100644
--- a/onnxruntime/core/session/abi_session_options_impl.h
+++ b/onnxruntime/core/session/abi_session_options_impl.h
@@ -21,6 +21,8 @@ struct OrtSessionOptions {
   OrtSessionOptions(const OrtSessionOptions& other);
   OrtSessionOptions& operator=(const OrtSessionOptions& other);
 
+  const onnxruntime::ConfigOptions& GetConfigOptions() const noexcept;
+
 #if !defined(ORT_MINIMAL_BUILD) || defined(ORT_MINIMAL_BUILD_CUSTOM_OPS)
   onnxruntime::Status RegisterCustomOpsLibrary(onnxruntime::PathString library_name);
 #endif
diff --git a/onnxruntime/core/session/compile_api.cc b/onnxruntime/core/session/compile_api.cc
new file mode 100644
index 0000000000000..a3f6addd100ad
--- /dev/null
+++ b/onnxruntime/core/session/compile_api.cc
@@ -0,0 +1,252 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/session/compile_api.h"
+
+#if !defined(ORT_MINIMAL_BUILD)
+#include <memory>
+#include <string>
+
+#include "core/common/common.h"
+#include "core/framework/error_code_helper.h"
+#include "core/session/abi_session_options_impl.h"
+#include "core/session/inference_session.h"
+#include "core/session/model_compilation_options.h"
+#include "core/session/ort_apis.h"
+#include "core/session/utils.h"
+#else
+#include "core/common/common.h"
+#include "core/framework/error_code_helper.h"
+#include "core/session/ort_apis.h"
+#endif  // !defined(ORT_MINIMAL_BUILD)
+
+using namespace onnxruntime;
+
+ORT_API(void, OrtCompileAPI::ReleaseModelCompilationOptions,
+        _Frees_ptr_opt_ OrtModelCompilationOptions* ort_model_compile_options) {
+#if !defined(ORT_MINIMAL_BUILD)
+  delete reinterpret_cast<onnxruntime::ModelCompilationOptions*>(ort_model_compile_options);
+#else
+  ORT_UNUSED_PARAMETER(ort_model_compile_options);
+#endif  // !defined(ORT_MINIMAL_BUILD)
+}
+
+ORT_API_STATUS_IMPL(OrtCompileAPI::CreateModelCompilationOptionsFromSessionOptions, _In_ const OrtEnv* env,
+                    _In_ const OrtSessionOptions* session_options, _Outptr_ OrtModelCompilationOptions** out) {
+  API_IMPL_BEGIN
+#if !defined(ORT_MINIMAL_BUILD)
+  if (env == nullptr) {
+    return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, "The env argument must be a non-null pointer");
+  }
+
+  if (session_options == nullptr) {
+    return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, "The session_options argument must be a non-null pointer");
+  }
+
+  auto model_compile_options = std::make_unique<onnxruntime::ModelCompilationOptions>(*env, *session_options);
+  *out = reinterpret_cast<OrtModelCompilationOptions*>(model_compile_options.release());
+  return nullptr;
+#else
+  ORT_UNUSED_PARAMETER(env);
+  ORT_UNUSED_PARAMETER(session_options);
+  ORT_UNUSED_PARAMETER(out);
+  return OrtApis::CreateStatus(ORT_NOT_IMPLEMENTED, "Compile API is not supported in this build");
+#endif  // !defined(ORT_MINIMAL_BUILD)
+  API_IMPL_END
+}
+
+ORT_API_STATUS_IMPL(OrtCompileAPI::ModelCompilationOptions_SetInputModelPath,
+                    _In_ OrtModelCompilationOptions* ort_model_compile_options,
+                    const ORTCHAR_T* input_model_path) {
+  API_IMPL_BEGIN
+#if !defined(ORT_MINIMAL_BUILD)
+  auto model_compile_options = reinterpret_cast<onnxruntime::ModelCompilationOptions*>(ort_model_compile_options);
+  std::string model_path = PathToUTF8String(input_model_path);
+
+  if (model_path.empty()) {
+    return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, "Invalid input model: path string is empty");
+  }
+
+  model_compile_options->SetInputModelPath(model_path);
+  return nullptr;
+#else
+  ORT_UNUSED_PARAMETER(ort_model_compile_options);
+  ORT_UNUSED_PARAMETER(input_model_path);
+  return OrtApis::CreateStatus(ORT_NOT_IMPLEMENTED, "Compile API is not supported in this build");
+#endif  // !defined(ORT_MINIMAL_BUILD)
+  API_IMPL_END
+}
+
+ORT_API_STATUS_IMPL(OrtCompileAPI::ModelCompilationOptions_SetInputModelFromBuffer,
+                    _In_ OrtModelCompilationOptions* ort_model_compile_options,
+                    const void* input_model_data, size_t input_model_data_size) {
+  API_IMPL_BEGIN
+#if !defined(ORT_MINIMAL_BUILD)
+  auto model_compile_options = reinterpret_cast<onnxruntime::ModelCompilationOptions*>(ort_model_compile_options);
+
+  if (input_model_data == nullptr) {
+    return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, "Invalid input model: data pointer is null");
+  }
+
+  if (input_model_data_size == 0) {
+    return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, "Invalid input model: data size is 0");
+  }
+
+  model_compile_options->SetInputModelFromBuffer(input_model_data, input_model_data_size);
+  return nullptr;
+#else
+  ORT_UNUSED_PARAMETER(ort_model_compile_options);
+  ORT_UNUSED_PARAMETER(input_model_data);
+  ORT_UNUSED_PARAMETER(input_model_data_size);
+  return OrtApis::CreateStatus(ORT_NOT_IMPLEMENTED, "Compile API is not supported in this build");
+#endif  // !defined(ORT_MINIMAL_BUILD)
+  API_IMPL_END
+}
+
+ORT_API_STATUS_IMPL(OrtCompileAPI::ModelCompilationOptions_SetOutputModelPath,
+                    _In_ OrtModelCompilationOptions* ort_model_compile_options,
+                    const ORTCHAR_T* output_model_path) {
+  API_IMPL_BEGIN
+#if !defined(ORT_MINIMAL_BUILD)
+  auto model_compile_options = reinterpret_cast<onnxruntime::ModelCompilationOptions*>(ort_model_compile_options);
+
+  std::string model_path = PathToUTF8String(output_model_path);
+  if (model_path.empty()) {
+    return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, "Invalid output model path: path is empty");
+  }
+
+  ORT_API_RETURN_IF_STATUS_NOT_OK(model_compile_options->SetOutputModelPath(model_path));
+  return nullptr;
+#else
+  ORT_UNUSED_PARAMETER(ort_model_compile_options);
+  ORT_UNUSED_PARAMETER(output_model_path);
+  return OrtApis::CreateStatus(ORT_NOT_IMPLEMENTED, "Compile API is not supported in this build");
+#endif  // !defined(ORT_MINIMAL_BUILD)
+  API_IMPL_END
+}
+
+ORT_API_STATUS_IMPL(OrtCompileAPI::ModelCompilationOptions_SetOutputModelExternalInitializersFile,
+                    _In_ OrtModelCompilationOptions* ort_model_compile_options,
+                    const ORTCHAR_T* external_initializers_file_path,
+                    size_t external_initializer_size_threshold) {
+  API_IMPL_BEGIN
+#if !defined(ORT_MINIMAL_BUILD)
+  std::string initializers_file_path = PathToUTF8String(external_initializers_file_path);
+  if (initializers_file_path.empty()) {
+    return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, "Invalid external initializer file: path is empty");
+  }
+
+  auto model_compile_options = reinterpret_cast<onnxruntime::ModelCompilationOptions*>(ort_model_compile_options);
+  model_compile_options->SetOutputModelExternalInitializersFile(initializers_file_path, external_initializer_size_threshold);
+  return nullptr;
+#else
+  ORT_UNUSED_PARAMETER(ort_model_compile_options);
+  ORT_UNUSED_PARAMETER(external_initializers_file_path);
+  ORT_UNUSED_PARAMETER(external_initializer_size_threshold);
+  return OrtApis::CreateStatus(ORT_NOT_IMPLEMENTED, "Compile API is not supported in this build");
+#endif  // !defined(ORT_MINIMAL_BUILD)
+  API_IMPL_END
+}
+
+ORT_API_STATUS_IMPL(OrtCompileAPI::ModelCompilationOptions_SetOutputModelBuffer,
+                    _In_ OrtModelCompilationOptions* ort_model_compile_options,
+                    _Inout_ OrtAllocator* allocator, void** output_model_data_ptr, size_t* output_model_data_size_ptr) {
+  API_IMPL_BEGIN
+#if !defined(ORT_MINIMAL_BUILD)
+  auto model_compile_options = reinterpret_cast<onnxruntime::ModelCompilationOptions*>(ort_model_compile_options);
+
+  if (output_model_data_ptr == nullptr) {
+    return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, "Invalid output model buffer: data pointer is null");
+  }
+
+  if (output_model_data_size_ptr == nullptr) {
+    return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, "Invalid output model buffer: size pointer is null");
+  }
+
+  if (allocator == nullptr) {
+    return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, "Invalid allocator for output model buffer: allocator pointer is null");
+  }
+
+  ORT_API_RETURN_IF_STATUS_NOT_OK(model_compile_options->SetOutputModelBuffer(allocator,
+                                                                              output_model_data_ptr,
+                                                                              output_model_data_size_ptr));
+  return nullptr;
+#else
+  ORT_UNUSED_PARAMETER(ort_model_compile_options);
+  ORT_UNUSED_PARAMETER(allocator);
+  ORT_UNUSED_PARAMETER(output_model_data_ptr);
+  ORT_UNUSED_PARAMETER(output_model_data_size_ptr);
+  return OrtApis::CreateStatus(ORT_NOT_IMPLEMENTED, "Compile API is not supported in this build");
+#endif  // !defined(ORT_MINIMAL_BUILD)
+  API_IMPL_END
+}
+
+ORT_API_STATUS_IMPL(OrtCompileAPI::ModelCompilationOptions_SetEpContextEmbedMode,
+                    _In_ OrtModelCompilationOptions* ort_model_compile_options,
+                    bool embed_ep_context_in_model) {
+  API_IMPL_BEGIN
+#if !defined(ORT_MINIMAL_BUILD)
+  auto model_compile_options = reinterpret_cast<onnxruntime::ModelCompilationOptions*>(ort_model_compile_options);
+  ORT_API_RETURN_IF_STATUS_NOT_OK(model_compile_options->SetEpContextEmbedMode(embed_ep_context_in_model));
+  return nullptr;
+#else
+  ORT_UNUSED_PARAMETER(ort_model_compile_options);
+  ORT_UNUSED_PARAMETER(embed_ep_context_in_model);
+  return OrtApis::CreateStatus(ORT_NOT_IMPLEMENTED, "Compile API is not supported in this build");
+#endif  // !defined(ORT_MINIMAL_BUILD)
+  API_IMPL_END
+}
+
+ORT_API_STATUS_IMPL(OrtCompileAPI::CompileModel, _In_ const OrtEnv* env,
+                    _In_ const OrtModelCompilationOptions* ort_model_compile_options) {
+  API_IMPL_BEGIN
+#if !defined(ORT_MINIMAL_BUILD)
+  auto model_compile_options = reinterpret_cast<const onnxruntime::ModelCompilationOptions*>(ort_model_compile_options);
+  ORT_API_RETURN_IF_STATUS_NOT_OK(model_compile_options->Check());
+
+  std::unique_ptr<onnxruntime::InferenceSession> session;
+  const OrtSessionOptions* session_options = &model_compile_options->GetSessionOptions();
+
+  if (model_compile_options->InputModelComesFromFile()) {
+    PathString input_model_path = ToPathString(model_compile_options->GetInputModelPath());
+    ORT_API_RETURN_IF_ERROR(CreateSessionAndLoadModel(session_options, env,
+                                                      input_model_path.c_str(),
+                                                      nullptr, 0, session));
+  } else {
+    ORT_API_RETURN_IF_ERROR(CreateSessionAndLoadModel(session_options, env, nullptr,
+                                                      model_compile_options->GetInputModelData(),
+                                                      model_compile_options->GetInputModelDataSize(), session));
+  }
+
+  ORT_API_RETURN_IF_ERROR(InitializeSession(session_options, *session));
+  return nullptr;
+#else
+  ORT_UNUSED_PARAMETER(env);
+  ORT_UNUSED_PARAMETER(ort_model_compile_options);
+  return OrtApis::CreateStatus(ORT_NOT_IMPLEMENTED, "Compile API is not supported in this build");
+#endif  // !defined(ORT_MINIMAL_BUILD)
+  API_IMPL_END
+}
+
+static constexpr OrtCompileApi ort_compile_api = {
+    // NOTE: The C# bindings depend on the Api order within this struct so all additions must be at the end,
+    // and no functions can be removed (the implementation needs to change to return an error).
+
+    &OrtCompileAPI::ReleaseModelCompilationOptions,
+    &OrtCompileAPI::CreateModelCompilationOptionsFromSessionOptions,
+    &OrtCompileAPI::ModelCompilationOptions_SetInputModelPath,
+    &OrtCompileAPI::ModelCompilationOptions_SetInputModelFromBuffer,
+    &OrtCompileAPI::ModelCompilationOptions_SetOutputModelPath,
+    &OrtCompileAPI::ModelCompilationOptions_SetOutputModelExternalInitializersFile,
+    &OrtCompileAPI::ModelCompilationOptions_SetOutputModelBuffer,
+    &OrtCompileAPI::ModelCompilationOptions_SetEpContextEmbedMode,
+    &OrtCompileAPI::CompileModel,
+};
+
+// checks that we don't violate the rule that the functions must remain in the slots they were originally assigned
+static_assert(offsetof(OrtCompileApi, CompileModel) / sizeof(void*) == 8,
+              "Size of version 22 Api cannot change");  // initial version in ORT 1.22
+
+ORT_API(const OrtCompileApi*, OrtCompileAPI::GetCompileApi) {
+  return &ort_compile_api;
+}
diff --git a/onnxruntime/core/session/compile_api.h b/onnxruntime/core/session/compile_api.h
new file mode 100644
index 0000000000000..b8c5211526b9d
--- /dev/null
+++ b/onnxruntime/core/session/compile_api.h
@@ -0,0 +1,32 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/session/onnxruntime_c_api.h"
+
+namespace OrtCompileAPI {
+
+// implementation that returns the API struct
+ORT_API(const OrtCompileApi*, GetCompileApi);
+
+ORT_API(void, ReleaseModelCompilationOptions, _Frees_ptr_opt_ OrtModelCompilationOptions*);
+ORT_API_STATUS_IMPL(CreateModelCompilationOptionsFromSessionOptions, _In_ const OrtEnv* env,
+                    _In_ const OrtSessionOptions* session_options, _Outptr_ OrtModelCompilationOptions** out);
+ORT_API_STATUS_IMPL(ModelCompilationOptions_SetInputModelPath, _In_ OrtModelCompilationOptions* model_compile_options,
+                    _In_ const ORTCHAR_T* input_model_path);
+ORT_API_STATUS_IMPL(ModelCompilationOptions_SetInputModelFromBuffer, _In_ OrtModelCompilationOptions* model_compile_options,
+                    _In_ const void* input_model_data, size_t input_model_data_size);
+ORT_API_STATUS_IMPL(ModelCompilationOptions_SetOutputModelPath, _In_ OrtModelCompilationOptions* model_compile_options,
+                    _In_ const ORTCHAR_T* output_model_path);
+ORT_API_STATUS_IMPL(ModelCompilationOptions_SetOutputModelExternalInitializersFile,
+                    _In_ OrtModelCompilationOptions* model_compile_options,
+                    _In_ const ORTCHAR_T* external_initializers_file_path,
+                    size_t external_initializer_size_threshold);
+ORT_API_STATUS_IMPL(ModelCompilationOptions_SetOutputModelBuffer, _In_ OrtModelCompilationOptions* model_compile_options,
+                    _Inout_ OrtAllocator* allocator, void** output_model_buffer_ptr, size_t* output_model_buffer_size_ptr);
+ORT_API_STATUS_IMPL(ModelCompilationOptions_SetEpContextEmbedMode, _In_ OrtModelCompilationOptions* model_compile_options,
+                    bool embed_ep_context_in_model);
+ORT_API_STATUS_IMPL(CompileModel, _In_ const OrtEnv* env, _In_ const OrtModelCompilationOptions* model_options);
+
+}  // namespace OrtCompileAPI
diff --git a/onnxruntime/core/session/inference_session.cc b/onnxruntime/core/session/inference_session.cc
index 0cb361bae563b..f276fc16a1d02 100644
--- a/onnxruntime/core/session/inference_session.cc
+++ b/onnxruntime/core/session/inference_session.cc
@@ -1394,7 +1394,7 @@ common::Status InferenceSession::TransformGraph(onnxruntime::Graph& graph, bool
   // Do partitioning based on execution providers' capabilities.
   ORT_RETURN_IF_ERROR_SESSIONID_(partitioner.Partition(graph, session_state_->GetMutableFuncMgr(), transform_layout_fn,
                                                        session_options_.config_options, *session_logger_,
-                                                       mode, debug_graph_fn));
+                                                       mode, session_options_.GetEpContextGenerationOptions(), debug_graph_fn));
 
   // apply Level2 and higher transformers.
   // we do not run Level 1 again as those transformers assume partitioning will run later to do node assignment.
diff --git a/onnxruntime/core/session/model_compilation_options.cc b/onnxruntime/core/session/model_compilation_options.cc
new file mode 100644
index 0000000000000..ac55fec0f82f5
--- /dev/null
+++ b/onnxruntime/core/session/model_compilation_options.cc
@@ -0,0 +1,203 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#if !defined(ORT_MINIMAL_BUILD)
+#include "core/session/model_compilation_options.h"
+
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "core/session/allocator_adapters.h"
+#include "core/session/onnxruntime_session_options_config_keys.h"
+#include "core/session/ort_env.h"
+
+namespace onnxruntime {
+ModelCompilationOptions::ModelCompilationOptions(const OrtEnv& env, const OrtSessionOptions& session_options)
+    : env_(env), session_options_(session_options) {
+  session_options_.value.has_explicit_ep_context_gen_options = true;
+  session_options_.value.ep_context_gen_options = session_options.value.GetEpContextGenerationOptions();
+  session_options_.value.ep_context_gen_options.enable = true;
+  session_options_.value.ep_context_gen_options.overwrite_existing_output_file = true;
+  session_options_.value.ep_context_gen_options.error_if_no_compiled_nodes = true;
+
+  // Shouldn't fail because the key/value strings are below the maximum string length limits in ConfigOptions.
+  ORT_ENFORCE(session_options_.value.config_options.AddConfigEntry(kOrtSessionOptionEpContextEnable, "1").IsOK());
+}
+
+void ModelCompilationOptions::SetInputModelPath(const std::string& input_model_path) {
+  ResetInputModelSettings();
+  input_model_path_ = input_model_path;
+}
+
+void ModelCompilationOptions::SetInputModelFromBuffer(const void* input_model_data, size_t input_model_data_size) {
+  ResetInputModelSettings();
+  input_model_data_ = input_model_data;
+  input_model_data_size_ = input_model_data_size;
+}
+
+Status ModelCompilationOptions::SetOutputModelPath(const std::string& output_model_path) {
+  ORT_RETURN_IF_ERROR(ResetOutputModelSettings());
+
+  ConfigOptions& config_options = session_options_.value.config_options;
+  EpContextModelGenerationOptions& ep_context_gen_options = session_options_.value.ep_context_gen_options;
+
+  ep_context_gen_options.output_model_file_path = output_model_path;
+
+  if (ep_context_gen_options.output_model_file_path.size() <= ConfigOptions::kMaxValueLength) {
+    Status status = config_options.AddConfigEntry(kOrtSessionOptionEpContextFilePath,
+                                                  ep_context_gen_options.output_model_file_path.c_str());
+    ORT_ENFORCE(status.IsOK());  // Should not fail because both key/value strings are below the min string lengths
+                                 // required by ConfigOptions::AddConfigEntry().
+  } else {
+    // A few things to note:
+    //   - ORT core now uses session_options.ep_context_gen_options to read EPContext model configurations.
+    //     It previously used session_options.config_options.
+    //   - EPs still currently use session_options.config_options to read a subset (enabled, embed mode, output path) of
+    //     EPContext model configurations.
+    //     TODO(adrianlizarraga): Update EPs to use ep_context_gen_options in backward-compatible manner.
+    //   - The output model file path is optional (generated from input path if absent).
+    //   - EPs use the output model path to generate a path to the context binary data file IFF not embedded
+    //     into EPContext nodes. If output model path is empty, EPs just create a path from input model path.
+    //   - session_options.config_options limits the string length of values, which artificially limits the length
+    //     of paths.
+    //   - So, only add this output model file path to session_options.config_options if it is not too long. The only
+    //     potential downside is that the context binary data file is using a different name, but the model will still
+    //     be valid.
+    logging::LoggingManager* log_manager = env_.GetLoggingManager();
+    if (log_manager != nullptr && log_manager->HasDefaultLogger()) {
+      const logging::Logger& logger = log_manager->DefaultLogger();
+      LOGS(logger, WARNING) << "Output model path length (" << ep_context_gen_options.output_model_file_path.size()
+                            << ") exceeds limit of " << ConfigOptions::kMaxKeyLength << " characters."
+                            << "ORT will still generated the expected output file, but EPs will see an empty "
+                            << "output model path in SessionOption's ConfigOptions.";
+    }
+  }
+  return Status::OK();
+}
+
+void ModelCompilationOptions::SetOutputModelExternalInitializersFile(const std::string& external_initializers_path,
+                                                                     size_t external_initializer_size_threshold) {
+  session_options_.value.ep_context_gen_options.output_external_initializers_file_path = external_initializers_path;
+  session_options_.value.ep_context_gen_options.output_external_initializer_size_threshold =
+      external_initializer_size_threshold;
+}
+
+Status ModelCompilationOptions::SetOutputModelBuffer(OrtAllocator* allocator,
+                                                     void** output_model_buffer_ptr,
+                                                     size_t* output_model_buffer_size_ptr) {
+  ORT_RETURN_IF_ERROR(ResetOutputModelSettings());
+
+  session_options_.value.ep_context_gen_options.output_model_buffer_ptr = output_model_buffer_ptr;
+  session_options_.value.ep_context_gen_options.output_model_buffer_size_ptr = output_model_buffer_size_ptr;
+  session_options_.value.ep_context_gen_options.output_model_buffer_allocator =
+      std::make_shared<onnxruntime::IAllocatorImplWrappingOrtAllocator>(allocator);
+  return Status::OK();
+}
+
+Status ModelCompilationOptions::SetEpContextEmbedMode(bool embed_ep_context_in_model) {
+  ORT_RETURN_IF_ERROR(session_options_.value.config_options.AddConfigEntry(
+      kOrtSessionOptionEpContextEmbedMode, embed_ep_context_in_model ? "1" : "0"));
+  session_options_.value.ep_context_gen_options.embed_ep_context_in_model = embed_ep_context_in_model;
+  return Status::OK();
+}
+
+const OrtSessionOptions& ModelCompilationOptions::GetSessionOptions() const {
+  return session_options_;
+}
+
+bool ModelCompilationOptions::InputModelComesFromFile() const {
+  return !input_model_path_.empty();
+}
+
+const std::string& ModelCompilationOptions::GetInputModelPath() const {
+  return input_model_path_;
+}
+
+const void* ModelCompilationOptions::GetInputModelData() const {
+  return input_model_data_;
+}
+
+size_t ModelCompilationOptions::GetInputModelDataSize() const {
+  return input_model_data_size_;
+}
+
+void ModelCompilationOptions::ResetInputModelSettings() {
+  input_model_path_.clear();
+  input_model_data_ = nullptr;
+  input_model_data_size_ = 0;
+}
+
+Status ModelCompilationOptions::ResetOutputModelSettings() {
+  EpContextModelGenerationOptions& ep_context_gen_options = session_options_.value.ep_context_gen_options;
+  ep_context_gen_options.output_model_file_path.clear();
+  ep_context_gen_options.output_model_buffer_ptr = nullptr;
+  ep_context_gen_options.output_model_buffer_size_ptr = nullptr;
+  ep_context_gen_options.output_model_buffer_allocator = nullptr;
+  return session_options_.value.config_options.AddConfigEntry(kOrtSessionOptionEpContextFilePath, "");
+}
+
+Status ModelCompilationOptions::CheckInputModelSettings() const {
+  const bool comes_from_file = !input_model_path_.empty();
+  const bool comes_from_memory = input_model_data_ != nullptr;
+
+  if (!comes_from_file && !comes_from_memory) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                           "Input model to compile must be loaded from either a file or a memory buffer");
+  }
+
+  if (comes_from_file && comes_from_memory) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                           "Input model to compile must be loaded from either a file or a memory buffer, ",
+                           "but not both.");
+  }
+
+  if (comes_from_file && !std::filesystem::exists(input_model_path_)) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Input model path does not exist: ", input_model_path_);
+  }
+
+  if (comes_from_memory && input_model_data_size_ == 0) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Buffer for input model data has size 0");
+  }
+
+  return Status::OK();
+}
+
+Status ModelCompilationOptions::CheckOutputModelSettings() const {
+  const EpContextModelGenerationOptions& ep_context_gen_options = session_options_.value.ep_context_gen_options;
+
+  const bool explicit_writes_to_file = !ep_context_gen_options.output_model_file_path.empty();
+  const bool writes_to_buffer = ep_context_gen_options.output_model_buffer_ptr != nullptr;
+
+  if (!explicit_writes_to_file && !writes_to_buffer) {
+    // User did not specify an output file or an output buffer. We default to generating an output file
+    // with a name based on the input file name, so do not return an error.
+    return Status::OK();
+  }
+
+  if (explicit_writes_to_file && writes_to_buffer) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                           "Output model to compile must be saved either to a file or to a buffer, but not both.");
+  }
+
+  if (writes_to_buffer && ep_context_gen_options.output_model_buffer_size_ptr == nullptr) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                           "Invalid buffer configuration for output model: size pointer is null");
+  }
+
+  if (writes_to_buffer && ep_context_gen_options.output_model_buffer_allocator == nullptr) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                           "Invalid buffer configuration for output model: allocator is null");
+  }
+
+  return Status::OK();
+}
+
+Status ModelCompilationOptions::Check() const {
+  ORT_ENFORCE(session_options_.value.ep_context_gen_options.enable);
+  ORT_RETURN_IF_ERROR(CheckInputModelSettings());
+  ORT_RETURN_IF_ERROR(CheckOutputModelSettings());
+  return Status::OK();
+}
+}  // namespace onnxruntime
+#endif  // !defined(ORT_MINIMAL_BUILD)
diff --git a/onnxruntime/core/session/model_compilation_options.h b/onnxruntime/core/session/model_compilation_options.h
new file mode 100644
index 0000000000000..5ee64d48c3060
--- /dev/null
+++ b/onnxruntime/core/session/model_compilation_options.h
@@ -0,0 +1,132 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#if !defined(ORT_MINIMAL_BUILD)
+#pragma once
+
+#include <memory>
+#include <string>
+#include "core/common/status.h"
+#include "core/common/path_string.h"
+#include "core/session/abi_session_options_impl.h"
+#include "core/session/onnxruntime_c_api.h"
+#include "core/session/onnxruntime_session_options_config_keys.h"
+
+namespace onnxruntime {
+
+/// <summary>
+/// Stores options to compile ONNX models into "EPContext" models.
+/// </summary>
+class ModelCompilationOptions {
+ public:
+  /// <summary>
+  /// Creates an instance with the session options to use for model compilation.
+  /// The session options are expected to have execution providers that compile.
+  /// </summary>
+  /// <param name="env">Reference to OrtEnv</param>
+  /// <param name="session_options">Reference to session options</param>
+  ModelCompilationOptions(const OrtEnv& env, const OrtSessionOptions& session_options);
+
+  /// <summary>
+  /// Sets the file path to the input ONNX model to compile.
+  /// Overrides any previous call to SetInputModelPath() or SetInputModelFromBuffer().
+  /// </summary>
+  /// <param name="input_model_path">The input model's path</param>
+  void SetInputModelPath(const std::string& input_model_path);
+
+  /// <summary>
+  /// Sets the buffer that stores the input ONNX model to compile.
+  /// Overrides any previous call to SetInputModelPath() or SetInputModelFromBuffer().
+  /// </summary>
+  /// <param name="input_model_data">Buffer containing the input ONNX model</param>
+  /// <param name="input_model_data_size">The size in bytes of the input model's buffer</param>
+  void SetInputModelFromBuffer(const void* input_model_data, size_t input_model_data_size);
+
+  /// <summary>
+  /// Sets the file path to store the output/compiled ONNX model.
+  /// Overrides any previous call to SetOutputModelPath() or SetOutputModelBuffer().
+  /// </summary>
+  /// <param name="output_model_path"></param>
+  /// <returns>Status indicating potential error</returns>
+  Status SetOutputModelPath(const std::string& output_model_path);
+
+  /// <summary>
+  /// Sets the file path to the file that will store external ONNX initializers for the compiled model.
+  /// Only stores initializers for graph nodes assigned to CPU EP.
+  /// </summary>
+  /// <param name="external_initializers_path">Path to the external initializers file to generate</param>
+  /// <param name="external_initializer_size_threshold">Initializers that exceed this threshold are external</param>
+  void SetOutputModelExternalInitializersFile(const std::string& external_initializers_path,
+                                              size_t external_initializer_size_threshold);
+
+  /// <summary>
+  /// Sets a pointer to the buffer that will contained the output/compiled ONNX model bytes.
+  /// Overrides any previous call to SetOutputModelPath() or SetOutputModelBuffer().
+  /// </summary>
+  /// <param name="allocator">Allocator to allocate the output buffer</param>
+  /// <param name="output_model_buffer_ptr">Pointer to the buffer that will contain the compiled model</param>
+  /// <param name="output_model_buffer_size_ptr">Set to the size of the buffer</param>
+  /// <returns>Status indicating potential error</returns>
+  Status SetOutputModelBuffer(OrtAllocator* allocator, void** output_model_buffer_ptr,
+                              size_t* output_model_buffer_size_ptr);
+
+  /// <summary>
+  /// Enables or disables the embedding of EPContext binary data into the `ep_cache_context` attribute of EPContext
+  /// nodes. Defaults to false (dumped to file).
+  /// </summary>
+  /// <param name="embed_ep_context_in_model">True if should be embedded, false otherwise</param>
+  /// <returns>Status indicating potential error</returns>
+  Status SetEpContextEmbedMode(bool embed_ep_context_in_model);
+
+  /// <summary>
+  /// Returns a reference to the session options object.
+  /// </summary>
+  /// <returns>session options</returns>
+  const OrtSessionOptions& GetSessionOptions() const;
+
+  /// <summary>
+  /// Returns the file path to the input ONNX model.
+  /// </summary>
+  /// <returns>input model's path</returns>
+  const std::string& GetInputModelPath() const;
+
+  /// <summary>
+  /// Returns true if the input model is read from a file.
+  /// </summary>
+  /// <returns>true if input model comes from a file</returns>
+  bool InputModelComesFromFile() const;
+
+  /// <summary>
+  /// Returns the buffer that contains the bytes for the input ONNX model.
+  /// Returns nullptr if the input model is not stored in a buffer.
+  /// </summary>
+  /// <returns>pointer to input model's buffer</returns>
+  const void* GetInputModelData() const;
+
+  /// <summary>
+  /// Returns the size in bytes of the buffer that contains the input ONNX model.
+  /// Returns 0 if the input model is not stored in a buffer.
+  /// </summary>
+  /// <returns>input model buffer's size in bytes</returns>
+  size_t GetInputModelDataSize() const;
+
+  /// <summary>
+  /// Checks if the compilation options described by this object are valid.
+  /// </summary>
+  /// <returns>An error status if the compilation options are invalid</returns>
+  Status Check() const;
+
+ private:
+  void ResetInputModelSettings();
+  Status ResetOutputModelSettings();
+  Status CheckInputModelSettings() const;
+  Status CheckOutputModelSettings() const;
+
+  const OrtEnv& env_;
+  OrtSessionOptions session_options_;
+  std::string input_model_path_;
+  const void* input_model_data_ = nullptr;
+  size_t input_model_data_size_ = 0;
+};
+}  // namespace onnxruntime
+#endif  // !defined(ORT_MINIMAL_BUILD)
diff --git a/onnxruntime/core/session/onnxruntime_c_api.cc b/onnxruntime/core/session/onnxruntime_c_api.cc
index ac67a3ce5c1a2..07df9fe84f6d0 100644
--- a/onnxruntime/core/session/onnxruntime_c_api.cc
+++ b/onnxruntime/core/session/onnxruntime_c_api.cc
@@ -14,7 +14,6 @@
 #include "core/common/status.h"
 #include "core/common/string_helper.h"
 #include "core/framework/allocator.h"
-#include "core/framework/allocator.h"
 #include "core/framework/callback.h"
 #include "core/framework/data_types.h"
 #include "core/framework/error_code_helper.h"
@@ -32,6 +31,7 @@
 #include "core/providers/get_execution_providers.h"
 #include "core/session/abi_session_options_impl.h"
 #include "core/session/allocator_adapters.h"
+#include "core/session/compile_api.h"
 #include "core/session/environment.h"
 #include "core/session/inference_session.h"
 #include "core/session/inference_session_utils.h"
@@ -2389,6 +2389,10 @@ ORT_API(const OrtModelEditorApi*, OrtApis::GetModelEditorApi) {
 #endif
 }
 
+ORT_API(const OrtCompileApi*, OrtApis::GetCompileApi) {
+  return OrtCompileAPI::GetCompileApi();
+}
+
 static constexpr OrtApiBase ort_api_base = {
     &OrtApis::GetApi,
     &OrtApis::GetVersionString};
@@ -2788,6 +2792,7 @@ static constexpr OrtApi ort_api_1_to_22 = {
 
     &OrtApis::CreateTensorWithDataAndDeleterAsOrtValue,
     &OrtApis::SessionOptionsSetLoadCancellationFlag,
+    &OrtApis::GetCompileApi,
 };
 
 // OrtApiBase can never change as there is no way to know what version of OrtApiBase is returned by OrtGetApiBase.
diff --git a/onnxruntime/core/session/ort_apis.h b/onnxruntime/core/session/ort_apis.h
index 0a87036a0dd1d..96e0dee304f5a 100644
--- a/onnxruntime/core/session/ort_apis.h
+++ b/onnxruntime/core/session/ort_apis.h
@@ -552,4 +552,5 @@ ORT_API_STATUS_IMPL(CreateTensorWithDataAndDeleterAsOrtValue, _In_ OrtAllocator*
 ORT_API_STATUS_IMPL(SessionOptionsSetLoadCancellationFlag, _Inout_ OrtSessionOptions* options,
                     _In_ bool is_cancel);
 
+ORT_API(const OrtCompileApi*, GetCompileApi);
 }  // namespace OrtApis
diff --git a/onnxruntime/core/session/provider_bridge_ort.cc b/onnxruntime/core/session/provider_bridge_ort.cc
index 8539394f321bd..184b8c6a3d81d 100644
--- a/onnxruntime/core/session/provider_bridge_ort.cc
+++ b/onnxruntime/core/session/provider_bridge_ort.cc
@@ -26,6 +26,7 @@
 #include "core/graph/model.h"
 #include "core/platform/env.h"
 #include "core/providers/common.h"
+#include "core/providers/providers.h"
 #include "core/session/inference_session.h"
 #include "core/session/abi_session_options_impl.h"
 #include "core/session/ort_apis.h"
@@ -374,6 +375,12 @@ struct ProviderHostImpl : ProviderHost {
   // IAllocator (direct)
   bool IAllocator__CalcMemSizeForArrayWithAlignment(size_t nmemb, size_t size, size_t alignment, size_t* out) override { return IAllocator::CalcMemSizeForArrayWithAlignment(nmemb, size, alignment, out); }
 
+  // IExecutionProviderFactory
+  std::unique_ptr<IExecutionProvider> IExecutionProviderFactory__CreateProvider(
+      IExecutionProviderFactory* p, const OrtSessionOptions& session_options, const OrtLogger& session_logger) override {
+    return p->IExecutionProviderFactory::CreateProvider(session_options, session_logger);
+  }
+
   // IExecutionProvider (direct)
   std::vector<std::unique_ptr<ComputeCapability>> IExecutionProvider__GetCapability(
       const IExecutionProvider* p, const onnxruntime::GraphViewer& graph_viewer,
@@ -812,11 +819,16 @@ struct ProviderHostImpl : ProviderHost {
     return p->GetConfigOrDefault(config_key, default_value);
   }
 
+  const std::unordered_map<std::string, std::string>& ConfigOptions__GetConfigOptionsMap(const ConfigOptions* p) override {
+    return p->GetConfigOptionsMap();
+  }
+
   // OrtRunOptions (wrapped)
   const ConfigOptions& RunOptions__GetConfigOptions(const RunOptions* p) override { return p->config_options; }
 
   // OrtSessionOptions (wrapped)
   const std::unordered_map<std::string, std::string>& SessionOptions__GetConfigOptionsMap(const OrtSessionOptions* p) override { return p->value.config_options.configurations; }
+  const ConfigOptions& SessionOptions__GetConfigOptions(const OrtSessionOptions* p) override { return p->value.config_options; }
   bool SessionOptions__GetEnableProfiling(const OrtSessionOptions* p) override { return p->value.enable_profiling; };
   // ComputeCapability (wrapped)
   std::unique_ptr<ComputeCapability> ComputeCapability__construct(std::unique_ptr<IndexedSubGraph> t_sub_graph) override { return std::make_unique<ComputeCapability>(std::move(t_sub_graph)); }
diff --git a/onnxruntime/core/session/provider_registration.cc b/onnxruntime/core/session/provider_registration.cc
index e578eb0dacd2d..9be79769d1f1b 100644
--- a/onnxruntime/core/session/provider_registration.cc
+++ b/onnxruntime/core/session/provider_registration.cc
@@ -1,12 +1,17 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
+#include <algorithm>
+#include <array>
+#include <sstream>
 #include <string>
 
 #include "core/common/common.h"
 #include "core/common/logging/logging.h"
+#include "core/common/string_utils.h"
 #include "core/framework/error_code_helper.h"
 #include "core/framework/provider_options.h"
+#include "core/graph/constants.h"
 #include "core/providers/provider_factory_creators.h"
 #include "core/session/abi_session_options_impl.h"
 #include "core/session/onnxruntime_c_api.h"
@@ -24,6 +29,14 @@
 
 using namespace onnxruntime;
 
+namespace onnxruntime {
+// Constants for the maximum string lengths of provider options keys and values. The maximum lengths are related
+// to the limits for session config options because we add provider options to the session configs map.
+static constexpr size_t kMaxSessionConfigsEpPrefixLength = 64;  // prefix for new key would be "ep.<EP_NAME>."
+static constexpr size_t kMaxProviderOptionKeyLength = ConfigOptions::kMaxKeyLength - kMaxSessionConfigsEpPrefixLength;
+static constexpr size_t kMaxProviderOptionValueLength = ConfigOptions::kMaxValueLength;
+}  // namespace onnxruntime
+
 namespace {
 
 OrtStatus* ParseProviderOptions(_In_reads_(num_keys) const char* const* provider_options_keys,
@@ -36,12 +49,21 @@ OrtStatus* ParseProviderOptions(_In_reads_(num_keys) const char* const* provider
       return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, "Provider options key/value cannot be empty");
     }
 
-    // arbitrary length to validate the key/value. adjust if/when needed.
-    // TODO: are any other input validation checks required here (and in the other functions that process
-    // provider options)?
-    if (strlen(provider_options_keys[i]) > 1024 || strlen(provider_options_values[i]) > 1024) {
-      return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT,
-                                   "Maximum string length for a provider options key/value is 1024.");
+    // Check that provider options keys and values are within the allowed maximum lengths.
+    const size_t key_length = strlen(provider_options_keys[i]);
+    if (key_length > kMaxProviderOptionKeyLength) {
+      std::ostringstream error_builder;
+      error_builder << "Provider option key length is " << key_length << " but the limit is "
+                    << kMaxProviderOptionKeyLength << ". Provider option key: " << provider_options_keys[i];
+      return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, error_builder.str().c_str());
+    }
+
+    const size_t value_length = strlen(provider_options_values[i]);
+    if (value_length > kMaxProviderOptionValueLength) {
+      std::ostringstream error_builder;
+      error_builder << "Provider option value length is " << value_length << " but the limit is "
+                    << kMaxProviderOptionValueLength << ". Provider option key: " << provider_options_keys[i];
+      return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, error_builder.str().c_str());
     }
 
     provider_options[provider_options_keys[i]] = provider_options_values[i];
@@ -63,6 +85,41 @@ ORT_API_STATUS_IMPL(OrtApis::SessionOptionsAppendExecutionProvider,
                     _In_reads_(num_keys) const char* const* provider_options_values,
                     _In_ size_t num_keys) {
   API_IMPL_BEGIN
+  enum class EpID {
+    INVALID = 0,
+    DML,
+    QNN,
+    OpenVINO,
+    SNPE,  // TODO(adrianlizarraga): Remove SNPE entirely because it has been replaced by QNN EP.
+    XNNPACK,
+    WEBNN,
+    WebGPU,
+    AZURE,
+    JS,
+    VitisAI,
+    CoreML,
+  };
+
+  struct EpToAppend {
+    EpID id = EpID::INVALID;
+    const char* short_name = nullptr;
+    const char* canonical_name = nullptr;
+  };
+
+  static std::array<EpToAppend, 11> supported_eps = {
+      EpToAppend{EpID::DML, "DML", kDmlExecutionProvider},
+      EpToAppend{EpID::QNN, "QNN", kQnnExecutionProvider},
+      EpToAppend{EpID::OpenVINO, "OpenVINO", kOpenVINOExecutionProvider},
+      EpToAppend{EpID::SNPE, "SNPE", kSnpeExecutionProvider},
+      EpToAppend{EpID::XNNPACK, "XNNPACK", kXnnpackExecutionProvider},
+      EpToAppend{EpID::WEBNN, "WEBNN", kWebNNExecutionProvider},
+      EpToAppend{EpID::WebGPU, "WebGPU", kWebGpuExecutionProvider},
+      EpToAppend{EpID::AZURE, "AZURE", kAzureExecutionProvider},
+      EpToAppend{EpID::JS, "JS", kJsExecutionProvider},
+      EpToAppend{EpID::VitisAI, "VitisAI", kVitisAIExecutionProvider},
+      EpToAppend{EpID::CoreML, "CoreML", kCoreMLExecutionProvider},
+  };
+
   ProviderOptions provider_options;
   OrtStatus* status = ParseProviderOptions(provider_options_keys,
                                            provider_options_values,
@@ -90,86 +147,160 @@ ORT_API_STATUS_IMPL(OrtApis::SessionOptionsAppendExecutionProvider,
                                  (std::string(provider_name) + " execution provider is not supported in this build. ").c_str());
   };
 
-  for (const auto& config_pair : provider_options) {
-    ORT_THROW_IF_ERROR(options->value.config_options.AddConfigEntry((std::string(provider_name) + ":" + config_pair.first).c_str(), config_pair.second.c_str()));
+  auto create_unknown_provider_status = [&provider_name](gsl::span<const EpToAppend> supported_eps) -> OrtStatus* {
+    std::ostringstream str_builder;
+    str_builder << "Unknown provider name '" << provider_name << "'. "
+                << "Currently supported values are ";
+    const size_t num_eps = supported_eps.size();
+    for (size_t i = 0; i < num_eps; ++i) {
+      const EpToAppend& ep_info = supported_eps[i];
+
+      str_builder << "'" << ep_info.short_name << "'/'" << ep_info.canonical_name << "'";
+      if (num_eps >= 2 && i == num_eps - 2) {
+        str_builder << ", and ";
+      } else if (i == num_eps - 1) {
+        str_builder << ".";
+      } else {
+        str_builder << ", ";
+      }
+    }
+
+    return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, str_builder.str().c_str());
+  };
+
+  auto ep_to_append_iter = std::find_if(supported_eps.begin(), supported_eps.end(),
+                                        [&provider_name](const EpToAppend& elem) -> bool {
+                                          return (strcmp(provider_name, elem.short_name) == 0) ||
+                                                 (strcmp(provider_name, elem.canonical_name) == 0);
+                                        });
+
+  if (ep_to_append_iter == supported_eps.end()) {
+    return create_unknown_provider_status(supported_eps);
   }
 
-  if (strcmp(provider_name, "DML") == 0) {
+  const EpToAppend& ep_to_append = *ep_to_append_iter;
+  ORT_ENFORCE(ep_to_append.id != EpID::INVALID);
+
+  // Add provider options to the session config options.
+  // Use a new key with the format: "ep.<EP_NAME>.<PROVIDER_OPTION_KEY>"
+  std::string key_prefix = "ep.";
+  key_prefix += utils::GetLowercaseString(ep_to_append.canonical_name);
+  key_prefix += ".";
+
+  for (const auto& [key, value] : provider_options) {
+    const std::string new_key = key_prefix + key;
+    if (new_key.size() > ConfigOptions::kMaxKeyLength) {
+      LOGS_DEFAULT(WARNING) << "Can't add provider option to session configurations: "
+                            << "New key's string length (" << new_key.size() << ") "
+                            << "exceeds limit (" << ConfigOptions::kMaxKeyLength << "). "
+                            << "Original key contents: " << key << " New key contents: " << new_key;
+      continue;
+    }
+
+    ORT_ENFORCE(options->value.config_options.AddConfigEntry(new_key.c_str(), value.c_str()).IsOK());
+  }
+
+  switch (ep_to_append.id) {
+    case EpID::DML: {
 #if defined(USE_DML)
-    options->provider_factories.push_back(DMLProviderFactoryCreator::CreateFromProviderOptions(options->value.config_options, provider_options));
+      options->provider_factories.push_back(
+          DMLProviderFactoryCreator::CreateFromProviderOptions(options->value.config_options, provider_options));
 #else
-    status = create_not_supported_status();
+      status = create_not_supported_status();
 #endif
-  } else if (strcmp(provider_name, "QNN") == 0) {
+      break;
+    }
+    case EpID::QNN: {
 #if defined(USE_QNN)
-    options->provider_factories.push_back(QNNProviderFactoryCreator::Create(provider_options, &(options->value)));
+      options->provider_factories.push_back(QNNProviderFactoryCreator::Create(provider_options, &(options->value)));
 #else
-    status = create_not_supported_status();
+      status = create_not_supported_status();
 #endif
-  } else if (strcmp(provider_name, "OpenVINO") == 0) {
+      break;
+    }
+    case EpID::OpenVINO: {
 #if defined(USE_OPENVINO)
-    options->provider_factories.push_back(OpenVINOProviderFactoryCreator::Create(&provider_options, &(options->value)));
+      options->provider_factories.push_back(OpenVINOProviderFactoryCreator::Create(&provider_options,
+                                                                                   &(options->value)));
 #else
-    status = create_not_supported_status();
+      status = create_not_supported_status();
 #endif
-  } else if (strcmp(provider_name, "SNPE") == 0) {
+      break;
+    }
+    case EpID::SNPE: {
 #if defined(USE_SNPE)
-    options->provider_factories.push_back(SNPEProviderFactoryCreator::Create(provider_options));
+      options->provider_factories.push_back(SNPEProviderFactoryCreator::Create(provider_options));
 #else
-    status = create_not_supported_status();
+      status = create_not_supported_status();
 #endif
-  } else if (strcmp(provider_name, "XNNPACK") == 0) {
+      break;
+    }
+    case EpID::XNNPACK: {
 #if defined(USE_XNNPACK)
-    options->provider_factories.push_back(XnnpackProviderFactoryCreator::Create(provider_options, &(options->value)));
+      options->provider_factories.push_back(XnnpackProviderFactoryCreator::Create(provider_options, &(options->value)));
 #else
-    status = create_not_supported_status();
+      status = create_not_supported_status();
 #endif
-  } else if (strcmp(provider_name, "WEBNN") == 0) {
+      break;
+    }
+    case EpID::WEBNN: {
 #if defined(USE_WEBNN)
-    std::string deviceType = options->value.config_options.GetConfigOrDefault("deviceType", "cpu");
-    provider_options["deviceType"] = deviceType;
-    options->provider_factories.push_back(WebNNProviderFactoryCreator::Create(provider_options));
+      std::string deviceType = options->value.config_options.GetConfigOrDefault("deviceType", "cpu");
+      provider_options["deviceType"] = deviceType;
+      options->provider_factories.push_back(WebNNProviderFactoryCreator::Create(provider_options));
 #else
-    status = create_not_supported_status();
+      status = create_not_supported_status();
 #endif
-  } else if (strcmp(provider_name, "WebGPU") == 0) {
+      break;
+    }
+    case EpID::WebGPU: {
 #if defined(USE_WEBGPU)
-    options->provider_factories.push_back(WebGpuProviderFactoryCreator::Create(options->value.config_options));
+      options->provider_factories.push_back(WebGpuProviderFactoryCreator::Create(options->value.config_options));
 #else
-    status = create_not_supported_status();
+      status = create_not_supported_status();
 #endif
-  } else if (strcmp(provider_name, "AZURE") == 0) {
+      break;
+    }
+    case EpID::AZURE: {
 #if defined(USE_AZURE)
-    options->provider_factories.push_back(AzureProviderFactoryCreator::Create(provider_options));
+      options->provider_factories.push_back(AzureProviderFactoryCreator::Create(provider_options));
 #else
-    status = create_not_supported_status();
+      status = create_not_supported_status();
 #endif
-  } else if (strcmp(provider_name, "JS") == 0) {
-#if defined(USE_JSEP)
-    std::string preferred_layout;
-    if (options->value.config_options.TryGetConfigEntry("preferredLayout", preferred_layout)) {
-      provider_options["preferred_layout"] = preferred_layout;
+      break;
     }
-    options->provider_factories.push_back(JsProviderFactoryCreator::Create(provider_options, &(options->value)));
+    case EpID::JS: {
+#if defined(USE_JSEP)
+      std::string preferred_layout;
+      if (options->value.config_options.TryGetConfigEntry("preferredLayout", preferred_layout)) {
+        provider_options["preferred_layout"] = preferred_layout;
+      }
+      options->provider_factories.push_back(JsProviderFactoryCreator::Create(provider_options, &(options->value)));
 #else
-    status = create_not_supported_status();
+      status = create_not_supported_status();
 #endif
-  } else if (strcmp(provider_name, "VitisAI") == 0) {
+      break;
+    }
+    case EpID::VitisAI: {
 #ifdef USE_VITISAI
-    status = OrtApis::SessionOptionsAppendExecutionProvider_VitisAI(options, provider_options_keys, provider_options_values, num_keys);
+      status = OrtApis::SessionOptionsAppendExecutionProvider_VitisAI(options, provider_options_keys,
+                                                                      provider_options_values, num_keys);
 #else
-    status = create_not_supported_status();
+      status = create_not_supported_status();
 #endif
-  } else if (strcmp(provider_name, "CoreML") == 0) {
+      break;
+    }
+    case EpID::CoreML: {
 #if defined(USE_COREML)
-    options->provider_factories.push_back(CoreMLProviderFactoryCreator::Create(provider_options));
+      options->provider_factories.push_back(CoreMLProviderFactoryCreator::Create(provider_options));
 #else
-    status = create_not_supported_status();
+      status = create_not_supported_status();
 #endif
-  } else {
-    ORT_UNUSED_PARAMETER(options);
-    status = OrtApis::CreateStatus(ORT_INVALID_ARGUMENT,
-                                   "Unknown provider name. Currently supported values are 'DML', 'QNN', 'OpenVINO', 'SNPE', 'XNNPACK', 'WEBNN', 'WebGPU', 'AZURE', 'JS', 'VitisAI', and 'CoreML'");
+      break;
+    }
+    default:
+      ORT_UNUSED_PARAMETER(options);
+      status = create_unknown_provider_status(supported_eps);
   }
 
   return status;
diff --git a/onnxruntime/core/session/utils.cc b/onnxruntime/core/session/utils.cc
index afb1ed2696c9f..7b7704a9f8269 100644
--- a/onnxruntime/core/session/utils.cc
+++ b/onnxruntime/core/session/utils.cc
@@ -97,12 +97,16 @@ OrtStatus* CreateSessionAndLoadModel(_In_ const OrtSessionOptions* options,
 OrtStatus* InitializeSession(_In_ const OrtSessionOptions* options,
                              _In_ onnxruntime::InferenceSession& sess,
                              _Inout_opt_ OrtPrepackedWeightsContainer* prepacked_weights_container) {
+  const logging::Logger* session_logger = sess.GetLogger();
+  ORT_ENFORCE(session_logger != nullptr,
+              "Session logger is invalid, but should have been initialized during session construction.");
+
   // we need to disable mem pattern if DML is one of the providers since DML doesn't have the concept of
   // byte addressable memory
   std::vector<std::unique_ptr<IExecutionProvider>> provider_list;
   if (options) {
     for (auto& factory : options->provider_factories) {
-      auto provider = factory->CreateProvider();
+      auto provider = factory->CreateProvider(*options, *reinterpret_cast<const OrtLogger*>(session_logger));
       provider_list.push_back(std::move(provider));
     }
   }
diff --git a/onnxruntime/python/onnxruntime_pybind_quant.cc b/onnxruntime/python/onnxruntime_pybind_quant.cc
index 51a52af1b151e..7065831718a51 100644
--- a/onnxruntime/python/onnxruntime_pybind_quant.cc
+++ b/onnxruntime/python/onnxruntime_pybind_quant.cc
@@ -34,8 +34,8 @@ namespace python {
 namespace py = pybind11;
 using namespace onnxruntime;
 
-template <typename T>
-void QuantizeMatMul4BitsBlockwise(
+template <typename T, int qbits>
+void QuantizeMatMulNBitsBlockwise(
     py::array_t<uint8_t> dst,          // shape: [ N, block_per_K, block_blob_size ]
     py::array_t<T> src,                // shape: [K, N]
     py::array_t<T> scale,              // shape: [N, block_per_K]
@@ -53,7 +53,7 @@ void QuantizeMatMul4BitsBlockwise(
   py::buffer_info scale_buf = scale.request();
   py::buffer_info zp_buf = zero_points.request();
 
-  MlasQuantizeBlockwise<T, 4>(
+  MlasQuantizeBlockwise<T, qbits>(
       reinterpret_cast<uint8_t*>(dst_buf.ptr),
       reinterpret_cast<T*>(scale_buf.ptr),
       is_symmetric ? nullptr : reinterpret_cast<uint8_t*>(zp_buf.ptr),
@@ -126,8 +126,10 @@ void QuantizeMatMulBnb4Blockwise(
 }
 
 void CreateQuantPybindModule(py::module& m) {
-  m.def("quantize_matmul_4bits", &QuantizeMatMul4BitsBlockwise<float>);
-  m.def("quantize_matmul_4bits", &QuantizeMatMul4BitsBlockwise<MLFloat16>);
+  m.def("quantize_matmul_4bits", &QuantizeMatMulNBitsBlockwise<float, 4>);
+  m.def("quantize_matmul_4bits", &QuantizeMatMulNBitsBlockwise<MLFloat16, 4>);
+  m.def("quantize_matmul_8bits", &QuantizeMatMulNBitsBlockwise<float, 8>);
+  m.def("quantize_matmul_8bits", &QuantizeMatMulNBitsBlockwise<MLFloat16, 8>);
   m.def("quantize_matmul_bnb4", &QuantizeMatMulBnb4Blockwise<float>);
   m.def("quantize_matmul_bnb4", &QuantizeMatMulBnb4Blockwise<MLFloat16>);
   m.def("quantize_qdq_matmul_4bits", &QuantizeQDQMatMul4BitsBlockwise<float>);
diff --git a/onnxruntime/python/tools/quantization/README.md b/onnxruntime/python/tools/quantization/README.md
index 5347690f10d12..8d3415558aa9a 100644
--- a/onnxruntime/python/tools/quantization/README.md
+++ b/onnxruntime/python/tools/quantization/README.md
@@ -1,2 +1,84 @@
 # Quantization Tool
-This tool can be used to quantize select ONNX models. Support is based on operators in the model. Please refer to https://onnxruntime.ai/docs/performance/quantization.html for usage details and https://github.com/microsoft/onnxruntime-inference-examples/tree/main/quantization for examples.
\ No newline at end of file
+This tool can be used to quantize selected ONNX models. Support is based on operators in the model. Please refer to https://onnxruntime.ai/docs/performance/quantization.html for usage details and https://github.com/microsoft/onnxruntime-inference-examples/tree/main/quantization for examples.
+
+## Static Quantization Tool
+
+### Build
+Please add `--enable_pybind` and `--build_wheel` to the build command to acquire the python tools.
+
+```bash
+cd onnxruntime
+.\build.bat --config RelWithDebInfo --build_shared_lib --parallel --cmake_generator "Visual Studio 17 2022" --enable_pybind --build_wheel
+```
+
+### Model and Data
+The static quantization tool expects the directory structure of model and data.
+
+```ps1
+work_dir\resnet18-v1-7
+├───model.onnx
+├───test_data_set_0
+├───test_data_set_1
+├───test_data_set_2
+├───test_data_set_3
+├───test_data_set_4
+├───test_data_set_5
+├───test_data_set_6
+├───test_data_set_7
+├───test_data_set_8
+└───test_data_set_9
+```
+
+### Usage
+Install the python tools built in onnxruntime
+```ps1
+cd work_dir
+python -m venv ort_env
+ort_env\Scripts\activate
+python -m pip install <path-to-built-folder>\RelWithDebInfo\RelWithDebInfo\dist\<name-of-the-wheel>.whl
+
+# The following command yields model_quant.onnx under the same directory "resnet18-v1-7"
+python -m onnxruntime.quantization.static_quantize_runner -i resnet18-v1-7\model.onnx -o resnet18-v1-7\model_quant.onnx
+
+work_dir\resnet18-v1-7
+├───model.onnx
+├───model_quant.onnx
+├───test_data_set_0
+│   ...
+└───test_data_set_9
+```
+
+### Quantization Arguments
+Please refer to `static_quantize_runner.py` for more detailed arguments.
+
+```ps1
+python -m onnxruntime.quantization.static_quantize_runner -i resnet18-v1-7\model.onnx -o resnet18-v1-7\model_quant.onnx --activation_type qint8 --weight_type qint16
+python -m onnxruntime.quantization.static_quantize_runner -i resnet18-v1-7\model.onnx -o resnet18-v1-7\model_quant.onnx --activation_type qint16 --weight_type qint16 --quantize_bias
+python -m onnxruntime.quantization.static_quantize_runner -i resnet18-v1-7\model.onnx -o resnet18-v1-7\model_quant.onnx --activation_type qint16 --weight_type qint8 --per_channel
+```
+
+### Tensor Quant Overrides Json Format
+With `--tensor_quant_overrides`, the tool can consume the json file with quantization override information.
+```ps1
+python -m onnxruntime.quantization.static_quantize_runner -i resnet18-v1-7\model.onnx -o resnet18-v1-7\model_quant.onnx --tensor_quant_overrides <path-to-json>\encoding.json
+```
+
+The tool expects the encoding.json with the format:
+```json
+{
+    "conv1_1": [
+        {
+            "scale": 0.005,
+            "zero_point": 12
+        }
+    ]
+}
+```
+- Each key is the name of a tensor in the onnx model.
+    - e.g. "conv1_1"
+- For each tensor, a list of dictionary should be provided
+    - For per-tensor quantization, the list contains a single dictionary.
+    - For per-channel quantization, the list contains a dictionary for each channel in the tensor.
+    - Each dictionary contain the information required for quantization including:
+        - scale (float)
+        - zero_point (int)
diff --git a/onnxruntime/python/tools/quantization/matmul_4bits_quantizer.py b/onnxruntime/python/tools/quantization/matmul_4bits_quantizer.py
index fa9a78012d2de..581ad341d3349 100644
--- a/onnxruntime/python/tools/quantization/matmul_4bits_quantizer.py
+++ b/onnxruntime/python/tools/quantization/matmul_4bits_quantizer.py
@@ -18,7 +18,7 @@
 from onnx.onnx_pb import GraphProto, ModelProto, NodeProto, TensorProto
 from packaging import version
 
-from onnxruntime.capi._pybind_state import quantize_matmul_4bits, quantize_qdq_matmul_4bits
+from onnxruntime.capi._pybind_state import quantize_matmul_4bits, quantize_matmul_8bits, quantize_qdq_matmul_4bits
 
 from .calibrate import CalibrationDataReader
 from .onnx_model import ONNXModel
@@ -35,6 +35,7 @@ def __init__(
         quant_format: QuantFormat,
         op_types_to_quantize: tuple[str, ...] | None = None,
         quant_axes: tuple[tuple[str, int], ...] | None = None,
+        customized_weight_config: dict | None = None,
     ):
         """This is the Base class for Weight Only blockwise quantization Configuration.
 
@@ -48,11 +49,15 @@ def __init__(
                 set of operator types to quantize. Default {MatMul}
             quant_axes (dict[str, int], optional):
                 op:axis, which axis to quantize for an op. Default {MatMul: 0, Gather: 1}
+            customized_weight_config:
+                customized weight config for nodes if needed.
+                If both customized_weight_config and nodes_to_exclude are set, nodes_to_exclude overwrites customized_weight_config.
         """
         self.algorithm = algorithm
         self.quant_format = quant_format
         self.op_types_to_quantize = set(op_types_to_quantize) if op_types_to_quantize else {"MatMul"}
         self.quant_axes = dict(quant_axes) if quant_axes else {"MatMul": 0, "Gather": 1}
+        self.customized_weight_config = customized_weight_config
 
 
 class RTNWeightOnlyQuantConfig(WeightOnlyQuantConfig):
@@ -61,6 +66,7 @@ def __init__(
         ratios=None,
         quant_format=QuantFormat.QOperator,
         op_types_to_quantize: tuple[str, ...] | None = None,
+        customized_weight_config: dict | None = None,
     ):
         """
         This is a class for round-to-nearest (RTN) algorithm Weight Only Quant Configuration.
@@ -84,6 +90,7 @@ def __init__(
             algorithm="RTN",
             quant_format=quant_format,
             op_types_to_quantize=op_types_to_quantize,
+            customized_weight_config=customized_weight_config,
         )
         self.ratios = ratios
 
@@ -191,6 +198,7 @@ def __init__(
         quant_format=QuantFormat.QOperator,
         op_types_to_quantize: tuple[str, ...] | None = None,
         quant_axes: tuple[tuple[str, int], ...] | None = None,
+        bits: int = 4,
     ):
         """
         This is a class for weight only affine quantization configuration.
@@ -221,7 +229,7 @@ def __init__(
         )
         self.block_size = block_size
         self.is_symmetric = is_symmetric
-        self.bits = 4
+        self.bits = bits
         self.accuracy_level = accuracy_level
 
 
@@ -721,9 +729,11 @@ class DefaultWeightOnlyQuantizer:
     def __init__(self, config: DefaultWeightOnlyQuantConfig):
         self.config = config
 
-    def int4_block_quant(self, fp32weight: npt.ArrayLike) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
-        """4b quantize fp32 weight to int4 using C++ kernels."""
+    def qbits_block_quant(self, fp32weight: npt.ArrayLike) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
+        """4b/8b quantize fp32 weight to int4 using C++ kernels."""
 
+        qbits = self.config.bits
+        kpack = 8 // qbits
         if len(fp32weight.shape) != 2:
             raise ValueError("Current int4 block quantization only supports 2D tensors!")
         rows, cols = fp32weight.shape
@@ -732,7 +742,7 @@ def int4_block_quant(self, fp32weight: npt.ArrayLike) -> tuple[np.ndarray, np.nd
         k_blocks = (rows + block_size - 1) // block_size
 
         if self.config.quant_format == QuantFormat.QOperator:
-            blob_size = block_size // 2
+            blob_size = (block_size + kpack - 1) // kpack
             padded_rows = k_blocks * block_size
             pad_len = padded_rows - rows
             if pad_len > 0:
@@ -740,12 +750,18 @@ def int4_block_quant(self, fp32weight: npt.ArrayLike) -> tuple[np.ndarray, np.nd
 
             # block wise quantization, each block comes from a single column
             packed = np.zeros((cols, k_blocks, blob_size), dtype="uint8")
-            zero_point = np.zeros(cols * ((k_blocks + 1) // 2), dtype="uint8")
+            zero_point = np.zeros(cols * ((k_blocks + kpack - 1) // kpack), dtype="uint8")
             scales = np.zeros((cols * k_blocks), dtype=fp32weight.dtype)
-            quantize_matmul_4bits(
-                packed, fp32weight, scales, zero_point, block_size, cols, rows, self.config.is_symmetric
-            )
+            if qbits == 8:
+                quantize_matmul_8bits(
+                    packed, fp32weight, scales, zero_point, block_size, cols, rows, self.config.is_symmetric
+                )
+            else:
+                quantize_matmul_4bits(
+                    packed, fp32weight, scales, zero_point, block_size, cols, rows, self.config.is_symmetric
+                )
         else:
+            # QDQ format only support 4 bits quantization
             packed = np.zeros((rows * cols + 1) // 2, dtype="uint8")
             zero_point = np.zeros((cols * k_blocks + 1) // 2, dtype="uint8")
             scales = np.zeros((k_blocks, cols), dtype=fp32weight.dtype)
@@ -757,10 +773,14 @@ def int4_block_quant(self, fp32weight: npt.ArrayLike) -> tuple[np.ndarray, np.nd
 
     def quantize_matmul(self, node: NodeProto, graph_stack: list[GraphProto]) -> list[NodeProto]:
         """
-        Quantize weight B of MatMul node to int4.
+        Quantize weight B of MatMul node to int4 or int8.
         Currently only support 2D constant matrix and axis 0 blockwise quantization.
         """
-        qtype = TensorProto.INT4 if self.config.is_symmetric else TensorProto.UINT4
+        bits = self.config.bits
+        if bits == 8:
+            qtype = TensorProto.INT8 if self.config.is_symmetric else TensorProto.UINT8
+        else:
+            qtype = TensorProto.INT4 if self.config.is_symmetric else TensorProto.UINT4
         input_b = node.input[1]
         b_tensor, b_graph = get_initializer(input_b, graph_stack)
         if b_tensor is None:
@@ -772,13 +792,15 @@ def quantize_matmul(self, node: NodeProto, graph_stack: list[GraphProto]) -> lis
             logger.info("MatMul weight is not 2D. Skip to quantize")
             return [node]  # can only process 2-D matrix
 
-        packed, scales, zero_points = self.int4_block_quant(b_ndarray)
+        packed, scales, zero_points = self.qbits_block_quant(b_ndarray)
 
         if self.config.quant_format == QuantFormat.QOperator:
-            b_quant = onnx.numpy_helper.from_array(packed, b_tensor.name + "_Q4")
+            b_quant = onnx.numpy_helper.from_array(packed, b_tensor.name + f"_Q{bits}")
             scales_tensor = onnx.numpy_helper.from_array(scales, b_tensor.name + "_scales")
         else:
-            b_quant = onnx.helper.make_tensor(b_tensor.name + "_DQ_Q4", qtype, b_ndarray.shape, packed.tobytes(), True)
+            b_quant = onnx.helper.make_tensor(
+                b_tensor.name + f"_DQ_Q{bits}", qtype, b_ndarray.shape, packed.tobytes(), True
+            )
             scales_tensor = onnx.numpy_helper.from_array(scales, b_tensor.name + "_DQ_scales")
 
         for input in b_graph.input:
@@ -800,21 +822,21 @@ def quantize_matmul(self, node: NodeProto, graph_stack: list[GraphProto]) -> lis
             rows, cols = b_ndarray.shape
             kwargs["K"] = rows
             kwargs["N"] = cols
-            kwargs["bits"] = 4
+            kwargs["bits"] = bits
             kwargs["block_size"] = self.config.block_size
             if self.config.accuracy_level is not None:
                 kwargs["accuracy_level"] = self.config.accuracy_level
 
-            matmul_q4_node = onnx.helper.make_node(
+            matmul_qbit_node = onnx.helper.make_node(
                 "MatMulNBits",
                 inputs=input_names,
                 outputs=[node.output[0]],
-                name=node.name + "_Q4" if node.name else "",
+                name=node.name + f"_Q{bits}" if node.name else "",
                 domain="com.microsoft",
                 **kwargs,
             )
 
-            output_nodes.append(matmul_q4_node)
+            output_nodes.append(matmul_qbit_node)
         else:
             dq_input_names = [b_quant.name, scales_tensor.name]
             dq_output_names = [b_quant.name + "_output"]
@@ -831,14 +853,14 @@ def quantize_matmul(self, node: NodeProto, graph_stack: list[GraphProto]) -> lis
                 "DequantizeLinear",
                 inputs=dq_input_names,
                 outputs=dq_output_names,
-                name=node.name + "_DQ_Q4" if node.name else "",
+                name=node.name + f"_DQ_Q{bits}" if node.name else "",
                 **dq_kwargs,
             )
             matmul_node = onnx.helper.make_node(
                 "MatMul",
                 inputs=matmul_input_names,
                 outputs=matmul_output_names,
-                name=node.name + "_matmul_Q4" if node.name else "",
+                name=node.name + f"_matmul_Q{bits}" if node.name else "",
             )
             output_nodes.extend([dq_node, matmul_node])
 
@@ -1010,16 +1032,23 @@ def quantize(self, node: NodeProto, graph_stack: list[GraphProto]) -> list[NodeP
         """
         logger.info(f"start to quantize {node.name} ...")
 
+        bits = self.config.bits
         if node.op_type == "MatMul":
+            if bits == 8 and self.config.quant_format == QuantFormat.QDQ:
+                logger.error("MatMul only supports QOperator format for 8 bits quantization.")
+                return [node]
             results = self.quantize_matmul(node, graph_stack)
         elif node.op_type == "Gather":
+            if self.config.bits != 4:
+                logger.error("Gather only supports 4 bits quantization.")
+                return [node]
+
             results = self.quantize_gather(node, graph_stack)
         else:
             logger.error(f"Unsupported operator {node.op_type} for weight only quantization. Skip quantization.")
-            results = [node]
-
-        logger.info(f"complete quantization of {node.name} ...")
+            return [node]
 
+        logger.info(f"complete quantization of {node.name} with {self.config.bits} bits ...")
         return results
 
 
@@ -1179,14 +1208,21 @@ def _process_subgraph(self, graph_stack: list[GraphProto]):
     def _generate_q4_node_config(self):
         """Generate weight only quant configuration for nodes."""
         q4_node_config = {}
-        template_config_q4 = {
-            "bits": 4,
-            "group_size": self.block_size,
-            "scheme": "sym" if self.is_symmetric else "asym",
-        }
         for node in self.model.model.graph.node:
             if node.op_type in ["MatMul"]:
                 if not all(self.model.get_initializer(i) is None for i in node.input):
+                    template_config_q4 = {
+                        "bits": 4,
+                        "group_size": self.block_size,
+                        "scheme": "sym" if self.is_symmetric else "asym",
+                    }
+                    if (
+                        self.algo_config.customized_weight_config
+                        and node.name in self.algo_config.customized_weight_config
+                    ):
+                        for key, value in self.algo_config.customized_weight_config[node.name].items():
+                            if key in template_config_q4:
+                                template_config_q4[key] = value
                     q4_node_config[node.name] = template_config_q4
         return q4_node_config
 
@@ -1446,6 +1482,7 @@ def parse_args():
             quant_format=quant_format,
             op_types_to_quantize=op_types_to_quantize,
             quant_axes=quant_axes,
+            bits=args.bits,
         )
     elif args.quant_method == "rtn":
         quant_config = RTNWeightOnlyQuantConfig(op_types_to_quantize=op_types_to_quantize)
diff --git a/onnxruntime/python/tools/quantization/static_quantize_runner.py b/onnxruntime/python/tools/quantization/static_quantize_runner.py
new file mode 100644
index 0000000000000..d222ba5b59ac1
--- /dev/null
+++ b/onnxruntime/python/tools/quantization/static_quantize_runner.py
@@ -0,0 +1,256 @@
+import argparse
+import json
+import os
+
+import numpy as np
+import onnx
+
+import onnxruntime
+from onnxruntime.quantization import QuantFormat, QuantType, StaticQuantConfig, quantize
+from onnxruntime.quantization.calibrate import CalibrationDataReader, CalibrationMethod
+
+
+class OnnxModelCalibrationDataReader(CalibrationDataReader):
+    def __init__(self, model_path):
+        self.model_dir = os.path.dirname(model_path)
+        data_dirs = [
+            os.path.join(self.model_dir, a) for a in os.listdir(self.model_dir) if a.startswith("test_data_set_")
+        ]
+        model_inputs = onnxruntime.InferenceSession(model_path).get_inputs()
+        name2tensors = []
+        for data_dir in data_dirs:
+            name2tensor = {}
+            data_paths = [os.path.join(data_dir, a) for a in sorted(os.listdir(data_dir))]
+            data_ndarrays = [self.read_onnx_pb_data(data_path) for data_path in data_paths]
+            for model_input, data_ndarray in zip(model_inputs, data_ndarrays, strict=False):
+                name2tensor[model_input.name] = data_ndarray
+            name2tensors.append(name2tensor)
+        assert len(name2tensors) == len(data_dirs)
+        assert len(name2tensors[0]) == len(model_inputs)
+
+        self.calibration_data = iter(name2tensors)
+
+    def get_next(self) -> dict:
+        """generate the input data dict for ONNXinferenceSession run"""
+        return next(self.calibration_data, None)
+
+    def read_onnx_pb_data(self, file_pb):
+        tensor = onnx.TensorProto()
+        with open(file_pb, "rb") as f:
+            tensor.ParseFromString(f.read())
+        ret = onnx.numpy_helper.to_array(tensor)
+        return ret
+
+
+def parse_arguments():
+    parser = argparse.ArgumentParser(description="The arguments for static quantization")
+    parser.add_argument("-i", "--input_model_path", required=True, help="Path to the input onnx model")
+    parser.add_argument(
+        "-o", "--output_quantized_model_path", required=True, help="Path to the output quantized onnx model"
+    )
+    parser.add_argument(
+        "--activation_type",
+        choices=["qint8", "quint8", "qint16", "quint16", "qint4", "quint4", "qfloat8e4m3fn"],
+        default="quint8",
+        help="Activation quantization type used",
+    )
+    parser.add_argument(
+        "--weight_type",
+        choices=["qint8", "quint8", "qint16", "quint16", "qint4", "quint4", "qfloat8e4m3fn"],
+        default="qint8",
+        help="Weight quantization type used",
+    )
+    parser.add_argument("--enable_subgraph", action="store_true", help="If set, subgraph will be quantized.")
+    parser.add_argument(
+        "--force_quantize_no_input_check",
+        action="store_true",
+        help="By default, some latent operators like maxpool, transpose, do not quantize if their input is not"
+        " quantized already. Setting to True to force such operator always quantize input and so generate"
+        " quantized output. Also the True behavior could be disabled per node using the nodes_to_exclude.",
+    )
+    parser.add_argument(
+        "--matmul_const_b_only",
+        action="store_true",
+        help="If set, only MatMul with const B will be quantized.",
+    )
+    parser.add_argument(
+        "--add_qdq_pair_to_weight",
+        action="store_true",
+        help="If set, it remains floating-point weight and inserts both QuantizeLinear/DeQuantizeLinear"
+        " nodes to weight.",
+    )
+    parser.add_argument(
+        "--dedicated_qdq_pair",
+        action="store_true",
+        help="If set, it will create identical and dedicated QDQ pair for each node.",
+    )
+    parser.add_argument(
+        "--op_types_to_exclude_output_quantization",
+        nargs="+",
+        default=[],
+        help="If any op type is specified, it won't quantize the output of ops with this specific op types.",
+    )
+    parser.add_argument(
+        "--calibration_method",
+        default="minmax",
+        choices=["minmax", "entropy", "percentile", "distribution"],
+        help="Calibration method used",
+    )
+    parser.add_argument("--quant_format", default="qdq", choices=["qdq", "qoperator"], help="Quantization format used")
+    parser.add_argument(
+        "--calib_tensor_range_symmetric",
+        action="store_true",
+        help="If enabled, the final range of tensor during calibration will be explicitly"
+        " set to symmetric to central point 0",
+    )
+    # TODO: --calib_strided_minmax"
+    # TODO: --calib_moving_average_constant"
+    # TODO: --calib_max_intermediate_outputs"
+    parser.add_argument(
+        "--calib_moving_average",
+        action="store_true",
+        help="If enabled, the moving average of"
+        " the minimum and maximum values will be computed when the calibration method selected is MinMax.",
+    )
+    parser.add_argument(
+        "--disable_quantize_bias",
+        action="store_true",
+        help="Whether to quantize floating-point biases by solely inserting a DeQuantizeLinear node"
+        " If not set, it remains floating-point bias and does not insert any quantization nodes"
+        " associated with biases.",
+    )
+
+    # TODO: Add arguments related to Smooth Quant
+
+    parser.add_argument(
+        "--use_qdq_contrib_ops",
+        action="store_true",
+        help="If set, the inserted QuantizeLinear and DequantizeLinear ops will have the com.microsoft domain,"
+        " which forces use of ONNX Runtime's QuantizeLinear and DequantizeLinear contrib op implementations.",
+    )
+    parser.add_argument(
+        "--minimum_real_range",
+        type=float,
+        default=0.0001,
+        help="If set to a floating-point value, the calculation of the quantization parameters"
+        " (i.e., scale and zero point) will enforce a minimum range between rmin and rmax. If (rmax-rmin)"
+        " is less than the specified minimum range, rmax will be set to rmin + MinimumRealRange. This is"
+        " necessary for EPs like QNN that require a minimum floating-point range when determining "
+        " quantization parameters.",
+    )
+    parser.add_argument(
+        "--qdq_keep_removable_activations",
+        action="store_true",
+        help="If set, removable activations (e.g., Clip or Relu) will not be removed,"
+        " and will be explicitly represented in the QDQ model.",
+    )
+    parser.add_argument(
+        "--qdq_disable_weight_adjust_for_int32_bias",
+        action="store_true",
+        help="If set, QDQ quantizer will not adjust the weight's scale when the bias"
+        " has a scale (input_scale * weight_scale) that is too small.",
+    )
+    parser.add_argument("--per_channel", action="store_true", help="Whether using per-channel quantization")
+    parser.add_argument(
+        "--nodes_to_quantize",
+        nargs="+",
+        default=None,
+        help="List of nodes names to quantize. When this list is not None only the nodes in this list are quantized.",
+    )
+    parser.add_argument(
+        "--nodes_to_exclude",
+        nargs="+",
+        default=None,
+        help="List of nodes names to exclude. The nodes in this list will be excluded from quantization when it is not None.",
+    )
+    parser.add_argument(
+        "--op_per_channel_axis",
+        nargs=2,
+        action="append",
+        metavar=("OP_TYPE", "PER_CHANNEL_AXIS"),
+        default=[],
+        help="Set channel axis for specific op type, for example: --op_per_channel_axis MatMul 1, and it's"
+        " effective only when per channel quantization is supported and per_channel is True. If specific"
+        " op type supports per channel quantization but not explicitly specified with channel axis,"
+        " default channel axis will be used.",
+    )
+    parser.add_argument("--tensor_quant_overrides", help="Set the json file for tensor quantization overrides.")
+    return parser.parse_args()
+
+
+def get_tensor_quant_overrides(file):
+    # TODO: Enhance the function to handle more real cases of json file
+    if not file:
+        return {}
+    with open(file) as f:
+        quant_override_dict = json.load(f)
+    for tensor in quant_override_dict:
+        for enc_dict in quant_override_dict[tensor]:
+            enc_dict["scale"] = np.array(enc_dict["scale"], dtype=np.float32)
+            enc_dict["zero_point"] = np.array(enc_dict["zero_point"])
+    return quant_override_dict
+
+
+def main():
+    args = parse_arguments()
+    data_reader = OnnxModelCalibrationDataReader(model_path=args.input_model_path)
+    arg2quant_type = {
+        "qint8": QuantType.QInt8,
+        "quint8": QuantType.QUInt8,
+        "qint16": QuantType.QInt16,
+        "quint16": QuantType.QUInt16,
+        "qint4": QuantType.QInt4,
+        "quint4": QuantType.QUInt4,
+        "qfloat8e4m3fn": QuantType.QFLOAT8E4M3FN,
+    }
+    activation_type = arg2quant_type[args.activation_type]
+    weight_type = arg2quant_type[args.weight_type]
+    qdq_op_type_per_channel_support_to_axis = dict(args.op_per_channel_axis)
+    extra_options = {
+        "EnableSubgraph": args.enable_subgraph,
+        "ForceQuantizeNoInputCheck": args.force_quantize_no_input_check,
+        "MatMulConstBOnly": args.matmul_const_b_only,
+        "AddQDQPairToWeight": args.add_qdq_pair_to_weight,
+        "OpTypesToExcludeOutputQuantization": args.op_types_to_exclude_output_quantization,
+        "DedicatedQDQPair": args.dedicated_qdq_pair,
+        "QDQOpTypePerChannelSupportToAxis": qdq_op_type_per_channel_support_to_axis,
+        "CalibTensorRangeSymmetric": args.calib_tensor_range_symmetric,
+        "CalibMovingAverage": args.calib_moving_average,
+        "QuantizeBias": not args.disable_quantize_bias,
+        "UseQDQContribOps": args.use_qdq_contrib_ops,
+        "MinimumRealRange": args.minimum_real_range,
+        "QDQKeepRemovableActivations": args.qdq_keep_removable_activations,
+        "QDQDisableWeightAdjustForInt32Bias": args.qdq_disable_weight_adjust_for_int32_bias,
+        # Load json file for encoding override
+        "TensorQuantOverrides": get_tensor_quant_overrides(args.tensor_quant_overrides),
+    }
+    arg2calib_method = {
+        "minmax": CalibrationMethod.MinMax,
+        "entropy": CalibrationMethod.Entropy,
+        "percentile": CalibrationMethod.Percentile,
+        "distribution": CalibrationMethod.Distribution,
+    }
+    arg2quant_format = {
+        "qdq": QuantFormat.QDQ,
+        "qoperator": QuantFormat.QOperator,
+    }
+    sqc = StaticQuantConfig(
+        calibration_data_reader=data_reader,
+        calibrate_method=arg2calib_method[args.calibration_method],
+        quant_format=arg2quant_format[args.quant_format],
+        activation_type=activation_type,
+        weight_type=weight_type,
+        op_types_to_quantize=None,
+        nodes_to_quantize=args.nodes_to_quantize,
+        nodes_to_exclude=args.nodes_to_exclude,
+        per_channel=args.per_channel,
+        reduce_range=False,
+        use_external_data_format=False,
+        calibration_providers=None,  # Use CPUExecutionProvider
+        extra_options=extra_options,
+    )
+    quantize(model_input=args.input_model_path, model_output=args.output_quantized_model_path, quant_config=sqc)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/onnxruntime/test/contrib_ops/matmul_4bits_test.cc b/onnxruntime/test/contrib_ops/matmul_4bits_test.cc
index 81323cb51a887..bb2bfab585da8 100644
--- a/onnxruntime/test/contrib_ops/matmul_4bits_test.cc
+++ b/onnxruntime/test/contrib_ops/matmul_4bits_test.cc
@@ -126,9 +126,9 @@ void RunTest(const TestOptions& opts,
                                             q_rows, q_cols);
 
   size_t q_data_size_in_bytes, q_scale_size, q_zp_size_in_bytes;
-  MlasBlockwiseQuantizedBufferSizes(QBits, static_cast<int>(opts.block_size), /* columnwise */ true,
-                                    static_cast<int>(K), static_cast<int>(N),
-                                    q_data_size_in_bytes, q_scale_size, &q_zp_size_in_bytes);
+  MlasBlockwiseQuantizedBufferSizes<QBits>(static_cast<int>(opts.block_size), /* columnwise */ true,
+                                           static_cast<int>(K), static_cast<int>(N),
+                                           q_data_size_in_bytes, q_scale_size, &q_zp_size_in_bytes);
 
   std::vector<uint8_t> input1_vals(q_data_size_in_bytes);
   std::vector<float> scales(q_scale_size);
diff --git a/onnxruntime/test/framework/session_state_test.cc b/onnxruntime/test/framework/session_state_test.cc
index 8f4eede76b905..76399743c97f8 100644
--- a/onnxruntime/test/framework/session_state_test.cc
+++ b/onnxruntime/test/framework/session_state_test.cc
@@ -503,8 +503,9 @@ void LoadWithResourceAwarePartitioning(const ORTCHAR_T* model_path,
   layout_transformation::DebugGraphFn debug_graph_fn;
   ASSERT_STATUS_OK(
       partitioner.Partition(graph, session_state.GetMutableFuncMgr(), transform_layout_fn,
-                            sess_options.config_options, default_logger,
-                            GraphPartitioner::Mode::kNormal, debug_graph_fn));
+                            sess_options.config_options, default_logger, GraphPartitioner::Mode::kNormal,
+                            EpContextModelGenerationOptions{},
+                            debug_graph_fn));
 
   verifier_fn(graph);
 }
diff --git a/onnxruntime/test/mlas/bench/bench_qnbitgemm.cpp b/onnxruntime/test/mlas/bench/bench_qnbitgemm.cpp
index 88c036ac4854e..15be5e140db6f 100644
--- a/onnxruntime/test/mlas/bench/bench_qnbitgemm.cpp
+++ b/onnxruntime/test/mlas/bench/bench_qnbitgemm.cpp
@@ -31,8 +31,8 @@ void RunQNBitGemmBenchmark(size_t BlkLen,
   }
 
   size_t QuantBDataSizeInBytes, QuantBScaleSize, QuantBZeroPointSizeInBytes;
-  MlasBlockwiseQuantizedBufferSizes(
-      BlkBitWidth, static_cast<int>(BlkLen), /* columnwise */ true,
+  MlasBlockwiseQuantizedBufferSizes<BlkBitWidth>(
+      static_cast<int>(BlkLen), /* columnwise */ true,
       static_cast<int>(K), static_cast<int>(N),
       QuantBDataSizeInBytes, QuantBScaleSize, &QuantBZeroPointSizeInBytes);
 
diff --git a/onnxruntime/test/mlas/unittest/test_blockq4.cpp b/onnxruntime/test/mlas/unittest/test_blockq4.cpp
index f75002f715154..4302e36db75cd 100644
--- a/onnxruntime/test/mlas/unittest/test_blockq4.cpp
+++ b/onnxruntime/test/mlas/unittest/test_blockq4.cpp
@@ -18,192 +18,163 @@ Module Name:
 
 #include "test_util.h"
 #include "mlas_q4.h"
+#include "core/mlas/lib/mlasi.h"
 
+template <int qbits>
+int GetElem(int v, int idx) {
+  return (v >> (qbits * idx)) & ((1 << qbits) - 1);
+}
+
+template <int qbits>
+int SetElem(int v, int idx, int value) {
+  v &= ~(((1 << qbits) - 1) << (qbits * idx));
+  v |= (value & ((1 << qbits) - 1)) << (qbits * idx);
+  return v;
+}
+
+template <typename T, int qbits>
 class MlasBlockwiseQdqTest : public MlasTestBase {
  private:
-  MatrixGuardBuffer<float> FpBuf;
-  MatrixGuardBuffer<float> FpBuf2;
+  std::random_device rd;
+  std::mt19937 gen{192373};
+  std::uniform_int_distribution<int> dist_int{0, (1 << qbits) - 1};
+  std::uniform_real_distribution<float> dist_float{-1.f, 1.f};
+  MatrixGuardBuffer<T> FpBuf;
+  MatrixGuardBuffer<T> FpBuf2;
+  MatrixGuardBuffer<T> FpBuf3;
   MatrixGuardBuffer<uint8_t> InputElements;
-  MatrixGuardBuffer<float> InputScales;
+  MatrixGuardBuffer<T> InputScales;
   MatrixGuardBuffer<uint8_t> InputOffsets;
-  MatrixGuardBuffer<uint8_t> OutputElements;
-  MatrixGuardBuffer<float> OutputScales;
-  MatrixGuardBuffer<uint8_t> OutputOffsets;
   MatrixGuardBuffer<uint8_t> QDQOutputElements;
-  MatrixGuardBuffer<float> QDQOutputScales;
+  MatrixGuardBuffer<T> QDQOutputScales;
   MatrixGuardBuffer<uint8_t> QDQOutputOffsets;
   MatrixGuardBuffer<uint8_t> QDQTransposedOutputElements;
-  MatrixGuardBuffer<float> QDQTransposedOutputScales;
+  MatrixGuardBuffer<T> QDQTransposedOutputScales;
   MatrixGuardBuffer<uint8_t> QDQTransposedOutputOffsets;
+  constexpr static float err_ = qbits == 8 ? 1e-2f : qbits == 4 ? 6e-2f
+                                                                : 2e-1f;
+  constexpr static float rel_ = qbits == 8 ? 5e-2f : qbits == 4 ? 2e-1f
+                                                                : 5e-1f;
+
+  bool FloatEqual(T a, T b, float err = err_, float rel = rel_) {
+    float va = static_cast<float>(a);
+    float vb = static_cast<float>(b);
+    return std::abs(va - vb) < err + std::abs(va) * rel;
+  }
 
   void Test(int rows, int columns, int block_size, bool columnwise, bool symmetric) {
-    float* dequant_buf = FpBuf.GetBuffer(rows * columns, true);
-    float* transposed = FpBuf2.GetBuffer(rows * columns, true);
+    constexpr int packSize = 8 / qbits;
+    T* input = FpBuf.GetFilledBuffer(rows * columns, [this](T* start, size_t size) {
+      for (size_t i = 0; i < size; i++) {
+        start[i] = T(this->dist_float(this->gen));
+      }
+    });
+    T* dequant = FpBuf2.GetBuffer(rows * columns, true);
+    T* transposed = FpBuf3.GetBuffer(rows * columns, true);
     size_t scale_size = (rows + block_size - 1) / block_size * columns;
-    size_t zp_size = (scale_size + 1) / 2;
-
+    size_t zp_size = (scale_size + packSize - 1) / packSize;
     MLAS_THREADPOOL* threadpool_ptr = GetMlasThreadPool();
 
     int meta_rows;
     int meta_cols;
-    MlasBlockwiseQuantMetaShape<float, 4>(block_size, columnwise, rows, columns, meta_rows, meta_cols);
+    MlasBlockwiseQuantMetaShape<T, qbits>(block_size, columnwise, rows, columns, meta_rows, meta_cols);
 
     int q_rows;
     int q_cols;
-    MlasBlockwiseQuantizedShape<float, 4>(block_size, columnwise, rows, columns, q_rows, q_cols);
+    MlasBlockwiseQuantizedShape<T, qbits>(block_size, columnwise, rows, columns, q_rows, q_cols);
 
     size_t q_data_size_in_bytes, q_scale_size, q_zp_size_in_bytes;
-    MlasBlockwiseQuantizedBufferSizes(4, block_size, columnwise, rows, columns,
-                                      q_data_size_in_bytes, q_scale_size, &q_zp_size_in_bytes);
-
-    uint8_t* elements = InputElements.GetBuffer(q_data_size_in_bytes, true);
-    uint8_t* qdq_weights = QDQOutputElements.GetBuffer((rows * columns + 1) / 2, true);
-    uint8_t* qdq_weights_T = QDQTransposedOutputElements.GetBuffer(q_data_size_in_bytes, true);
-
-    int v = 7;
-    for (int c = 0; c < columns; c++) {
-      for (int r = 0; r < rows; r += 2) {
-        int idx = c * q_rows + r / 2;
-        uint8_t v0 = static_cast<uint8_t>(v);
-        v = (v + 5) % 16;
-        if (v == 11 || v == 7 || v == 3) {
-          // making the cycle 13 instead of 16, avoiding same values in a row
-          v = (v + 5) % 16;
-        }
-        uint8_t v1 = 0;
-        if (r + 1 < rows) {
-          v1 = static_cast<uint8_t>(v);
-          v = (v + 5) % 16;
-          if (v == 11 || v == 7 || v == 3) {
-            // making the cycle 13 instead of 16, avoiding same values in a row
-            v = (v + 5) % 16;
-          }
-        }
-
-        elements[idx] = (v1 << 4) | v0;
-      }
+    MlasBlockwiseQuantizedBufferSizes<qbits>(block_size, columnwise, rows, columns,
+                                             q_data_size_in_bytes, q_scale_size, &q_zp_size_in_bytes);
+
+    uint8_t* elements = InputElements.GetBuffer(q_data_size_in_bytes, true);  // after quantize
+    uint8_t* qdq_weights;
+    uint8_t* qdq_weights_T;
+    if constexpr (qbits == 4) {
+      qdq_weights = QDQOutputElements.GetBuffer((rows * columns + packSize - 1) / packSize, true);
+      qdq_weights_T = QDQTransposedOutputElements.GetBuffer(q_data_size_in_bytes, true);
     }
 
-    float* scales = InputScales.GetBuffer(q_scale_size);
-    float* qdq_scales = QDQOutputScales.GetBuffer(scale_size);
-    float* qdq_scales_T = QDQTransposedOutputScales.GetBuffer(q_scale_size);
+    T* scales = InputScales.GetBuffer(q_scale_size, true);
     uint8_t* zp = symmetric ? nullptr : InputOffsets.GetBuffer(q_zp_size_in_bytes, true);
-    uint8_t* qdq_zp = symmetric ? nullptr : QDQOutputOffsets.GetBuffer(zp_size, true);
-    uint8_t* qdq_zp_T = symmetric ? nullptr : QDQTransposedOutputOffsets.GetBuffer(q_zp_size_in_bytes, true);
-    if (zp) {
-      for (int c = 0; c < meta_cols; c++) {
-        for (int r = 0; r < meta_rows; r += 2) {
-          int idx = c * ((meta_rows + 1) / 2) + r / 2;
-          uint8_t v0 = static_cast<uint8_t>(v);
-          v = (v + 5) % 16;
-          if (v == 11 || v == 7 || v == 3) {
-            // making the cycle 13 instead of 16, avoiding same values in a row
-            v = (v + 5) % 16;
-          }
-          uint8_t v1 = 0;
-          if (r + 1 < meta_rows) {
-            v1 = static_cast<uint8_t>(v);
-            v = (v + 5) % 16;
-            if (v == 11 || v == 7 || v == 3) {
-              // making the cycle 13 instead of 16, avoiding same values in a row
-              v = (v + 5) % 16;
-            }
-          }
-          zp[idx] = (v1 << 4) | v0;
-        }
-      }
+    T* qdq_scales;
+    T* qdq_scales_T;
+    uint8_t* qdq_zp;
+    uint8_t* qdq_zp_T;
+    if constexpr (qbits == 4) {
+      qdq_scales = QDQOutputScales.GetBuffer(scale_size, true);
+      qdq_scales_T = QDQTransposedOutputScales.GetBuffer(q_scale_size, true);
+      qdq_zp = symmetric ? nullptr : QDQOutputOffsets.GetBuffer(zp_size, true);
+      qdq_zp_T = symmetric ? nullptr : QDQTransposedOutputOffsets.GetBuffer(q_zp_size_in_bytes, true);
     }
 
-    MlasDequantizeBlockwise<float, 4>(dequant_buf, elements, scales, zp, block_size,
-                                      columnwise, rows, columns, threadpool_ptr);
-
-    MlasTranspose(dequant_buf, transposed, columns, rows);
-
-    uint8_t* o_elements = OutputElements.GetBuffer(q_rows * q_cols, true);
-    float* o_scales = OutputScales.GetBuffer(meta_rows * meta_cols);
-    uint8_t* o_zp = symmetric ? nullptr : OutputOffsets.GetBuffer(((meta_rows + 1) / 2) * meta_cols, true);
-
-    MlasQuantizeBlockwise<float, 4>(o_elements, o_scales, o_zp, transposed, block_size,
+    MlasQuantizeBlockwise<T, qbits>(elements, scales, zp, input, block_size,
                                     columnwise, rows, columns, columns, threadpool_ptr);
 
-    if (columnwise) {
-      bool signed_quant = MlasQDQQuantizeBlockwise<float, 4>(
-          transposed, qdq_scales, qdq_zp, qdq_weights,
-          true, rows, columns, block_size, threadpool_ptr);
+    MlasDequantizeBlockwise<T, qbits>(dequant, elements, scales, zp, block_size,
+                                      columnwise, rows, columns, threadpool_ptr);
 
-      ASSERT_EQ(symmetric, signed_quant) << "symmetric quantization should be signed";
+    MlasTranspose(dequant, transposed, columns, rows, threadpool_ptr);
 
-      if (symmetric) {
-        MlasQDQTransposeBlockwiseQuantized<float, 4, true>(
-            qdq_weights, qdq_scales, qdq_zp, qdq_weights_T, qdq_scales_T, qdq_zp_T,
+    if constexpr (qbits == 4) {
+      if (columnwise) {
+        bool signed_quant = MlasQDQQuantizeBlockwise<T, qbits>(
+            input, qdq_scales, qdq_zp, qdq_weights,
             true, rows, columns, block_size, threadpool_ptr);
 
-      } else {
-        MlasQDQTransposeBlockwiseQuantized<float, 4, false>(
-            qdq_weights, qdq_scales, qdq_zp, qdq_weights_T, qdq_scales_T, qdq_zp_T,
-            true, rows, columns, block_size, threadpool_ptr);
+        ASSERT_EQ(symmetric, signed_quant) << "symmetric quantization should be signed";
+
+        if (symmetric) {
+          MlasQDQTransposeBlockwiseQuantized<T, qbits, true>(
+              qdq_weights, qdq_scales, qdq_zp, qdq_weights_T, qdq_scales_T, qdq_zp_T,
+              true, rows, columns, block_size, threadpool_ptr);
+
+        } else {
+          MlasQDQTransposeBlockwiseQuantized<T, qbits, false>(
+              qdq_weights, qdq_scales, qdq_zp, qdq_weights_T, qdq_scales_T, qdq_zp_T,
+              true, rows, columns, block_size, threadpool_ptr);
+        }
       }
     }
 
-    for (int c = 0; c < columns; c++) {
-      for (int r = 0; r < rows; r += 2) {
-        int idx = c * q_rows + r / 2;
-        ASSERT_EQ(o_elements[idx] & 0xf, elements[idx] & 0xf)
+    for (int r = 0; r < rows; r++) {
+      for (int c = 0; c < columns; c++) {
+        int idx = r * columns + c;
+        ASSERT_TRUE(FloatEqual(input[idx], transposed[idx]))
+            << " input: " << input[idx] << ", transposed: " << transposed[idx]
             << ", index=[" << r << "x" << c << "], shape=[" << rows << "x" << columns
             << "] block: " << block_size << ", symmetric: " << symmetric << ", columnwise: " << columnwise;
-        if (columnwise) {
-          ASSERT_EQ(qdq_weights_T[idx] & 0xf, elements[idx] & 0xf)
-              << ", index=[" << r << "x" << c << "], shape=[" << rows << "x" << columns
-              << "] block: " << block_size << ", symmetric: " << symmetric << ", columnwise: " << columnwise;
-        }
+      }
+    }
 
-        if (r + 1 < rows) {
-          ASSERT_EQ(o_elements[idx] >> 4, elements[idx] >> 4)
-              << ", index=[" << r + 1 << "x" << c << "], shape=[" << rows << "x" << columns
-              << "] block: " << block_size << ", symmetric: " << symmetric << ", columnwise: " << columnwise;
-          if (columnwise) {
-            ASSERT_EQ(qdq_weights_T[idx] >> 4, elements[idx] >> 4)
-                << ", index=[" << r + 1 << "x" << c << "], shape=[" << rows << "x" << columns
+    if (columnwise && qbits == 4) {
+      for (int c = 0; c < columns; c++) {
+        for (int r = 0; r < rows; r += packSize) {
+          int idx = c * q_rows + r / packSize;
+          for (int l = 0; l < packSize && l + r < rows; ++l) {
+            ASSERT_EQ(GetElem<qbits>(qdq_weights_T[idx], l), GetElem<qbits>(elements[idx], l))
+                << ", qdq index=[" << r + l << "x" << c << "], shape=[" << rows << "x" << columns
                 << "] block: " << block_size << ", symmetric: " << symmetric << ", columnwise: " << columnwise;
           }
         }
       }
-    }
 
-    for (int c = 0; c < meta_cols; c++) {
-      for (int r = 0; r < meta_rows; r++) {
-        int idx = c * meta_rows + r;
-        ASSERT_EQ(o_scales[idx], scales[idx])
-            << ", index=" << r << "x" << c << ", shape=[" << rows << "x" << columns
-            << "] block: " << block_size << ", symmetric: " << symmetric << ", columnwise: " << columnwise;
-
-        if (columnwise) {
-          ASSERT_EQ(qdq_scales_T[idx], scales[idx])
-              << ", index=" << r << "x" << c << ", shape=[" << rows << "x" << columns
+      for (int c = 0; c < meta_cols; c++) {
+        for (int r = 0; r < meta_rows; r++) {
+          int idx = c * meta_rows + r;
+          ASSERT_TRUE(FloatEqual(qdq_scales_T[idx], scales[idx]))
+              << ", qdq index=" << r << "x" << c << ", shape=[" << rows << "x" << columns
               << "] block: " << block_size << ", symmetric: " << symmetric << ", columnwise: " << columnwise;
         }
       }
-    }
 
-    if (symmetric) return;
-    for (int c = 0; c < meta_cols; c++) {
-      for (int r = 0; r < meta_rows; r += 2) {
-        int idx = c * ((meta_rows + 1) / 2) + r / 2;
-        ASSERT_EQ(o_zp[idx] & 0xf, zp[idx] & 0xf)
-            << ", index=" << r << "x" << c << ", shape=[" << rows << "x" << columns
-            << "] block: " << block_size << ", symmetric: " << symmetric << ", columnwise: " << columnwise;
-        if (columnwise) {
-          ASSERT_EQ(qdq_zp_T[idx] & 0xf, zp[idx] & 0xf)
-              << ", index=" << r << "x" << c << ", shape=[" << rows << "x" << columns
-              << "] block: " << block_size << ", symmetric: " << symmetric << ", columnwise: " << columnwise;
-        }
-        if (r + 1 < meta_rows) {
-          ASSERT_EQ(o_zp[idx] >> 4, zp[idx] >> 4)
-              << ", index=" << r + 1 << "x" << c << ", shape=[" << rows << "x" << columns
-              << "] block: " << block_size << ", symmetric: " << symmetric << ", columnwise: " << columnwise;
-          if (columnwise) {
-            ASSERT_EQ(qdq_zp_T[idx] >> 4, zp[idx] >> 4)
-                << ", index=" << r + 1 << "x" << c << ", shape=[" << rows << "x" << columns
+      if (symmetric) return;
+      for (int c = 0; c < meta_cols; c++) {
+        for (int r = 0; r < meta_rows; r += packSize) {
+          int idx = c * ((meta_rows + packSize - 1) / packSize) + r / packSize;
+          for (int l = 0; l < packSize && r + l < meta_rows; ++l) {
+            ASSERT_EQ(GetElem<qbits>(qdq_zp_T[idx], l), GetElem<qbits>(zp[idx], l))
+                << ", qdq index=" << r + l << "x" << c << ", shape=[" << rows << "x" << columns
                 << "] block: " << block_size << ", symmetric: " << symmetric << ", columnwise: " << columnwise;
           }
         }
@@ -213,7 +184,7 @@ class MlasBlockwiseQdqTest : public MlasTestBase {
 
  public:
   static const char* GetTestSuiteName() {
-    static const std::string suite_name("BlockQ4");
+    static const std::string suite_name("BlockQ" + std::to_string(qbits));
     return suite_name.c_str();
   }
 
@@ -263,7 +234,9 @@ class MlasBlockwiseQdqTest : public MlasTestBase {
 static UNUSED_VARIABLE bool added_to_main = AddTestRegister([](bool is_short_execute) {
   size_t count = 0;
   if (is_short_execute) {
-    count += MlasDirectShortExecuteTests<MlasBlockwiseQdqTest>::RegisterShortExecute();
+    count += MlasDirectShortExecuteTests<MlasBlockwiseQdqTest<float, 2>>::RegisterShortExecute();
+    count += MlasDirectShortExecuteTests<MlasBlockwiseQdqTest<float, 4>>::RegisterShortExecute();
+    count += MlasDirectShortExecuteTests<MlasBlockwiseQdqTest<float, 8>>::RegisterShortExecute();
   }
   return count;
 });
diff --git a/onnxruntime/test/mlas/unittest/test_sqnbitgemm.cpp b/onnxruntime/test/mlas/unittest/test_sqnbitgemm.cpp
index 16af51cfaa12d..91ce359aec316 100644
--- a/onnxruntime/test/mlas/unittest/test_sqnbitgemm.cpp
+++ b/onnxruntime/test/mlas/unittest/test_sqnbitgemm.cpp
@@ -246,9 +246,9 @@ class MlasSQNBitGemmTest : public MlasTestBase {
     uint8_t* QuantBZeroPoint = nullptr;
     {
       size_t QuantBDataSizeInBytes, QuantBScaleSize, QuantBZeroPointSizeInBytes;
-      MlasBlockwiseQuantizedBufferSizes(BlkBitWidth, BlkLen, /* columnwise */ true,
-                                        static_cast<int>(K), static_cast<int>(N),
-                                        QuantBDataSizeInBytes, QuantBScaleSize, &QuantBZeroPointSizeInBytes);
+      MlasBlockwiseQuantizedBufferSizes<BlkBitWidth>(BlkLen, /* columnwise */ true,
+                                                     static_cast<int>(K), static_cast<int>(N),
+                                                     QuantBDataSizeInBytes, QuantBScaleSize, &QuantBZeroPointSizeInBytes);
 
       QuantBData = BufferQuantBData.GetBuffer(QuantBDataSizeInBytes);
       QuantBScale = BufferQuantBScale.GetBuffer(QuantBScaleSize);
diff --git a/onnxruntime/test/mlas/unittest/test_transpose.cpp b/onnxruntime/test/mlas/unittest/test_transpose.cpp
index 8fa98411a21ab..2f446ec5b4d37 100644
--- a/onnxruntime/test/mlas/unittest/test_transpose.cpp
+++ b/onnxruntime/test/mlas/unittest/test_transpose.cpp
@@ -3,12 +3,13 @@
 
 #include "test_util.h"
 
-template <typename ElementType>
+template <typename ElementType, bool Threaded>
 class MlasTransposeTest : public MlasTestBase {
  private:
   MatrixGuardBuffer<ElementType> BufferInput;
   MatrixGuardBuffer<ElementType> BufferOutput;
   MatrixGuardBuffer<ElementType> BufferOutputReference;
+  MLAS_THREADPOOL* threadpool_;
 
   void
   Test(size_t M, size_t N) {
@@ -16,7 +17,7 @@ class MlasTransposeTest : public MlasTestBase {
     ElementType* Output = BufferOutput.GetBuffer(M * N);
     ElementType* OutputReference = BufferOutputReference.GetBuffer(M * N);
 
-    MlasTranspose(Input, Output, M, N);
+    MlasTranspose(Input, Output, M, N, threadpool_);
     ReferenceTranspose(Input, OutputReference, M, N);
 
     ASSERT_EQ(memcmp(Output, OutputReference, M * N * sizeof(ElementType)), 0) << " [" << M << "," << N << "]";
@@ -31,11 +32,23 @@ class MlasTransposeTest : public MlasTestBase {
   }
 
  public:
+  MlasTransposeTest() : threadpool_(Threaded ? GetMlasThreadPool() : nullptr) {}
+
   static const char* GetTestSuiteName() {
-    static const std::string suite_name = std::string("Transpose_Size") + std::to_string(int(sizeof(ElementType)));
+    static const std::string suite_name = std::string("Transpose_") +
+                                          GetTypeString() +
+                                          std::string(Threaded ? "_Threaded" : "_SingleThread");
     return suite_name.c_str();
   }
 
+  static const std::string GetTypeString() {
+    if (std::is_same<ElementType, float>::value) return std::string("FP32");
+    if (std::is_same<ElementType, uint32_t>::value) return std::string("U32");
+    if (std::is_same<ElementType, uint16_t>::value) return std::string("U16");
+    if (std::is_same<ElementType, uint8_t>::value) return std::string("U8");
+    return std::string("unknown");
+  }
+
   void ExecuteShort(void) override {
     for (size_t m = 1; m <= 32; m++) {
       for (size_t n = 1; n <= 32; n++) {
@@ -48,9 +61,14 @@ class MlasTransposeTest : public MlasTestBase {
 static UNUSED_VARIABLE bool added_to_main = AddTestRegister([](bool is_short_execute) {
   size_t count = 0;
   if (is_short_execute) {
-    count += MlasDirectShortExecuteTests<MlasTransposeTest<uint32_t>>::RegisterShortExecute();
-    count += MlasDirectShortExecuteTests<MlasTransposeTest<uint16_t>>::RegisterShortExecute();
-    count += MlasDirectShortExecuteTests<MlasTransposeTest<uint8_t>>::RegisterShortExecute();
+    count += MlasDirectShortExecuteTests<MlasTransposeTest<uint32_t, false>>::RegisterShortExecute();
+    count += MlasDirectShortExecuteTests<MlasTransposeTest<uint16_t, false>>::RegisterShortExecute();
+    count += MlasDirectShortExecuteTests<MlasTransposeTest<uint8_t, false>>::RegisterShortExecute();
+    count += MlasDirectShortExecuteTests<MlasTransposeTest<uint32_t, true>>::RegisterShortExecute();
+    count += MlasDirectShortExecuteTests<MlasTransposeTest<uint16_t, true>>::RegisterShortExecute();
+    count += MlasDirectShortExecuteTests<MlasTransposeTest<uint8_t, true>>::RegisterShortExecute();
+    count += MlasDirectShortExecuteTests<MlasTransposeTest<float, true>>::RegisterShortExecute();
+    count += MlasDirectShortExecuteTests<MlasTransposeTest<int8_t, true>>::RegisterShortExecute();
   }
   return count;
 });
diff --git a/onnxruntime/test/onnx/README.txt b/onnxruntime/test/onnx/README.txt
index c6effce0157b6..66860df5ec90b 100644
--- a/onnxruntime/test/onnx/README.txt
+++ b/onnxruntime/test/onnx/README.txt
@@ -2,6 +2,7 @@ onnx_test_runner [options...] <data_root>
 Options:
         -j [models]: Specifies the number of models to run simultaneously.
         -c [runs]: Specifies the number of Session::Run() to invoke simultaneously for each model.
+        -I [inference_mode]: Use inference mode. Save the inference result and skip the output value comparison.
         -n [test_case_name]: Specifies a single test case to run.
         -p [PLANNER_TYPE]: PLANNER_TYPE could be 'seq' or 'simple'. Default: 'simple'.
         -e [EXECUTION_PROVIDER]: EXECUTION_PROVIDER could be 'cpu', 'cuda', 'dnnl', 'tensorrt' or 'acl'. Default: 'cpu'.
@@ -33,3 +34,15 @@ How to run model tests:
    onnx_test_runner <test_data_dir>
    e.g.
 	 onnx_test_runner C:\testdata
+
+3. If running with inference_mode (-I), onnx_test_runner will save output in pb format.
+   - The value comparison will be skipped
+   - The actual_output_<output_id>.pb will be saved in the corresponding test_data_set_<test_case_id> directory.
+   - Example:
+      C:\workspace\resnet18-v1-7
+      │   resnet18-v1-7.onnx
+      │
+      └───test_data_set_0
+            actual_output_0.pb (produced by onnx_test_runner.exe)
+            input_0.pb
+            output_0.pb
diff --git a/onnxruntime/test/onnx/TestCase.cc b/onnxruntime/test/onnx/TestCase.cc
index 024987ede0ce3..bc8b672512d8d 100644
--- a/onnxruntime/test/onnx/TestCase.cc
+++ b/onnxruntime/test/onnx/TestCase.cc
@@ -306,6 +306,10 @@ class OnnxTestCase : public ITestCase {
                        bool is_input, size_t i,
                        std::unordered_map<std::string, Ort::Value>& out) const;
 
+  void ConvertResult(Ort::Value& out_value,
+                     size_t i,
+                     ONNX_NAMESPACE::TensorProto& result_pb) const;
+
   void ConvertTestData(const ONNX_NAMESPACE::SequenceProto& test_data_pb,
                        onnxruntime::test::HeapBuffer& b,
                        bool is_input, size_t i,
@@ -349,6 +353,7 @@ class OnnxTestCase : public ITestCase {
 
   void LoadTestData(size_t id, onnxruntime::test::HeapBuffer& b, std::unordered_map<std::string, Ort::Value>&,
                     bool is_input) const override;
+  void SaveResult(size_t id, std::vector<Ort::Value>& out_values) const override;
 };
 
 std::unique_ptr<ITestCase> CreateOnnxTestCase(const std::string& test_case_name,
@@ -442,6 +447,21 @@ static void LoadTensor(const PATH_STRING_TYPE& pb_file, ONNX_NAMESPACE::TensorPr
   }
 }
 
+// save tensors to disk
+template <typename PATH_STRING_TYPE>
+static void SaveTensor(const PATH_STRING_TYPE& pb_file, ONNX_NAMESPACE::TensorProto& result_pb) {
+  int tensor_fd;
+  auto st = Env::Default().FileOpenWr(pb_file, tensor_fd);
+  if (!st.IsOK()) {
+    ORT_THROW("open file '", ToUTF8String(pb_file), "' failed:", st.ErrorMessage());
+  }
+  google::protobuf::io::FileOutputStream f(tensor_fd, protobuf_block_size_in_bytes);
+  f.SetCloseOnDelete(true);
+  if (!result_pb.SerializeToZeroCopyStream(&f)) {
+    ORT_THROW("serialize file '", ToUTF8String(pb_file), "' failed");
+  }
+}
+
 // load sequence tensors from disk
 template <typename PATH_STRING_TYPE>
 static void LoadSequenceTensor(const PATH_STRING_TYPE& pb_file, ONNX_NAMESPACE::SequenceProto& input_pb) {
@@ -559,6 +579,41 @@ void OnnxTestCase::LoadTestData(size_t id, onnxruntime::test::HeapBuffer& b,
   }
 }
 
+void OnnxTestCase::SaveResult(size_t id, std::vector<Ort::Value>& out_values) const {
+  if (id >= test_data_dirs_.size()) {
+    ORT_THROW("index out of bound");
+  }
+  std::vector<PATH_STRING_TYPE> result_pb_files;
+
+  std::filesystem::path dir_fs_path = test_data_dirs_[id];
+  if (!std::filesystem::exists(dir_fs_path)) return;
+
+  for (size_t idx = 0; idx < out_values.size(); idx++) {
+    auto onnx_type = out_values[idx].GetConst().GetTypeInfo().GetConst().GetONNXType();
+    if (onnx_type == ONNXType::ONNX_TYPE_TENSOR) {
+      ONNX_NAMESPACE::TensorProto result_pb;
+      ConvertResult(out_values[idx], idx, result_pb);
+      std::cout << "[SaveResult] " << result_pb.dims_size() << std::endl;
+#ifdef _WIN32
+      std::wstring actual_output = std::wstring(L"actual_output_") + std::to_wstring(idx) + std::wstring(L".pb");
+#else
+      std::string actual_output = std::string("actual_output_") + std::to_string(idx) + std::string(".pb");
+#endif
+      result_pb_files.push_back(actual_output);
+      SaveTensor(dir_fs_path / result_pb_files[idx], result_pb);
+    } else if (onnx_type == ONNXType::ONNX_TYPE_SEQUENCE) {
+      // TODO: Support SequenceProto
+      ORT_THROW("ONNX_TYPE_SEQUENCE type for the output ", idx, " is not yet supported in test runner SaveResult");
+    } else if (onnx_type == ONNXType::ONNX_TYPE_OPTIONAL) {
+      // TODO: Support OptionalProto
+      ORT_THROW("ONNX_TYPE_OPTIONAL type for the output ", idx, " is not yet supported in test runner SaveResult");
+    } else {
+      ORT_THROW("Unsupported type for the output ", idx, " in test runner SaveResult");
+    }
+  }
+  return;
+}
+
 void OnnxTestCase::ConvertTestData(const ONNX_NAMESPACE::TensorProto& test_data_pb,
                                    onnxruntime::test::HeapBuffer& b,
                                    bool is_input, size_t i,
@@ -589,6 +644,18 @@ void OnnxTestCase::ConvertTestData(const ONNX_NAMESPACE::TensorProto& test_data_
   out.emplace(name_finalized, std::move(v1));
 }
 
+void OnnxTestCase::ConvertResult(Ort::Value& out_value,
+                                 size_t i,
+                                 ONNX_NAMESPACE::TensorProto& result_pb) const {
+  // Convert output Ort::Value to TensorProto
+  std::string out_name = model_info_->GetOutputName(i);
+  auto status = onnxruntime::test::MLValueToTensorProto(out_value, result_pb);
+  if (!status.IsOK()) {
+    ORT_THROW("Output ", i, " ", out_name, ": ", status.ToString());
+  }
+  return;
+}
+
 void OnnxTestCase::ConvertTestData(const ONNX_NAMESPACE::SequenceProto& test_data_pb,
                                    onnxruntime::test::HeapBuffer& b,
                                    bool is_input, size_t i,
diff --git a/onnxruntime/test/onnx/TestCase.h b/onnxruntime/test/onnx/TestCase.h
index 745a1fe9eeb50..f129ed714663a 100644
--- a/onnxruntime/test/onnx/TestCase.h
+++ b/onnxruntime/test/onnx/TestCase.h
@@ -32,6 +32,7 @@ class ITestCase {
   virtual void LoadTestData(size_t id, onnxruntime::test::HeapBuffer& b,
                             std::unordered_map<std::string, Ort::Value>& name_data_map,
                             bool is_input) const = 0;
+  virtual void SaveResult(size_t id, std::vector<Ort::Value>& out_values) const = 0;
   virtual const std::filesystem::path& GetModelUrl() const = 0;
   virtual const std::string& GetNodeName() const = 0;
   virtual const ONNX_NAMESPACE::ValueInfoProto* GetInputInfoFromModel(size_t i) const = 0;
diff --git a/onnxruntime/test/onnx/dataitem_request.cc b/onnxruntime/test/onnx/dataitem_request.cc
index d8deafb70b431..e391f8847c9f2 100644
--- a/onnxruntime/test/onnx/dataitem_request.cc
+++ b/onnxruntime/test/onnx/dataitem_request.cc
@@ -19,10 +19,10 @@ namespace onnxruntime {
 namespace test {
 
 std::pair<EXECUTE_RESULT, TIME_SPEC> DataTaskRequestContext::Run(const ITestCase& c, ::Ort::Session& session,
-                                                                 OrtAllocator* allocator, size_t task_id) {
+                                                                 OrtAllocator* allocator, size_t task_id, bool inference_mode) {
   std::pair<EXECUTE_RESULT, TIME_SPEC> result;
   Callback empty_cb;
-  DataTaskRequestContext ctx(empty_cb, c, session, allocator, task_id);
+  DataTaskRequestContext ctx(empty_cb, c, session, allocator, task_id, inference_mode);
   ORT_TRY {
     result = ctx.RunImpl();
   }
@@ -37,9 +37,9 @@ std::pair<EXECUTE_RESULT, TIME_SPEC> DataTaskRequestContext::Run(const ITestCase
 
 void DataTaskRequestContext::Request(const Callback& cb, concurrency::ThreadPool* tp,
                                      const ITestCase& c, Ort::Session& session,
-                                     OrtAllocator* allocator, size_t task_id) {
+                                     OrtAllocator* allocator, size_t task_id, bool inference_mode) {
   assert(cb);
-  std::unique_ptr<DataTaskRequestContext> self = std::make_unique<DataTaskRequestContext>(cb, c, session, allocator, task_id);
+  std::unique_ptr<DataTaskRequestContext> self = std::make_unique<DataTaskRequestContext>(cb, c, session, allocator, task_id, inference_mode);
   CallableFactory<DataTaskRequestContext, void> f(self.get());
   auto runnable = f.GetCallable<&DataTaskRequestContext::RunAsync>();
   onnxruntime::concurrency::ThreadPool::Schedule(tp, [runnable]() { runnable.Invoke(); });
@@ -118,6 +118,10 @@ std::pair<EXECUTE_RESULT, TIME_SPEC> DataTaskRequestContext::RunImpl() {
   test_case_.GetRelativePerSampleTolerance(&relative_per_sample_tolerance);
   test_case_.GetPostProcessing(&post_procesing);
 
+  if (inference_mode_) {
+    test_case_.SaveResult(task_id_, output_values);
+    return std::make_pair(EXECUTE_RESULT::SUCCESS, spent_time_);
+  }
   std::unordered_map<std::string, Ort::Value> expected_output_values;
   test_case_.LoadTestData(task_id_, holder, expected_output_values, false);
 
diff --git a/onnxruntime/test/onnx/dataitem_request.h b/onnxruntime/test/onnx/dataitem_request.h
index 96a0e64cd8fcb..8b7b0c82a2f9a 100644
--- a/onnxruntime/test/onnx/dataitem_request.h
+++ b/onnxruntime/test/onnx/dataitem_request.h
@@ -39,7 +39,7 @@ class DataTaskRequestContext {
   /// <param name="task_id">this task id</param>
   /// <returns>execution result and elapsed time</returns>
   static std::pair<EXECUTE_RESULT, TIME_SPEC> Run(const ITestCase& c, ::Ort::Session& session,
-                                                  OrtAllocator* allocator, size_t task_id);
+                                                  OrtAllocator* allocator, size_t task_id, bool inference_mode = false);
 
   /// <summary>
   /// Schedules a data task to run on a threadpool. The function
@@ -53,7 +53,7 @@ class DataTaskRequestContext {
   /// <param name="task_id">this taks id</param>
   static void Request(const Callback& cb, concurrency::ThreadPool* tp,
                       const ITestCase& c, ::Ort::Session& session,
-                      OrtAllocator* allocator, size_t task_id);
+                      OrtAllocator* allocator, size_t task_id, bool inference_mode = false);
 
   ORT_DISALLOW_COPY_AND_ASSIGNMENT(DataTaskRequestContext);
 
@@ -69,12 +69,13 @@ class DataTaskRequestContext {
 
   DataTaskRequestContext(const Callback& cb,
                          const ITestCase& test_case, ::Ort::Session& session,
-                         OrtAllocator* allocator, size_t task_id)
+                         OrtAllocator* allocator, size_t task_id, bool inference_mode = false)
       : cb_(cb),
         test_case_(test_case),
         session_(session),
         default_allocator_(allocator),
-        task_id_(task_id) {
+        task_id_(task_id),
+        inference_mode_(inference_mode) {
     SetTimeSpecToZero(&spent_time_);
   }
 
@@ -88,6 +89,7 @@ class DataTaskRequestContext {
   OrtAllocator* default_allocator_;
   size_t task_id_;
   TIME_SPEC spent_time_;
+  bool inference_mode_;
 };
 
 }  // namespace test
diff --git a/onnxruntime/test/onnx/main.cc b/onnxruntime/test/onnx/main.cc
index 1d66c488775d3..d9cfa1ea79d85 100644
--- a/onnxruntime/test/onnx/main.cc
+++ b/onnxruntime/test/onnx/main.cc
@@ -45,6 +45,7 @@ void usage() {
       "\t-M : Disable memory pattern\n"
       "\t-c [runs]: Specifies the number of Session::Run() to invoke simultaneously for each model.\n"
       "\t-r [repeat]: Specifies the number of times to repeat\n"
+      "\t-I [inference_mode]: Use inference mode. Save the inference result and skip the output value comparison.\n"
       "\t-v: verbose\n"
       "\t-n [test_case_name]: Specifies a single test case to run.\n"
       "\t-e [EXECUTION_PROVIDER]: EXECUTION_PROVIDER could be 'cpu', 'cuda', 'dnnl', 'tensorrt', 'vsinpu'"
@@ -203,6 +204,7 @@ int real_main(int argc, char* argv[], Ort::Env& env) {
   bool enable_cpu_mem_arena = true;
   ExecutionMode execution_mode = ExecutionMode::ORT_SEQUENTIAL;
   int repeat_count = 1;
+  bool inference_mode = false;
   int p_models = GetNumCpuCores();
   bool enable_cuda = false;
   bool enable_dnnl = false;
@@ -240,7 +242,7 @@ int real_main(int argc, char* argv[], Ort::Env& env) {
   bool pause = false;
   {
     int ch;
-    while ((ch = getopt(argc, argv, ORT_TSTR("Ac:hj:Mn:r:e:t:a:xvo:d:C:i:pzfb"))) != -1) {
+    while ((ch = getopt(argc, argv, ORT_TSTR("Ac:hj:IMn:r:e:t:a:xvo:d:C:i:pzfb"))) != -1) {
       switch (ch) {
         case 'A':
           enable_cpu_mem_arena = false;
@@ -269,6 +271,9 @@ int real_main(int argc, char* argv[], Ort::Env& env) {
             return -1;
           }
           break;
+        case 'I':
+          inference_mode = true;
+          break;
         case 'M':
           enable_mem_pattern = false;
           break;
@@ -935,7 +940,7 @@ select from 'TF8', 'TF16', 'UINT8', 'FLOAT', 'ITENSOR'. \n)");
               });
 
     auto tp = TestEnv::CreateThreadPool(Env::Default());
-    TestEnv test_env(env, sf, tp.get(), std::move(tests), stat);
+    TestEnv test_env(env, sf, tp.get(), std::move(tests), stat, inference_mode);
     Status st = test_env.Run(p_models, concurrent_session_runs, repeat_count);
     if (!st.IsOK()) {
       fprintf(stderr, "%s\n", st.ErrorMessage().c_str());
diff --git a/onnxruntime/test/onnx/tensorprotoutils.cc b/onnxruntime/test/onnx/tensorprotoutils.cc
index 50ab2290c6456..92c4b5bc88fe7 100644
--- a/onnxruntime/test/onnx/tensorprotoutils.cc
+++ b/onnxruntime/test/onnx/tensorprotoutils.cc
@@ -586,6 +586,124 @@ Status TensorProtoToMLValue(const onnx::TensorProto& tensor_proto, const MemBuff
   return Status::OK();
 }
 
+Status MLValueToTensorProto(Ort::Value& value, onnx::TensorProto& tensor_proto) {
+  Ort::TensorTypeAndShapeInfo tensor_info = value.GetTensorTypeAndShapeInfo();
+  size_t out_dims = tensor_info.GetDimensionsCount();
+  std::vector<int64_t> out_shape = tensor_info.GetShape();
+  for (size_t j = 0; j != out_dims; ++j) {
+    if (out_shape[j] < 0) return Status(common::ONNXRUNTIME, common::FAIL, "Tensor can't contain negative dims");
+    tensor_proto.add_dims(out_shape[j]);
+  }
+  size_t tensor_size = tensor_info.GetElementCount();
+  if (static_cast<uint64_t>(tensor_size) > SIZE_MAX) {
+    return Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT, "Size overflow");
+  }
+
+  ONNXTensorElementDataType tensor_elem_data_type = tensor_info.GetElementType();
+  int tensor_elem_bytes;
+  int tensor_proto_dtype;
+  switch (tensor_elem_data_type) {
+    case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT:
+      tensor_proto_dtype = ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_FLOAT;
+      tensor_elem_bytes = sizeof(float);
+      break;
+    case ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8:
+      tensor_proto_dtype = ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_UINT8;
+      tensor_elem_bytes = sizeof(uint8_t);
+      break;
+    case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT8:
+      tensor_proto_dtype = ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_INT8;
+      tensor_elem_bytes = sizeof(int8_t);
+      break;
+    case ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT16:
+      tensor_proto_dtype = ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_UINT16;
+      tensor_elem_bytes = sizeof(uint16_t);
+      break;
+    case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT16:
+      tensor_proto_dtype = ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_INT16;
+      tensor_elem_bytes = sizeof(int16_t);
+      break;
+    case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32:
+      tensor_proto_dtype = ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_INT32;
+      tensor_elem_bytes = sizeof(int32_t);
+      break;
+    case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64:
+      tensor_proto_dtype = ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_INT64;
+      tensor_elem_bytes = sizeof(int64_t);
+      break;
+    case ONNX_TENSOR_ELEMENT_DATA_TYPE_BOOL:
+      tensor_proto_dtype = ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_BOOL;
+      tensor_elem_bytes = sizeof(bool);
+      break;
+    case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16:
+      tensor_proto_dtype = ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_FLOAT16;
+      tensor_elem_bytes = 2;
+      break;
+    case ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE:
+      tensor_proto_dtype = ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_DOUBLE;
+      tensor_elem_bytes = sizeof(double);
+      break;
+    case ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT32:
+      tensor_proto_dtype = ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_UINT32;
+      tensor_elem_bytes = sizeof(uint32_t);
+      break;
+    case ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT64:
+      tensor_proto_dtype = ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_UINT64;
+      tensor_elem_bytes = sizeof(uint64_t);
+      break;
+    case ONNX_TENSOR_ELEMENT_DATA_TYPE_COMPLEX64:
+      tensor_proto_dtype = ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_COMPLEX64;
+      tensor_elem_bytes = 8;
+      break;
+    case ONNX_TENSOR_ELEMENT_DATA_TYPE_COMPLEX128:
+      tensor_proto_dtype = ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_COMPLEX128;
+      tensor_elem_bytes = 16;
+      break;
+    case ONNX_TENSOR_ELEMENT_DATA_TYPE_BFLOAT16:
+      tensor_proto_dtype = ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_BFLOAT16;
+      tensor_elem_bytes = 2;
+      break;
+#if !defined(DISABLE_FLOAT8_TYPES)
+    case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT8E4M3FN:
+      tensor_proto_dtype = ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_FLOAT8E4M3FN;
+      tensor_elem_bytes = 1;
+      break;
+    case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT8E4M3FNUZ:
+      tensor_proto_dtype = ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_FLOAT8E4M3FNUZ;
+      tensor_elem_bytes = 1;
+      break;
+    case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT8E5M2:
+      tensor_proto_dtype = ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_FLOAT8E5M2;
+      tensor_elem_bytes = 1;
+      break;
+    case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT8E5M2FNUZ:
+      tensor_proto_dtype = ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_FLOAT8E5M2FNUZ;
+      tensor_elem_bytes = 1;
+      break;
+#endif
+    case ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT4:
+      tensor_proto_dtype = ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_UINT4;
+      tensor_elem_bytes = 1;
+      break;
+    case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT4:
+      tensor_proto_dtype = ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_INT4;
+      tensor_elem_bytes = 1;
+      break;
+    default: {
+      // In this function, we do not support
+      // ONNX_TENSOR_ELEMENT_DATA_TYPE_STRING and ONNX_TENSOR_ELEMENT_DATA_TYPE_UNDEFINED
+      std::ostringstream ostr;
+      ostr << "Initialized tensor with unexpected type: " << tensor_elem_data_type;
+      return Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT, ostr.str());
+    }
+  }
+  tensor_proto.set_data_type(tensor_proto_dtype);
+  void* output_buffer = value.GetTensorMutableRawData();
+  tensor_proto.set_raw_data(
+      output_buffer, tensor_size * tensor_elem_bytes);
+  return Status::OK();
+}
+
 template Status GetSizeInBytesFromTensorProto<kAllocAlignment>(const onnx::TensorProto& tensor_proto, size_t* out);
 template Status GetSizeInBytesFromTensorProto<0>(const onnx::TensorProto& tensor_proto, size_t* out);
 }  // namespace test
diff --git a/onnxruntime/test/onnx/tensorprotoutils.h b/onnxruntime/test/onnx/tensorprotoutils.h
index cbfb1276ea0eb..cc9700cee9b7a 100644
--- a/onnxruntime/test/onnx/tensorprotoutils.h
+++ b/onnxruntime/test/onnx/tensorprotoutils.h
@@ -36,6 +36,8 @@ common::Status GetSizeInBytesFromTensorProto(const onnx::TensorProto& tensor_pro
 common::Status TensorProtoToMLValue(const onnx::TensorProto& input, const MemBuffer& m, /* out */ Ort::Value& value,
                                     OrtCallback& deleter);
 
+common::Status MLValueToTensorProto(Ort::Value& value, /* out */ onnx::TensorProto& tensor_proto);
+
 template <typename T>
 void UnpackTensor(const onnx::TensorProto& tensor, const void* raw_data, size_t raw_data_len,
                   /*out*/ T* p_data, size_t expected_size);
diff --git a/onnxruntime/test/onnx/testcase_driver.cc b/onnxruntime/test/onnx/testcase_driver.cc
index 87f8a265757ff..0e14a0dcb6cf8 100644
--- a/onnxruntime/test/onnx/testcase_driver.cc
+++ b/onnxruntime/test/onnx/testcase_driver.cc
@@ -11,9 +11,10 @@
 namespace onnxruntime {
 namespace test {
 
-TestCaseDriver::TestCaseDriver(const TestEnv& env, size_t concurrent_runs)
+TestCaseDriver::TestCaseDriver(const TestEnv& env, size_t concurrent_runs, bool inference_mode)
     : env_(env),
       concurrent_runs_(concurrent_runs),
+      inference_mode_(inference_mode),
       tests_started_(0),
       tests_inprogress_(0),
       finished_(false) {
@@ -22,22 +23,22 @@ TestCaseDriver::TestCaseDriver(const TestEnv& env, size_t concurrent_runs)
   on_test_case_complete_ = f.GetCallable<&TestCaseDriver::OnTestCaseComplete>();
 }
 
-std::vector<std::shared_ptr<TestCaseResult>> TestCaseDriver::Run(const TestEnv& env, size_t concurrent_runs, size_t repeat_count) {
+std::vector<std::shared_ptr<TestCaseResult>> TestCaseDriver::Run(const TestEnv& env, size_t concurrent_runs, size_t repeat_count, bool inference_mode) {
   std::vector<std::shared_ptr<TestCaseResult>> results;
   for (const auto& c : env.GetTests()) {
     auto result = TestCaseRequestContext::Run(env.GetThreadPool(),
-                                              *c, env.Env(), env.GetSessionOptions(), concurrent_runs, repeat_count);
+                                              *c, env.Env(), env.GetSessionOptions(), concurrent_runs, repeat_count, inference_mode);
     results.push_back(std::move(result));
   }
   return results;
 }
 
 std::vector<std::shared_ptr<TestCaseResult>> TestCaseDriver::RunParallel(const TestEnv& test_env, size_t parallel_models,
-                                                                         size_t concurrent_runs) {
+                                                                         size_t concurrent_runs, bool inference_mode) {
   assert(parallel_models > 1);
   parallel_models = std::min(parallel_models, test_env.GetTests().size());
   LOGF_DEFAULT(ERROR, "Running tests in parallel: at most %u models at any time", static_cast<unsigned int>(parallel_models));
-  TestCaseDriver driver(test_env, concurrent_runs);
+  TestCaseDriver driver(test_env, concurrent_runs, inference_mode);
   driver.RunModelsAsync(parallel_models);
   auto results = driver.TakeResults();
   return results;
@@ -53,7 +54,7 @@ void TestCaseDriver::RunModelsAsync(size_t parallel_models) {
     }
     tests_inprogress_.fetch_add(1, std::memory_order_relaxed);
     TestCaseRequestContext::Request(on_test_case_complete_, env_.GetThreadPool(), *tests[next_to_run],
-                                    env_.Env(), env_.GetSessionOptions(), next_to_run, concurrent_runs_);
+                                    env_.Env(), env_.GetSessionOptions(), next_to_run, concurrent_runs_, inference_mode_);
   }
   // This thread is not on a threadpool so we are not using it
   // to run anything. Just wait.
@@ -74,7 +75,8 @@ void TestCaseDriver::OnTestCaseComplete(size_t test_case_id, std::shared_ptr<Tes
   if (next_to_run < total_models) {
     tests_inprogress_.fetch_add(1, std::memory_order_relaxed);
     TestCaseRequestContext::Request(on_test_case_complete_, env_.GetThreadPool(), *tests[next_to_run],
-                                    env_.Env(), env_.GetSessionOptions(), next_to_run, concurrent_runs_);
+                                    env_.Env(), env_.GetSessionOptions(), next_to_run, concurrent_runs_,
+                                    inference_mode_);
   }
 
   auto before_we_done = tests_inprogress_.fetch_sub(1, std::memory_order_acq_rel);
diff --git a/onnxruntime/test/onnx/testcase_driver.h b/onnxruntime/test/onnx/testcase_driver.h
index f913f2db2e453..cdd1fc430e816 100644
--- a/onnxruntime/test/onnx/testcase_driver.h
+++ b/onnxruntime/test/onnx/testcase_driver.h
@@ -33,7 +33,8 @@ class TestCaseDriver {
   /// <param name="repeat_count">Repeat each data tests this many times (only for non-concurrent execution)</param>
   /// <returns>All tests results</returns>
   static std::vector<std::shared_ptr<TestCaseResult>> Run(const TestEnv& env,
-                                                          size_t concurrent_runs, size_t repeat_count);
+                                                          size_t concurrent_runs, size_t repeat_count,
+                                                          bool inference_mode = false);
 
   /// <summary>
   /// Runs all test cases(models) concurrently but not more than
@@ -44,12 +45,12 @@ class TestCaseDriver {
   /// <param name="concurrent_runs">number of data tests to run concurrently on a specific test case(model)</param>
   /// <returns>All test results</returns>
   static std::vector<std::shared_ptr<TestCaseResult>> RunParallel(const TestEnv& env, size_t parallel_models,
-                                                                  size_t concurrent_runs);
+                                                                  size_t concurrent_runs, bool inference_mode = false);
 
   ORT_DISALLOW_ASSIGNMENT(TestCaseDriver);
 
  private:
-  TestCaseDriver(const TestEnv& env, size_t concurrent_runs);
+  TestCaseDriver(const TestEnv& env, size_t concurrent_runs, bool inference_mode = false);
 
   /// This makes the __Dtor private because the lifespan is managed by the class itself
   ~TestCaseDriver() = default;
@@ -66,6 +67,7 @@ class TestCaseDriver {
 
   const TestEnv& env_;
   size_t concurrent_runs_;
+  bool inference_mode_;
   std::vector<std::shared_ptr<TestCaseResult>> results_;
   TestCaseRequestContext::Callback on_test_case_complete_;
 
diff --git a/onnxruntime/test/onnx/testcase_request.cc b/onnxruntime/test/onnx/testcase_request.cc
index 9d653571ca2ec..95c7c89056d10 100644
--- a/onnxruntime/test/onnx/testcase_request.cc
+++ b/onnxruntime/test/onnx/testcase_request.cc
@@ -14,7 +14,7 @@ namespace onnxruntime {
 namespace test {
 
 TestCaseRequestContext::TestCaseRequestContext(const Callback& cb, PThreadPool tp, const ITestCase& test_case, Ort::Env& env,
-                                               const Ort::SessionOptions& session_opts, size_t test_case_id)
+                                               const Ort::SessionOptions& session_opts, size_t test_case_id, bool inference_mode)
     : cb_(cb),
       tp_(tp),
       test_case_(test_case),
@@ -24,6 +24,7 @@ TestCaseRequestContext::TestCaseRequestContext(const Callback& cb, PThreadPool t
       test_case_id_(test_case_id),
       allocator_(),
       result_(),
+      inference_mode_(inference_mode),
       data_tasks_started_(0),
       data_tasks_inprogress_(0) {
   result_ = std::make_shared<TestCaseResult>(test_case_.GetDataCount(), EXECUTE_RESULT::NOT_SET, test_case_.GetTestCaseName());
@@ -53,7 +54,8 @@ bool TestCaseRequestContext::SetupSession() {
 std::shared_ptr<TestCaseResult> TestCaseRequestContext::Run(PThreadPool tpool,
                                                             const ITestCase& c, Ort::Env& env,
                                                             const Ort::SessionOptions& session_opts,
-                                                            size_t concurrent_runs, size_t repeat_count) {
+                                                            size_t concurrent_runs, size_t repeat_count,
+                                                            bool inference_mode) {
   // temp hack. Because we have no resource control. We may not have enough memory to run this test in parallel
   if (c.GetTestCaseName() == "coreml_FNS-Candy_ImageNet") {
     concurrent_runs = 1;
@@ -61,7 +63,7 @@ std::shared_ptr<TestCaseResult> TestCaseRequestContext::Run(PThreadPool tpool,
 
   // No callback, test_case_id is zero.
   Callback empty_cb;
-  TestCaseRequestContext ctx(empty_cb, tpool, c, env, session_opts, 0U);
+  TestCaseRequestContext ctx(empty_cb, tpool, c, env, session_opts, 0U, inference_mode);
 
   const size_t data_count = c.GetDataCount();
   if (concurrent_runs > 1 && data_count > 1) {
@@ -79,13 +81,14 @@ void TestCaseRequestContext::Request(const Callback& cb, PThreadPool tpool,
                                      Ort::Env& env,
                                      const Ort::SessionOptions& session_opts,
                                      size_t test_case_id,
-                                     size_t concurrent_runs) {
+                                     size_t concurrent_runs,
+                                     bool inference_mode) {
   // temp hack. Because we have no resource control. We may not have enough memory to run this test in parallel
   if (c.GetTestCaseName() == "coreml_FNS-Candy_ImageNet") {
     concurrent_runs = 1;
   }
 
-  std::unique_ptr<TestCaseRequestContext> self = std::make_unique<TestCaseRequestContext>(cb, tpool, c, env, session_opts, test_case_id);
+  std::unique_ptr<TestCaseRequestContext> self = std::make_unique<TestCaseRequestContext>(cb, tpool, c, env, session_opts, test_case_id, inference_mode);
   CallableFactory<TestCaseRequestContext, void, size_t> f(self.get());
   auto runnable = f.GetCallable<&TestCaseRequestContext::RunAsync>();
   onnxruntime::concurrency::ThreadPool::Schedule(tpool, [runnable, concurrent_runs]() { runnable.Invoke(concurrent_runs); });
@@ -112,10 +115,10 @@ void TestCaseRequestContext::RunAsync(size_t concurrent_runs) {
       break;
     }
     data_tasks_inprogress_.fetch_add(1, std::memory_order_relaxed);
-    DataTaskRequestContext::Request(on_data_task_cb_, tp_, test_case_, session_, &allocator_, next_to_run);
+    DataTaskRequestContext::Request(on_data_task_cb_, tp_, test_case_, session_, &allocator_, next_to_run, inference_mode_);
   }
   // This runs in this thread and we should invoke the callback for it.
-  auto result = DataTaskRequestContext::Run(test_case_, session_, &allocator_, this_task_id);
+  auto result = DataTaskRequestContext::Run(test_case_, session_, &allocator_, this_task_id, inference_mode_);
   OnDataTaskComplete(this_task_id, result.first, result.second);
 }
 
@@ -128,7 +131,7 @@ void TestCaseRequestContext::OnDataTaskComplete(size_t task_id, EXECUTE_RESULT r
   auto next_to_run = data_tasks_started_.fetch_add(1, std::memory_order_relaxed);
   if (next_to_run < test_case_.GetDataCount()) {
     data_tasks_inprogress_.fetch_add(1, std::memory_order_relaxed);
-    DataTaskRequestContext::Request(on_data_task_cb_, tp_, test_case_, session_, &allocator_, next_to_run);
+    DataTaskRequestContext::Request(on_data_task_cb_, tp_, test_case_, session_, &allocator_, next_to_run, inference_mode_);
   }
 
   auto before_we_done = data_tasks_inprogress_.fetch_sub(1, std::memory_order_acq_rel);
@@ -165,7 +168,7 @@ void TestCaseRequestContext::RunSequentially(size_t repeat_count) {
   const size_t data_count = test_case_.GetDataCount();
   for (size_t idx_repeat = 0; idx_repeat < repeat_count; ++idx_repeat) {
     for (size_t idx_data = 0; idx_data != data_count; ++idx_data) {
-      auto result = DataTaskRequestContext::Run(test_case_, session_, &allocator_, idx_data);
+      auto result = DataTaskRequestContext::Run(test_case_, session_, &allocator_, idx_data, inference_mode_);
       result_->SetResult(idx_data, result.first);
       AccumulateTimeSpec(&test_case_time_, &zero, &result.second);
     }
diff --git a/onnxruntime/test/onnx/testcase_request.h b/onnxruntime/test/onnx/testcase_request.h
index 7a8fd2c2e0e81..097250b0e5d76 100644
--- a/onnxruntime/test/onnx/testcase_request.h
+++ b/onnxruntime/test/onnx/testcase_request.h
@@ -50,7 +50,8 @@ class TestCaseRequestContext {
   static std::shared_ptr<TestCaseResult> Run(PThreadPool tpool,
                                              const ITestCase& c, Ort::Env& env,
                                              const Ort::SessionOptions& sf,
-                                             size_t concurrent_runs, size_t repeat_count);
+                                             size_t concurrent_runs, size_t repeat_count,
+                                             bool inference_mode = false);
 
   /// <summary>
   /// Schedules a TestCase to asynchronously on a TP. The function returns immediately.
@@ -64,7 +65,7 @@ class TestCaseRequestContext {
   /// <param name="concurrent_runs"></param>
   static void Request(const Callback& cb, PThreadPool tpool, const ITestCase& c,
                       Ort::Env& env, const Ort::SessionOptions& sf,
-                      size_t test_case_id, size_t concurrent_runs);
+                      size_t test_case_id, size_t concurrent_runs, bool inference_mode = false);
 
   const TIME_SPEC& GetTimeSpend() const {
     return test_case_time_;
@@ -79,7 +80,7 @@ class TestCaseRequestContext {
   ~TestCaseRequestContext() = default;
 
   TestCaseRequestContext(const Callback& cb, PThreadPool tp, const ITestCase& test_case, Ort::Env& env,
-                         const Ort::SessionOptions& session_opts, size_t test_case_id);
+                         const Ort::SessionOptions& session_opts, size_t test_case_id, bool inference_mode = false);
 
  private:
   bool SetupSession();
@@ -109,6 +110,7 @@ class TestCaseRequestContext {
   MockedOrtAllocator allocator_;
   std::shared_ptr<TestCaseResult> result_;
   TIME_SPEC test_case_time_;
+  bool inference_mode_;
   Callable<void, size_t, EXECUTE_RESULT, const TIME_SPEC&> on_data_task_cb_;
 
   mutable std::atomic_size_t data_tasks_started_;
diff --git a/onnxruntime/test/onnx/testenv.cc b/onnxruntime/test/onnx/testenv.cc
index e92aafb7590ca..8be553b7789cd 100644
--- a/onnxruntime/test/onnx/testenv.cc
+++ b/onnxruntime/test/onnx/testenv.cc
@@ -18,9 +18,10 @@ std::unique_ptr<OrtThreadPool> TestEnv::CreateThreadPool(onnxruntime::Env& env)
 }
 
 TestEnv::TestEnv(Ort::Env& env, Ort::SessionOptions& so, PThreadPool tp,
-                 std::vector<ITestCase*>&& tests, TestResultStat& stat)
+                 std::vector<ITestCase*>&& tests, TestResultStat& stat, bool inference_mode)
     : env_(env),
       so_(so),
+      inference_mode_(inference_mode),
       tp_(tp),
       tests_(std::move(tests)),
       stat_(stat) {
@@ -33,9 +34,9 @@ TestEnv::~TestEnv() {
 Status TestEnv::Run(size_t parallel_models, int concurrent_runs, size_t repeat_count) {
   std::vector<std::shared_ptr<TestCaseResult>> results;
   if (parallel_models > 1U && tests_.size() > 1U) {
-    results = onnxruntime::test::TestCaseDriver::RunParallel(*this, parallel_models, concurrent_runs);
+    results = onnxruntime::test::TestCaseDriver::RunParallel(*this, parallel_models, concurrent_runs, inference_mode_);
   } else {
-    results = onnxruntime::test::TestCaseDriver::Run(*this, concurrent_runs, repeat_count);
+    results = onnxruntime::test::TestCaseDriver::Run(*this, concurrent_runs, repeat_count, inference_mode_);
   }
 
   CalculateStats(results);
diff --git a/onnxruntime/test/onnx/testenv.h b/onnxruntime/test/onnx/testenv.h
index 6a29f8818bccc..149fe9d4ce66b 100644
--- a/onnxruntime/test/onnx/testenv.h
+++ b/onnxruntime/test/onnx/testenv.h
@@ -34,7 +34,7 @@ using PThreadPool = OrtThreadPool*;
 class TestEnv {
  public:
   TestEnv(Ort::Env& env, Ort::SessionOptions& sf1, PThreadPool tp,
-          std::vector<ITestCase*>&& tests, TestResultStat& stat1);
+          std::vector<ITestCase*>&& tests, TestResultStat& stat1, bool inference_mode = false);
 
   ~TestEnv();
 
@@ -72,6 +72,7 @@ class TestEnv {
 
   Ort::Env& env_;
   const Ort::SessionOptions& so_;
+  bool inference_mode_;
   PThreadPool tp_;
   std::vector<ITestCase*> tests_;
   TestResultStat& stat_;
diff --git a/onnxruntime/test/optimizer/graph_transform_test.cc b/onnxruntime/test/optimizer/graph_transform_test.cc
index a33b3148014f1..cd10429e5eae0 100755
--- a/onnxruntime/test/optimizer/graph_transform_test.cc
+++ b/onnxruntime/test/optimizer/graph_transform_test.cc
@@ -2946,6 +2946,24 @@ TEST_F(GraphTransformationTests, TransposeMatmulTransBatchNoFusion) {
   }
 }
 
+TEST_F(GraphTransformationTests, TransposeMatmulFusion_SameInput_gh_issue_24341) {
+  constexpr const ORTCHAR_T* model_uri = MODEL_FOLDER "fusion/gh_issue_24341.onnx";
+
+  std::shared_ptr<Model> p_model;
+  ASSERT_STATUS_OK(Model::Load(model_uri, p_model, nullptr, *logger_));
+  Graph& graph = p_model->MainGraph();
+  std::map<std::string, int> orig_op_to_count = CountOpsInGraph(graph);
+  onnxruntime::GraphTransformerManager graph_transformation_mgr{5};
+  ASSERT_STATUS_OK(graph_transformation_mgr.Register(
+      std::make_unique<MatmulTransposeFusion>(), TransformerLevel::Level1));
+  ASSERT_STATUS_OK(graph_transformation_mgr.ApplyTransformers(graph, TransformerLevel::Level1, *logger_));
+
+  std::map<std::string, int> op_to_count = CountOpsInGraph(graph);
+  ASSERT_EQ(op_to_count["Transpose"], orig_op_to_count["Transpose"]);
+  ASSERT_EQ(op_to_count["MatMul"], orig_op_to_count["MatMul"]);
+  ASSERT_EQ(op_to_count["Cast"], orig_op_to_count["Cast"]);
+}
+
 TEST_F(GraphTransformationTests, Gemm_LeakyRelu_Fusion) {
   constexpr const ORTCHAR_T* model_uri = MODEL_FOLDER "gemm_activation_fusion/gemm_activation_fusion.onnx";
 
@@ -8089,9 +8107,9 @@ TEST_F(GraphTransformationTests, MatMulNBitsBiasFusion) {
                                                 q_rows, q_cols);
 
       size_t q_data_size_in_bytes, q_scale_size, q_zp_size_in_bytes;
-      MlasBlockwiseQuantizedBufferSizes(qbits, block_size, /* columnwise */ true,
-                                        K, N,
-                                        q_data_size_in_bytes, q_scale_size, &q_zp_size_in_bytes);
+      MlasBlockwiseQuantizedBufferSizes<qbits>(block_size, /* columnwise */ true,
+                                               K, N,
+                                               q_data_size_in_bytes, q_scale_size, &q_zp_size_in_bytes);
 
       auto* A = builder.MakeInput<float>(std::vector{M, K}, "A");
 
diff --git a/onnxruntime/test/optimizer/graph_transform_test_layernorm.cc b/onnxruntime/test/optimizer/graph_transform_test_layernorm.cc
index 2320a2321f8ff..7b700922f4306 100755
--- a/onnxruntime/test/optimizer/graph_transform_test_layernorm.cc
+++ b/onnxruntime/test/optimizer/graph_transform_test_layernorm.cc
@@ -15,6 +15,7 @@
 #include "core/optimizer/initializer.h"
 
 #include "core/optimizer/embed_layer_norm_fusion.h"
+#include "core/optimizer/group_query_attention_fusion.h"
 #include "core/optimizer/layer_norm_fusion.h"
 #include "core/optimizer/skip_layer_norm_fusion.h"
 
@@ -579,6 +580,23 @@ TEST_F(GraphTransformationTests, SimplifiedLayerNormWithCastsFusionTestCudaEp) {
   }
 }
 
+static void TestGQAFusion(const std::basic_string<ORTCHAR_T>& file_path, int matmulnbits_count, int matmul_count, logging::Logger* logger) {
+  std::shared_ptr<Model> p_model;
+  ASSERT_TRUE(Model::Load(file_path, p_model, nullptr, *logger).IsOK());
+  Graph& graph = p_model->MainGraph();
+
+  onnxruntime::GraphTransformerManager graph_transformation_mgr{3};
+  ASSERT_STATUS_OK(graph_transformation_mgr.Register(std::make_unique<GroupQueryAttentionFusion>(), TransformerLevel::Level2));
+  ASSERT_STATUS_OK(graph_transformation_mgr.ApplyTransformers(graph, TransformerLevel::Level1, *logger));
+  ASSERT_STATUS_OK(graph_transformation_mgr.ApplyTransformers(graph, TransformerLevel::Level2, *logger));
+
+  std::map<std::string, int> op_to_count = CountOpsInGraph(graph);
+  ASSERT_TRUE(op_to_count["com.microsoft.RotaryEmbedding"] == 0);
+  ASSERT_TRUE(op_to_count["com.microsoft.MatMulNBits"] == matmulnbits_count);
+  ASSERT_TRUE(op_to_count["MatMul"] == matmul_count);
+  ASSERT_TRUE(op_to_count["com.microsoft.GroupQueryAttention"] == 1);
+}
+
 static void TestSkipLayerNormFusion(const std::basic_string<ORTCHAR_T>& file_path, int add_count, int ln_count,
                                     int skip_ln_count, int cast_count, logging::Logger* logger) {
   std::shared_ptr<Model> p_model;
@@ -620,6 +638,12 @@ TEST_F(GraphTransformationTests, SkipLayerNormFusionTest) {
   TestSkipLayerNormFusion(MODEL_FOLDER "fusion/skip_layer_norm_format3_graph_output.onnx", 1, 1, 0, 0, logger_.get());
 }
 
+TEST_F(GraphTransformationTests, GroupQueryAttentionFusionTest) {
+  TestGQAFusion(MODEL_FOLDER "fusion/gqa_fusion_quantized_simple.onnx", 1, 0, logger_.get());
+  TestGQAFusion(MODEL_FOLDER "fusion/gqa_fusion_different_head_sizes.onnx", 0, 1, logger_.get());
+  TestGQAFusion(MODEL_FOLDER "fusion/gqa_fusion_quantized_different_head_sizes.onnx", 1, 0, logger_.get());
+}
+
 TEST_F(GraphTransformationTests, SkipLayerNormFusionWithCastTest) {
   TestSkipLayerNormFusion(MODEL_FOLDER "fusion/skip_layer_norm_format1_with_cast.onnx", 0, 0, 1, 3, logger_.get());
   TestSkipLayerNormFusion(MODEL_FOLDER "fusion/skip_layer_norm_format2_with_cast.onnx", 0, 0, 1, 3, logger_.get());
diff --git a/onnxruntime/test/providers/azure/azure_basic_test.cc b/onnxruntime/test/providers/azure/azure_basic_test.cc
index 8d7f791e2651a..9010e69006888 100644
--- a/onnxruntime/test/providers/azure/azure_basic_test.cc
+++ b/onnxruntime/test/providers/azure/azure_basic_test.cc
@@ -1,6 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
+#include "core/graph/constants.h"
 #include "core/session/onnxruntime_cxx_api.h"
 #include "core/session/inference_session.h"
 #include "test/util/include/inference_session_wrapper.h"
@@ -21,7 +22,12 @@ TEST(AzureEP, TestSessionCreation) {
   so.AppendExecutionProvider("AZURE", options);
   // session could be created
   EXPECT_NO_THROW((Ort::Session{*ort_env, ort_model_path, so}));
-}
 
+  // Use canonical EP name 'AzureExecutionProvider'
+  Ort::SessionOptions session_options2;
+  session_options2.AddConfigEntry("azure.endpoint_type", "triton");
+  session_options2.AppendExecutionProvider(kAzureExecutionProvider, options);
+  EXPECT_NO_THROW((Ort::Session{*ort_env, ort_model_path, session_options2}));
+}
 }  // namespace test
-}  // namespace onnxruntime
\ No newline at end of file
+}  // namespace onnxruntime
diff --git a/onnxruntime/test/providers/coreml/coreml_basic_test.cc b/onnxruntime/test/providers/coreml/coreml_basic_test.cc
index 3505193b77683..a2c9881ab5169 100644
--- a/onnxruntime/test/providers/coreml/coreml_basic_test.cc
+++ b/onnxruntime/test/providers/coreml/coreml_basic_test.cc
@@ -1,12 +1,16 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
+#include <memory>
+
 #include "core/common/logging/logging.h"
+#include "core/graph/constants.h"
 #include "core/graph/graph.h"
 #include "core/graph/graph_viewer.h"
 #include "core/providers/coreml/coreml_provider_factory_creator.h"
 #include "core/providers/coreml/coreml_provider_factory.h"
 #include "core/session/inference_session.h"
+#include "core/session/onnxruntime_cxx_api.h"
 #include "test/common/tensor_op_test_utils.h"
 #include "test/framework/test_utils.h"
 #include "test/util/include/asserts.h"
@@ -28,19 +32,68 @@
 using namespace ONNX_NAMESPACE;
 using namespace ::onnxruntime::logging;
 
+// defined in test_main.cc
+extern std::unique_ptr<Ort::Env> ort_env;
+
 namespace onnxruntime {
 namespace test {
 
-static std::unique_ptr<IExecutionProvider> MakeCoreMLExecutionProvider(
-    std::string ModelFormat = "NeuralNetwork", std::string ComputeUnits = "CPUOnly", std::string ModelCacheDirectory = "") {
+static std::unordered_map<std::string, std::string> MakeCoreMLProviderOptions(std::string ModelFormat = "NeuralNetwork",
+                                                                              std::string ComputeUnits = "CPUOnly",
+                                                                              std::string ModelCacheDirectory = "") {
   std::unordered_map<std::string, std::string> provider_options = {{kCoremlProviderOption_MLComputeUnits, ComputeUnits},
                                                                    {kCoremlProviderOption_ModelFormat, ModelFormat},
-                                                                   {kCoremlProviderOption_ModelCacheDirectory, ModelCacheDirectory}};
+                                                                   {kCoremlProviderOption_ModelCacheDirectory,
+                                                                    ModelCacheDirectory}};
+  return provider_options;
+}
+
+static std::unique_ptr<IExecutionProvider> MakeCoreMLExecutionProvider(
+    std::string ModelFormat = "NeuralNetwork", std::string ComputeUnits = "CPUOnly", std::string ModelCacheDirectory = "") {
+  std::unordered_map<std::string, std::string> provider_options = MakeCoreMLProviderOptions(ModelFormat,
+                                                                                            ComputeUnits,
+                                                                                            ModelCacheDirectory);
   return CoreMLProviderFactoryCreator::Create(provider_options)->CreateProvider();
 }
 
 #if !defined(ORT_MINIMAL_BUILD)
 
+TEST(CoreMLExecutionProviderTest, TestAddEpUsingPublicApi) {
+  auto session_has_ep = [](Ort::Session& session) -> bool {
+    // Access the underlying InferenceSession.
+    const OrtSession* ort_session = session;
+    const InferenceSession* s = reinterpret_cast<const InferenceSession*>(ort_session);
+    bool has_ep = false;
+
+    for (const auto& provider : s->GetRegisteredProviderTypes()) {
+      if (provider == kCoreMLExecutionProvider) {
+        has_ep = true;
+        break;
+      }
+    }
+    return has_ep;
+  };
+
+  const ORTCHAR_T* model_file_name = ORT_TSTR("testdata/constant_floats.onnx");
+  auto provider_options = MakeCoreMLProviderOptions("NeuralNetwork", "CPUOnly", "./tmp");
+
+  {
+    // Test C++ API to add CoreML EP with the short name 'CoreML'.
+    Ort::SessionOptions so;
+    so.AppendExecutionProvider("CoreML", provider_options);
+    Ort::Session session(*ort_env, model_file_name, so);
+    ASSERT_TRUE(session_has_ep(session)) << "CoreML EP was not found in registered providers for session.";
+  }
+
+  {
+    // Test C++ API to add CoreML EP with the long canonical name 'CoreMLExecutionProvider'.
+    Ort::SessionOptions so;
+    so.AppendExecutionProvider(kCoreMLExecutionProvider, provider_options);
+    Ort::Session session(*ort_env, model_file_name, so);
+    ASSERT_TRUE(session_has_ep(session)) << "CoreML EP was not found in registered providers for session.";
+  }
+}
+
 TEST(CoreMLExecutionProviderTest, FunctionTest) {
   const ORTCHAR_T* model_file_name = ORT_TSTR("coreml_execution_provider_test_graph.onnx");
 
diff --git a/onnxruntime/test/providers/cpu/nn/instance_norm_op_test.cc b/onnxruntime/test/providers/cpu/nn/instance_norm_op_test.cc
index 46b74f2c2eb9d..a5c0cda8dcf3a 100644
--- a/onnxruntime/test/providers/cpu/nn/instance_norm_op_test.cc
+++ b/onnxruntime/test/providers/cpu/nn/instance_norm_op_test.cc
@@ -50,6 +50,11 @@ TYPED_TEST(InstanceNormalizationOpTest, InstanceNorm) {
                                      -0.14644464F, -0.82262872F, -0.66852817F, 1.63760153F,
                                      -1.65898662F, 0.27618144F, 0.64840618F, 0.734399F};
     test.AddOutput<TypeParam>("Y", input_dims, GetTypedArray<TypeParam>(expected_output));
+#ifdef USE_WEBGPU
+    if constexpr (std::is_same<TypeParam, MLFloat16>::value) {
+      test.SetOutputTolerance(0.005F, 0.005F);
+    }
+#endif
     test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
   };
   run_test(false);
@@ -79,7 +84,11 @@ TYPED_TEST(InstanceNormalizationOpTest, InstanceNormBatch1) {
                                      1.46688162F, -0.98600774F, -0.79911913F, 0.31824524F,
                                      0.57370438F, 0.42193634F, 0.6525492F, -1.64818992F};
     test.AddOutput<TypeParam>("Y", input_dims, GetTypedArray<TypeParam>(expected_output));
-
+#ifdef USE_WEBGPU
+    if constexpr (std::is_same<TypeParam, MLFloat16>::value) {
+      test.SetOutputTolerance(0.005F, 0.005F);
+    }
+#endif
     test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
   };
   run_test(false);
@@ -121,7 +130,7 @@ TEST(InstanceNormalizationOpTest, InstanceNormBatch2) {
 }
 
 // Only CUDA and ROCm kernels have float 16 support
-#if defined(USE_CUDA) || defined(USE_ROCM) || defined(USE_COREML)
+#if defined(USE_CUDA) || defined(USE_ROCM) || defined(USE_COREML) || defined(USE_WEBGPU)
 
 TEST(InstanceNormalizationOpTest, InstanceNormBatch1_fp16) {
   OpTester test("InstanceNormalization");
@@ -158,7 +167,9 @@ TEST(InstanceNormalizationOpTest, InstanceNormBatch1_fp16) {
   test.AddInput<MLFloat16>("scale", {3}, scale_fp16);
   test.AddInput<MLFloat16>("B", {3}, B_fp16);
   test.AddOutput<MLFloat16>("Y", input_dims, expected_output_fp16);
-
+#ifdef USE_WEBGPU
+  test.SetOutputTolerance(0.005F, 0.005F);
+#endif
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
 }
 
@@ -205,7 +216,9 @@ TEST(InstanceNormalizationOpTest, InstanceNormBatch2_fp16) {
   test.AddInput<MLFloat16>("scale", {3}, scale_fp16);
   test.AddInput<MLFloat16>("B", {3}, B_fp16);
   test.AddOutput<MLFloat16>("Y", input_dims, expected_output_fp16);
-
+#ifdef USE_WEBGPU
+  test.SetOutputTolerance(0.005F, 0.005F);
+#endif
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
 }
 
diff --git a/onnxruntime/test/providers/cpu/tensor/resize_op_test.cc b/onnxruntime/test/providers/cpu/tensor/resize_op_test.cc
index f0d3a53f0ac81..f182cea838933 100644
--- a/onnxruntime/test/providers/cpu/tensor/resize_op_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/resize_op_test.cc
@@ -2005,6 +2005,8 @@ void TestAntialiasing(std::map<std::string, std::string> attributes,
       roi = parse_attr(v, 0.0f);
     } else if (k == "antialias") {
       test.AddAttribute<int64_t>("antialias", std::stoll(v));
+    } else if (k == "nearest_mode") {
+      test.AddAttribute("nearest_mode", v);
     } else {
       throw std::invalid_argument("Unknown attribute");
     }
@@ -2287,6 +2289,111 @@ TEST(ResizeOpTest, Antialias_NHWCBicubic_ExcludeOutside) {
   TestAntialiasing({{"mode", "cubic"}, {"exclude_outside", "0"}}, {1, 4, 6, 2}, X, {1, 8, 4, 2}, Y, excluded_eps);
 }
 
+TEST(ResizeOpTest, NoAntialias_AlignCorners_Cubic_Floor_NCHW) {
+  std::vector<float> X(24);
+  std::iota(X.begin(), X.end(), 1.0f);
+  // clang-format off
+  std::vector<float> Y = {
+       1.0f, 1.34111f, 1.80029f, 2.32945f, 2.67055f, 3.19971f, 3.65889f,     4.0f,
+     2.264f, 2.60511f, 3.06429f, 3.59345f, 3.93455f, 4.46371f, 4.92289f,   5.264f,
+     3.912f, 4.25311f, 4.71229f, 5.24145f, 5.58256f, 6.11171f,  6.5709f,   6.912f,
+     6.088f, 6.42911f, 6.88829f, 7.41745f, 7.75856f, 8.28771f,  8.7469f, 9.08801f,
+     7.736f, 8.07711f, 8.53629f, 9.06545f, 9.40655f, 9.93571f, 10.3949f,  10.736f,
+       9.0f, 9.34111f, 9.80029f, 10.3295f, 10.6706f, 11.1997f, 11.6589f,    12.0f,
+
+      13.0f, 13.3411f, 13.8003f, 14.3295f, 14.6706f, 15.1997f, 15.6589f,    16.0f,
+    14.264f, 14.6051f, 15.0643f, 15.5934f, 15.9346f, 16.4637f, 16.9229f,  17.264f,
+    15.912f, 16.2531f, 16.7123f, 17.2415f, 17.5826f, 18.1117f, 18.5709f,  18.912f,
+    18.088f, 18.4291f, 18.8883f, 19.4175f, 19.7586f, 20.2877f, 20.7469f,  21.088f,
+    19.736f, 20.0771f, 20.5363f, 21.0654f, 21.4066f, 21.9357f, 22.3949f,  22.736f,
+      21.0f, 21.3411f, 21.8003f, 22.3295f, 22.6706f, 23.1997f, 23.6589f,    24.0f,
+  };
+  // clang-format on
+  TestAntialiasing(
+      {{"antialias", "0"},
+       {"coordinate_transformation_mode", "align_corners"},
+       {"cubic_coeff_a", "-0.75"},
+       {"exclude_outside", "0"},
+       {"extrapolation_value", "0"},
+       {"mode", "cubic"},
+       {"nearest_mode", "floor"}},
+      {1, 2, 3, 4}, X, {1, 2, 6, 8}, Y);
+}
+
+// This test is generated by onnxruntime/test/providers/cpu/tensor/resize_sample_test_gen.py
+TEST(ResizeOpTest, NoAntialias_AlignCorners_Cubic_Floor_NHWC) {
+  std::vector<float> X(24);
+  std::iota(X.begin(), X.end(), 1.0f);
+  // clang-format off
+  std::vector<float> Y = {
+     1.0000f,  2.0000f,
+     1.6822f,  2.6822f,
+     2.6006f,  3.6006f,
+     3.6589f,  4.6589f,
+     4.3411f,  5.3411f,
+     5.3994f,  6.3994f,
+     6.3178f,  7.3178f,
+     7.0000f,  8.0000f,
+
+     3.5280f,  4.5280f,
+     4.2102f,  5.2102f,
+     5.1286f,  6.1286f,
+     6.1869f,  7.1869f,
+     6.8691f,  7.8691f,
+     7.9274f,  8.9274f,
+     8.8458f,  9.8458f,
+     9.5280f, 10.5280f,
+
+     6.8240f,  7.8240f,
+     7.5062f,  8.5062f,
+     8.4246f,  9.4246f,
+     9.4829f, 10.4829f,
+    10.1651f, 11.1651f,
+    11.2234f, 12.2234f,
+    12.1418f, 13.1418f,
+    12.8240f, 13.8240f,
+
+    11.1760f, 12.1760f,
+    11.8582f, 12.8582f,
+    12.7766f, 13.7766f,
+    13.8349f, 14.8349f,
+    14.5171f, 15.5171f,
+    15.5754f, 16.5754f,
+    16.4938f, 17.4938f,
+    17.1760f, 18.1760f,
+
+    14.4720f, 15.4720f,
+    15.1542f, 16.1542f,
+    16.0726f, 17.0726f,
+    17.1309f, 18.1309f,
+    17.8131f, 18.8131f,
+    18.8714f, 19.8714f,
+    19.7898f, 20.7898f,
+    20.4720f, 21.4720f,
+
+    17.0000f, 18.0000f,
+    17.6822f, 18.6822f,
+    18.6006f, 19.6006f,
+    19.6589f, 20.6589f,
+    20.3411f, 21.3411f,
+    21.3994f, 22.3994f,
+    22.3178f, 23.3178f,
+    23.0000f, 24.0000f,
+  };
+  // clang-format on
+  InlinedVector<std::string_view> excluded_eps = {kCudaExecutionProvider};
+  TestAntialiasing(
+      {{"antialias", "0"},
+       {"coordinate_transformation_mode", "align_corners"},
+       {"cubic_coeff_a", "-0.75"},
+       {"exclude_outside", "0"},
+       {"extrapolation_value", "0"},
+       {"mode", "cubic"},
+       {"nearest_mode", "floor"}},
+      // FIXME: Fix error on kCudaExecutionProvider
+      {1, 3, 4, 2}, X, {1, 6, 8, 2}, Y, excluded_eps);
+}
+
 TEST(ResizeOpTest, Antialias_Linear_AlignCorners) {
   if (DefaultDmlExecutionProvider().get() != nullptr) {
     GTEST_SKIP() << "Skipping because dml implementation of antialias"
diff --git a/onnxruntime/test/providers/cpu/tensor/resize_sample_test_gen.py b/onnxruntime/test/providers/cpu/tensor/resize_sample_test_gen.py
new file mode 100644
index 0000000000000..57212dfbdc9b3
--- /dev/null
+++ b/onnxruntime/test/providers/cpu/tensor/resize_sample_test_gen.py
@@ -0,0 +1,34 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+# This code is used to generate test data for the test case:
+# TEST(ResizeOpTest, NoAntialias_AlignCorners_Cubic_Floor_NHWC)
+# located in onnxruntime/test/providers/cpu/tensor/resize_op_test.cc
+
+import torch
+import torch.nn.functional as F
+
+# Define input tensor
+input_tensor = (
+    torch.tensor(
+        [
+            [[1, 2], [3, 4], [5, 6], [7, 8]],
+            [[9, 10], [11, 12], [13, 14], [15, 16]],
+            [[17, 18], [19, 20], [21, 22], [23, 24]],
+        ]
+    )
+    .unsqueeze(0)
+    .to(torch.float32)
+)
+
+output_shape = (1, 6, 8, 2)
+
+# Apply resize operation
+output_tensor = F.interpolate(
+    input_tensor.permute(0, 3, 1, 2),  # Convert to NHWC to NCHW for PyTorch
+    size=(6, 8),  # Note: PyTorch expects size in (height, width) format
+    mode="bicubic",  # bicubic is the same as cubic in ONNX
+    align_corners=True,
+).permute(0, 2, 3, 1)  # Convert back to NHWC
+
+print(output_tensor)
diff --git a/onnxruntime/test/providers/qnn/qnn_basic_test.cc b/onnxruntime/test/providers/qnn/qnn_basic_test.cc
index ae976c164734b..b75751f89a6c7 100644
--- a/onnxruntime/test/providers/qnn/qnn_basic_test.cc
+++ b/onnxruntime/test/providers/qnn/qnn_basic_test.cc
@@ -5,6 +5,7 @@
 #include <string>
 #include <thread>
 
+#include "core/graph/constants.h"
 #include "core/providers/cpu/cpu_provider_factory.h"  // For OrtSessionOptionsAppendExecutionProvider_CPU
 #if BUILD_QNN_EP_STATIC_LIB
 #include "core/providers/qnn/qnn_allocator.h"  // Used by QnnHTPBackendTests.UseHtpSharedMemoryAllocatorForInputs
@@ -37,43 +38,60 @@ namespace test {
 // Tests that the QNN EP is registered when added via the public C++ API.
 // Loads a simple ONNX model that adds floats.
 TEST_F(QnnHTPBackendTests, TestAddEpUsingPublicApi) {
+  auto session_has_qnn_ep = [](Ort::Session& session) -> bool {
+    // Access the underlying InferenceSession.
+    const OrtSession* ort_session = session;
+    const InferenceSession* s = reinterpret_cast<const InferenceSession*>(ort_session);
+    bool have_qnn_ep = false;
+
+    for (const auto& provider : s->GetRegisteredProviderTypes()) {
+      if (provider == kQnnExecutionProvider) {
+        have_qnn_ep = true;
+        break;
+      }
+    }
+    return have_qnn_ep;
+  };
+
+  onnxruntime::ProviderOptions options;
+#if defined(_WIN32)
+  options["backend_path"] = "QnnHtp.dll";
+#else
+  options["backend_path"] = "libQnnHtp.so";
+#endif
+
+  const ORTCHAR_T* ort_model_path = ORT_MODEL_FOLDER "constant_floats.onnx";
+
   {
+    // Test C++ API to add QNN EP with the short name 'QNN'.
     Ort::SessionOptions so;
 
     // Can only enforce that model runs on QNN in linux CI machines
-    // because they support the CPU backend and emulate the HPT backend.
+    // because they support the CPU backend and emulate the HTP backend.
     // TODO: Remove #ifdef when Windows Arm64 machines support the CPU backend.
 #if defined(__linux__)
     so.AddConfigEntry(kOrtSessionOptionsDisableCPUEPFallback, "1");  // Disable fallback to the CPU EP.
 #endif
-
-    onnxruntime::ProviderOptions options;
-
-#if defined(_WIN32)
-    options["backend_path"] = "QnnHtp.dll";
-#else
-    options["backend_path"] = "libQnnHtp.so";
-#endif
-
     so.AppendExecutionProvider("QNN", options);
 
-    const ORTCHAR_T* ort_model_path = ORT_MODEL_FOLDER "constant_floats.onnx";
     Ort::Session session(*ort_env, ort_model_path, so);
+    ASSERT_TRUE(session_has_qnn_ep(session)) << "QNN EP was not found in registered providers for session "
+                                             << "when added to session with name 'QNN'";
+  }
 
-    // Access the underlying InferenceSession.
-    const OrtSession* ort_session = session;
-    const InferenceSession* s = reinterpret_cast<const InferenceSession*>(ort_session);
-
-    bool have_qnn_ep = false;
+  {
+    // Test C++ API to add QNN EP with the long canonical name 'QNNExecutionProvider'.
+    Ort::SessionOptions so;
 
-    for (const auto& provider : s->GetRegisteredProviderTypes()) {
-      if (provider == kQnnExecutionProvider) {
-        have_qnn_ep = true;
-        break;
-      }
-    }
+    // TODO: Remove #ifdef when Windows Arm64 machines support the CPU backend.
+#if defined(__linux__)
+    so.AddConfigEntry(kOrtSessionOptionsDisableCPUEPFallback, "1");  // Disable fallback to the CPU EP.
+#endif
+    so.AppendExecutionProvider(kQnnExecutionProvider, options);
 
-    ASSERT_TRUE(have_qnn_ep) << "QNN EP was not found in registered providers for session.";
+    Ort::Session session(*ort_env, ort_model_path, so);
+    ASSERT_TRUE(session_has_qnn_ep(session)) << "QNN EP was not found in registered providers for session "
+                                             << "when added to session with name '" << kQnnExecutionProvider << "'";
   }
 }
 
diff --git a/onnxruntime/test/providers/qnn/qnn_ep_context_test.cc b/onnxruntime/test/providers/qnn/qnn_ep_context_test.cc
index e39102a21dd1c..f605e6fb309c7 100644
--- a/onnxruntime/test/providers/qnn/qnn_ep_context_test.cc
+++ b/onnxruntime/test/providers/qnn/qnn_ep_context_test.cc
@@ -196,6 +196,258 @@ void QnnContextBinaryMultiPartitionTestBody(bool single_ep_node = true) {
   CleanUpCtxFile(context_model_file);
 }
 
+// Helper struct that represents a test model.
+struct TestModel {
+  std::unique_ptr<onnxruntime::Model> model;
+  std::unique_ptr<ModelTestBuilder> builder;
+
+  std::string Serialize() const {
+    std::string model_data;
+    model->ToProto().SerializeToString(&model_data);
+    return model_data;
+  }
+
+  Status Save(const ORTCHAR_T* path) const {
+    return onnxruntime::Model::Save(*model, PathString(path));
+  }
+};
+
+// Create a test model from a function that programmatically builds a graph.
+static void CreateTestModel(test::GetTestModelFn graph_builder,
+                            int onnx_opset_version,
+                            logging::Severity log_severity,
+                            TestModel& test_model) {
+  auto& logging_manager = DefaultLoggingManager();
+  logging_manager.SetDefaultLoggerSeverity(log_severity);
+  const std::unordered_map<std::string, int> domain_to_version = {{"", onnx_opset_version}, {kMSDomain, 1}};
+
+  test_model.model = std::make_unique<onnxruntime::Model>("QNN_EP_TestModel", false, ModelMetaData(), PathString(),
+                                                          IOnnxRuntimeOpSchemaRegistryList(), domain_to_version,
+                                                          std::vector<ONNX_NAMESPACE::FunctionProto>{}, logging_manager.DefaultLogger());
+  test_model.builder = std::make_unique<ModelTestBuilder>(test_model.model->MainGraph());
+  graph_builder(*test_model.builder);
+  test_model.builder->SetGraphOutputs();
+  ASSERT_STATUS_OK(test_model.model->MainGraph().Resolve());
+}
+
+// Helper that checks that a compiled model has the expected number of EPContext nodes.
+static void CheckEpContextNodeCounts(const onnxruntime::Model& ep_ctx_model,
+                                     int expected_ep_context_node_count,
+                                     int expected_other_node_count) {
+  int ep_context_node_count = 0;
+  int non_ep_context_node_count = 0;
+  auto& ctx_graph = ep_ctx_model.MainGraph();
+  for (auto& node : ctx_graph.Nodes()) {
+    if (node.OpType() == "EPContext") {
+      ++ep_context_node_count;
+      // validate the fix for the partition issue relate to QDQ model
+      ASSERT_EQ(node.InputDefs().size(), 1);
+    } else {
+      ++non_ep_context_node_count;
+    }
+  }
+
+  EXPECT_EQ(ep_context_node_count, expected_ep_context_node_count);
+  EXPECT_EQ(non_ep_context_node_count, expected_other_node_count);
+}
+
+// Helper to check that a compiled model (stored as a file) has the expected number of EPContext nodes.
+static void CheckEpContextNodeCounts(const ORTCHAR_T* model_path,
+                                     int expected_ep_context_node_count,
+                                     int expected_other_node_count) {
+  std::shared_ptr<Model> ep_ctx_model;
+  ASSERT_STATUS_OK(Model::Load(ToPathString(model_path), ep_ctx_model, nullptr, DefaultLoggingManager().DefaultLogger()));
+  CheckEpContextNodeCounts(*ep_ctx_model, expected_ep_context_node_count, expected_other_node_count);
+}
+
+// Helper to check that a compiled model (stored in a buffer) has the expected number of EPContext nodes.
+static void CheckEpContextNodeCounts(void* model_buffer, size_t model_buffer_size,
+                                     int expected_ep_context_node_count,
+                                     int expected_other_node_count) {
+  std::shared_ptr<Model> ep_ctx_model;
+  const ORTCHAR_T* output_model_path = ORT_TSTR("tmp_output_ctx_model.onnx");
+  ASSERT_STATUS_OK(onnxruntime::Model::LoadFromBytes(static_cast<int>(model_buffer_size),
+                                                     model_buffer, output_model_path, ep_ctx_model,
+                                                     nullptr, DefaultLoggingManager().DefaultLogger()));
+  CheckEpContextNodeCounts(*ep_ctx_model, expected_ep_context_node_count, expected_other_node_count);
+  std::filesystem::remove(output_model_path);
+}
+
+// Test using the CompileModel() API with settings:
+//   - input model file
+//   - output model file
+TEST_F(QnnHTPBackendTests, CompileApi_FromSessionOptions_InputModelFromPath) {
+  const ORTCHAR_T* input_model_file = ORT_TSTR("./compileapi_fromsessionoptions_inputmodelfrompath.onnx");
+  const ORTCHAR_T* output_model_file = ORT_TSTR("./qnn_context_binary_multi_partition_test.onnx");
+  std::filesystem::remove(input_model_file);
+  std::filesystem::remove(output_model_file);
+
+  // Create a test model and save it to a file.
+  TestModel test_model;
+  CreateTestModel(BuildGraphWithQAndNonQ(false), 21, logging::Severity::kERROR, test_model);
+  ASSERT_STATUS_OK(test_model.Save(input_model_file));
+
+  // Initialize session options with QNN EP
+  Ort::SessionOptions so;
+  ProviderOptions provider_options;
+#if defined(_WIN32)
+  provider_options["backend_path"] = "QnnHtp.dll";
+#else
+  provider_options["backend_path"] = "libQnnHtp.so";
+#endif
+  provider_options["offload_graph_io_quantization"] = "0";
+  so.AppendExecutionProvider("QNN", provider_options);
+
+  // Create model compilation options from the session options.
+  Ort::ModelCompilationOptions compile_options(*ort_env, so);
+  compile_options.SetInputModelPath(input_model_file);
+  compile_options.SetOutputModelPath(output_model_file);
+
+  // Compile the model.
+  Ort::Status status = Ort::CompileModel(*ort_env, compile_options);
+  ASSERT_TRUE(status.IsOK()) << status.GetErrorMessage();
+
+  // Make sure the compiled model was generated and has the expected number of EPContext nodes.
+  ASSERT_TRUE(std::filesystem::exists(output_model_file));
+  CheckEpContextNodeCounts(output_model_file, 2, 2);
+}
+
+// Test using the CompileModel() API with settings:
+//   - input model from buffer
+//   - output model file
+//   - EPContext nodes in output model use embedded binary blobs.
+TEST_F(QnnHTPBackendTests, CompileApi_FromSessionOptions_InputModelAsBuffer_Embedded) {
+  // Create a test model and serialize it to a buffer.
+  TestModel test_model;
+  CreateTestModel(BuildGraphWithQAndNonQ(false), 21, logging::Severity::kERROR, test_model);
+  std::string model_data = test_model.Serialize();
+
+  const ORTCHAR_T* output_model_file = ORT_TSTR("./qnn_context_binary_multi_partition_test.onnx");
+  std::filesystem::remove(output_model_file);
+
+  // Initialize session options with QNN EP
+  Ort::SessionOptions so;
+  ProviderOptions provider_options;
+#if defined(_WIN32)
+  provider_options["backend_path"] = "QnnHtp.dll";
+#else
+  provider_options["backend_path"] = "libQnnHtp.so";
+#endif
+  provider_options["offload_graph_io_quantization"] = "0";
+  so.AppendExecutionProvider("QNN", provider_options);
+
+  // Create model compilation options from the session options.
+  Ort::ModelCompilationOptions compile_options(*ort_env, so);
+  compile_options.SetInputModelFromBuffer(reinterpret_cast<const void*>(model_data.data()), model_data.size());
+  compile_options.SetOutputModelPath(output_model_file);
+  compile_options.SetEpContextEmbedMode(true);
+
+  // Compile the model.
+  Ort::Status status = Ort::CompileModel(*ort_env, compile_options);
+  ASSERT_TRUE(status.IsOK()) << status.GetErrorMessage();
+
+  // Make sure the compiled model was generated and has the expected number of EPContext nodes.
+  ASSERT_TRUE(std::filesystem::exists(output_model_file));
+  CheckEpContextNodeCounts(output_model_file, 2, 2);
+}
+
+// Test using the CompileModel() API with settings:
+//   - input model from file
+//   - save output model to a buffer
+TEST_F(QnnHTPBackendTests, CompileApi_FromSessionOptions_OutputModelBuffer) {
+  const ORTCHAR_T* input_model_file = ORT_TSTR("./compileapi_fromsessionoptions_inputmodelfrompath.onnx");
+  std::filesystem::remove(input_model_file);
+
+  // Create a test model and save it to a file.
+  TestModel test_model;
+  CreateTestModel(BuildGraphWithQAndNonQ(false), 21, logging::Severity::kERROR, test_model);
+  ASSERT_STATUS_OK(test_model.Save(input_model_file));
+
+  // Initialize session options with QNN EP
+  Ort::SessionOptions so;
+  ProviderOptions provider_options;
+#if defined(_WIN32)
+  provider_options["backend_path"] = "QnnHtp.dll";
+#else
+  provider_options["backend_path"] = "libQnnHtp.so";
+#endif
+  provider_options["offload_graph_io_quantization"] = "0";
+  so.AppendExecutionProvider("QNN", provider_options);
+
+  // Create model compilation options from the session options. Output model is stored in a buffer.
+  Ort::ModelCompilationOptions compile_options(*ort_env, so);
+  compile_options.SetInputModelPath(input_model_file);
+
+  Ort::AllocatorWithDefaultOptions allocator;
+  void* output_model_buffer = nullptr;
+  size_t output_model_buffer_size = 0;
+  compile_options.SetOutputModelBuffer(allocator, &output_model_buffer, &output_model_buffer_size);
+
+  // Compile the model.
+  Ort::Status status = Ort::CompileModel(*ort_env, compile_options);
+  ASSERT_TRUE(status.IsOK()) << status.GetErrorMessage();
+
+  // Make sure the compiled model was saved to the buffer.
+  ASSERT_TRUE(output_model_buffer != nullptr);
+  ASSERT_TRUE(output_model_buffer_size > 0);
+
+  // Check that the compiled model has the expected number of EPContext nodes.
+  CheckEpContextNodeCounts(output_model_buffer, output_model_buffer_size, 2, 2);
+}
+
+// Test using the CompileModel() API with settings:
+//   - input model from file
+//   - save output model to a buffer
+//   - save initializers (used by CPU EP) to external file.
+//   - EPContext nodes in output model use embedded binary blobs.
+TEST_F(QnnHTPBackendTests, CompileApi_FromSessionOptions_OutputModelBuffer_OutputInitializersFile) {
+  const ORTCHAR_T* input_model_file = ORT_TSTR("./compileapi_fromsessionoptions_outputmodelbuffer_initializers.onnx");
+  const ORTCHAR_T* output_initializers_file = ORT_TSTR("./compileapi_initializers.bin");
+  std::filesystem::remove(input_model_file);
+  std::filesystem::remove(output_initializers_file);
+
+  // Create a test model and save it to a file.
+  TestModel test_model;
+  CreateTestModel(BuildGraphWithQAndNonQ(false), 21, logging::Severity::kERROR, test_model);
+  ASSERT_STATUS_OK(test_model.Save(input_model_file));
+
+  // Initialize session options with QNN EP
+  Ort::SessionOptions so;
+  ProviderOptions provider_options;
+#if defined(_WIN32)
+  provider_options["backend_path"] = "QnnHtp.dll";
+#else
+  provider_options["backend_path"] = "libQnnHtp.so";
+#endif
+  provider_options["offload_graph_io_quantization"] = "0";
+  so.AppendExecutionProvider("QNN", provider_options);
+
+  // Create model compilation options from the session options. Output model is stored in a buffer.
+  Ort::ModelCompilationOptions compile_options(*ort_env, so);
+  compile_options.SetInputModelPath(input_model_file);
+
+  Ort::AllocatorWithDefaultOptions allocator;
+  void* output_model_buffer = nullptr;
+  size_t output_model_buffer_size = 0;
+  compile_options.SetOutputModelBuffer(allocator, &output_model_buffer, &output_model_buffer_size);
+  compile_options.SetOutputModelExternalInitializersFile(output_initializers_file, 0);
+  compile_options.SetEpContextEmbedMode(true);
+
+  // Compile the model.
+  Ort::Status status = Ort::CompileModel(*ort_env, compile_options);
+  ASSERT_TRUE(status.IsOK()) << status.GetErrorMessage();
+
+  // Make sure the compiled model was saved to the buffer.
+  ASSERT_TRUE(output_model_buffer != nullptr);
+  ASSERT_TRUE(output_model_buffer_size > 0);
+
+  // Make sure that the initializers were saved to an external file.
+  ASSERT_TRUE(std::filesystem::exists(output_initializers_file));
+
+  // Check that the compiled model has the expected number of EPContext nodes.
+  CheckEpContextNodeCounts(output_model_buffer, output_model_buffer_size, 2, 2);
+}
+
 // Test that models with 1 non-quantized FusedMatMul node and 1 quantized Add node can still generate the context binary
 // The generated Onnx model has 1 FusedMatMul node and 1 EPContext node
 TEST_F(QnnHTPBackendTests, QnnContextBinaryMultiPartitionSupport1) {
diff --git a/onnxruntime/test/providers/qnn/reshape_expand_op_test.cc b/onnxruntime/test/providers/qnn/reshape_expand_op_test.cc
index a5f19881e3d2b..d32be64fa5229 100644
--- a/onnxruntime/test/providers/qnn/reshape_expand_op_test.cc
+++ b/onnxruntime/test/providers/qnn/reshape_expand_op_test.cc
@@ -279,6 +279,16 @@ TEST_F(QnnHTPBackendTests, Expand_HTP_int32) {
                                      19);  // Opset
 }
 
+// Test that int64 Expand runs on HTP backend.
+TEST_F(QnnHTPBackendTests, Expand_HTP_int64) {
+  RunReshapeExpandTestOnHTP<int64_t>("Expand",
+                                     TestInputDef<int64_t>({1}, false, {1}),
+                                     TestInputDef<int64_t>({3}, true, {1, 2, 3}),
+                                     {},  // Attributes
+                                     ExpectedEPNodeAssignment::All,
+                                     19);  // Opset
+}
+
 // Test QDQ Expand
 TEST_F(QnnHTPBackendTests, Expand_4D) {
   RunQDQReshapeExpandTestOnHTP<uint8_t>("Expand",
diff --git a/onnxruntime/test/providers/xnnpack/xnnpack_basic_test.cc b/onnxruntime/test/providers/xnnpack/xnnpack_basic_test.cc
index 85ca7c1ed328e..f3403f112b7f2 100644
--- a/onnxruntime/test/providers/xnnpack/xnnpack_basic_test.cc
+++ b/onnxruntime/test/providers/xnnpack/xnnpack_basic_test.cc
@@ -7,6 +7,7 @@
 #include "core/common/logging/logging.h"
 #include "core/common/span_utils.h"
 #include "core/framework/utils.h"
+#include "core/graph/constants.h"
 #include "core/graph/graph.h"
 #include "core/providers/xnnpack/xnnpack_execution_provider.h"
 #include "core/session/inference_session.h"
@@ -126,17 +127,7 @@ TEST(XnnpackEP, TestAllocatorSharing) {
 }
 
 TEST(XnnpackEP, TestAddEpUsingPublicApi) {
-  {
-    // C++ API test
-    Ort::SessionOptions so;
-    onnxruntime::ProviderOptions options;
-    // no real options currently but set a value to make sure it's passed through. requires manual validation.
-    options["one"] = "two";
-    so.AppendExecutionProvider("XNNPACK", options);
-
-    const ORTCHAR_T* ort_model_path = ORT_MODEL_FOLDER "nhwc_conv_clip_relu.onnx";
-    Ort::Session session(*ort_env, ort_model_path, so);
-
+  auto session_has_xnnpack_ep = [](Ort::Session& session) -> bool {
     // dirty hack to access the underlying InferenceSession but don't know a better way.
     const OrtSession* ort_session = session;
     const InferenceSession* s = reinterpret_cast<const InferenceSession*>(ort_session);
@@ -149,8 +140,33 @@ TEST(XnnpackEP, TestAddEpUsingPublicApi) {
         break;
       }
     }
+    return have_xnnpack_ep;
+  };
+
+  {
+    // C++ API test
+    Ort::SessionOptions so;
+    onnxruntime::ProviderOptions options;
+    // no real options currently but set a value to make sure it's passed through. requires manual validation.
+    options["one"] = "two";
+    so.AppendExecutionProvider("XNNPACK", options);
+
+    const ORTCHAR_T* ort_model_path = ORT_MODEL_FOLDER "nhwc_conv_clip_relu.onnx";
+    Ort::Session session(*ort_env, ort_model_path, so);
+    ASSERT_TRUE(session_has_xnnpack_ep(session)) << "Xnnpack EP was not found in registered providers for session.";
+  }
 
-    ASSERT_TRUE(have_xnnpack_ep) << "Xnnpack EP was not found in registered providers for session.";
+  {
+    // C++ API test using canonical EP name 'XnnpackExecutionProvider'
+    Ort::SessionOptions so;
+    onnxruntime::ProviderOptions options;
+    // no real options currently but set a value to make sure it's passed through. requires manual validation.
+    options["one"] = "two";
+    so.AppendExecutionProvider(kXnnpackExecutionProvider, options);
+
+    const ORTCHAR_T* ort_model_path = ORT_MODEL_FOLDER "nhwc_conv_clip_relu.onnx";
+    Ort::Session session(*ort_env, ort_model_path, so);
+    ASSERT_TRUE(session_has_xnnpack_ep(session)) << "Xnnpack EP was not found in registered providers for session.";
   }
 
   {
diff --git a/onnxruntime/test/python/quantization/test_op_matmul_4bits.py b/onnxruntime/test/python/quantization/test_op_matmul_4bits.py
index ed0c65cba78ac..059f3c47f1b8d 100644
--- a/onnxruntime/test/python/quantization/test_op_matmul_4bits.py
+++ b/onnxruntime/test/python/quantization/test_op_matmul_4bits.py
@@ -390,3 +390,4 @@ def test_quantize_matmul_int4_using_hqq_algo(self):
 
 if __name__ == "__main__":
     unittest.main()
+    # TODO(fajin): add 8bit quantization test after enabling kenrels
diff --git a/onnxruntime/test/python/quantization/test_static_quantize_runner.py b/onnxruntime/test/python/quantization/test_static_quantize_runner.py
new file mode 100644
index 0000000000000..3fb93bc8a5a0b
--- /dev/null
+++ b/onnxruntime/test/python/quantization/test_static_quantize_runner.py
@@ -0,0 +1,568 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
+
+import contextlib
+import json
+import os
+import tempfile
+import unittest
+from unittest import mock
+
+import numpy as np
+import onnx
+
+from onnxruntime.quantization import static_quantize_runner
+
+
+class StaticQuantizeRunnerTestBase:
+    def setUp(self):
+        self._tmp_model_dir = tempfile.TemporaryDirectory(prefix="ort.static_quant_runner_")
+        self._tmp_dir_path = self._tmp_model_dir.name
+        self.rand_seed = 0
+        np.random.seed(self.rand_seed)
+
+    def run_static_quantize_runner(self, script_args: list, override_dict=None) -> onnx.ModelProto:
+        model = self.onnx_model()
+        test_data_sets = self.get_test_data_set(model)
+        for idx, test_data_set in enumerate(test_data_sets):
+            test_data_set_dir = os.path.join(self._tmp_dir_path, f"test_data_set_{idx}")
+            os.makedirs(test_data_set_dir, exist_ok=True)
+            for data_idx, test_data in enumerate(test_data_set):
+                data_path = os.path.join(test_data_set_dir, f"input_{data_idx}.pb")
+                with open(data_path, "wb") as f:
+                    tensor_proto = onnx.numpy_helper.from_array(test_data)
+                    f.write(tensor_proto.SerializeToString())
+
+        in_model_path = os.path.join(self._tmp_dir_path, "model.onnx")
+        onnx.save_model(model, in_model_path)  # Save input test model to disk
+
+        out_model_path = os.path.join(self._tmp_dir_path, "model_quant.onnx")
+
+        # Call script's main() with custom command-line args.
+        script = ["static_quantize_runner.py", "-i", in_model_path, "-o", out_model_path]
+        if override_dict:
+            quant_override_path = os.path.join(self._tmp_dir_path, "quant_override.json")
+            with open(quant_override_path, "w") as f:
+                json.dump(override_dict, f)
+            script += ["--tensor_quant_overrides", quant_override_path]
+
+        with mock.patch("sys.argv", script + script_args):
+            static_quantize_runner.main()
+
+        # check that output qdq model was generated
+        self.assertTrue(os.path.exists(out_model_path))
+        return onnx.load_model(out_model_path)
+
+    def onnx_model(self) -> onnx.ModelProto:
+        pass
+
+    def get_test_data_set(self, model):
+        inp_shapes = [[d.dim_value for d in inp.type.tensor_type.shape.dim] for inp in model.graph.input]
+        inp_dtypes = [onnx.helper.tensor_dtype_to_np_dtype(inp.type.tensor_type.elem_type) for inp in model.graph.input]
+        test_data_sets = [
+            [
+                np.random.rand(*inp_shape).astype(inp_dtype)
+                for inp_shape, inp_dtype in zip(inp_shapes, inp_dtypes, strict=False)
+            ]
+        ]
+        return test_data_sets
+
+    def test_activation_weight_type(self):
+        script_args_set = [
+            (["--activation_type", "quint8", "--weight_type", "quint8"], True),
+            (["--activation_type", "quint8", "--weight_type", "qint8"], True),
+            (["--activation_type", "qint8", "--weight_type", "quint8"], False),
+            (["--activation_type", "qint8", "--weight_type", "qint8"], True),
+        ]
+        for script_args, success in script_args_set:
+            with (
+                self.subTest(script_args=script_args),
+                contextlib.nullcontext() if success else self.assertRaises(Exception),
+            ):
+                self.run_static_quantize_runner(script_args)
+
+    def test_quant_format(self):
+        script_args_set = [(["--quant_format", "qdq"], True), (["--quant_format", "qoperator"], True)]
+        for script_args, success in script_args_set:
+            with (
+                self.subTest(script_args=script_args),
+                contextlib.nullcontext() if success else self.assertRaises(Exception),
+            ):
+                self.run_static_quantize_runner(script_args)
+
+    def test_calibration_method(self):
+        script_args_set = [
+            (["--calibration_method", "minmax"], True),
+            (["--calibration_method", "entropy"], True),
+            (["--calibration_method", "percentile"], True),
+            (["--calibration_method", "distribution"], True),
+        ]
+        for script_args, success in script_args_set:
+            with (
+                self.subTest(script_args=script_args),
+                contextlib.nullcontext() if success else self.assertRaises(Exception),
+            ):
+                self.run_static_quantize_runner(script_args)
+
+    def test_calib_tensor_range_symmetric(self):
+        # TODO: Check whether the calibrated tensor value is symmetric
+        script_args_set = [
+            (["--activation_type", "quint8", "--calib_tensor_range_symmetric"], True),
+            (["--activation_type", "qint8", "--calib_tensor_range_symmetric"], True),
+        ]
+        for script_args, success in script_args_set:
+            with (
+                self.subTest(script_args=script_args, success=success),
+                contextlib.nullcontext() if success else self.assertRaises(Exception),
+            ):
+                self.run_static_quantize_runner(script_args)
+
+    def test_calib_moving_average(self):
+        script_args_set = [
+            (["--calib_moving_average"], True),
+        ]
+        for script_args, success in script_args_set:
+            with (
+                self.subTest(script_args=script_args, success=success),
+                contextlib.nullcontext() if success else self.assertRaises(Exception),
+            ):
+                self.run_static_quantize_runner(script_args)
+
+    def test_minimum_real_range(self):
+        # TODO: Check whether the (rmin-rmax) follows the minimum range
+        script_args_set = [
+            (["--minimum_real_range", "0.001"], True),
+        ]
+        for script_args, success in script_args_set:
+            with (
+                self.subTest(script_args=script_args, success=success),
+                contextlib.nullcontext() if success else self.assertRaises(Exception),
+            ):
+                self.run_static_quantize_runner(script_args)
+
+    def test_use_qdq_contrib_ops(self):
+        script_args_set = [
+            (["--use_qdq_contrib_ops"], True),
+        ]
+        for script_args, success in script_args_set:
+            with (
+                self.subTest(script_args=script_args, success=success),
+                contextlib.nullcontext() if success else self.assertRaises(Exception),
+            ):
+                quant_model = self.run_static_quantize_runner(script_args)
+                for node in quant_model.graph.node:
+                    if node.op_type in ["QuantizeLinear", "DeQuantizeLinear"]:
+                        assert node.domain == "com.microsoft"
+
+    def test_qdq_disable_weight_adjust_for_int32_bias(self):
+        # TODO: Check whether the weight's scale is adjusted
+        script_args_set = [
+            (["--qdq_disable_weight_adjust_for_int32_bias"], True),
+        ]
+        for script_args, success in script_args_set:
+            with (
+                self.subTest(script_args=script_args, success=success),
+                contextlib.nullcontext() if success else self.assertRaises(Exception),
+            ):
+                self.run_static_quantize_runner(script_args)
+
+
+class TestAdd(StaticQuantizeRunnerTestBase, unittest.TestCase):
+    def onnx_model(self, inp_shape=(4, 3, 32, 32)) -> onnx.ModelProto:
+        graph_inputs = [
+            onnx.helper.make_tensor_value_info("input_0", onnx.TensorProto.FLOAT, inp_shape),
+            onnx.helper.make_tensor_value_info("input_1", onnx.TensorProto.FLOAT, inp_shape),
+        ]
+        graph_outputs = [onnx.helper.make_tensor_value_info("output_0", onnx.TensorProto.FLOAT, inp_shape)]
+        initializers = []
+
+        add_node = onnx.helper.make_node("Add", ["input_0", "input_1"], ["output_0"], name="Add0")
+        graph = onnx.helper.make_graph(
+            [add_node],
+            "AddGraph",
+            graph_inputs,
+            graph_outputs,
+            initializer=initializers,
+        )
+        opset_imports = [onnx.helper.make_opsetid("", 21)]
+        model = onnx.helper.make_model(graph, opset_imports=opset_imports)
+        model = onnx.shape_inference.infer_shapes(model)
+        onnx.checker.check_model(model, True)
+        return model
+
+    def test_tensor_quant_overrides(self):
+        override_dict = {"input_0": [{"scale": 0.005, "zero_point": 2}]}
+        quant_model = self.run_static_quantize_runner(script_args=[], override_dict=override_dict)
+        assert quant_model.graph.initializer[0].name == "input_0_zero_point"
+        assert np.allclose(quant_model.graph.initializer[0].int32_data, 2)
+        assert quant_model.graph.initializer[1].name == "input_0_scale"
+        assert np.allclose(quant_model.graph.initializer[1].float_data, 0.005)
+
+
+class TestMatMulConstB(StaticQuantizeRunnerTestBase, unittest.TestCase):
+    def onnx_model(self, inp_shape=(4, 3, 5, 7), weight_shape=(4, 3, 7, 9), out_shape=(4, 3, 5, 9)) -> onnx.ModelProto:
+        graph_inputs = [
+            onnx.helper.make_tensor_value_info("input_0", onnx.TensorProto.FLOAT, inp_shape),
+        ]
+        graph_outputs = [onnx.helper.make_tensor_value_info("output_0", onnx.TensorProto.FLOAT, out_shape)]
+        initializers = [
+            onnx.helper.make_tensor(
+                "weight_0", onnx.TensorProto.FLOAT, dims=weight_shape, vals=np.random.rand(*weight_shape)
+            )
+        ]
+        matmul_input_names = ["input_0", "weight_0"]
+
+        matmul_node = onnx.helper.make_node("MatMul", matmul_input_names, ["output_0"], name="MatMul0")
+        graph = onnx.helper.make_graph(
+            [matmul_node],
+            "MatMulGraph",
+            graph_inputs,
+            graph_outputs,
+            initializer=initializers,
+        )
+        opset_imports = [onnx.helper.make_opsetid("", 21)]
+        model = onnx.helper.make_model(graph, opset_imports=opset_imports)
+        model = onnx.shape_inference.infer_shapes(model)
+        onnx.checker.check_model(model, True)
+        return model
+
+    def test_per_channel_quant(self):
+        # TODO: Fix the str and int issue on the axis and enable the following subtests
+        script_args_set = [
+            (["--per_channel"], 3),
+            # (["--per_channel", "--op_per_channel_axis", "MatMul", "0"], 4),
+            # (["--per_channel", "--op_per_channel_axis", "MatMul", "1"], 3),
+            # (["--per_channel", "--op_per_channel_axis", "MatMul", "2"], 7),
+            # (["--per_channel", "--op_per_channel_axis", "MatMul", "3"], 9),
+        ]
+        for script_args, n_channels in script_args_set:
+            with self.subTest(script_args=script_args):
+                quant_model = self.run_static_quantize_runner(script_args)
+                assert quant_model.graph.initializer[0].name == "input_0_zero_point"
+                assert len(quant_model.graph.initializer[0].dims) == 0
+                assert quant_model.graph.initializer[1].name == "input_0_scale"
+                assert len(quant_model.graph.initializer[1].dims) == 0
+                assert quant_model.graph.initializer[2].name == "weight_0_zero_point"
+                assert quant_model.graph.initializer[2].dims[0] == n_channels
+                assert quant_model.graph.initializer[3].name == "weight_0_scale"
+                assert quant_model.graph.initializer[3].dims[0] == n_channels
+
+    def test_add_qdq_pair_to_weight(self):
+        script_args_set = [
+            (["--add_qdq_pair_to_weight"], 7),
+            ([], 6),
+        ]
+        for script_args, expect_num_nodes in script_args_set:
+            with self.subTest(script_args=script_args, expect_num_nodes=expect_num_nodes):
+                quant_model = self.run_static_quantize_runner(script_args)
+                assert len(quant_model.graph.node) == expect_num_nodes
+
+    def test_exlude_output_quantization(self):
+        script_args_set = [
+            (["--op_types_to_exclude_output_quantization", "MatMul"], 4),
+            ([], 6),
+        ]
+        for script_args, expect_num_nodes in script_args_set:
+            with self.subTest(script_args=script_args, expect_num_nodes=expect_num_nodes):
+                quant_model = self.run_static_quantize_runner(script_args)
+                assert len(quant_model.graph.node) == expect_num_nodes
+
+
+class TestMatMul(StaticQuantizeRunnerTestBase, unittest.TestCase):
+    def onnx_model(self, inp_shape1=(4, 3, 5, 7), inp_shape2=(4, 3, 7, 9), out_shape=(4, 3, 5, 9)) -> onnx.ModelProto:
+        graph_inputs = [
+            onnx.helper.make_tensor_value_info("input_0", onnx.TensorProto.FLOAT, inp_shape1),
+            onnx.helper.make_tensor_value_info("input_1", onnx.TensorProto.FLOAT, inp_shape2),
+        ]
+        graph_outputs = [onnx.helper.make_tensor_value_info("output_0", onnx.TensorProto.FLOAT, out_shape)]
+        matmul_input_names = ["input_0", "input_1"]
+
+        matmul_node = onnx.helper.make_node("MatMul", matmul_input_names, ["output_0"], name="MatMul0")
+        graph = onnx.helper.make_graph([matmul_node], "MatMulGraph", graph_inputs, graph_outputs)
+        opset_imports = [onnx.helper.make_opsetid("", 21)]
+        model = onnx.helper.make_model(graph, opset_imports=opset_imports)
+        model = onnx.shape_inference.infer_shapes(model)
+        onnx.checker.check_model(model, True)
+        return model
+
+    def test_matmul_const_b_only(self):
+        # TODO: Support --matmul_const_b_only on qdq quant_format and refine the testcase
+        script_args_set = [
+            (["--quant_format", "qoperator", "--matmul_const_b_only"], 1),
+            (["--quant_format", "qoperator"], 4),
+        ]
+        for script_args, expect_num_nodes in script_args_set:
+            with self.subTest(script_args=script_args, expect_num_nodes=expect_num_nodes):
+                quant_model = self.run_static_quantize_runner(script_args)
+                assert len(quant_model.graph.node) == expect_num_nodes
+
+
+class TestAddSideBySide(StaticQuantizeRunnerTestBase, unittest.TestCase):
+    def onnx_model(self, inp_shape=(4, 3, 32, 32)) -> onnx.ModelProto:
+        graph_inputs = [
+            onnx.helper.make_tensor_value_info("input_0", onnx.TensorProto.FLOAT, inp_shape),
+            onnx.helper.make_tensor_value_info("input_1", onnx.TensorProto.FLOAT, inp_shape),
+            onnx.helper.make_tensor_value_info("input_2", onnx.TensorProto.FLOAT, inp_shape),
+        ]
+        graph_outputs = [
+            onnx.helper.make_tensor_value_info("output_0", onnx.TensorProto.FLOAT, inp_shape),
+            onnx.helper.make_tensor_value_info("output_1", onnx.TensorProto.FLOAT, inp_shape),
+        ]
+        add_node0 = onnx.helper.make_node("Add", ["input_0", "input_1"], ["output_0"], name="Add0")
+        add_node1 = onnx.helper.make_node("Add", ["input_1", "input_2"], ["output_1"], name="Add1")
+        graph = onnx.helper.make_graph(
+            [add_node0, add_node1],
+            "AddGraph",
+            graph_inputs,
+            graph_outputs,
+            initializer=[],
+        )
+        opset_imports = [onnx.helper.make_opsetid("", 21)]
+        model = onnx.helper.make_model(graph, opset_imports=opset_imports)
+        model = onnx.shape_inference.infer_shapes(model)
+        onnx.checker.check_model(model, True)
+        return model
+
+    def test_dedicated_qdq_pair(self):
+        # TODO: Support --matmul_const_b_only on qdq quant_format and refine the testcase
+        script_args_set = [([], 12), (["--dedicated_qdq_pair"], 14)]
+        for script_args, expect_num_nodes in script_args_set:
+            with self.subTest(script_args=script_args, expect_num_nodes=expect_num_nodes):
+                quant_model = self.run_static_quantize_runner(script_args)
+                assert len(quant_model.graph.node) == expect_num_nodes
+
+    def test_nodes_to_quantize(self):
+        script_args_set = [
+            (["--nodes_to_quantize", "Add0"], 8),
+            (["--nodes_to_quantize", "Add1"], 8),
+            (["--nodes_to_quantize", "Add0", "Add1"], 12),
+        ]
+        for script_args, expect_num_nodes in script_args_set:
+            with self.subTest(script_args=script_args, expect_num_nodes=expect_num_nodes):
+                quant_model = self.run_static_quantize_runner(script_args)
+                assert len(quant_model.graph.node) == expect_num_nodes
+
+    def test_nodes_to_exclude(self):
+        script_args_set = [
+            (["--nodes_to_exclude", "Add0"], 8),
+            (["--nodes_to_exclude", "Add1"], 8),
+            (["--nodes_to_exclude", "Add0", "Add1"], 2),
+        ]
+        for script_args, expect_num_nodes in script_args_set:
+            with self.subTest(script_args=script_args, expect_num_nodes=expect_num_nodes):
+                quant_model = self.run_static_quantize_runner(script_args)
+                assert len(quant_model.graph.node) == expect_num_nodes
+
+
+class TestConv(StaticQuantizeRunnerTestBase, unittest.TestCase):
+    def onnx_model(
+        self, inp_shape=(4, 3, 32, 32), weight_shape=(8, 3, 5, 5), bias_shape=(8,), out_shape=(4, 8, 28, 28)
+    ) -> onnx.ModelProto:
+        graph_inputs = [
+            onnx.helper.make_tensor_value_info("input_0", onnx.TensorProto.FLOAT, inp_shape),
+        ]
+        graph_outputs = [onnx.helper.make_tensor_value_info("output_0", onnx.TensorProto.FLOAT, out_shape)]
+        initializers = [
+            onnx.helper.make_tensor(
+                "weight_0", onnx.TensorProto.FLOAT, dims=weight_shape, vals=np.random.rand(*weight_shape)
+            ),
+            onnx.helper.make_tensor(
+                "bias_0", onnx.TensorProto.FLOAT, dims=bias_shape, vals=np.random.rand(*bias_shape)
+            ),
+        ]
+
+        conv_node = onnx.helper.make_node(
+            "Conv",
+            ["input_0", "weight_0", "bias_0"],
+            ["output_0"],
+            name="Conv0",
+            kernel_shape=[5, 5],
+        )
+        graph = onnx.helper.make_graph(
+            [conv_node],
+            "ConvGraph",
+            graph_inputs,
+            graph_outputs,
+            initializer=initializers,
+        )
+        opset_imports = [onnx.helper.make_opsetid("", 21)]
+        model = onnx.helper.make_model(graph, opset_imports=opset_imports)
+        model = onnx.shape_inference.infer_shapes(model)
+        onnx.checker.check_model(model, True)
+        return model
+
+    def test_disable_quantize_bias(self):
+        script_args_set = [
+            ([], 7),
+            (["--disable_quantize_bias"], 6),
+        ]
+        for script_args, expect_num_nodes in script_args_set:
+            with self.subTest(script_args=script_args, expect_num_nodes=expect_num_nodes):
+                quant_model = self.run_static_quantize_runner(script_args)
+                assert len(quant_model.graph.node) == expect_num_nodes
+
+
+class TestConvRelu(StaticQuantizeRunnerTestBase, unittest.TestCase):
+    def onnx_model(
+        self, inp_shape=(4, 3, 32, 32), weight_shape=(8, 3, 5, 5), bias_shape=(8,), out_shape=(4, 8, 28, 28)
+    ) -> onnx.ModelProto:
+        graph_inputs = [
+            onnx.helper.make_tensor_value_info("input_0", onnx.TensorProto.FLOAT, inp_shape),
+        ]
+        graph_outputs = [onnx.helper.make_tensor_value_info("output_0", onnx.TensorProto.FLOAT, out_shape)]
+        initializers = [
+            onnx.helper.make_tensor(
+                "weight_0", onnx.TensorProto.FLOAT, dims=weight_shape, vals=np.random.rand(*weight_shape)
+            ),
+            onnx.helper.make_tensor(
+                "bias_0", onnx.TensorProto.FLOAT, dims=bias_shape, vals=np.random.rand(*bias_shape)
+            ),
+        ]
+
+        conv_node = onnx.helper.make_node(
+            "Conv",
+            ["input_0", "weight_0", "bias_0"],
+            ["conv_output_0"],
+            name="Conv0",
+            kernel_shape=[5, 5],
+        )
+
+        relu_node = onnx.helper.make_node(
+            "Relu",
+            ["conv_output_0"],
+            ["output_0"],
+            name="Relu0",
+        )
+        graph = onnx.helper.make_graph(
+            [conv_node, relu_node],
+            "ConvReluGraph",
+            graph_inputs,
+            graph_outputs,
+            initializer=initializers,
+        )
+        opset_imports = [onnx.helper.make_opsetid("", 21)]
+        model = onnx.helper.make_model(graph, opset_imports=opset_imports)
+        model = onnx.shape_inference.infer_shapes(model)
+        onnx.checker.check_model(model, True)
+        return model
+
+    def test_qdq_keep_removable_activations(self):
+        script_args_set = [([], 7), (["--qdq_keep_removable_activations"], 10)]
+        for script_args, expect_num_nodes in script_args_set:
+            with self.subTest(script_args=script_args, expect_num_nodes=expect_num_nodes):
+                quant_model = self.run_static_quantize_runner(script_args)
+                assert len(quant_model.graph.node) == expect_num_nodes
+
+
+class TestIfGraph(StaticQuantizeRunnerTestBase, unittest.TestCase):
+    def onnx_model(self) -> onnx.ModelProto:
+        cond = onnx.helper.make_tensor_value_info("cond", onnx.TensorProto.BOOL, [1])
+        input_0 = onnx.helper.make_tensor_value_info("input_0", onnx.TensorProto.FLOAT, [4, 3, 32, 32])
+        input_1 = onnx.helper.make_tensor_value_info("input_1", onnx.TensorProto.FLOAT, [4, 3, 32, 32])
+        output_0 = onnx.helper.make_tensor_value_info("output_0", onnx.TensorProto.FLOAT, [4, 3, 32, 32])
+
+        then_graph = onnx.helper.make_graph(
+            nodes=[onnx.helper.make_node("Add", ["input_0", "input_1"], ["output_0"], name="Add0")],
+            name="AddGraph",
+            inputs=[],
+            outputs=[output_0],
+            initializer=[],
+        )
+        else_graph = onnx.helper.make_graph(
+            nodes=[onnx.helper.make_node("Mul", ["input_0", "input_1"], ["output_0"], name="Mul0")],
+            name="MulGraph",
+            inputs=[],
+            outputs=[output_0],
+            initializer=[],
+        )
+        if_node = onnx.helper.make_node(
+            "If", inputs=["cond"], outputs=["output_0"], then_branch=then_graph, else_branch=else_graph
+        )
+        graph = onnx.helper.make_graph(
+            nodes=[if_node], name="main_graph", inputs=[cond, input_0, input_1], outputs=[output_0]
+        )
+        opset_imports = [onnx.helper.make_opsetid("", 21)]
+        model = onnx.helper.make_model(graph, opset_imports=opset_imports)
+        model = onnx.shape_inference.infer_shapes(model)
+        onnx.checker.check_model(model, True)
+        return model
+
+    def test_enable_subgraph(self):
+        # TODO: Investigate the reason of failure on --enable_subgraph
+        script_args_set = [(["--enable_subgraph", "--quant_format", "qoperator"], False)]
+        for script_args, success in script_args_set:
+            with (
+                self.subTest(script_args=script_args),
+                contextlib.nullcontext() if success else self.assertRaises(Exception),
+            ):
+                self.run_static_quantize_runner(script_args)
+
+    def test_calibration_method(self):
+        # TODO: Investigate the reason of failure on calibration methods
+        script_args_set = [
+            (["--calibration_method", "minmax"], True),
+            (["--calibration_method", "entropy"], False),
+            (["--calibration_method", "percentile"], False),
+            (["--calibration_method", "distribution"], False),
+        ]
+        for script_args, success in script_args_set:
+            with (
+                self.subTest(script_args=script_args),
+                contextlib.nullcontext() if success else self.assertRaises(Exception),
+            ):
+                self.run_static_quantize_runner(script_args)
+
+
+class TestWhere(StaticQuantizeRunnerTestBase, unittest.TestCase):
+    def onnx_model(self, inp_shape=(4, 3, 32, 32)) -> onnx.ModelProto:
+        graph_inputs = [
+            onnx.helper.make_tensor_value_info("cond", onnx.TensorProto.BOOL, inp_shape),
+            onnx.helper.make_tensor_value_info("input_0", onnx.TensorProto.FLOAT, inp_shape),
+            onnx.helper.make_tensor_value_info("input_1", onnx.TensorProto.FLOAT, inp_shape),
+        ]
+        graph_outputs = [onnx.helper.make_tensor_value_info("output_0", onnx.TensorProto.FLOAT, inp_shape)]
+
+        add_node = onnx.helper.make_node("Add", ["input_0", "input_1"], ["Add_output_0"], name="Add0")
+        mul_node = onnx.helper.make_node("Mul", ["input_0", "input_1"], ["Mul_output_0"], name="Mul0")
+        where_node = onnx.helper.make_node(
+            "Where", ["cond", "Add_output_0", "Mul_output_0"], ["output_0"], name="Where0"
+        )
+        graph = onnx.helper.make_graph(
+            [add_node, mul_node, where_node],
+            "WhereGraph",
+            graph_inputs,
+            graph_outputs,
+            initializer=[],
+        )
+        opset_imports = [onnx.helper.make_opsetid("", 21)]
+        model = onnx.helper.make_model(graph, opset_imports=opset_imports)
+        model = onnx.shape_inference.infer_shapes(model)
+        onnx.checker.check_model(model, True)
+        return model
+
+    def test_quant_format(self):
+        # TODO: Investigate the failure on --quant_format qoperator
+        script_args_set = [(["--quant_format", "qdq"], True), (["--quant_format", "qoperator"], False)]
+        for script_args, success in script_args_set:
+            with (
+                self.subTest(script_args=script_args),
+                contextlib.nullcontext() if success else self.assertRaises(Exception),
+            ):
+                self.run_static_quantize_runner(script_args)
+
+    def test_force_quantize_no_input_check(self):
+        script_args_set = [
+            (["--nodes_to_exclude", "Add0", "Mul0"], 3),
+            (["--nodes_to_exclude", "Add0", "Mul0", "--force_quantize_no_input_check"], 9),
+        ]
+        for script_args, expect_num_node in script_args_set:
+            with self.subTest(script_args=script_args):
+                quant_model = self.run_static_quantize_runner(script_args)
+                assert len(quant_model.graph.node) == expect_num_node
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/onnxruntime/test/shared_lib/test_inference.cc b/onnxruntime/test/shared_lib/test_inference.cc
index e00606af1c086..0a4ea71933724 100644
--- a/onnxruntime/test/shared_lib/test_inference.cc
+++ b/onnxruntime/test/shared_lib/test_inference.cc
@@ -4674,27 +4674,14 @@ TEST(CApiTest, RequestLoadCancellation) {
   constexpr const ORTCHAR_T* model_path = ORT_TSTR("testdata/transformers/tiny_gpt2_beamsearch.onnx");
   Ort::Env env(ORT_LOGGING_LEVEL_WARNING);
   Ort::SessionOptions session_options;
+  session_options.SetLoadCancellationFlag(true);
 
-  auto terminator = [&session_options]() {
-    session_options.SetLoadCancellationFlag(true);
-    return;
-  };
-
-  std::packaged_task<void()> task{terminator};
-  std::future<void> terminator_result = task.get_future();
-  std::thread terminator_thread{std::move(task)};
   bool terminated = false;
   try {
     Ort::Session session(env, model_path, session_options);
   } catch (const Ort::Exception& ex) {
     terminated = OrtErrorCode::ORT_MODEL_LOAD_CANCELED == ex.GetOrtErrorCode();
   }
-  // done with the thread
-  terminator_thread.join();
-
-  // call get to propagate any exception
-  terminator_result.get();
-
   ASSERT_TRUE(terminated);
 }
 
diff --git a/onnxruntime/test/shared_lib/utils.h b/onnxruntime/test/shared_lib/utils.h
index 5d15582b86cb9..95cb5948f3701 100644
--- a/onnxruntime/test/shared_lib/utils.h
+++ b/onnxruntime/test/shared_lib/utils.h
@@ -52,7 +52,11 @@ void RunSession(OrtAllocator* allocator,
   auto* actual = output_tensor->GetTensorMutableData<ModelOutputT>();
   for (size_t i = 0; i != total_len; ++i) {
     if constexpr (std::is_same<ModelOutputT, float>::value || std::is_same<ModelOutputT, double>::value) {
+#ifdef USE_OPENVINO
+      EXPECT_NEAR(expected_output[i], actual[i], 5e-2) << "i=" << i;
+#else
       EXPECT_NEAR(expected_output[i], actual[i], 1e-3) << "i=" << i;
+#endif
     } else {
       EXPECT_EQ(expected_output[i], actual[i]) << "i=" << i;
     }
diff --git a/onnxruntime/test/testdata/transform/fusion/gh_issue_24341.onnx b/onnxruntime/test/testdata/transform/fusion/gh_issue_24341.onnx
new file mode 100644
index 0000000000000..9c43bfc7fa79c
Binary files /dev/null and b/onnxruntime/test/testdata/transform/fusion/gh_issue_24341.onnx differ
diff --git a/onnxruntime/test/testdata/transform/fusion/gqa_fusion_different_head_sizes.onnx b/onnxruntime/test/testdata/transform/fusion/gqa_fusion_different_head_sizes.onnx
new file mode 100644
index 0000000000000..f51831adb1ef8
Binary files /dev/null and b/onnxruntime/test/testdata/transform/fusion/gqa_fusion_different_head_sizes.onnx differ
diff --git a/onnxruntime/test/testdata/transform/fusion/gqa_fusion_quantized_different_head_sizes.onnx b/onnxruntime/test/testdata/transform/fusion/gqa_fusion_quantized_different_head_sizes.onnx
new file mode 100644
index 0000000000000..26f36b8bc686b
Binary files /dev/null and b/onnxruntime/test/testdata/transform/fusion/gqa_fusion_quantized_different_head_sizes.onnx differ
diff --git a/onnxruntime/test/testdata/transform/fusion/gqa_fusion_quantized_simple.onnx b/onnxruntime/test/testdata/transform/fusion/gqa_fusion_quantized_simple.onnx
new file mode 100644
index 0000000000000..a6b5a6b503093
Binary files /dev/null and b/onnxruntime/test/testdata/transform/fusion/gqa_fusion_quantized_simple.onnx differ
diff --git a/onnxruntime/wasm/pre-jsep.js b/onnxruntime/wasm/pre-jsep.js
index cca8da0525fbe..4dca86d287dae 100644
--- a/onnxruntime/wasm/pre-jsep.js
+++ b/onnxruntime/wasm/pre-jsep.js
@@ -151,6 +151,9 @@ Module["jsepInit"] = (name, params) => {
     Module["webnnRegisterGraphInput"] =
       backend["registerGraphInput"].bind(backend);
     Module["webnnIsGraphInput"] = backend["isGraphInput"].bind(backend);
+    Module["webnnRegisterGraphOutput"] =
+      backend["registerGraphOutput"].bind(backend);
+    Module["webnnIsGraphOutput"] = backend["isGraphOutput"].bind(backend);
 
     Module["webnnCreateTemporaryTensor"] =
       backend["createTemporaryTensor"].bind(backend);
diff --git a/setup.py b/setup.py
index 53e533050b245..5fc78963eca9a 100644
--- a/setup.py
+++ b/setup.py
@@ -400,6 +400,7 @@ def finalize_options(self):
     # QNN V68/V73 dependencies
     qnn_deps = [
         "QnnCpu.dll",
+        "QnnGpu.dll",
         "QnnHtp.dll",
         "QnnSaver.dll",
         "QnnSystem.dll",
diff --git a/tools/ci_build/github/azure-pipelines/android-arm64-v8a-QNN-crosscompile-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/android-arm64-v8a-QNN-crosscompile-ci-pipeline.yml
index 18a14aa0ac075..ba6a33b07e765 100644
--- a/tools/ci_build/github/azure-pipelines/android-arm64-v8a-QNN-crosscompile-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/android-arm64-v8a-QNN-crosscompile-ci-pipeline.yml
@@ -32,7 +32,7 @@ parameters:
 - name: QnnSdk
   displayName: QNN SDK version
   type: string
-  default: 2.32.0.250228
+  default: 2.33.2.250410
 
 jobs:
 - job: Build_QNN_EP
diff --git a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
index 8f1189b05858c..5313af8e7e2d5 100644
--- a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
+++ b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
@@ -57,7 +57,7 @@ parameters:
 - name: QnnSdk
   displayName: QNN SDK Version
   type: string
-  default: 2.32.0.250228
+  default: 2.33.2.250410
 
 resources:
   repositories:
diff --git a/tools/ci_build/github/azure-pipelines/custom-nuget-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/custom-nuget-packaging-pipeline.yml
index a6ad823594917..7b214dbdfae3a 100644
--- a/tools/ci_build/github/azure-pipelines/custom-nuget-packaging-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/custom-nuget-packaging-pipeline.yml
@@ -6,7 +6,7 @@ parameters:
 - name: QnnSdk
   displayName: QNN SDK Version
   type: string
-  default: 2.32.0.250228
+  default: 2.33.2.250410
 
 - name: IsReleaseBuild
   displayName: Is a release build? Set it to true if you are doing an Onnx Runtime release.
diff --git a/tools/ci_build/github/azure-pipelines/linux-openvino-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-openvino-ci-pipeline.yml
deleted file mode 100644
index 6f122034e3698..0000000000000
--- a/tools/ci_build/github/azure-pipelines/linux-openvino-ci-pipeline.yml
+++ /dev/null
@@ -1,37 +0,0 @@
-##### start trigger Don't edit it manually, Please do edit set-trigger-rules.py ####
-### please do rerun set-trigger-rules.py ###
-trigger:
-  branches:
-    include:
-    - main
-    - rel-*
-  paths:
-    exclude:
-    - docs/**
-    - README.md
-    - CONTRIBUTING.md
-    - BUILD.md
-    - 'js/web'
-    - 'onnxruntime/core/providers/js'
-pr:
-  branches:
-    include:
-    - main
-    - rel-*
-  paths:
-    exclude:
-    - docs/**
-    - README.md
-    - CONTRIBUTING.md
-    - BUILD.md
-    - 'js/web'
-    - 'onnxruntime/core/providers/js'
-#### end trigger ####
-
-jobs:
-- template: templates/linux-ci.yml
-  parameters:
-    AgentPool : 'Linux-CPU-2019'
-    JobName: 'Linux_CI_Dev'
-    RunDockerBuildArgs: '-o ubuntu22.04 -p 3.10 -d openvino -v 2025.1.0 -x "--enable_generic_interface --use_openvino CPU --build_wheel"'
-    TimeoutInMinutes: 120
diff --git a/tools/ci_build/github/azure-pipelines/linux-qnn-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-qnn-ci-pipeline.yml
index 704860587459a..f08fd70d6d6cf 100644
--- a/tools/ci_build/github/azure-pipelines/linux-qnn-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/linux-qnn-ci-pipeline.yml
@@ -33,7 +33,7 @@ parameters:
 - name: QnnSdk
   displayName: QNN SDK version
   type: string
-  default: 2.32.0.250228
+  default: 2.33.2.250410
 
 jobs:
   - job: Build_QNN_EP
diff --git a/tools/ci_build/github/azure-pipelines/nuget/templates/dml-vs-2022.yml b/tools/ci_build/github/azure-pipelines/nuget/templates/dml-vs-2022.yml
index 130bc330d53f7..b8095a4e8c45c 100644
--- a/tools/ci_build/github/azure-pipelines/nuget/templates/dml-vs-2022.yml
+++ b/tools/ci_build/github/azure-pipelines/nuget/templates/dml-vs-2022.yml
@@ -75,7 +75,7 @@ stages:
         inputs:
           versionSpec: '3.12'
           addToPath: true
-          architecture: ${{ parameters.BuildArch }}   
+          architecture: ${{ parameters.BuildArch }}
 
       # need to set PROCESSOR_ARCHITECTURE so the x86 SDK is installed correctly
       - task: UseDotNet@2
@@ -93,11 +93,11 @@ stages:
 
       - ${{ if notIn(parameters['sln_platform'], 'Win32', 'x64') }}:
         - powershell: |
-            python tools\ci_build\build.py ${{ parameters.BuildCommand }} --use_binskim_compliant_compile_flags --parallel --build_csharp --build --update --config $(BuildConfig) --build_nuget --msbuild_extra_options IncludeMobileTargets=false ${{ variables.build_py_lto_flag }}
+            python tools\ci_build\build.py ${{ parameters.BuildCommand }} --use_binskim_compliant_compile_flags --parallel --build_csharp --build --update --config $(BuildConfig) --msbuild_extra_options IncludeMobileTargets=false ${{ variables.build_py_lto_flag }}
 
       - ${{ else }}:
         - powershell: |
-            python tools\ci_build\build.py ${{ parameters.BuildCommand }} --use_binskim_compliant_compile_flags --parallel --use_vcpkg --use_vcpkg_ms_internal_asset_cache --build_csharp --build --update --config $(BuildConfig) --build_nuget --msbuild_extra_options IncludeMobileTargets=false ${{ variables.build_py_lto_flag }}
+            python tools\ci_build\build.py ${{ parameters.BuildCommand }} --use_binskim_compliant_compile_flags --parallel --use_vcpkg --use_vcpkg_ms_internal_asset_cache --build_csharp --build --update --config $(BuildConfig) --msbuild_extra_options IncludeMobileTargets=false ${{ variables.build_py_lto_flag }}
 
       - ${{ if notIn(parameters['sln_platform'], 'Win32', 'x64') }}:
         # Use cross-compiled protoc
@@ -168,13 +168,13 @@ stages:
             modifyEnvironment: true
         - template: ../../templates/win-esrp-dll.yml
           parameters:
-            FolderPath: '$(Build.SourcesDirectory)\js\node\bin\napi-v3\win32\x64'
+            FolderPath: '$(Build.SourcesDirectory)\js\node\bin\napi-v6\win32\x64'
             DisplayName: 'ESRP - Sign Node.js binding binaries'
             DoEsrp: ${{ parameters.DoEsrp }}
             Pattern: '*.dll,*.node'
 
         - script: |
-           del /Q $(Build.SourcesDirectory)\js\node\bin\napi-v3\win32\x64\CodeSignSummary-*.*
+           del /Q $(Build.SourcesDirectory)\js\node\bin\napi-v6\win32\x64\CodeSignSummary-*.*
            call npm pack
            copy $(Build.SourcesDirectory)\js\node\onnxruntime-*.tgz $(Build.ArtifactStagingDirectory)
            xcopy /E /I $(Build.SourcesDirectory)\js\node\prebuilds $(Build.ArtifactStagingDirectory)\prebuilds
@@ -196,18 +196,18 @@ stages:
 
       - ${{ if eq(parameters.BuildNodejs, 'true') }}:
         - task: CopyFiles@2
-          displayName: 'Copy DirectML binaries to: $(Build.SourcesDirectory)\js\node\bin\napi-v3\win32\${{ parameters.sln_platform }}'
+          displayName: 'Copy DirectML binaries to: $(Build.SourcesDirectory)\js\node\bin\napi-v6\win32\${{ parameters.sln_platform }}'
           inputs:
             SourceFolder: '$(Build.BinariesDirectory)\$(BuildConfig)\$(BuildConfig)'
             Contents: 'DirectML.dll'
-            TargetFolder: '$(Build.SourcesDirectory)\js\node\bin\napi-v3\win32\${{ parameters.sln_platform }}'
+            TargetFolder: '$(Build.SourcesDirectory)\js\node\bin\napi-v6\win32\${{ parameters.sln_platform }}'
         - template: ../../templates/win-esrp-dll.yml
           parameters:
-            FolderPath: '$(Build.SourcesDirectory)\js\node\bin\napi-v3\win32\${{ parameters.sln_platform }}'
+            FolderPath: '$(Build.SourcesDirectory)\js\node\bin\napi-v6\win32\${{ parameters.sln_platform }}'
             DisplayName: 'ESRP - Sign Node.js binding binaries'
             DoEsrp: ${{ parameters.DoEsrp }}
             Pattern: '*.node'
         - task: 1ES.PublishPipelineArtifact@1
           inputs:
-            targetPath: '$(Build.SourcesDirectory)\js\node\bin\napi-v3\win32\${{ parameters.sln_platform }}'
+            targetPath: '$(Build.SourcesDirectory)\js\node\bin\napi-v6\win32\${{ parameters.sln_platform }}'
             artifactName: 'drop-onnxruntime-nodejs-win-${{ parameters.sln_platform }}-dml'
diff --git a/tools/ci_build/github/azure-pipelines/py-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/py-packaging-pipeline.yml
index 41549287cd3ab..d19f9bde7ad75 100644
--- a/tools/ci_build/github/azure-pipelines/py-packaging-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/py-packaging-pipeline.yml
@@ -59,7 +59,7 @@ parameters:
 - name: qnn_sdk_version
   type: string
   displayName: 'QNN SDK version. Only for QNN packages.'
-  default: 2.32.0.250228
+  default: 2.33.2.250410
 
 trigger: none
 
diff --git a/tools/ci_build/github/azure-pipelines/qnn-ep-nuget-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/qnn-ep-nuget-packaging-pipeline.yml
index fedabba9083e6..34fbe74260ace 100644
--- a/tools/ci_build/github/azure-pipelines/qnn-ep-nuget-packaging-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/qnn-ep-nuget-packaging-pipeline.yml
@@ -2,7 +2,7 @@ parameters:
 - name: QnnSdk
   displayName: QNN SDK Version
   type: string
-  default: 2.32.0.250228
+  default: 2.33.2.250410
 
 - name: build_config
   displayName: Build Configuration
diff --git a/tools/ci_build/github/azure-pipelines/stages/download-java-tools-stage.yml b/tools/ci_build/github/azure-pipelines/stages/download-java-tools-stage.yml
index 1cc3e2def327f..67fa5dba029b1 100644
--- a/tools/ci_build/github/azure-pipelines/stages/download-java-tools-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/stages/download-java-tools-stage.yml
@@ -15,7 +15,7 @@ stages:
           mkdir -p java-tools
           pushd java-tools
           wget --tries=3 https://oss.sonatype.org/service/local/repositories/releases/content/org/junit/platform/junit-platform-console-standalone/1.6.2/junit-platform-console-standalone-1.6.2.jar -P ./
-          wget --tries=3 https://oss.sonatype.org/service/local/repositories/releases/content/com/google/protobuf/protobuf-java/3.21.7/protobuf-java-3.21.7.jar -P ./
+          wget --tries=3 https://oss.sonatype.org/service/local/repositories/releases/content/com/google/protobuf/protobuf-java/3.25.5/protobuf-java-3.25.5.jar -P ./
           popd
         workingDirectory: '$(Agent.TempDirectory)'
 
diff --git a/tools/ci_build/github/azure-pipelines/stages/java-cuda-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/stages/java-cuda-packaging-stage.yml
index 617c00b5fa592..3f800212509de 100644
--- a/tools/ci_build/github/azure-pipelines/stages/java-cuda-packaging-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/stages/java-cuda-packaging-stage.yml
@@ -122,7 +122,7 @@ stages:
           pushd test
           jar xf $(Build.BinariesDirectory)\final-jar\testing.jar
           popd
-          java -DUSE_CUDA=1 -jar junit-platform-console-standalone-1.6.2.jar -cp .;.\test;protobuf-java-3.21.7.jar;onnxruntime_gpu-$(OnnxRuntimeVersion).jar --scan-class-path --fail-if-no-tests --disable-banner
+          java -DUSE_CUDA=1 -jar junit-platform-console-standalone-1.6.2.jar -cp .;.\test;protobuf-java-3.25.5.jar;onnxruntime_gpu-$(OnnxRuntimeVersion).jar --scan-class-path --fail-if-no-tests --disable-banner
         workingDirectory: '$(Build.BinariesDirectory)\final-jar'
   - job: Final_Jar_Testing_Linux_GPU
     dependsOn:
diff --git a/tools/ci_build/github/azure-pipelines/stages/nodejs-win-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/stages/nodejs-win-packaging-stage.yml
index 73e650eb07992..a6100cae8b69d 100644
--- a/tools/ci_build/github/azure-pipelines/stages/nodejs-win-packaging-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/stages/nodejs-win-packaging-stage.yml
@@ -127,11 +127,11 @@ stages:
           arguments: ${{ parameters.BuildArch }}
           modifyEnvironment: true
       - task: CopyFiles@2
-        displayName: 'Copy DirectML binaries to: $(Build.SourcesDirectory)\js\node\bin\napi-v3\win32\${{ parameters.sln_platform }}'
+        displayName: 'Copy DirectML binaries to: $(Build.SourcesDirectory)\js\node\bin\napi-v6\win32\${{ parameters.sln_platform }}'
         inputs:
           SourceFolder: '$(Build.BinariesDirectory)\$(BuildConfig)\$(BuildConfig)'
           Contents: 'DirectML.dll'
-          TargetFolder: '$(Build.SourcesDirectory)\js\node\bin\napi-v3\win32\${{ parameters.sln_platform }}'
+          TargetFolder: '$(Build.SourcesDirectory)\js\node\bin\napi-v6\win32\${{ parameters.sln_platform }}'
       - powershell: |
           $dxcZipUrl = "https://github.com/microsoft/DirectXShaderCompiler/releases/download/v1.8.2502/dxc_2025_02_20.zip"
           $dxcZipPath = "$(Build.BinariesDirectory)\dxc.zip"
@@ -153,7 +153,7 @@ stages:
 
           # Copy the necessary DLLs to the target directory
           $sourcePath = Join-Path $dxcExtractPath "bin\$targetArch"
-          $targetPath = "$(Build.SourcesDirectory)\js\node\bin\napi-v3\win32\$targetArch"
+          $targetPath = "$(Build.SourcesDirectory)\js\node\bin\napi-v6\win32\$targetArch"
 
           Write-Host "Copying dxil.dll and dxcompiler.dll from $sourcePath to $targetPath"
           Copy-Item -Path "$sourcePath\dxil.dll" -Destination $targetPath -Force
@@ -163,13 +163,13 @@ stages:
         displayName: 'Download and Copy DXC Binaries'
       - template: ../templates/win-esrp-dll.yml
         parameters:
-          FolderPath: '$(Build.SourcesDirectory)\js\node\bin\napi-v3\win32\${{ parameters.sln_platform }}'
+          FolderPath: '$(Build.SourcesDirectory)\js\node\bin\napi-v6\win32\${{ parameters.sln_platform }}'
           DisplayName: 'ESRP - Sign Node.js binding binaries'
           DoEsrp: ${{ parameters.DoEsrp }}
           Pattern: '*.dll,*.node'
 
       - script: |
-          del /Q $(Build.SourcesDirectory)\js\node\bin\napi-v3\win32\${{ parameters.sln_platform }}\CodeSignSummary-*.*
+          del /Q $(Build.SourcesDirectory)\js\node\bin\napi-v6\win32\${{ parameters.sln_platform }}\CodeSignSummary-*.*
           call npm pack
           copy $(Build.SourcesDirectory)\js\node\onnxruntime-*.tgz $(Build.ArtifactStagingDirectory)
         workingDirectory: '$(Build.SourcesDirectory)\js\node'
@@ -177,7 +177,7 @@ stages:
 
       - task: 1ES.PublishPipelineArtifact@1
         inputs:
-          targetPath: '$(Build.SourcesDirectory)\js\node\bin\napi-v3\win32\${{ parameters.sln_platform }}'
+          targetPath: '$(Build.SourcesDirectory)\js\node\bin\napi-v6\win32\${{ parameters.sln_platform }}'
           artifactName: ${{ parameters.ArtifactName }}
 
       - ${{ if and(eq(parameters.PublishWebGpuBuildTools, true), eq(parameters.sln_platform, 'x64')) }}:
diff --git a/tools/ci_build/github/azure-pipelines/stages/py-cpu-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/stages/py-cpu-packaging-stage.yml
index f5e827e277b40..4c18fb73cd779 100644
--- a/tools/ci_build/github/azure-pipelines/stages/py-cpu-packaging-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/stages/py-cpu-packaging-stage.yml
@@ -59,7 +59,7 @@ parameters:
 - name: qnn_sdk_version
   type: string
   displayName: 'QNN SDK version. Only for QNN packages.'
-  default: 2.32.0.250228
+  default: 2.33.2.250410
 
 stages:
 - ${{ if eq(parameters.enable_windows_cpu, true) }}:
diff --git a/tools/ci_build/github/azure-pipelines/templates/android-java-api-aar-test.yml b/tools/ci_build/github/azure-pipelines/templates/android-java-api-aar-test.yml
index 1d35c9461768d..da6263cc56975 100644
--- a/tools/ci_build/github/azure-pipelines/templates/android-java-api-aar-test.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/android-java-api-aar-test.yml
@@ -17,7 +17,7 @@ parameters:
 - name: QnnSDKVersion
   displayName: QNN SDK Version
   type: string
-  default: '2.32.0.250228'
+  default: '2.33.2.250410'
 
 - name: enableWebGpu
   displayName: Enable WebGPU test
diff --git a/tools/ci_build/github/azure-pipelines/templates/android-java-api-aar.yml b/tools/ci_build/github/azure-pipelines/templates/android-java-api-aar.yml
index f930101f34d05..03839f8ac6282 100644
--- a/tools/ci_build/github/azure-pipelines/templates/android-java-api-aar.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/android-java-api-aar.yml
@@ -51,7 +51,7 @@ parameters:
 - name: QnnSDKVersion
   displayName: QNN SDK Version
   type: string
-  default: '2.32.0.250228'
+  default: '2.33.2.250410'
 
 - name: is1ES
   displayName: Is 1ES pipeline
diff --git a/tools/ci_build/github/azure-pipelines/templates/c-api-artifacts-package-and-publish-steps-windows.yml b/tools/ci_build/github/azure-pipelines/templates/c-api-artifacts-package-and-publish-steps-windows.yml
index 046c737a2b151..f15a2992e0d00 100644
--- a/tools/ci_build/github/azure-pipelines/templates/c-api-artifacts-package-and-publish-steps-windows.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/c-api-artifacts-package-and-publish-steps-windows.yml
@@ -66,6 +66,7 @@ steps:
           copy $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\libQnnHtp*.so $(Build.BinariesDirectory)\${{parameters.artifactName}}\lib /Y
           copy $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\libqnnhtp*.cat $(Build.BinariesDirectory)\${{parameters.artifactName}}\lib /Y
           copy $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\QnnCpu.dll $(Build.BinariesDirectory)\${{parameters.artifactName}}\lib
+          copy $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\QnnGpu.dll $(Build.BinariesDirectory)\${{parameters.artifactName}}\lib
           copy $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\QnnHtp.dll $(Build.BinariesDirectory)\${{parameters.artifactName}}\lib
           copy $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\QnnHtpPrepare.dll $(Build.BinariesDirectory)\${{parameters.artifactName}}\lib
           copy $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\QnnHtpV68Stub.dll $(Build.BinariesDirectory)\${{parameters.artifactName}}\lib
diff --git a/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml b/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml
index 9b1d7b705e741..7b2e2396d27dd 100644
--- a/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml
@@ -45,7 +45,7 @@ parameters:
 - name: QnnSDKVersion
   displayName: QNN SDK Version
   type: string
-  default: 2.32.0.250228
+  default: 2.33.2.250410
 
 - name: is1ES
   displayName: Is 1ES pipeline
@@ -599,7 +599,7 @@ stages:
     #  - linux/x64/libonnxruntime_providers_cuda.so
     #
     # Rest binary artifacts will eventually be put into folder before packaging 'onnxruntime-node':
-    #  $(Build.SourcesDirectory)\js\node\bin\napi-v3\{os}\{cpu_arch}\
+    #  $(Build.SourcesDirectory)\js\node\bin\napi-v6\{os}\{cpu_arch}\
     #
     # {os} is one of 'win32', 'darwin', 'linux' and {cpu_arch} is one of 'x64', 'arm64'.
 
@@ -659,63 +659,63 @@ stages:
 
     # Node.js binding win32/x64
     - task: CopyFiles@2
-      displayName: 'Copy binaries to: $(Build.SourcesDirectory)\js\node\bin\napi-v3\win32\x64\'
+      displayName: 'Copy binaries to: $(Build.SourcesDirectory)\js\node\bin\napi-v6\win32\x64\'
       inputs:
         SourceFolder: '$(Build.BinariesDirectory)\nodejs-artifacts\win32\x64'
         Contents: |
           *.dll
           *.node
-        TargetFolder: '$(Build.SourcesDirectory)\js\node\bin\napi-v3\win32\x64'
+        TargetFolder: '$(Build.SourcesDirectory)\js\node\bin\napi-v6\win32\x64'
 
     # Node.js binding win32/arm64
     - task: CopyFiles@2
-      displayName: 'Copy binaries to: $(Build.SourcesDirectory)\js\node\bin\napi-v3\win32\arm64\'
+      displayName: 'Copy binaries to: $(Build.SourcesDirectory)\js\node\bin\napi-v6\win32\arm64\'
       inputs:
         SourceFolder: '$(Build.BinariesDirectory)\nodejs-artifacts\win32\arm64'
         Contents: |
           *.dll
           *.node
-        TargetFolder: '$(Build.SourcesDirectory)\js\node\bin\napi-v3\win32\arm64'
+        TargetFolder: '$(Build.SourcesDirectory)\js\node\bin\napi-v6\win32\arm64'
 
     # Node.js binding linux/x64
     - task: CopyFiles@2
-      displayName: 'Copy nodejs binaries to: $(Build.SourcesDirectory)\js\node\bin\napi-v3\linux\x64\'
+      displayName: 'Copy nodejs binaries to: $(Build.SourcesDirectory)\js\node\bin\napi-v6\linux\x64\'
       inputs:
         SourceFolder: '$(Build.BinariesDirectory)\nodejs-artifacts\linux\x64'
         Contents: |
           libonnxruntime.so.1
           *.node
-        TargetFolder: '$(Build.SourcesDirectory)\js\node\bin\napi-v3\linux\x64'
+        TargetFolder: '$(Build.SourcesDirectory)\js\node\bin\napi-v6\linux\x64'
 
     # Node.js binding linux/arm64
     - task: CopyFiles@2
-      displayName: 'Copy nodejs binaries to: $(Build.SourcesDirectory)\js\node\bin\napi-v3\linux\arm64\'
+      displayName: 'Copy nodejs binaries to: $(Build.SourcesDirectory)\js\node\bin\napi-v6\linux\arm64\'
       inputs:
         SourceFolder: '$(Build.BinariesDirectory)\nodejs-artifacts\linux\arm64'
         Contents: |
           libonnxruntime.so.1
           *.node
-        TargetFolder: '$(Build.SourcesDirectory)\js\node\bin\napi-v3\linux\arm64'
+        TargetFolder: '$(Build.SourcesDirectory)\js\node\bin\napi-v6\linux\arm64'
 
     # Node.js binding darwin/x64
     - task: CopyFiles@2
-      displayName: 'Copy nodejs binaries to: $(Build.SourcesDirectory)\js\node\bin\napi-v3\darwin\x64\'
+      displayName: 'Copy nodejs binaries to: $(Build.SourcesDirectory)\js\node\bin\napi-v6\darwin\x64\'
       inputs:
         SourceFolder: '$(Build.BinariesDirectory)\nodejs-artifacts\darwin\x64'
         Contents: |
           libonnxruntime.*.dylib
           *.node
-        TargetFolder: '$(Build.SourcesDirectory)\js\node\bin\napi-v3\darwin\x64'
+        TargetFolder: '$(Build.SourcesDirectory)\js\node\bin\napi-v6\darwin\x64'
 
     # Node.js binding darwin/arm64
     - task: CopyFiles@2
-      displayName: 'Copy nodejs binaries to: $(Build.SourcesDirectory)\js\node\bin\napi-v3\darwin\arm64\'
+      displayName: 'Copy nodejs binaries to: $(Build.SourcesDirectory)\js\node\bin\napi-v6\darwin\arm64\'
       inputs:
         SourceFolder: '$(Build.BinariesDirectory)\nodejs-artifacts\darwin\arm64'
         Contents: |
           libonnxruntime.*.dylib
           *.node
-        TargetFolder: '$(Build.SourcesDirectory)\js\node\bin\napi-v3\darwin\arm64'
+        TargetFolder: '$(Build.SourcesDirectory)\js\node\bin\napi-v6\darwin\arm64'
 
     - task: PowerShell@2
       inputs:
diff --git a/tools/ci_build/github/azure-pipelines/templates/final-jar-testing.yml b/tools/ci_build/github/azure-pipelines/templates/final-jar-testing.yml
index 5d947b3c34ad7..d79b1096efb77 100644
--- a/tools/ci_build/github/azure-pipelines/templates/final-jar-testing.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/final-jar-testing.yml
@@ -69,7 +69,7 @@ stages:
             pushd test
             jar xf $(Build.BinariesDirectory)\final-jar\testing.jar
             popd
-            java -jar junit-platform-console-standalone-1.6.2.jar -cp .;.\test;protobuf-java-3.21.7.jar;onnxruntime-$(OnnxRuntimeVersion).jar --scan-class-path --fail-if-no-tests --disable-banner
+            java -jar junit-platform-console-standalone-1.6.2.jar -cp .;.\test;protobuf-java-3.25.5.jar;onnxruntime-$(OnnxRuntimeVersion).jar --scan-class-path --fail-if-no-tests --disable-banner
           workingDirectory: '$(Build.BinariesDirectory)\final-jar'
     - ${{ else }}:
       - task: Bash@3
@@ -83,8 +83,8 @@ stages:
             pushd test
             jar xf '$(Build.BinariesDirectory)/final-jar/testing.jar'
             popd
-            # if you want to run the tests in the power shell, you need to replace ':' to ';', that is,  "-cp .;.\test;protobuf-java-3.21.7.jar;onnxruntime-$(OnnxRuntimeVersion).jar"
-            java -jar ./junit-platform-console-standalone-1.6.2.jar -cp .:./test:./protobuf-java-3.21.7.jar:./onnxruntime-$(OnnxRuntimeVersion).jar --scan-class-path --fail-if-no-tests --disable-banner
+            # if you want to run the tests in the power shell, you need to replace ':' to ';', that is,  "-cp .;.\test;protobuf-java-3.25.5.jar;onnxruntime-$(OnnxRuntimeVersion).jar"
+            java -jar ./junit-platform-console-standalone-1.6.2.jar -cp .:./test:./protobuf-java-3.25.5.jar:./onnxruntime-$(OnnxRuntimeVersion).jar --scan-class-path --fail-if-no-tests --disable-banner
           workingDirectory: '$(Build.BinariesDirectory)/final-jar'
         env:
           ${{ if eq(parameters.OS, 'MacOS') }}:
diff --git a/tools/ci_build/github/azure-pipelines/templates/jobs/download_linux_qnn_sdk.yml b/tools/ci_build/github/azure-pipelines/templates/jobs/download_linux_qnn_sdk.yml
index 9ab308c440e21..e00e40b80b723 100644
--- a/tools/ci_build/github/azure-pipelines/templates/jobs/download_linux_qnn_sdk.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/jobs/download_linux_qnn_sdk.yml
@@ -1,7 +1,7 @@
 parameters:
   - name: QnnSDKVersion
     type: string
-    default: '2.32.0.250228'
+    default: '2.33.2.250410'
 
 steps:
   - script: |
diff --git a/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_qnn_sdk.yml b/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_qnn_sdk.yml
index 62399443dfd35..3b27060b3fcec 100644
--- a/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_qnn_sdk.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_qnn_sdk.yml
@@ -1,7 +1,7 @@
 parameters:
   - name: QnnSDKVersion
     type: string
-    default: '2.32.0.250228'
+    default: '2.33.2.250410'
 
 steps:
   - powershell: |
diff --git a/tools/ci_build/github/azure-pipelines/templates/linux-ci.yml b/tools/ci_build/github/azure-pipelines/templates/linux-ci.yml
deleted file mode 100644
index 15165e3cb0950..0000000000000
--- a/tools/ci_build/github/azure-pipelines/templates/linux-ci.yml
+++ /dev/null
@@ -1,40 +0,0 @@
-parameters:
-  AgentPool : 'onnxruntime-Ubuntu2204-AMD-CPU'
-  StageName : 'Linux_CI_Dev'
-  RunDockerBuildArgs: '-o ubuntu20.04 -d cpu -x "--build_wheel"'
-  NuPackScript: ''
-  RunInjectedPipeline: 'false'
-  InjectedPipeline: ''
-  DockerImageTag: ''
-  TimeoutInMinutes: 120
-  # Controls whether unreleased onnx opsets are allowed. Default is set to 1
-  AllowReleasedOpsetOnly: '1'
-
-jobs:
-- job: ${{ parameters.StageName }}
-  workspace:
-    clean: all
-  timeoutInMinutes:  ${{ parameters.TimeoutInMinutes }}
-  variables:
-    ALLOW_RELEASED_ONNX_OPSET_ONLY: ${{ parameters.AllowReleasedOpsetOnly }}
-    skipComponentGovernanceDetection: true
-  pool: ${{ parameters.AgentPool }}
-  steps:
-    - checkout: self
-    - template: run-docker-build-steps.yml
-      parameters:
-        RunDockerBuildArgs: '${{ parameters.RunDockerBuildArgs }}'
-    - task: PublishTestResults@2
-      displayName: 'Publish unit test results'
-      inputs:
-        testResultsFiles: '**/*.results.xml'
-        searchFolder: '$(Build.BinariesDirectory)'
-        testRunTitle: 'Unit Test Run'
-      condition: succeededOrFailed()
-    - ${{ if eq(parameters['RunInjectedPipeline'], 'true') }}:
-      - template: |
-         ${{ parameters.InjectedPipeline }}
-        parameters:
-          DockerImageTag: ${{ parameters.DockerImageTag }}
-          BuildConfig: Release
-    - template: clean-agent-build-directory-step.yml
diff --git a/tools/ci_build/github/azure-pipelines/templates/nodejs-artifacts-package-and-publish-steps-posix.yml b/tools/ci_build/github/azure-pipelines/templates/nodejs-artifacts-package-and-publish-steps-posix.yml
index df4d3f95a47c7..3e3d7167f4988 100644
--- a/tools/ci_build/github/azure-pipelines/templates/nodejs-artifacts-package-and-publish-steps-posix.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/nodejs-artifacts-package-and-publish-steps-posix.yml
@@ -8,9 +8,9 @@ parameters:
 - name: artifactName
   type: string
   default: ''
-  
+
 steps:
     - task: 1ES.PublishPipelineArtifact@1
       inputs:
-        targetPath: '$(Build.SourcesDirectory)/js/node/bin/napi-v3/${{ parameters.os }}/${{ parameters.arch }}'
+        targetPath: '$(Build.SourcesDirectory)/js/node/bin/napi-v6/${{ parameters.os }}/${{ parameters.arch }}'
         artifactName: '${{parameters.artifactName}}'
diff --git a/tools/ci_build/github/azure-pipelines/templates/nodejs-artifacts-package-and-publish-steps-windows.yml b/tools/ci_build/github/azure-pipelines/templates/nodejs-artifacts-package-and-publish-steps-windows.yml
index 5286ae6c49142..4afeab4a2c020 100644
--- a/tools/ci_build/github/azure-pipelines/templates/nodejs-artifacts-package-and-publish-steps-windows.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/nodejs-artifacts-package-and-publish-steps-windows.yml
@@ -8,21 +8,21 @@ parameters:
   displayName: Run code sign tasks? Must be true if you are doing an Onnx Runtime release.
   type: boolean
   default: true
-  
+
 - name: artifactName
   type: string
   default: ''
-  
+
 steps:
   - ${{ if eq(parameters.DoEsrp, true) }}:
     - template: win-esrp-dll.yml
       parameters:
-        FolderPath: '$(Build.SourcesDirectory)\js\node\bin\napi-v3\win32\${{ parameters.arch }}'
+        FolderPath: '$(Build.SourcesDirectory)\js\node\bin\napi-v6\win32\${{ parameters.arch }}'
         DisplayName: 'ESRP - Sign Node.js binding binaries'
         DoEsrp: ${{parameters.DoEsrp}}
         Pattern: '*.node'
 
   - task: 1ES.PublishPipelineArtifact@1
     inputs:
-      targetPath: '$(Build.SourcesDirectory)\js\node\bin\napi-v3\win32\${{ parameters.arch }}'
+      targetPath: '$(Build.SourcesDirectory)\js\node\bin\napi-v6\win32\${{ parameters.arch }}'
       artifactName: '${{parameters.artifactName}}'
diff --git a/tools/ci_build/github/azure-pipelines/templates/py-linux-qnn.yml b/tools/ci_build/github/azure-pipelines/templates/py-linux-qnn.yml
index f9d651c79d604..c361fe678699e 100644
--- a/tools/ci_build/github/azure-pipelines/templates/py-linux-qnn.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/py-linux-qnn.yml
@@ -26,7 +26,7 @@ parameters:
 - name: QnnSdk
   displayName: QNN SDK version
   type: string
-  default: 2.32.0.250228
+  default: 2.33.2.250410
   
 - name: is1ES
   displayName: 'Whether the pipeline is running in 1ES'
diff --git a/tools/ci_build/github/azure-pipelines/templates/py-win-arm64-qnn.yml b/tools/ci_build/github/azure-pipelines/templates/py-win-arm64-qnn.yml
index f5c597ab50af9..c1f47de63c38c 100644
--- a/tools/ci_build/github/azure-pipelines/templates/py-win-arm64-qnn.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/py-win-arm64-qnn.yml
@@ -7,7 +7,7 @@ parameters:
 - name: QNN_SDK
   displayName: QNN SDK Version
   type: string
-  default: 2.32.0.250228
+  default: 2.33.2.250410
 
 - name: ENV_SETUP_SCRIPT
   type: string
diff --git a/tools/ci_build/github/azure-pipelines/templates/py-win-arm64ec-qnn.yml b/tools/ci_build/github/azure-pipelines/templates/py-win-arm64ec-qnn.yml
index e8665d9d46d41..6df46bfc8e1b0 100644
--- a/tools/ci_build/github/azure-pipelines/templates/py-win-arm64ec-qnn.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/py-win-arm64ec-qnn.yml
@@ -7,7 +7,7 @@ parameters:
 - name: QNN_SDK
   displayName: QNN SDK Version
   type: string
-  default: 2.32.0.250228
+  default: 2.33.2.250410
 
 - name: ENV_SETUP_SCRIPT
   type: string
diff --git a/tools/ci_build/github/azure-pipelines/templates/py-win-x64-qnn.yml b/tools/ci_build/github/azure-pipelines/templates/py-win-x64-qnn.yml
index 8f910e8305b76..72c8323d032ed 100644
--- a/tools/ci_build/github/azure-pipelines/templates/py-win-x64-qnn.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/py-win-x64-qnn.yml
@@ -7,7 +7,7 @@ parameters:
 - name: QNN_SDK
   displayName: QNN SDK Version
   type: string
-  default: 2.32.0.250228
+  default: 2.33.2.250410
 
 - name: ENV_SETUP_SCRIPT
   type: string
diff --git a/tools/ci_build/github/azure-pipelines/templates/qnn-ep-win.yml b/tools/ci_build/github/azure-pipelines/templates/qnn-ep-win.yml
index a799d2eb44a20..11560486dfd6c 100644
--- a/tools/ci_build/github/azure-pipelines/templates/qnn-ep-win.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/qnn-ep-win.yml
@@ -1,5 +1,5 @@
 parameters:
-  QnnSdk: '2.32.0.250228'
+  QnnSdk: '2.33.2.250410'
   build_config: 'RelWithDebInfo'
   IsReleaseBuild: false
   DoEsrp: false
diff --git a/tools/ci_build/github/azure-pipelines/win-qnn-arm64-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-qnn-arm64-ci-pipeline.yml
index 6ea497fab6a8a..bc4e0de149b54 100644
--- a/tools/ci_build/github/azure-pipelines/win-qnn-arm64-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/win-qnn-arm64-ci-pipeline.yml
@@ -33,7 +33,7 @@ parameters:
 - name: QnnSdk
   displayName: QNN SDK version
   type: string
-  default: 2.32.0.250228
+  default: 2.33.2.250410
 
 jobs:
 - job: 'BUILD_QNN_EP'
diff --git a/tools/ci_build/github/azure-pipelines/win-qnn-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-qnn-ci-pipeline.yml
index eea2af585595a..b83621d285f9a 100644
--- a/tools/ci_build/github/azure-pipelines/win-qnn-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/win-qnn-ci-pipeline.yml
@@ -33,7 +33,7 @@ parameters:
 - name: QnnSdk
   displayName: QNN SDK version
   type: string
-  default: 2.32.0.250228
+  default: 2.33.2.250410
 
 jobs:
 - job: 'BUILD_QNN_EP'
diff --git a/tools/ci_build/github/js/pack-npm-packages.ps1 b/tools/ci_build/github/js/pack-npm-packages.ps1
index 64acd7bfc9751..f842f83505a4a 100644
--- a/tools/ci_build/github/js/pack-npm-packages.ps1
+++ b/tools/ci_build/github/js/pack-npm-packages.ps1
@@ -22,13 +22,24 @@
 #   Always generate packages for onnxruntime-common and onnxruntime-(node|web|react-native)
 #
 
-if ($Args.Count -ne 3) {
-    throw "This script requires 3 arguments, but got $($Args.Count)"
+if ($Args.Count -lt 3) {
+    throw "This script requires at least 3 arguments, but got $($Args.Count)"
 }
 $MODE=$Args[0] # "dev" or "release" or "rc"; otherwise it is considered as a version number
 $ORT_ROOT=$Args[1] # eg. D:\source\onnxruntime
 $TARGET=$Args[2] # "node" or "web" or "react_native"
 
+if ($TARGET -ne "node" -and $TARGET -ne "web" -and $TARGET -ne "react_native") {
+    throw "Invalid target: $TARGET. Must be one of 'node', 'web' or 'react_native'"
+}
+if ($TARGET -ne "node" -and $Args.Count -gt 3) {
+    throw "Invalid argument count: $($Args.Count). Only 3 arguments are allowed for target '$TARGET'"
+}
+$VERSION_CANDIDATES=$Args[3] # optional, only for node target
+if ($TARGET -eq "node" -and $Args.Count -gt 4) {
+    throw "Invalid argument count: $($Args.Count). Only 4 arguments are allowed for target '$TARGET'"
+}
+
 Function Generate-Package-Version-Number {
     pushd $ORT_ROOT
     $version_base=Get-Content ./VERSION_NUMBER
@@ -170,7 +181,7 @@ if ($MODE -eq "dev") {
 
     # update version.ts of TARGET
     pushd ..
-    npm run update-version $TARGET
+    npm run update-version $TARGET $VERSION_CANDIDATES
     npm run format
     popd
 
diff --git a/tools/ci_build/github/linux/docker/inference/x86_64/python/openvino/Dockerfile b/tools/ci_build/github/linux/docker/inference/x86_64/python/openvino/Dockerfile
new file mode 100644
index 0000000000000..dd049d7260bdf
--- /dev/null
+++ b/tools/ci_build/github/linux/docker/inference/x86_64/python/openvino/Dockerfile
@@ -0,0 +1,68 @@
+# Use the specified UBI8 base image with GCC 14
+ARG BASEIMAGE="onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_x64_ubi8_gcc14:20250124.1"
+FROM ${BASEIMAGE}
+
+ARG BUILD_UID=1000
+ARG BUILD_USER=onnxruntimedev
+
+# Switch to root for system-level installations
+USER root
+WORKDIR /
+
+RUN dnf install -y --nodocs \
+        wget \
+        tar \
+        gzip \
+        git \
+        numactl-libs \
+        ocl-icd \
+    && dnf clean all \
+    && rm -rf /var/cache/dnf
+
+ENV INTEL_OPENVINO_DIR=/opt/intel/openvino_2025.1.0
+ARG OPENVINO_PACKAGE_URL=https://storage.openvinotoolkit.org/repositories/openvino/packages/2025.1/linux/openvino_toolkit_rhel8_2025.1.0.18503.6fec06580ab_x86_64.tgz
+ARG TEMP_DIR=/tmp/openvino_installer
+
+RUN mkdir -p ${TEMP_DIR} && \
+    cd ${TEMP_DIR} && \
+    wget -O openvino_toolkit.tgz ${OPENVINO_PACKAGE_URL} && \
+    tar -xzf openvino_toolkit.tgz && \
+    OV_FOLDER_NAME=$(ls -d openvino_toolkit_rhel8_*) && \
+    mkdir -p /opt/intel && \
+    mv ${OV_FOLDER_NAME} ${INTEL_OPENVINO_DIR} && \
+    ln -s ${INTEL_OPENVINO_DIR} /opt/intel/openvino && \
+    # Clean up installer files
+    cd / && rm -rf ${TEMP_DIR}
+
+RUN dnf install -y --nodocs \
+        libedit \
+        ocl-icd \
+        https://repositories.intel.com/graphics/rhel/8.6/intel-gmmlib-22.3.1-i529.el8.x86_64.rpm \
+        https://repositories.intel.com/graphics/rhel/8.6/intel-igc-core-1.0.12504.6-i537.el8.x86_64.rpm \
+        https://repositories.intel.com/graphics/rhel/8.6/intel-igc-opencl-1.0.12504.6-i537.el8.x86_64.rpm \
+        https://repositories.intel.com/graphics/rhel/8.6/intel-opencl-22.43.24595.35-i538.el8.x86_64.rpm \
+        https://repositories.intel.com/graphics/rhel/8.6/intel-level-zero-gpu-1.3.24595.35-i538.el8.x86_64.rpm \
+        https://repositories.intel.com/graphics/rhel/8.6/level-zero-1.8.8-i524.el8.x86_64.rpm \
+    && dnf clean all \
+    && rm -rf /var/cache/dnf
+
+# --- Environment Setup ---
+ENV LD_LIBRARY_PATH=${INTEL_OPENVINO_DIR}/runtime/3rdparty/hddl/lib:${INTEL_OPENVINO_DIR}/runtime/3rdparty/tbb/lib:${INTEL_OPENVINO_DIR}/runtime/lib/intel64:${INTEL_OPENVINO_DIR}/tools/compile_tool:${INTEL_OPENVINO_DIR}/extras/opencv/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
+ENV OpenCV_DIR=${INTEL_OPENVINO_DIR}/extras/opencv/cmake
+ENV PATH=${INTEL_OPENVINO_DIR}/runtime/bin/intel64:${PATH}
+ENV TBB_DIR=${INTEL_OPENVINO_DIR}/runtime/3rdparty/tbb/cmake
+ENV ngraph_DIR=${INTEL_OPENVINO_DIR}/runtime/cmake
+ENV OpenVINO_DIR=${INTEL_OPENVINO_DIR}/runtime/cmake
+ENV PKG_CONFIG_PATH=${INTEL_OPENVINO_DIR}/runtime/lib/intel64/pkgconfig${PKG_CONFIG_PATH:+:${PKG_CONFIG_PATH}}
+ENV OpenVINORootDir=${INTEL_OPENVINO_DIR}
+
+# --- Build User Setup ---
+# Create the user with the specified UID and add to the 'video' group for GPU access.
+# Also add to 'users' group which is common practice.
+RUN groupmod -g 44 video || groupadd -g 44 video && \
+    useradd --shell /bin/bash --uid ${BUILD_UID} --gid users --groups video --create-home --home-dir /home/${BUILD_USER} ${BUILD_USER} && \
+    chown ${BUILD_USER}:users /home/${BUILD_USER}
+
+# Switch to the build user
+USER ${BUILD_USER}
+WORKDIR /home/${BUILD_USER}
diff --git a/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt b/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt
index 40dd9637747bf..f5331f48b3c04 100644
--- a/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt
+++ b/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt
@@ -10,4 +10,4 @@ sympy==1.12 ; python_version < '3.9'
 sympy==1.13 ; python_version >= '3.9'
 flatbuffers
 neural-compressor>=2.2.1
-triton
+triton==3.2.0
diff --git a/tools/ci_build/github/linux/java_linux_final_test.sh b/tools/ci_build/github/linux/java_linux_final_test.sh
index c8eac74a9db83..2699d488acbb8 100755
--- a/tools/ci_build/github/linux/java_linux_final_test.sh
+++ b/tools/ci_build/github/linux/java_linux_final_test.sh
@@ -32,8 +32,8 @@ jar xf "$BINARY_DIR/final-jar/testing.jar"
 popd
 
 curl -O -sSL https://oss.sonatype.org/service/local/repositories/releases/content/org/junit/platform/junit-platform-console-standalone/1.6.2/junit-platform-console-standalone-1.6.2.jar
-curl -O -sSL https://oss.sonatype.org/service/local/repositories/releases/content/com/google/protobuf/protobuf-java/3.21.7/protobuf-java-3.21.7.jar
-java -DUSE_CUDA=1 -jar ./junit-platform-console-standalone-1.6.2.jar -cp .:./test:./protobuf-java-3.21.7.jar:./onnxruntime_gpu-"${VERSION_NUMBER}".jar --scan-class-path --fail-if-no-tests --disable-banner
+curl -O -sSL https://oss.sonatype.org/service/local/repositories/releases/content/com/google/protobuf/protobuf-java/3.25.5/protobuf-java-3.25.5.jar
+java -DUSE_CUDA=1 -jar ./junit-platform-console-standalone-1.6.2.jar -cp .:./test:./protobuf-java-3.25.5.jar:./onnxruntime_gpu-"${VERSION_NUMBER}".jar --scan-class-path --fail-if-no-tests --disable-banner
 
 
 EXIT_CODE=$?
diff --git a/tools/ci_build/github/linux/run_build.sh b/tools/ci_build/github/linux/run_build.sh
deleted file mode 100755
index 25b3610872a04..0000000000000
--- a/tools/ci_build/github/linux/run_build.sh
+++ /dev/null
@@ -1,66 +0,0 @@
-#!/bin/bash
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT License.
-
-# This file is used by Linux Multi GPU TensorRT CI Pipeline,Linux OpenVINO CI Pipeline,orttraining-linux-gpu-ci-pipeline
-set -e -o -x
-
-id
-
-SCRIPT_DIR="$( dirname "${BASH_SOURCE[0]}" )"
-YOCTO_VERSION="4.19"
-
-while getopts d:x:o:y: parameter_Option
-do case "${parameter_Option}"
-in
-d) BUILD_DEVICE=${OPTARG};;
-x) BUILD_EXTR_PAR=${OPTARG};;
-o) BUILD_OS=${OPTARG};;
-# YOCTO 4.19 + ACL 19.05, YOCTO 4.14 + ACL 19.02
-y) YOCTO_VERSION=${OPTARG};;
-esac
-done
-
-if [ $BUILD_OS = "yocto" ]; then
-    YOCTO_FOLDER="4.19-warrior"
-    if [ $YOCTO_VERSION = "4.14" ]; then
-        YOCTO_FOLDER="4.14-sumo"
-    fi
-    pushd /onnxruntime_src
-    if [ ! -d build ]; then
-        mkdir build
-    fi
-    cd build
-    . /opt/fsl-imx-xwayland/$YOCTO_FOLDER/environment-setup-aarch64-poky-linux
-    alias cmake="/usr/bin/cmake -DCMAKE_TOOLCHAIN_FILE=$OECORE_NATIVE_SYSROOT/usr/share/cmake/OEToolchainConfig.cmake"
-    cmake ../cmake -Donnxruntime_RUN_ONNX_TESTS=OFF -Donnxruntime_GENERATE_TEST_REPORTS=ON -Donnxruntime_DEV_MODE=ON -DPYTHON_EXECUTABLE=/usr/bin/python3 -Donnxruntime_USE_CUDA=OFF -Donnxruntime_USE_NSYNC=OFF -Donnxruntime_CUDNN_HOME= -Donnxruntime_USE_JEMALLOC=OFF -Donnxruntime_ENABLE_PYTHON=OFF -Donnxruntime_BUILD_CSHARP=OFF -Donnxruntime_USE_EIGEN_FOR_BLAS=ON -Donnxruntime_USE_OPENBLAS=OFF -Donnxruntime_USE_ACL=ON -Donnxruntime_USE_MKLDNN=OFF -Donnxruntime_USE_MKLML=OFF -Donnxruntime_USE_LLVM=OFF -Donnxruntime_ENABLE_MICROSOFT_INTERNAL=OFF -Donnxruntime_USE_EIGEN_THREADPOOL=OFF -Donnxruntime_BUILD_UNIT_TESTS=ON -DCMAKE_BUILD_TYPE=Release -DCMAKE_C_IMPLICIT_INCLUDE_DIRECTORIES:PATH=/opt/fsl-imx-xwayland/$YOCTO_FOLDER/sysroots/aarch64-poky-linux/usr/include -DCMAKE_CXX_IMPLICIT_INCLUDE_DIRECTORIES:PATH=/opt/fsl-imx-xwayland/$YOCTO_FOLDER/sysroots/aarch64-poky-linux/usr/include -DONNX_CUSTOM_PROTOC_EXECUTABLE=/usr/bin/protoc
-
-    make -j$(nproc)
-else
-    COMMON_BUILD_ARGS="--skip_submodule_sync --enable_onnx_tests --parallel --use_binskim_compliant_compile_flags --cmake_path /usr/bin/cmake --ctest_path /usr/bin/ctest"
-
-    if [ $BUILD_DEVICE = "gpu" ]; then
-        _CUDNN_VERSION=$(echo $CUDNN_VERSION | cut -d. -f1-2)
-        python3 $SCRIPT_DIR/../../build.py --build_dir /build \
-            --config Release $COMMON_BUILD_ARGS \
-            --use_cuda \
-            --cuda_home /usr/local/cuda \
-            --cudnn_home /usr/local/cudnn-$_CUDNN_VERSION/cuda $BUILD_EXTR_PAR
-    elif [[ $BUILD_DEVICE = "tensorrt"* ]]; then
-        if [ $BUILD_DEVICE = "tensorrt-v7.1" ]; then
-            pushd .
-            cd $SCRIPT_DIR/../../../../cmake
-            sed -i "s/^onnx_tensorrt.*$/onnx_tensorrt;https:\/\/github.com\/onnx\/onnx-tensorrt\/archive\/refs\/tags\/release\/7.1.zip;e23bf76bbe4748c49951d6b401cf5e1006d86cce/g" deps.txt
-            popd
-        fi
-        _CUDNN_VERSION=$(echo $CUDNN_VERSION | cut -d. -f1-2)
-        python3 $SCRIPT_DIR/../../build.py --build_dir /build \
-            --config Release $COMMON_BUILD_ARGS \
-            --use_tensorrt --tensorrt_home /usr/lib/x86_64-linux-gnu/ \
-            --cuda_home /usr/local/cuda \
-            --cudnn_home /usr/lib/x86_64-linux-gnu/ $BUILD_EXTR_PAR
-    else #cpu and openvino
-        python3 $SCRIPT_DIR/../../build.py --build_dir /build \
-            --config Release $COMMON_BUILD_ARGS $BUILD_EXTR_PAR
-    fi
-fi
diff --git a/tools/ci_build/github/linux/run_dockerbuild.sh b/tools/ci_build/github/linux/run_dockerbuild.sh
deleted file mode 100755
index ada7c623ac568..0000000000000
--- a/tools/ci_build/github/linux/run_dockerbuild.sh
+++ /dev/null
@@ -1,123 +0,0 @@
-#!/bin/bash
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT License.
-
-# This file is used by Linux Multi GPU TensorRT CI Pipeline,Linux OpenVINO CI Pipeline,orttraining-linux-gpu-ci-pipeline
-#This file is only for Linux pipelines that build on ubuntu. All the docker images here are based on ubuntu.
-#Please don't put CentOS or manylinux2014 related stuffs here.
-set -e -o -x
-id
-SOURCE_ROOT=$BUILD_SOURCESDIRECTORY
-SCRIPT_DIR=$BUILD_SOURCESDIRECTORY/tools/ci_build/github/linux
-BUILD_DIR=$BUILD_BINARIESDIRECTORY
-
-
-YOCTO_VERSION="4.19"
-#Training only
-INSTALL_DEPS_DISTRIBUTED_SETUP=false
-ALLOW_RELEASED_ONNX_OPSET_ONLY_ENV="ALLOW_RELEASED_ONNX_OPSET_ONLY="$ALLOW_RELEASED_ONNX_OPSET_ONLY
-echo "ALLOW_RELEASED_ONNX_OPSET_ONLY environment variable is set as $ALLOW_RELEASED_ONNX_OPSET_ONLY_ENV"
-
-while getopts o:d:p:x:v:y:t:i:mue parameter_Option
-do case "${parameter_Option}"
-in
-#yocto, ubuntu22.04
-o) BUILD_OS=${OPTARG};;
-#gpu, tensorrt or openvino. It is ignored when BUILD_OS is yocto.
-d) BUILD_DEVICE=${OPTARG};;
-#python version: 3.8 3.9 3.10 3.11 3.12 (absence means default 3.10)
-p) PYTHON_VER=${OPTARG};;
-# "--build_wheel --use_openblas"
-x) BUILD_EXTR_PAR=${OPTARG};;
-# openvino version tag: 2020.3 (OpenVINO EP 2.0 supports version starting 2020.3)
-v) OPENVINO_VERSION=${OPTARG};;
-# YOCTO 4.19 + ACL 19.05, YOCTO 4.14 + ACL 19.02
-y) YOCTO_VERSION=${OPTARG};;
-# an additional name for the resulting docker image (created with "docker tag")
-# this is useful for referencing the image outside of this script
-t) EXTRA_IMAGE_TAG=${OPTARG};;
-# the docker image cache container registry
-i) IMAGE_CACHE_CONTAINER_REGISTRY_NAME=${OPTARG};;
-# install distributed setup dependencies
-m) INSTALL_DEPS_DISTRIBUTED_SETUP=true;;
-*) echo "Invalid option";;
-esac
-done
-
-# shellcheck disable=SC2034
-EXIT_CODE=1
-DEFAULT_PYTHON_VER="3.10"
-
-PYTHON_VER=${PYTHON_VER:=$DEFAULT_PYTHON_VER}
-echo "bo=$BUILD_OS bd=$BUILD_DEVICE bdir=$BUILD_DIR pv=$PYTHON_VER bex=$BUILD_EXTR_PAR"
-
-GET_DOCKER_IMAGE_CMD="${SOURCE_ROOT}/tools/ci_build/get_docker_image.py"
-if [[ -n "${IMAGE_CACHE_CONTAINER_REGISTRY_NAME}" ]]; then
-    GET_DOCKER_IMAGE_CMD="${GET_DOCKER_IMAGE_CMD} --container-registry ${IMAGE_CACHE_CONTAINER_REGISTRY_NAME}"
-fi
-DOCKER_CMD="docker"
-# If BUILD_OS is ubuntu, then UBUNTU_VERSION is set to the version string after ubuntu
-if [[ $BUILD_OS == ubuntu* ]]; then
-    UBUNTU_VERSION=${BUILD_OS#ubuntu}
-fi
-
-NEED_BUILD_SHARED_LIB=true
-cd $SCRIPT_DIR/docker
-if [ $BUILD_OS = "yocto" ]; then
-    IMAGE="arm-yocto-$YOCTO_VERSION"
-    DOCKER_FILE=Dockerfile.ubuntu_for_arm
-    # ACL 19.05 need yocto 4.19
-    TOOL_CHAIN_SCRIPT=fsl-imx-xwayland-glibc-x86_64-fsl-image-qt5-aarch64-toolchain-4.19-warrior.sh
-    if [ $YOCTO_VERSION = "4.14" ]; then
-        TOOL_CHAIN_SCRIPT=fsl-imx-xwayland-glibc-x86_64-fsl-image-qt5-aarch64-toolchain-4.14-sumo.sh
-    fi
-    $GET_DOCKER_IMAGE_CMD --repository "onnxruntime-$IMAGE" \
-        --docker-build-args="--build-arg TOOL_CHAIN=$TOOL_CHAIN_SCRIPT --build-arg BUILD_USER=onnxruntimedev --build-arg BUILD_UID=$(id -u) --build-arg PYTHON_VERSION=${PYTHON_VER}" \
-        --dockerfile $DOCKER_FILE --context .
-elif [[ $BUILD_DEVICE = "openvino"* ]]; then
-        BUILD_ARGS="--build-arg BUILD_USER=onnxruntimedev --build-arg BUILD_UID=$(id -u) --build-arg PYTHON_VERSION=${PYTHON_VER} --build-arg OPENVINO_VERSION=${OPENVINO_VERSION} --build-arg UBUNTU_VERSION=${UBUNTU_VERSION}"
-        IMAGE="$BUILD_OS-openvino"
-        DOCKER_FILE=Dockerfile.ubuntu_openvino
-        $GET_DOCKER_IMAGE_CMD --repository "onnxruntime-$IMAGE" \
-                --docker-build-args="${BUILD_ARGS}" \
-                --dockerfile $DOCKER_FILE --context .
-else
-  exit 1
-fi
-
-if [[ $NEED_BUILD_SHARED_LIB = true ]]; then
-    BUILD_EXTR_PAR=" --build_shared_lib ${BUILD_EXTR_PAR}"
-fi
-
-if [ -v EXTRA_IMAGE_TAG ]; then
-    ${DOCKER_CMD} tag "onnxruntime-$IMAGE" "${EXTRA_IMAGE_TAG}"
-fi
-
-set +e
-mkdir -p ~/.onnx
-
-if [ -z "$NIGHTLY_BUILD" ]; then
-    set NIGHTLY_BUILD=0
-fi
-
-if [ $BUILD_DEVICE = "cpu" ] || [ $BUILD_DEVICE = "openvino" ] || [ $BUILD_DEVICE = "arm" ]; then
-    RUNTIME=
-else
-    RUNTIME="--gpus all"
-fi
-
-DOCKER_RUN_PARAMETER="--volume $SOURCE_ROOT:/onnxruntime_src \
-                      --volume $BUILD_DIR:/build \
-                      --volume /data/models:/build/models:ro \
-                      --volume /data/onnx:/data/onnx:ro \
-                      --volume $HOME/.onnx:/home/onnxruntimedev/.onnx"
-if [ $BUILD_DEVICE = "openvino" ] && [[ $BUILD_EXTR_PAR == *"--use_openvino GPU_FP"* ]]; then
-    DOCKER_RUN_PARAMETER="$DOCKER_RUN_PARAMETER --device /dev/dri:/dev/dri"
-fi
-# Though this command has a yocto version argument, none of our ci build pipelines use yocto.
-$DOCKER_CMD run $RUNTIME --rm $DOCKER_RUN_PARAMETER \
-    -e NIGHTLY_BUILD -e SYSTEM_COLLECTIONURI \
-    -e $ALLOW_RELEASED_ONNX_OPSET_ONLY_ENV \
-    "onnxruntime-$IMAGE" \
-    /bin/bash /onnxruntime_src/tools/ci_build/github/linux/run_build.sh \
-    -d $BUILD_DEVICE -x "$BUILD_EXTR_PAR" -o $BUILD_OS -y $YOCTO_VERSION
diff --git a/tools/doc/builddoc.sh b/tools/doc/builddoc.sh
new file mode 100644
index 0000000000000..88d4fc5d409b1
--- /dev/null
+++ b/tools/doc/builddoc.sh
@@ -0,0 +1,21 @@
+# This script must be executed from this folder.
+
+# $1 python path
+# $2 source folder
+# $3 build folder
+# $4 build config
+
+# Fail the document generation if anything goes wrong in the process
+set -e -x
+
+# Install doc generation tools
+$1/python -m pip install -r $2/docs/python/requirements.txt
+
+# Fake onnxruntime installation
+export PYTHONPATH=$3/$4:$PYTHONPATH
+
+# Remove old docs
+rm -rf $3/docs/
+
+$1/python -m sphinx -v -T -b html -d $3/docs/_doctrees/html $2/docs/python/ $3/docs/html
+$1/python -u $2/tools/doc/rename_folders.py $3/docs/html
diff --git a/tools/doc/rename_folders.py b/tools/doc/rename_folders.py
new file mode 100644
index 0000000000000..587755d101ce2
--- /dev/null
+++ b/tools/doc/rename_folders.py
@@ -0,0 +1,85 @@
+"""
+Github publishes the markdown documentation with jekyll enabled.
+This extension does not publish any folder starting with `_`.
+These folders need to be renamed.
+"""
+
+import os
+import re
+
+
+def rename_folder(root):
+    """
+    Renames all folder starting with `_`.
+    Returns the list of renamed folders.
+    """
+    found = []
+    for r, dirs, _files in os.walk(root):
+        for name in dirs:
+            if name.startswith("_"):
+                found.append((r, name))
+    renamed = []
+    for r, name in found:
+        into = name.lstrip("_")
+        renamed.append((r, name, into))
+        full_src = os.path.join(r, name)
+        full_into = os.path.join(r, into)
+        if os.path.exists(full_into):
+            raise RuntimeError("%r already exists, previous documentation should be removed.")
+        print(f"rename {full_src!r}")
+        os.rename(full_src, full_into)
+
+    return renamed
+
+
+def replace_files(root, renamed):
+    subs = {r[1]: r[2] for r in renamed}
+    reg = re.compile('(\\"[a-zA-Z0-9\\.\\/\\?\\:@\\-_=#]+\\.([a-zA-Z]){2,6}([a-zA-Z0-9\\.\\&\\/\\?\\:@\\-_=#])*\\")')
+
+    for r, _dirs, files in os.walk(root):
+        for name in files:
+            if os.path.splitext(name)[-1] != ".html":
+                continue
+            full = os.path.join(r, name)
+            with open(full, encoding="utf-8") as f:
+                content = f.read()
+            find = reg.findall(content)
+            repl = []
+            for f in find:
+                if f[0].startswith("http"):
+                    continue
+                for k, v in subs.items():
+                    if k == v:
+                        raise ValueError(f"{k!r} == {v!r}")
+                    if (f'"{k}') in f[0]:
+                        repl.append((f[0], f[0].replace(f'"{k}', f'"{v}')))
+                    if (f"/{k}") in f[0]:
+                        repl.append((f[0], f[0].replace(f"/{k}", f"/{v}")))
+            if len(repl) == 0:
+                continue
+            print(f"update {full!r}")
+            for k, v in repl:
+                content = content.replace(k, v)
+            with open(full, "w", encoding="utf-8") as f:
+                f.write(content)
+
+
+if __name__ == "__main__":
+    import sys
+
+    if len(sys.argv) > 1:
+        root = sys.argv[-1]
+    else:
+        root = "../../build/docs/html"
+    print(f"look into {root!r}")
+    ren = rename_folder(root)
+    if len(ren) == 0:
+        ren = [
+            ("", "_static", "static"),
+            ("", "_images", "images"),
+            ("", "_downloads", "downloads"),
+            ("", "_sources", "sources"),
+            ("", "_modules", "modules"),
+        ]
+    replace_files(root, ren)
+    print("done.")
diff --git a/tools/nuget/generate_nuspec_for_native_nuget.py b/tools/nuget/generate_nuspec_for_native_nuget.py
index 8ccb2c054900e..1b7b5f5bc7092 100644
--- a/tools/nuget/generate_nuspec_for_native_nuget.py
+++ b/tools/nuget/generate_nuspec_for_native_nuget.py
@@ -552,6 +552,7 @@ def generate_files(line_list, args):
 
     if is_qnn_package:
         files_list.append("<file src=" + '"' + os.path.join(args.native_build_path, "QnnCpu.dll") + runtimes + " />")
+        files_list.append("<file src=" + '"' + os.path.join(args.native_build_path, "QnnGpu.dll") + runtimes + " />")
         files_list.append("<file src=" + '"' + os.path.join(args.native_build_path, "QnnHtp.dll") + runtimes + " />")
         files_list.append("<file src=" + '"' + os.path.join(args.native_build_path, "QnnSaver.dll") + runtimes + " />")
         if args.target_architecture != "x64":