From d062cf131d7ed8729b33d8eb40f4ff814f429ded Mon Sep 17 00:00:00 2001 From: "Klimenko, Mikhail" Date: Fri, 11 Jul 2025 15:57:47 +0200 Subject: [PATCH 1/6] Add on-the-fly bfloat16->float16 conversion pass --- .../providers/openvino/backend_manager.cc | 13 ++++- .../core/providers/openvino/contexts.h | 3 +- .../openvino/openvino_execution_provider.cc | 3 +- .../openvino/openvino_provider_factory.cc | 2 + .../openvino/ov_versions/capability.cc | 17 +++--- .../openvino/ov_versions/capability.h | 3 +- .../openvino/ov_versions/data_ops.cc | 3 + .../providers/openvino/ov_versions/data_ops.h | 7 ++- .../qdq_transformations/qdq_scales_fix.cpp | 55 ++++++++++++++++++- .../qdq_transformations/qdq_scales_fix.h | 5 ++ .../core/session/provider_bridge_ort.cc | 1 + .../python/onnxruntime_pybind_state.cc | 3 +- onnxruntime/test/perftest/ort_test_session.cc | 7 +++ 13 files changed, 104 insertions(+), 18 deletions(-) diff --git a/onnxruntime/core/providers/openvino/backend_manager.cc b/onnxruntime/core/providers/openvino/backend_manager.cc index 65532c31e14bd..567d05427a0df 100644 --- a/onnxruntime/core/providers/openvino/backend_manager.cc +++ b/onnxruntime/core/providers/openvino/backend_manager.cc @@ -453,7 +453,18 @@ BackendManager::GetModelProtoFromFusedNode(const onnxruntime::Node& fused_node, DumpOpenVINOEPModel(onnx_model_path_name, model_proto.get(), fused_node); ORT_ENFORCE(status.IsOK(), status.ErrorMessage()); return model_proto; - } else { + } + else if (session_context_.enable_bfloat16_optimizer) { + std::unique_ptr model; + Status status = bfloat16_fix::Transform(subgraph, logger, model); + auto model_proto = model->ToProto(); + model_proto->set_ir_version(ONNX_NAMESPACE::Version::IR_VERSION); + print_model_proto_duration(); + DumpOpenVINOEPModel(onnx_model_path_name, model_proto.get(), fused_node); + ORT_ENFORCE(status.IsOK(), status.ErrorMessage()); + return model_proto; + } + else { LOGS_DEFAULT(INFO) << "[OpenVINO-EP] OVEP QDQ optimization pass is disabled"; auto model = subgraph.CreateModel(logger); auto model_proto = model->ToProto(); diff --git a/onnxruntime/core/providers/openvino/contexts.h b/onnxruntime/core/providers/openvino/contexts.h index 6a2b375d733f9..1ed89253f27ac 100644 --- a/onnxruntime/core/providers/openvino/contexts.h +++ b/onnxruntime/core/providers/openvino/contexts.h @@ -101,6 +101,7 @@ struct ProviderInfo { bool disable_dynamic_shapes{false}; // [disable_dynamic_shapes]: Rewrite dynamic shaped models to // static shape at runtime and execute. bool enable_qdq_optimizer{false}; // Enables QDQ pruning for efficient inference latency with NPU + bool enable_bfloat16_optimizer{false}; // Enables on-the-fly bfloat16->float16 conversion bool enable_causallm{false}; // Enables Causal LM Compilation for ORT GenAI OVEP Pass bool so_context_enable{false}; // ORT session option bool so_disable_cpu_ep_fallback{false}; // ORT session option @@ -110,7 +111,7 @@ struct ProviderInfo { const ConfigOptions* config_options{NULL}; const std::unordered_set valid_provider_keys = {"device_type", "device_id", "device_luid", "cache_dir", "precision", "load_config", "context", "num_of_threads", "model_priority", "num_streams", "enable_opencl_throttling", "enable_qdq_optimizer", - "enable_causallm", "disable_dynamic_shapes", "reshape_input"}; + "enable_bfloat16_optimizer", "enable_causallm", "disable_dynamic_shapes", "reshape_input"}; }; // Holds context applicable to the entire EP instance. diff --git a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc index a0aa04293ac37..825a79520a5e1 100644 --- a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc +++ b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc @@ -81,7 +81,8 @@ OpenVINOExecutionProvider::GetCapability(const GraphViewer& graph_viewer, openvino_ep::GetCapability obj(ep_ctx_handle_, graph_viewer, session_context_.device_type, - session_context_.enable_qdq_optimizer); + session_context_.enable_qdq_optimizer, + session_context_.enable_bfloat16_optimizer); result = obj.Execute(); session_context_.is_wholly_supported_graph = obj.IsWhollySupportedGraph(); session_context_.has_external_weights = obj.HasExternalWeights(); diff --git a/onnxruntime/core/providers/openvino/openvino_provider_factory.cc b/onnxruntime/core/providers/openvino/openvino_provider_factory.cc index bad1d416eeda2..67dc1f1c56527 100644 --- a/onnxruntime/core/providers/openvino/openvino_provider_factory.cc +++ b/onnxruntime/core/providers/openvino/openvino_provider_factory.cc @@ -340,6 +340,8 @@ static void ParseProviderInfo(const ProviderOptions& provider_options, pi.enable_qdq_optimizer = ParseBooleanOption(provider_options, "enable_qdq_optimizer"); + pi.enable_bfloat16_optimizer = ParseBooleanOption(provider_options, "enable_bfloat16_optimizer"); + pi.enable_causallm = ParseBooleanOption(provider_options, "enable_causallm"); pi.disable_dynamic_shapes = ParseBooleanOption(provider_options, "disable_dynamic_shapes"); diff --git a/onnxruntime/core/providers/openvino/ov_versions/capability.cc b/onnxruntime/core/providers/openvino/ov_versions/capability.cc index 88ddde8610c6e..4d844f233297f 100644 --- a/onnxruntime/core/providers/openvino/ov_versions/capability.cc +++ b/onnxruntime/core/providers/openvino/ov_versions/capability.cc @@ -30,9 +30,10 @@ namespace openvino_ep { GetCapability::GetCapability(const EPCtxHandler& ep_ctx_handler, const GraphViewer& graph_viewer_param, const std::string device_type_param, - const bool enable_qdq_optimizer) : ep_ctx_handler_(ep_ctx_handler), - graph_viewer_(graph_viewer_param), - device_type_(std::move(device_type_param)) { + const bool enable_qdq_optimizer, + bool enable_bfloat16_optimizer) : ep_ctx_handler_(ep_ctx_handler), + graph_viewer_(graph_viewer_param), + device_type_(std::move(device_type_param)) { bool npu_qdq_optimizer_enabled = false; if (device_type_.find("NPU") != std::string::npos) { device_type_ = "CPU"; @@ -42,15 +43,15 @@ GetCapability::GetCapability(const EPCtxHandler& ep_ctx_handler, } #if OPENVINO_VERSION_MAJOR == 2024 && OPENVINO_VERSION_MINOR == 5 - data_ops_ = new DataOps(graph_viewer_, V_2024_5, device_type_, npu_qdq_optimizer_enabled); + data_ops_ = new DataOps(graph_viewer_, V_2024_5, device_type_, npu_qdq_optimizer_enabled, enable_bfloat16_optimizer); #elif OPENVINO_VERSION_MAJOR == 2024 && OPENVINO_VERSION_MINOR == 6 - data_ops_ = new DataOps(graph_viewer_, V_2024_6, device_type_, npu_qdq_optimizer_enabled); + data_ops_ = new DataOps(graph_viewer_, V_2024_6, device_type_, npu_qdq_optimizer_enabled, enable_bfloat16_optimizer); #elif OPENVINO_VERSION_MAJOR == 2025 && OPENVINO_VERSION_MINOR == 0 - data_ops_ = new DataOps(graph_viewer_, V_2025_0, device_type_, npu_qdq_optimizer_enabled); + data_ops_ = new DataOps(graph_viewer_, V_2025_0, device_type_, npu_qdq_optimizer_enabled, enable_bfloat16_optimizer); #elif OPENVINO_VERSION_MAJOR == 2025 && OPENVINO_VERSION_MINOR == 1 - data_ops_ = new DataOps(graph_viewer_, V_2025_1, device_type_, npu_qdq_optimizer_enabled); + data_ops_ = new DataOps(graph_viewer_, V_2025_1, device_type_, npu_qdq_optimizer_enabled, enable_bfloat16_optimizer); #else - data_ops_ = new DataOps(graph_viewer_, V_2025_1, device_type_, npu_qdq_optimizer_enabled); + data_ops_ = new DataOps(graph_viewer_, V_2025_1, device_type_, npu_qdq_optimizer_enabled, enable_bfloat16_optimizer); #endif } diff --git a/onnxruntime/core/providers/openvino/ov_versions/capability.h b/onnxruntime/core/providers/openvino/ov_versions/capability.h index 364e79a76f154..786768d0ba3c4 100644 --- a/onnxruntime/core/providers/openvino/ov_versions/capability.h +++ b/onnxruntime/core/providers/openvino/ov_versions/capability.h @@ -24,7 +24,8 @@ class GetCapability { GetCapability(const EPCtxHandler& ep_ctx_handler, const GraphViewer& graph_viewer_param, const std::string device_type_param, - const bool enable_qdq_optimizer); + const bool enable_qdq_optimizer, + bool enable_bfloat16_optimizer); virtual std::vector> Execute(); bool IsWhollySupportedGraph() { return is_wholly_supported_graph_; diff --git a/onnxruntime/core/providers/openvino/ov_versions/data_ops.cc b/onnxruntime/core/providers/openvino/ov_versions/data_ops.cc index 84001c1161efc..868b968fde7a1 100644 --- a/onnxruntime/core/providers/openvino/ov_versions/data_ops.cc +++ b/onnxruntime/core/providers/openvino/ov_versions/data_ops.cc @@ -617,6 +617,9 @@ bool DataOps::type_is_supported(const NodeArg* node_arg, bool is_initializer) { // experimentally for GPU and qdq stripping mode allow int16 types if (npu_qdq_optimizer_enabled_ && (dtype == ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_INT16 || dtype == ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_UINT16)) return true; + // Enable bfloat16 -> float16 on-the-fly conversion + if (bfloat16_optimizer_enabled_ && dtype == ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_BFLOAT16) + return true; } #ifndef NDEBUG if (openvino_ep::backend_utils::IsDebugEnabled()) { diff --git a/onnxruntime/core/providers/openvino/ov_versions/data_ops.h b/onnxruntime/core/providers/openvino/ov_versions/data_ops.h index cf7d834d6cfc7..4ee3d9736d29c 100644 --- a/onnxruntime/core/providers/openvino/ov_versions/data_ops.h +++ b/onnxruntime/core/providers/openvino/ov_versions/data_ops.h @@ -69,6 +69,7 @@ class DataOps { std::set supported_types_gpu_; std::set supported_types_initializer_; bool npu_qdq_optimizer_enabled_; + bool bfloat16_optimizer_enabled_; protected: void populate_op_mode_supported(); @@ -81,11 +82,13 @@ class DataOps { public: DataOps(const GraphViewer& graph_viewer_param, VersionNum ver, - const std::string dev_id, const bool npu_qdq_optimizer_enabled) + const std::string dev_id, const bool npu_qdq_optimizer_enabled, + bool bfloat16_optimizer_enabled) : graph_viewer_(graph_viewer_param), version_id_(ver), device_id_(std::move(dev_id)), - npu_qdq_optimizer_enabled_(npu_qdq_optimizer_enabled) { + npu_qdq_optimizer_enabled_(npu_qdq_optimizer_enabled), + bfloat16_optimizer_enabled_(bfloat16_optimizer_enabled) { populate_op_mode_supported(); populate_types_supported(); } diff --git a/onnxruntime/core/providers/openvino/qdq_transformations/qdq_scales_fix.cpp b/onnxruntime/core/providers/openvino/qdq_transformations/qdq_scales_fix.cpp index 571aa57c99f33..3a11168920c3f 100644 --- a/onnxruntime/core/providers/openvino/qdq_transformations/qdq_scales_fix.cpp +++ b/onnxruntime/core/providers/openvino/qdq_transformations/qdq_scales_fix.cpp @@ -3,6 +3,7 @@ #include "qdq_scales_fix.h" #include "core/providers/openvino/ov_protobuf_utils.h" +#include "core/framework/float16.h" #include #include @@ -605,8 +606,7 @@ float get_initializer_value(const Graph& graph, const std::string& initializer_n auto size = get_initializer_size(graph, initializer_name); ORT_ENFORCE(size == 1, "Expected an initializer to be of size 1"); return raw_data[0]; - } - else + } else return get_float_initializer_data(p_initializer); } @@ -775,7 +775,6 @@ bool scale_graph(CustomGraph& gen_graph, return needs_second_run; } - Status copy_model(const GraphViewer& src_graph_viewer, const logging::Logger& logger, std::unique_ptr& model) { model = src_graph_viewer.CreateModel(logger); @@ -942,5 +941,55 @@ Status Transform(const GraphViewer& src_graph_viewer, return status; } } // namespace qdq_scales_fix + +namespace bfloat16_fix { +void replace_bf16_with_fp16(qdq_scales_fix::CustomGraph& gen_graph) { + for (auto& const_node : gen_graph.original_graph.Nodes()) { + auto node = const_cast(const_node); + if (node->OpType() == "Cast") { + for (auto& [name, const_attribute] : node->GetAttributes()) { + auto& attribute = const_cast(const_attribute); + if (name == "to" && attribute.type() == ONNX_NAMESPACE::AttributeProto_AttributeType_INT) + if (attribute.i() == ONNX_NAMESPACE::TensorProto_DataType_BFLOAT16) + attribute.set_i(ONNX_NAMESPACE::TensorProto_DataType_FLOAT16); + } + } + for (auto& output : node->OutputDefs()) { + auto& output_proto = const_cast(output->ToProto().type()); + if (output_proto.mutable_tensor_type()->elem_type() == ONNX_NAMESPACE::TensorProto_DataType_BFLOAT16) + output_proto.mutable_tensor_type()->set_elem_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT16); + } + } + + const auto& init_set = gen_graph.original_graph.GetAllInitializedTensors(); + for (auto& [key, const_tensor_proto] : init_set) { + auto tensor_proto = const_cast(const_tensor_proto); + auto dt = tensor_proto->data_type(); + if (dt == ONNX_NAMESPACE::TensorProto_DataType_BFLOAT16) { + auto raw_data = tensor_proto->has_raw_data() ? reinterpret_cast(tensor_proto->mutable_raw_data()->data()) : nullptr; + if (raw_data) { + tensor_proto->set_data_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT16); + std::int64_t size = 1; + for (int i = 0; i < tensor_proto->dims_size(); ++i) + size *= tensor_proto->dims()[i]; + for (std::int64_t i = 0; i < size; ++i) { + std::uint32_t tmp = static_cast(raw_data[i]) << 16; + raw_data[i] = onnxruntime::MLFloat16(*reinterpret_cast(&tmp)).val; + } + } + } + } +} + +Status Transform(const GraphViewer& src_graph_viewer, + const logging::Logger& logger, + /*out*/ std::unique_ptr& model) { + auto status = qdq_scales_fix::copy_model(src_graph_viewer, logger, model); + auto g = qdq_scales_fix::generate_graph_from_onnx(model->MainGraph()); + + replace_bf16_with_fp16(g); + return status; +} +} // namespace bfloat16_fix } // namespace openvino_ep } // namespace onnxruntime diff --git a/onnxruntime/core/providers/openvino/qdq_transformations/qdq_scales_fix.h b/onnxruntime/core/providers/openvino/qdq_transformations/qdq_scales_fix.h index c54c531e1bd40..2182850d96c43 100644 --- a/onnxruntime/core/providers/openvino/qdq_transformations/qdq_scales_fix.h +++ b/onnxruntime/core/providers/openvino/qdq_transformations/qdq_scales_fix.h @@ -15,5 +15,10 @@ Status Transform(const GraphViewer& src_graph, const logging::Logger& logger, /*out*/ std::unique_ptr& model); } +namespace bfloat16_fix { +Status Transform(const GraphViewer& src_graph, + const logging::Logger& logger, + /*out*/ std::unique_ptr& model); +} } // namespace openvino_ep } // namespace onnxruntime diff --git a/onnxruntime/core/session/provider_bridge_ort.cc b/onnxruntime/core/session/provider_bridge_ort.cc index 3db35ae8769e0..3c3a1942a4cf1 100644 --- a/onnxruntime/core/session/provider_bridge_ort.cc +++ b/onnxruntime/core/session/provider_bridge_ort.cc @@ -2144,6 +2144,7 @@ ProviderOptions OrtOpenVINOProviderOptionsToOrtOpenVINOProviderOptionsV2(const O ov_options_converted_map["load_config"] = ""; ov_options_converted_map["model_priority"] = "DEFAULT"; ov_options_converted_map["enable_qdq_optimizer"] = "false"; + ov_options_converted_map["enable_bfloat16_optimizer"] = "false"; ov_options_converted_map["enable_causallm"] = "false"; return ov_options_converted_map; } diff --git a/onnxruntime/python/onnxruntime_pybind_state.cc b/onnxruntime/python/onnxruntime_pybind_state.cc index bdc4f65e590d9..cad66e4fd3566 100644 --- a/onnxruntime/python/onnxruntime_pybind_state.cc +++ b/onnxruntime/python/onnxruntime_pybind_state.cc @@ -1182,7 +1182,8 @@ static std::shared_ptr CreateExecutionProviderFactory #if defined(USE_OPENVINO) || defined(USE_OPENVINO_PROVIDER_INTERFACE) ProviderOptions OV_provider_options_map; const std::unordered_set valid_provider_keys = {"device_type", "device_id", "device_luid", "cache_dir", "precision", - "load_config", "context", "num_of_threads", "model_priority", "num_streams", "enable_opencl_throttling", "enable_qdq_optimizer", + "load_config", "context", "num_of_threads", "model_priority", "num_streams", "enable_opencl_throttling", + "enable_qdq_optimizer", "enable_bfloat16_optimizer", "enable_causallm", "disable_dynamic_shapes", "reshape_input"}; auto it = provider_options_map.find(type); if (it != provider_options_map.end()) { diff --git a/onnxruntime/test/perftest/ort_test_session.cc b/onnxruntime/test/perftest/ort_test_session.cc index 7a210ca8482a4..5f2ea1cf2e2d7 100644 --- a/onnxruntime/test/perftest/ort_test_session.cc +++ b/onnxruntime/test/perftest/ort_test_session.cc @@ -764,6 +764,13 @@ select from 'TF8', 'TF16', 'UINT8', 'FLOAT', 'ITENSOR'. \n)"); } else { ORT_THROW("[ERROR] [OpenVINO] The value for the key 'enable_qdq_optimizer' should be a boolean i.e. true or false. Default value is false.\n"); } + } else if (key == "enable_bfloat16_optimizer") { + if (value == "true" || value == "True" || + value == "false" || value == "False") { + ov_options[key] = value; + } else { + ORT_THROW("[ERROR] [OpenVINO] The value for the key 'enable_bfloat16_optimizer' should be a boolean i.e. true or false. Default value is false.\n"); + } } else if (key == "enable_causallm") { if (value == "true" || value == "True" || value == "false" || value == "False") { From 4a2c571859452dcea2f19d116a05ad541533c86f Mon Sep 17 00:00:00 2001 From: "Klimenko, Mikhail" Date: Mon, 14 Jul 2025 07:55:49 +0000 Subject: [PATCH 2/6] Fix undetected bfloat16 initializers --- onnxruntime/core/providers/openvino/backend_manager.cc | 1 + .../core/providers/openvino/ov_versions/data_ops.cc | 10 ++++------ 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/onnxruntime/core/providers/openvino/backend_manager.cc b/onnxruntime/core/providers/openvino/backend_manager.cc index 567d05427a0df..8bc99a7e0907d 100644 --- a/onnxruntime/core/providers/openvino/backend_manager.cc +++ b/onnxruntime/core/providers/openvino/backend_manager.cc @@ -455,6 +455,7 @@ BackendManager::GetModelProtoFromFusedNode(const onnxruntime::Node& fused_node, return model_proto; } else if (session_context_.enable_bfloat16_optimizer) { + LOGS_DEFAULT(INFO) << "[OpenVINO-EP] OVEP bfloat16->float16 optimization pass is enabled"; std::unique_ptr model; Status status = bfloat16_fix::Transform(subgraph, logger, model); auto model_proto = model->ToProto(); diff --git a/onnxruntime/core/providers/openvino/ov_versions/data_ops.cc b/onnxruntime/core/providers/openvino/ov_versions/data_ops.cc index 868b968fde7a1..a8db03170723d 100644 --- a/onnxruntime/core/providers/openvino/ov_versions/data_ops.cc +++ b/onnxruntime/core/providers/openvino/ov_versions/data_ops.cc @@ -560,8 +560,11 @@ bool DataOps::type_is_supported(const NodeArg* node_arg, bool is_initializer) { return false; } + auto dtype = type_proto->tensor_type().elem_type(); + // Enable bfloat16 -> float16 on-the-fly conversion + if (bfloat16_optimizer_enabled_ && dtype == ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_BFLOAT16) + return true; if (is_initializer) { - auto dtype = type_proto->tensor_type().elem_type(); for (auto const& var : supported_types_initializer_) { if ((var.first <= version_id_) && (var.second == dtype)) { @@ -576,8 +579,6 @@ bool DataOps::type_is_supported(const NodeArg* node_arg, bool is_initializer) { #endif return false; } else { - auto dtype = type_proto->tensor_type().elem_type(); - if (device_id_.find("HETERO") != std::string::npos || device_id_.find("MULTI") != std::string::npos || device_id_.find("AUTO") != std::string::npos) { for (auto const& var : supported_types_npu_) { @@ -617,9 +618,6 @@ bool DataOps::type_is_supported(const NodeArg* node_arg, bool is_initializer) { // experimentally for GPU and qdq stripping mode allow int16 types if (npu_qdq_optimizer_enabled_ && (dtype == ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_INT16 || dtype == ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_UINT16)) return true; - // Enable bfloat16 -> float16 on-the-fly conversion - if (bfloat16_optimizer_enabled_ && dtype == ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_BFLOAT16) - return true; } #ifndef NDEBUG if (openvino_ep::backend_utils::IsDebugEnabled()) { From 601dd30a23346dd3da855241900bf4560fbadc41 Mon Sep 17 00:00:00 2001 From: "Klimenko, Mikhail" Date: Mon, 14 Jul 2025 11:54:07 +0000 Subject: [PATCH 3/6] Remove the option and make the logic implicit --- .../core/providers/openvino/backend_manager.cc | 18 ++++++++++++++---- onnxruntime/core/providers/openvino/contexts.h | 3 +-- .../openvino/openvino_execution_provider.cc | 3 +-- .../openvino/openvino_provider_factory.cc | 2 -- .../openvino/ov_versions/capability.cc | 17 ++++++++--------- .../openvino/ov_versions/capability.h | 3 +-- .../providers/openvino/ov_versions/data_ops.cc | 2 +- .../providers/openvino/ov_versions/data_ops.h | 7 ++----- .../core/session/provider_bridge_ort.cc | 1 - onnxruntime/python/onnxruntime_pybind_state.cc | 3 +-- onnxruntime/test/perftest/ort_test_session.cc | 7 ------- 11 files changed, 29 insertions(+), 37 deletions(-) diff --git a/onnxruntime/core/providers/openvino/backend_manager.cc b/onnxruntime/core/providers/openvino/backend_manager.cc index 8bc99a7e0907d..ceec01353d3f6 100644 --- a/onnxruntime/core/providers/openvino/backend_manager.cc +++ b/onnxruntime/core/providers/openvino/backend_manager.cc @@ -372,6 +372,18 @@ static bool IsQDQGraph(const onnxruntime::GraphViewer& graph_viewer) { return false; } +static bool HasBf16(const onnxruntime::GraphViewer& graph_viewer) { + const auto& node_indices = graph_viewer.GetNodesInTopologicalOrder(); + for (std::size_t i = 0; i < node_indices.size(); i++) { + gsl::not_null node(graph_viewer.GetNode(node_indices[i])); + for (auto& output : node->OutputDefs()) { + if (output->ToProto().type().tensor_type().elem_type() == ONNX_NAMESPACE::TensorProto_DataType_BFLOAT16) + return true; + } + } + return false; +} + static void DumpOpenVINOEPModel([[maybe_unused]] const std::filesystem::path& onnx_model_path_name, [[maybe_unused]] ONNX_NAMESPACE::ModelProto* model_proto, [[maybe_unused]] const onnxruntime::Node& fused_node) { @@ -453,8 +465,7 @@ BackendManager::GetModelProtoFromFusedNode(const onnxruntime::Node& fused_node, DumpOpenVINOEPModel(onnx_model_path_name, model_proto.get(), fused_node); ORT_ENFORCE(status.IsOK(), status.ErrorMessage()); return model_proto; - } - else if (session_context_.enable_bfloat16_optimizer) { + } else if (HasBf16(subgraph)) { LOGS_DEFAULT(INFO) << "[OpenVINO-EP] OVEP bfloat16->float16 optimization pass is enabled"; std::unique_ptr model; Status status = bfloat16_fix::Transform(subgraph, logger, model); @@ -464,8 +475,7 @@ BackendManager::GetModelProtoFromFusedNode(const onnxruntime::Node& fused_node, DumpOpenVINOEPModel(onnx_model_path_name, model_proto.get(), fused_node); ORT_ENFORCE(status.IsOK(), status.ErrorMessage()); return model_proto; - } - else { + } else { LOGS_DEFAULT(INFO) << "[OpenVINO-EP] OVEP QDQ optimization pass is disabled"; auto model = subgraph.CreateModel(logger); auto model_proto = model->ToProto(); diff --git a/onnxruntime/core/providers/openvino/contexts.h b/onnxruntime/core/providers/openvino/contexts.h index 1ed89253f27ac..6a2b375d733f9 100644 --- a/onnxruntime/core/providers/openvino/contexts.h +++ b/onnxruntime/core/providers/openvino/contexts.h @@ -101,7 +101,6 @@ struct ProviderInfo { bool disable_dynamic_shapes{false}; // [disable_dynamic_shapes]: Rewrite dynamic shaped models to // static shape at runtime and execute. bool enable_qdq_optimizer{false}; // Enables QDQ pruning for efficient inference latency with NPU - bool enable_bfloat16_optimizer{false}; // Enables on-the-fly bfloat16->float16 conversion bool enable_causallm{false}; // Enables Causal LM Compilation for ORT GenAI OVEP Pass bool so_context_enable{false}; // ORT session option bool so_disable_cpu_ep_fallback{false}; // ORT session option @@ -111,7 +110,7 @@ struct ProviderInfo { const ConfigOptions* config_options{NULL}; const std::unordered_set valid_provider_keys = {"device_type", "device_id", "device_luid", "cache_dir", "precision", "load_config", "context", "num_of_threads", "model_priority", "num_streams", "enable_opencl_throttling", "enable_qdq_optimizer", - "enable_bfloat16_optimizer", "enable_causallm", "disable_dynamic_shapes", "reshape_input"}; + "enable_causallm", "disable_dynamic_shapes", "reshape_input"}; }; // Holds context applicable to the entire EP instance. diff --git a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc index 825a79520a5e1..a0aa04293ac37 100644 --- a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc +++ b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc @@ -81,8 +81,7 @@ OpenVINOExecutionProvider::GetCapability(const GraphViewer& graph_viewer, openvino_ep::GetCapability obj(ep_ctx_handle_, graph_viewer, session_context_.device_type, - session_context_.enable_qdq_optimizer, - session_context_.enable_bfloat16_optimizer); + session_context_.enable_qdq_optimizer); result = obj.Execute(); session_context_.is_wholly_supported_graph = obj.IsWhollySupportedGraph(); session_context_.has_external_weights = obj.HasExternalWeights(); diff --git a/onnxruntime/core/providers/openvino/openvino_provider_factory.cc b/onnxruntime/core/providers/openvino/openvino_provider_factory.cc index 67dc1f1c56527..bad1d416eeda2 100644 --- a/onnxruntime/core/providers/openvino/openvino_provider_factory.cc +++ b/onnxruntime/core/providers/openvino/openvino_provider_factory.cc @@ -340,8 +340,6 @@ static void ParseProviderInfo(const ProviderOptions& provider_options, pi.enable_qdq_optimizer = ParseBooleanOption(provider_options, "enable_qdq_optimizer"); - pi.enable_bfloat16_optimizer = ParseBooleanOption(provider_options, "enable_bfloat16_optimizer"); - pi.enable_causallm = ParseBooleanOption(provider_options, "enable_causallm"); pi.disable_dynamic_shapes = ParseBooleanOption(provider_options, "disable_dynamic_shapes"); diff --git a/onnxruntime/core/providers/openvino/ov_versions/capability.cc b/onnxruntime/core/providers/openvino/ov_versions/capability.cc index 4d844f233297f..88ddde8610c6e 100644 --- a/onnxruntime/core/providers/openvino/ov_versions/capability.cc +++ b/onnxruntime/core/providers/openvino/ov_versions/capability.cc @@ -30,10 +30,9 @@ namespace openvino_ep { GetCapability::GetCapability(const EPCtxHandler& ep_ctx_handler, const GraphViewer& graph_viewer_param, const std::string device_type_param, - const bool enable_qdq_optimizer, - bool enable_bfloat16_optimizer) : ep_ctx_handler_(ep_ctx_handler), - graph_viewer_(graph_viewer_param), - device_type_(std::move(device_type_param)) { + const bool enable_qdq_optimizer) : ep_ctx_handler_(ep_ctx_handler), + graph_viewer_(graph_viewer_param), + device_type_(std::move(device_type_param)) { bool npu_qdq_optimizer_enabled = false; if (device_type_.find("NPU") != std::string::npos) { device_type_ = "CPU"; @@ -43,15 +42,15 @@ GetCapability::GetCapability(const EPCtxHandler& ep_ctx_handler, } #if OPENVINO_VERSION_MAJOR == 2024 && OPENVINO_VERSION_MINOR == 5 - data_ops_ = new DataOps(graph_viewer_, V_2024_5, device_type_, npu_qdq_optimizer_enabled, enable_bfloat16_optimizer); + data_ops_ = new DataOps(graph_viewer_, V_2024_5, device_type_, npu_qdq_optimizer_enabled); #elif OPENVINO_VERSION_MAJOR == 2024 && OPENVINO_VERSION_MINOR == 6 - data_ops_ = new DataOps(graph_viewer_, V_2024_6, device_type_, npu_qdq_optimizer_enabled, enable_bfloat16_optimizer); + data_ops_ = new DataOps(graph_viewer_, V_2024_6, device_type_, npu_qdq_optimizer_enabled); #elif OPENVINO_VERSION_MAJOR == 2025 && OPENVINO_VERSION_MINOR == 0 - data_ops_ = new DataOps(graph_viewer_, V_2025_0, device_type_, npu_qdq_optimizer_enabled, enable_bfloat16_optimizer); + data_ops_ = new DataOps(graph_viewer_, V_2025_0, device_type_, npu_qdq_optimizer_enabled); #elif OPENVINO_VERSION_MAJOR == 2025 && OPENVINO_VERSION_MINOR == 1 - data_ops_ = new DataOps(graph_viewer_, V_2025_1, device_type_, npu_qdq_optimizer_enabled, enable_bfloat16_optimizer); + data_ops_ = new DataOps(graph_viewer_, V_2025_1, device_type_, npu_qdq_optimizer_enabled); #else - data_ops_ = new DataOps(graph_viewer_, V_2025_1, device_type_, npu_qdq_optimizer_enabled, enable_bfloat16_optimizer); + data_ops_ = new DataOps(graph_viewer_, V_2025_1, device_type_, npu_qdq_optimizer_enabled); #endif } diff --git a/onnxruntime/core/providers/openvino/ov_versions/capability.h b/onnxruntime/core/providers/openvino/ov_versions/capability.h index 786768d0ba3c4..364e79a76f154 100644 --- a/onnxruntime/core/providers/openvino/ov_versions/capability.h +++ b/onnxruntime/core/providers/openvino/ov_versions/capability.h @@ -24,8 +24,7 @@ class GetCapability { GetCapability(const EPCtxHandler& ep_ctx_handler, const GraphViewer& graph_viewer_param, const std::string device_type_param, - const bool enable_qdq_optimizer, - bool enable_bfloat16_optimizer); + const bool enable_qdq_optimizer); virtual std::vector> Execute(); bool IsWhollySupportedGraph() { return is_wholly_supported_graph_; diff --git a/onnxruntime/core/providers/openvino/ov_versions/data_ops.cc b/onnxruntime/core/providers/openvino/ov_versions/data_ops.cc index a8db03170723d..8f271a825711e 100644 --- a/onnxruntime/core/providers/openvino/ov_versions/data_ops.cc +++ b/onnxruntime/core/providers/openvino/ov_versions/data_ops.cc @@ -562,7 +562,7 @@ bool DataOps::type_is_supported(const NodeArg* node_arg, bool is_initializer) { auto dtype = type_proto->tensor_type().elem_type(); // Enable bfloat16 -> float16 on-the-fly conversion - if (bfloat16_optimizer_enabled_ && dtype == ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_BFLOAT16) + if (dtype == ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_BFLOAT16) return true; if (is_initializer) { for (auto const& var : supported_types_initializer_) { diff --git a/onnxruntime/core/providers/openvino/ov_versions/data_ops.h b/onnxruntime/core/providers/openvino/ov_versions/data_ops.h index 4ee3d9736d29c..cf7d834d6cfc7 100644 --- a/onnxruntime/core/providers/openvino/ov_versions/data_ops.h +++ b/onnxruntime/core/providers/openvino/ov_versions/data_ops.h @@ -69,7 +69,6 @@ class DataOps { std::set supported_types_gpu_; std::set supported_types_initializer_; bool npu_qdq_optimizer_enabled_; - bool bfloat16_optimizer_enabled_; protected: void populate_op_mode_supported(); @@ -82,13 +81,11 @@ class DataOps { public: DataOps(const GraphViewer& graph_viewer_param, VersionNum ver, - const std::string dev_id, const bool npu_qdq_optimizer_enabled, - bool bfloat16_optimizer_enabled) + const std::string dev_id, const bool npu_qdq_optimizer_enabled) : graph_viewer_(graph_viewer_param), version_id_(ver), device_id_(std::move(dev_id)), - npu_qdq_optimizer_enabled_(npu_qdq_optimizer_enabled), - bfloat16_optimizer_enabled_(bfloat16_optimizer_enabled) { + npu_qdq_optimizer_enabled_(npu_qdq_optimizer_enabled) { populate_op_mode_supported(); populate_types_supported(); } diff --git a/onnxruntime/core/session/provider_bridge_ort.cc b/onnxruntime/core/session/provider_bridge_ort.cc index 3c3a1942a4cf1..3db35ae8769e0 100644 --- a/onnxruntime/core/session/provider_bridge_ort.cc +++ b/onnxruntime/core/session/provider_bridge_ort.cc @@ -2144,7 +2144,6 @@ ProviderOptions OrtOpenVINOProviderOptionsToOrtOpenVINOProviderOptionsV2(const O ov_options_converted_map["load_config"] = ""; ov_options_converted_map["model_priority"] = "DEFAULT"; ov_options_converted_map["enable_qdq_optimizer"] = "false"; - ov_options_converted_map["enable_bfloat16_optimizer"] = "false"; ov_options_converted_map["enable_causallm"] = "false"; return ov_options_converted_map; } diff --git a/onnxruntime/python/onnxruntime_pybind_state.cc b/onnxruntime/python/onnxruntime_pybind_state.cc index cad66e4fd3566..bdc4f65e590d9 100644 --- a/onnxruntime/python/onnxruntime_pybind_state.cc +++ b/onnxruntime/python/onnxruntime_pybind_state.cc @@ -1182,8 +1182,7 @@ static std::shared_ptr CreateExecutionProviderFactory #if defined(USE_OPENVINO) || defined(USE_OPENVINO_PROVIDER_INTERFACE) ProviderOptions OV_provider_options_map; const std::unordered_set valid_provider_keys = {"device_type", "device_id", "device_luid", "cache_dir", "precision", - "load_config", "context", "num_of_threads", "model_priority", "num_streams", "enable_opencl_throttling", - "enable_qdq_optimizer", "enable_bfloat16_optimizer", + "load_config", "context", "num_of_threads", "model_priority", "num_streams", "enable_opencl_throttling", "enable_qdq_optimizer", "enable_causallm", "disable_dynamic_shapes", "reshape_input"}; auto it = provider_options_map.find(type); if (it != provider_options_map.end()) { diff --git a/onnxruntime/test/perftest/ort_test_session.cc b/onnxruntime/test/perftest/ort_test_session.cc index 5f2ea1cf2e2d7..7a210ca8482a4 100644 --- a/onnxruntime/test/perftest/ort_test_session.cc +++ b/onnxruntime/test/perftest/ort_test_session.cc @@ -764,13 +764,6 @@ select from 'TF8', 'TF16', 'UINT8', 'FLOAT', 'ITENSOR'. \n)"); } else { ORT_THROW("[ERROR] [OpenVINO] The value for the key 'enable_qdq_optimizer' should be a boolean i.e. true or false. Default value is false.\n"); } - } else if (key == "enable_bfloat16_optimizer") { - if (value == "true" || value == "True" || - value == "false" || value == "False") { - ov_options[key] = value; - } else { - ORT_THROW("[ERROR] [OpenVINO] The value for the key 'enable_bfloat16_optimizer' should be a boolean i.e. true or false. Default value is false.\n"); - } } else if (key == "enable_causallm") { if (value == "true" || value == "True" || value == "false" || value == "False") { From c594c4da48d339c8a8567b3370a9f9bfd96ea606 Mon Sep 17 00:00:00 2001 From: "Klimenko, Mikhail" Date: Tue, 29 Jul 2025 10:34:19 +0000 Subject: [PATCH 4/6] Add tests --- .../openvino_ep_bfloat16_pass_test.cc | 116 ++++++++++++++++++ 1 file changed, 116 insertions(+) create mode 100644 onnxruntime/test/providers/openvino/openvino_ep_bfloat16_pass_test.cc diff --git a/onnxruntime/test/providers/openvino/openvino_ep_bfloat16_pass_test.cc b/onnxruntime/test/providers/openvino/openvino_ep_bfloat16_pass_test.cc new file mode 100644 index 0000000000000..fc90563a61bb1 --- /dev/null +++ b/onnxruntime/test/providers/openvino/openvino_ep_bfloat16_pass_test.cc @@ -0,0 +1,116 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include +#include +#include + +#include "core/session/onnxruntime_cxx_api.h" +#include "core/framework/float16.h" + +#include "test/util/include/test/test_environment.h" +#include "test/optimizer/qdq_test_utils.h" + +#include "gtest/gtest.h" +#include "gmock/gmock.h" + +using namespace ONNX_NAMESPACE; +using namespace onnxruntime::logging; + +extern std::unique_ptr ort_env; + +class OVEP_BF16_Tests : public ::testing::TestWithParam {}; + +namespace detail { +auto ConstructModel() { + using namespace onnxruntime; + using namespace test; + + std::unordered_map domain_to_version; + domain_to_version[kOnnxDomain] = 19; + Model model("Bfloat16Tester", true, ModelMetaData(), PathString(), IOnnxRuntimeOpSchemaRegistryList(), + domain_to_version, {}, DefaultLoggingManager().DefaultLogger()); + + Graph& graph = model.MainGraph(); + ModelTestBuilder builder(graph); + auto dim = 4; + std::vector input_data(dim, 1.0f); + auto* input = builder.MakeInput({dim}, input_data); + builder.graph_.SetInputs({input}); + + auto* cast_to_bf16 = builder.MakeIntermediate(); + Node& cast_node = builder.AddNode("Cast", {input}, {cast_to_bf16}, ""); + cast_node.AddAttribute("to", static_cast(ONNX_NAMESPACE::TensorProto_DataType_BFLOAT16)); + + std::vector weight_data(dim * dim); + for (std::size_t i = 0; i < weight_data.size(); ++i) + weight_data[i] = onnxruntime::BFloat16(static_cast(i % dim) / dim); + auto* weights = builder.MakeInitializer({dim, dim}, weight_data); + + auto* matmul_out = builder.MakeIntermediate(); + builder.AddNode("MatMul", {cast_to_bf16, weights}, {matmul_out}); + + std::vector weight_data_2(dim * dim); + for (std::size_t i = 0; i < weight_data_2.size(); ++i) + weight_data_2[i] = onnxruntime::BFloat16(static_cast(i % dim) / dim); + auto* weights_2 = builder.MakeInitializer({dim, dim}, weight_data_2); + + auto* matmul_out_2 = builder.MakeIntermediate(); + builder.AddNode("MatMul", {matmul_out, weights_2}, {matmul_out_2}); + + auto* output = builder.MakeOutput(); + Node& cast2_node = builder.AddNode("Cast", {matmul_out_2}, {output}); + cast2_node.AddAttribute("to", static_cast(ONNX_NAMESPACE::TensorProto_DataType_FLOAT)); + + builder.SetGraphOutputs(); + auto st = model.MainGraph().Resolve(); + if (st != Status::OK()) + throw std::runtime_error(st.ErrorMessage()); + return model; +} + +auto ProbeDevice(const std::string& device) { + static std::map is_present; + if (is_present.find(device) == is_present.end()) { + Ort::SessionOptions sessionOptions; + std::unordered_map ov_options; + ov_options["device_type"] = device; + try { + sessionOptions.AppendExecutionProvider_OpenVINO_V2(ov_options); + is_present[device] = true; + } catch (...) { + is_present[device] = false; + } + } + return is_present[device]; +} +} // namespace detail + +namespace onnxruntime { +namespace test { + +TEST_P(OVEP_BF16_Tests, TestModelConversion) { + Ort::SessionOptions sessionOptions; + std::unordered_map ov_options; + const auto& device = GetParam(); + if (!::detail::ProbeDevice(device)) + GTEST_SKIP() << device + " is not available on this machine"; + + ov_options["device_type"] = device; + auto model = ::detail::ConstructModel(); + sessionOptions.AppendExecutionProvider_OpenVINO_V2(ov_options); + + std::string model_data; + model.ToProto().SerializeToString(&model_data); + auto model_data_span = AsByteSpan(model_data.data(), model_data.size()); + try { + Ort::Session session(*ort_env, model_data_span.data(), model_data_span.size(), sessionOptions); + } catch (...) { + FAIL(); + } +} +INSTANTIATE_TEST_SUITE_P(OVEP_Tests, + OVEP_BF16_Tests, + ::testing::Values("CPU", "GPU", "NPU")); +} // namespace test +} // namespace onnxruntime From 58e12daf8eed18b9adfb1ec058ac89393098af62 Mon Sep 17 00:00:00 2001 From: "Klimenko, Mikhail" Date: Wed, 30 Jul 2025 14:12:12 +0000 Subject: [PATCH 5/6] Rename detection function --- onnxruntime/core/providers/openvino/backend_manager.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/onnxruntime/core/providers/openvino/backend_manager.cc b/onnxruntime/core/providers/openvino/backend_manager.cc index a18f9fc7a6530..cadeab4cbd4cc 100644 --- a/onnxruntime/core/providers/openvino/backend_manager.cc +++ b/onnxruntime/core/providers/openvino/backend_manager.cc @@ -375,7 +375,7 @@ static bool IsQDQGraph(const onnxruntime::GraphViewer& graph_viewer) { return false; } -static bool HasBf16(const onnxruntime::GraphViewer& graph_viewer) { +static bool IsModelBF16(const onnxruntime::GraphViewer& graph_viewer) { const auto& node_indices = graph_viewer.GetNodesInTopologicalOrder(); for (std::size_t i = 0; i < node_indices.size(); i++) { gsl::not_null node(graph_viewer.GetNode(node_indices[i])); @@ -468,7 +468,7 @@ BackendManager::GetModelProtoFromFusedNode(const onnxruntime::Node& fused_node, DumpOpenVINOEPModel(onnx_model_path_name, model_proto.get(), fused_node); ORT_ENFORCE(status.IsOK(), status.ErrorMessage()); return model_proto; - } else if (HasBf16(subgraph)) { + } else if (IsModelBF16(subgraph)) { LOGS_DEFAULT(INFO) << "[OpenVINO-EP] OVEP bfloat16->float16 optimization pass is enabled"; std::unique_ptr model; Status status = bfloat16_fix::Transform(subgraph, logger, model); From d846db049ebb49d93d7a45815bf0351656b953e7 Mon Sep 17 00:00:00 2001 From: "Klimenko, Mikhail" Date: Wed, 30 Jul 2025 14:44:47 +0000 Subject: [PATCH 6/6] Fix CI for strict aliasing rules --- .../providers/openvino/qdq_transformations/qdq_scales_fix.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/onnxruntime/core/providers/openvino/qdq_transformations/qdq_scales_fix.cpp b/onnxruntime/core/providers/openvino/qdq_transformations/qdq_scales_fix.cpp index 11137449cc9d4..f1ce230387565 100644 --- a/onnxruntime/core/providers/openvino/qdq_transformations/qdq_scales_fix.cpp +++ b/onnxruntime/core/providers/openvino/qdq_transformations/qdq_scales_fix.cpp @@ -973,8 +973,7 @@ void replace_bf16_with_fp16(qdq_scales_fix::CustomGraph& gen_graph) { for (int i = 0; i < tensor_proto->dims_size(); ++i) size *= tensor_proto->dims()[i]; for (std::int64_t i = 0; i < size; ++i) { - std::uint32_t tmp = static_cast(raw_data[i]) << 16; - raw_data[i] = onnxruntime::MLFloat16(*reinterpret_cast(&tmp)).val; + raw_data[i] = onnxruntime::MLFloat16(onnxruntime::BFloat16::FromBits(raw_data[i])).val; } } }