From 3eaaa8d7a71b581b0d483a4d73a8012f150e1b45 Mon Sep 17 00:00:00 2001 From: "Peng, Bo" Date: Fri, 31 Oct 2025 10:28:33 +0800 Subject: [PATCH 1/5] disable bfloat16 conversion when single cast node to bfloat16, unit test case --- onnxruntime/core/providers/openvino/backend_manager.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/onnxruntime/core/providers/openvino/backend_manager.cc b/onnxruntime/core/providers/openvino/backend_manager.cc index 74999ab10a67d..11667c73cb781 100644 --- a/onnxruntime/core/providers/openvino/backend_manager.cc +++ b/onnxruntime/core/providers/openvino/backend_manager.cc @@ -598,7 +598,7 @@ BackendManager::GetModelProtoFromFusedNode(const onnxruntime::Node& fused_node, DumpOpenVINOEPModel(onnx_model_path_name, model_proto.get(), fused_node); ORT_ENFORCE(status.IsOK(), status.ErrorMessage()); return model_proto; - } else if (IsModelBF16(subgraph)) { + } else if (IsModelBF16(subgraph) && subgraph.GetNodesInTopologicalOrder().size() > 1) { // don't apply conversion when single cast node graph (unit test case) LOGS_DEFAULT(INFO) << "[OpenVINO-EP] OVEP bfloat16->float16 optimization pass is enabled"; std::unique_ptr model; Status status = bfloat16_fix::Transform(subgraph, logger, model); From 18278585aa5bbfed73567d71dbb57ee24d1e8e9e Mon Sep 17 00:00:00 2001 From: "Peng, Bo" Date: Mon, 3 Nov 2025 15:28:00 +0800 Subject: [PATCH 2/5] Insert a Cast(To:BFloat16) before output node(bfloat16) to keep user use original bf16 outputs tensor --- .../providers/openvino/backend_manager.cc | 2 +- .../qdq_transformations/qdq_scales_fix.cpp | 40 +++++++++++++++++++ 2 files changed, 41 insertions(+), 1 deletion(-) diff --git a/onnxruntime/core/providers/openvino/backend_manager.cc b/onnxruntime/core/providers/openvino/backend_manager.cc index 11667c73cb781..74999ab10a67d 100644 --- a/onnxruntime/core/providers/openvino/backend_manager.cc +++ b/onnxruntime/core/providers/openvino/backend_manager.cc @@ -598,7 +598,7 @@ BackendManager::GetModelProtoFromFusedNode(const onnxruntime::Node& fused_node, DumpOpenVINOEPModel(onnx_model_path_name, model_proto.get(), fused_node); ORT_ENFORCE(status.IsOK(), status.ErrorMessage()); return model_proto; - } else if (IsModelBF16(subgraph) && subgraph.GetNodesInTopologicalOrder().size() > 1) { // don't apply conversion when single cast node graph (unit test case) + } else if (IsModelBF16(subgraph)) { LOGS_DEFAULT(INFO) << "[OpenVINO-EP] OVEP bfloat16->float16 optimization pass is enabled"; std::unique_ptr model; Status status = bfloat16_fix::Transform(subgraph, logger, model); diff --git a/onnxruntime/core/providers/openvino/qdq_transformations/qdq_scales_fix.cpp b/onnxruntime/core/providers/openvino/qdq_transformations/qdq_scales_fix.cpp index de0e8a97fb6b0..df82119953f49 100644 --- a/onnxruntime/core/providers/openvino/qdq_transformations/qdq_scales_fix.cpp +++ b/onnxruntime/core/providers/openvino/qdq_transformations/qdq_scales_fix.cpp @@ -958,7 +958,47 @@ Status Transform(const GraphViewer& src_graph_viewer, namespace bfloat16_fix { void replace_bf16_with_fp16(qdq_scales_fix::CustomGraph& gen_graph) { + auto& graph = gen_graph.original_graph; + std::unordered_set protected_nodes; + + // To keep the BF16 output, insert a Cast node before it. + // Add the inserted Cast node and the Identity node to protected_nodes. + // The data flow becomes: Modified Internal Graph (FP16) -> Cast(to BF16) -> Identity -> Output(BF16). + for (const auto* output_arg : graph.GetOutputs()) { + if (output_arg->TypeAsProto() && + output_arg->TypeAsProto()->tensor_type().elem_type() == ONNX_NAMESPACE::TensorProto_DataType_BFLOAT16) { + const Node* identity_node = graph.GetProducerNode(output_arg->Name()); + if (!identity_node || identity_node->OpType() != "Identity") { + continue; + } + protected_nodes.insert(identity_node->Index()); + + // Create the new Cast node to keep bf16 output. + std::string cast_node_name = "InsertCastToBf16_" + output_arg->Name(); + const NodeArg* cast_input_arg = identity_node->InputDefs()[0]; + auto& cast_output_arg = graph.GetOrCreateNodeArg(cast_input_arg->Name() + "_bf16", output_arg->TypeAsProto()); + InlinedVector cast_inputs = {const_cast(cast_input_arg)}; + InlinedVector cast_outputs = {&cast_output_arg}; + Node& cast_node = graph.AddNode(cast_node_name, "Cast", "Cast internal FP16 to BF16 output", cast_inputs, cast_outputs, nullptr, ""); + cast_node.AddAttribute("to", static_cast(ONNX_NAMESPACE::TensorProto_DataType_BFLOAT16)); + protected_nodes.insert(cast_node.Index()); + + // Reroute the graph edges. + auto edge_it = identity_node->InputEdgesBegin(); + if (edge_it != identity_node->InputEdgesEnd()) { + const Node& producer_node = edge_it->GetNode(); + int producer_arg_index = edge_it->GetSrcArgIndex(); + graph.RemoveEdge(producer_node.Index(), identity_node->Index(), producer_arg_index, 0); + graph.AddEdge(producer_node.Index(), cast_node.Index(), producer_arg_index, 0); + graph.AddEdge(cast_node.Index(), identity_node->Index(), 0, 0); + } + } + } + for (auto& const_node : gen_graph.original_graph.Nodes()) { + if (protected_nodes.count(const_node->Index())) { + continue; + } auto node = const_cast(const_node); if (node->OpType() == "Cast") { for (auto& [name, const_attribute] : node->GetAttributes()) { From 415fdfeb10ce988496dd6677d8d3f13c38824992 Mon Sep 17 00:00:00 2001 From: "Peng, Bo" Date: Mon, 3 Nov 2025 16:42:05 +0800 Subject: [PATCH 3/5] revert changes to add Cast Node, add statement to disable bfloat16 transform for OV CPU --- .../providers/openvino/backend_manager.cc | 2 +- .../qdq_transformations/qdq_scales_fix.cpp | 40 ------------------- 2 files changed, 1 insertion(+), 41 deletions(-) diff --git a/onnxruntime/core/providers/openvino/backend_manager.cc b/onnxruntime/core/providers/openvino/backend_manager.cc index 74999ab10a67d..741b76ce203c9 100644 --- a/onnxruntime/core/providers/openvino/backend_manager.cc +++ b/onnxruntime/core/providers/openvino/backend_manager.cc @@ -598,7 +598,7 @@ BackendManager::GetModelProtoFromFusedNode(const onnxruntime::Node& fused_node, DumpOpenVINOEPModel(onnx_model_path_name, model_proto.get(), fused_node); ORT_ENFORCE(status.IsOK(), status.ErrorMessage()); return model_proto; - } else if (IsModelBF16(subgraph)) { + } else if ((session_context_.device_type.find("CPU") == std::string::npos) && IsModelBF16(subgraph)) { LOGS_DEFAULT(INFO) << "[OpenVINO-EP] OVEP bfloat16->float16 optimization pass is enabled"; std::unique_ptr model; Status status = bfloat16_fix::Transform(subgraph, logger, model); diff --git a/onnxruntime/core/providers/openvino/qdq_transformations/qdq_scales_fix.cpp b/onnxruntime/core/providers/openvino/qdq_transformations/qdq_scales_fix.cpp index df82119953f49..de0e8a97fb6b0 100644 --- a/onnxruntime/core/providers/openvino/qdq_transformations/qdq_scales_fix.cpp +++ b/onnxruntime/core/providers/openvino/qdq_transformations/qdq_scales_fix.cpp @@ -958,47 +958,7 @@ Status Transform(const GraphViewer& src_graph_viewer, namespace bfloat16_fix { void replace_bf16_with_fp16(qdq_scales_fix::CustomGraph& gen_graph) { - auto& graph = gen_graph.original_graph; - std::unordered_set protected_nodes; - - // To keep the BF16 output, insert a Cast node before it. - // Add the inserted Cast node and the Identity node to protected_nodes. - // The data flow becomes: Modified Internal Graph (FP16) -> Cast(to BF16) -> Identity -> Output(BF16). - for (const auto* output_arg : graph.GetOutputs()) { - if (output_arg->TypeAsProto() && - output_arg->TypeAsProto()->tensor_type().elem_type() == ONNX_NAMESPACE::TensorProto_DataType_BFLOAT16) { - const Node* identity_node = graph.GetProducerNode(output_arg->Name()); - if (!identity_node || identity_node->OpType() != "Identity") { - continue; - } - protected_nodes.insert(identity_node->Index()); - - // Create the new Cast node to keep bf16 output. - std::string cast_node_name = "InsertCastToBf16_" + output_arg->Name(); - const NodeArg* cast_input_arg = identity_node->InputDefs()[0]; - auto& cast_output_arg = graph.GetOrCreateNodeArg(cast_input_arg->Name() + "_bf16", output_arg->TypeAsProto()); - InlinedVector cast_inputs = {const_cast(cast_input_arg)}; - InlinedVector cast_outputs = {&cast_output_arg}; - Node& cast_node = graph.AddNode(cast_node_name, "Cast", "Cast internal FP16 to BF16 output", cast_inputs, cast_outputs, nullptr, ""); - cast_node.AddAttribute("to", static_cast(ONNX_NAMESPACE::TensorProto_DataType_BFLOAT16)); - protected_nodes.insert(cast_node.Index()); - - // Reroute the graph edges. - auto edge_it = identity_node->InputEdgesBegin(); - if (edge_it != identity_node->InputEdgesEnd()) { - const Node& producer_node = edge_it->GetNode(); - int producer_arg_index = edge_it->GetSrcArgIndex(); - graph.RemoveEdge(producer_node.Index(), identity_node->Index(), producer_arg_index, 0); - graph.AddEdge(producer_node.Index(), cast_node.Index(), producer_arg_index, 0); - graph.AddEdge(cast_node.Index(), identity_node->Index(), 0, 0); - } - } - } - for (auto& const_node : gen_graph.original_graph.Nodes()) { - if (protected_nodes.count(const_node->Index())) { - continue; - } auto node = const_cast(const_node); if (node->OpType() == "Cast") { for (auto& [name, const_attribute] : node->GetAttributes()) { From a6f7cbb15f1a691b3ef9e7e805cc5fa8c79a3757 Mon Sep 17 00:00:00 2001 From: "Peng, Bo" Date: Thu, 6 Nov 2025 19:25:52 +0800 Subject: [PATCH 4/5] remove bfloat16 silence conversion --- .../providers/openvino/backend_manager.cc | 22 -------- .../qdq_transformations/qdq_scales_fix.cpp | 55 ------------------- .../qdq_transformations/qdq_scales_fix.h | 5 -- 3 files changed, 82 deletions(-) diff --git a/onnxruntime/core/providers/openvino/backend_manager.cc b/onnxruntime/core/providers/openvino/backend_manager.cc index 741b76ce203c9..4a20847c0890c 100644 --- a/onnxruntime/core/providers/openvino/backend_manager.cc +++ b/onnxruntime/core/providers/openvino/backend_manager.cc @@ -389,18 +389,6 @@ static bool IsQDQGraph(const onnxruntime::GraphViewer& graph_viewer) { return false; } -static bool IsModelBF16(const onnxruntime::GraphViewer& graph_viewer) { - const auto& node_indices = graph_viewer.GetNodesInTopologicalOrder(); - for (std::size_t i = 0; i < node_indices.size(); i++) { - gsl::not_null node(graph_viewer.GetNode(node_indices[i])); - for (auto& output : node->OutputDefs()) { - if (output->ToProto().type().tensor_type().elem_type() == ONNX_NAMESPACE::TensorProto_DataType_BFLOAT16) - return true; - } - } - return false; -} - static bool Is16BitTensor(const onnxruntime::NodeArg* node_arg) { const auto* type_proto = node_arg ? node_arg->TypeAsProto() : nullptr; return type_proto && type_proto->has_tensor_type() && @@ -598,16 +586,6 @@ BackendManager::GetModelProtoFromFusedNode(const onnxruntime::Node& fused_node, DumpOpenVINOEPModel(onnx_model_path_name, model_proto.get(), fused_node); ORT_ENFORCE(status.IsOK(), status.ErrorMessage()); return model_proto; - } else if ((session_context_.device_type.find("CPU") == std::string::npos) && IsModelBF16(subgraph)) { - LOGS_DEFAULT(INFO) << "[OpenVINO-EP] OVEP bfloat16->float16 optimization pass is enabled"; - std::unique_ptr model; - Status status = bfloat16_fix::Transform(subgraph, logger, model); - auto model_proto = model->ToProto(); - model_proto->set_ir_version(ONNX_NAMESPACE::Version::IR_VERSION); - print_model_proto_duration(); - DumpOpenVINOEPModel(onnx_model_path_name, model_proto.get(), fused_node); - ORT_ENFORCE(status.IsOK(), status.ErrorMessage()); - return model_proto; } else { LOGS_DEFAULT(INFO) << "[OpenVINO-EP] OVEP QDQ optimization pass is disabled"; diff --git a/onnxruntime/core/providers/openvino/qdq_transformations/qdq_scales_fix.cpp b/onnxruntime/core/providers/openvino/qdq_transformations/qdq_scales_fix.cpp index de0e8a97fb6b0..161100f63fd26 100644 --- a/onnxruntime/core/providers/openvino/qdq_transformations/qdq_scales_fix.cpp +++ b/onnxruntime/core/providers/openvino/qdq_transformations/qdq_scales_fix.cpp @@ -955,60 +955,5 @@ Status Transform(const GraphViewer& src_graph_viewer, return status; } } // namespace qdq_scales_fix - -namespace bfloat16_fix { -void replace_bf16_with_fp16(qdq_scales_fix::CustomGraph& gen_graph) { - for (auto& const_node : gen_graph.original_graph.Nodes()) { - auto node = const_cast(const_node); - if (node->OpType() == "Cast") { - for (auto& [name, const_attribute] : node->GetAttributes()) { - auto& attribute = const_cast(const_attribute); - if (name == "to" && attribute.type() == ONNX_NAMESPACE::AttributeProto_AttributeType_INT) - if (attribute.i() == ONNX_NAMESPACE::TensorProto_DataType_BFLOAT16) - attribute.set_i(ONNX_NAMESPACE::TensorProto_DataType_FLOAT16); - } - } - for (auto& output : node->OutputDefs()) { - auto& output_proto = const_cast(output->ToProto().type()); - if (output_proto.mutable_tensor_type()->elem_type() == ONNX_NAMESPACE::TensorProto_DataType_BFLOAT16) - output_proto.mutable_tensor_type()->set_elem_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT16); - } - } - - for (auto& node : gen_graph.original_graph.Nodes()) { - for (auto& input_def : node->InputDefs()) { - ORT_THROW_IF_ERROR(graph_utils::ConvertInMemoryDataToInline(gen_graph.original_graph, input_def->Name())); - } - } - - const auto& init_set = gen_graph.original_graph.GetAllInitializedTensors(); - for (auto& [key, const_tensor_proto] : init_set) { - auto tensor_proto = const_cast(const_tensor_proto); - auto dt = tensor_proto->data_type(); - if (dt == ONNX_NAMESPACE::TensorProto_DataType_BFLOAT16) { - auto raw_data = tensor_proto->has_raw_data() ? reinterpret_cast(tensor_proto->mutable_raw_data()->data()) : nullptr; - if (raw_data) { - tensor_proto->set_data_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT16); - std::int64_t size = 1; - for (int i = 0; i < tensor_proto->dims_size(); ++i) - size *= tensor_proto->dims()[i]; - for (std::int64_t i = 0; i < size; ++i) { - raw_data[i] = onnxruntime::MLFloat16(onnxruntime::BFloat16::FromBits(raw_data[i])).val; - } - } - } - } -} - -Status Transform(const GraphViewer& src_graph_viewer, - const logging::Logger& logger, - /*out*/ std::unique_ptr& model) { - auto status = qdq_scales_fix::copy_model(src_graph_viewer, logger, model); - auto g = qdq_scales_fix::generate_graph_from_onnx(model->MainGraph()); - - replace_bf16_with_fp16(g); - return status; -} -} // namespace bfloat16_fix } // namespace openvino_ep } // namespace onnxruntime diff --git a/onnxruntime/core/providers/openvino/qdq_transformations/qdq_scales_fix.h b/onnxruntime/core/providers/openvino/qdq_transformations/qdq_scales_fix.h index 2182850d96c43..c54c531e1bd40 100644 --- a/onnxruntime/core/providers/openvino/qdq_transformations/qdq_scales_fix.h +++ b/onnxruntime/core/providers/openvino/qdq_transformations/qdq_scales_fix.h @@ -15,10 +15,5 @@ Status Transform(const GraphViewer& src_graph, const logging::Logger& logger, /*out*/ std::unique_ptr& model); } -namespace bfloat16_fix { -Status Transform(const GraphViewer& src_graph, - const logging::Logger& logger, - /*out*/ std::unique_ptr& model); -} } // namespace openvino_ep } // namespace onnxruntime From 46b1c78d1674330fbb1babdac8a9cb94f6bf547b Mon Sep 17 00:00:00 2001 From: "Peng, Bo" Date: Fri, 7 Nov 2025 10:27:23 +0800 Subject: [PATCH 5/5] remove bf16 testing and cpu support for openvino --- .../openvino/ov_versions/data_ops.cc | 4 +- .../qdq_transformations/qdq_scales_fix.cpp | 1 - .../openvino_ep_bfloat16_pass_test.cc | 116 ------------------ 3 files changed, 1 insertion(+), 120 deletions(-) delete mode 100644 onnxruntime/test/providers/openvino/openvino_ep_bfloat16_pass_test.cc diff --git a/onnxruntime/core/providers/openvino/ov_versions/data_ops.cc b/onnxruntime/core/providers/openvino/ov_versions/data_ops.cc index 037cb6a1270ea..4156b45cd638a 100644 --- a/onnxruntime/core/providers/openvino/ov_versions/data_ops.cc +++ b/onnxruntime/core/providers/openvino/ov_versions/data_ops.cc @@ -561,9 +561,7 @@ bool DataOps::type_is_supported(const NodeArg* node_arg, bool is_initializer) { } auto dtype = type_proto->tensor_type().elem_type(); - // Enable bfloat16 -> float16 on-the-fly conversion - if (dtype == ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_BFLOAT16 || - dtype == ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_INT16 || + if (dtype == ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_INT16 || dtype == ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_UINT16) return true; if (is_initializer) { diff --git a/onnxruntime/core/providers/openvino/qdq_transformations/qdq_scales_fix.cpp b/onnxruntime/core/providers/openvino/qdq_transformations/qdq_scales_fix.cpp index 161100f63fd26..a7b5c51882ff4 100644 --- a/onnxruntime/core/providers/openvino/qdq_transformations/qdq_scales_fix.cpp +++ b/onnxruntime/core/providers/openvino/qdq_transformations/qdq_scales_fix.cpp @@ -4,7 +4,6 @@ #include "qdq_scales_fix.h" #include "core/providers/openvino/ov_protobuf_utils.h" #include "core/framework/ort_value.h" -#include "core/common/float16.h" #include #include diff --git a/onnxruntime/test/providers/openvino/openvino_ep_bfloat16_pass_test.cc b/onnxruntime/test/providers/openvino/openvino_ep_bfloat16_pass_test.cc deleted file mode 100644 index 105a35011a78d..0000000000000 --- a/onnxruntime/test/providers/openvino/openvino_ep_bfloat16_pass_test.cc +++ /dev/null @@ -1,116 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#include -#include -#include - -#include "core/session/onnxruntime_cxx_api.h" -#include "core/common/float16.h" - -#include "test/util/include/test/test_environment.h" -#include "test/unittest_util/qdq_test_utils.h" - -#include "gtest/gtest.h" -#include "gmock/gmock.h" - -using namespace ONNX_NAMESPACE; -using namespace onnxruntime::logging; - -extern std::unique_ptr ort_env; - -class OVEP_BF16_Tests : public ::testing::TestWithParam {}; - -namespace detail { -auto ConstructModel() { - using namespace onnxruntime; - using namespace test; - - std::unordered_map domain_to_version; - domain_to_version[kOnnxDomain] = 19; - Model model("Bfloat16Tester", true, ModelMetaData(), PathString(), IOnnxRuntimeOpSchemaRegistryList(), - domain_to_version, {}, DefaultLoggingManager().DefaultLogger()); - - Graph& graph = model.MainGraph(); - ModelTestBuilder builder(graph); - auto dim = 4; - std::vector input_data(dim, 1.0f); - auto* input = builder.MakeInput({dim}, input_data); - builder.graph_.SetInputs({input}); - - auto* cast_to_bf16 = builder.MakeIntermediate(); - Node& cast_node = builder.AddNode("Cast", {input}, {cast_to_bf16}, ""); - cast_node.AddAttribute("to", static_cast(ONNX_NAMESPACE::TensorProto_DataType_BFLOAT16)); - - std::vector weight_data(dim * dim); - for (std::size_t i = 0; i < weight_data.size(); ++i) - weight_data[i] = onnxruntime::BFloat16(static_cast(i % dim) / dim); - auto* weights = builder.MakeInitializer({dim, dim}, weight_data); - - auto* matmul_out = builder.MakeIntermediate(); - builder.AddNode("MatMul", {cast_to_bf16, weights}, {matmul_out}); - - std::vector weight_data_2(dim * dim); - for (std::size_t i = 0; i < weight_data_2.size(); ++i) - weight_data_2[i] = onnxruntime::BFloat16(static_cast(i % dim) / dim); - auto* weights_2 = builder.MakeInitializer({dim, dim}, weight_data_2); - - auto* matmul_out_2 = builder.MakeIntermediate(); - builder.AddNode("MatMul", {matmul_out, weights_2}, {matmul_out_2}); - - auto* output = builder.MakeOutput(); - Node& cast2_node = builder.AddNode("Cast", {matmul_out_2}, {output}); - cast2_node.AddAttribute("to", static_cast(ONNX_NAMESPACE::TensorProto_DataType_FLOAT)); - - builder.SetGraphOutputs(); - auto st = model.MainGraph().Resolve(); - if (st != Status::OK()) - throw std::runtime_error(st.ErrorMessage()); - return model; -} - -auto ProbeDevice(const std::string& device) { - static std::map is_present; - if (is_present.find(device) == is_present.end()) { - Ort::SessionOptions sessionOptions; - std::unordered_map ov_options; - ov_options["device_type"] = device; - try { - sessionOptions.AppendExecutionProvider_OpenVINO_V2(ov_options); - is_present[device] = true; - } catch (...) { - is_present[device] = false; - } - } - return is_present[device]; -} -} // namespace detail - -namespace onnxruntime { -namespace test { - -TEST_P(OVEP_BF16_Tests, TestModelConversion) { - Ort::SessionOptions sessionOptions; - std::unordered_map ov_options; - const auto& device = GetParam(); - if (!::detail::ProbeDevice(device)) - GTEST_SKIP() << device + " is not available on this machine"; - - ov_options["device_type"] = device; - auto model = ::detail::ConstructModel(); - sessionOptions.AppendExecutionProvider_OpenVINO_V2(ov_options); - - std::string model_data; - model.ToProto().SerializeToString(&model_data); - auto model_data_span = AsByteSpan(model_data.data(), model_data.size()); - try { - Ort::Session session(*ort_env, model_data_span.data(), model_data_span.size(), sessionOptions); - } catch (...) { - FAIL(); - } -} -INSTANTIATE_TEST_SUITE_P(OVEP_Tests, - OVEP_BF16_Tests, - ::testing::Values("CPU", "GPU", "NPU")); -} // namespace test -} // namespace onnxruntime