From b217424293318ec08e9af6204b0e8ec9b87e4869 Mon Sep 17 00:00:00 2001 From: Eric Crawford Date: Fri, 25 Apr 2025 17:01:12 -0700 Subject: [PATCH 1/2] Optimize CPU time spent in inference path Move input/output name to ort/ov input output bindings to compilation. Reduce tensor lookups by name in favor of index look ups. --- .../core/providers/openvino/backend_utils.cc | 4 +- .../openvino/backends/basic_backend.cc | 136 +++++------------- .../openvino/backends/basic_backend.h | 39 ++++- 3 files changed, 78 insertions(+), 101 deletions(-) diff --git a/onnxruntime/core/providers/openvino/backend_utils.cc b/onnxruntime/core/providers/openvino/backend_utils.cc index 2ee5e9ec3e3a9..1382c187f6b4e 100644 --- a/onnxruntime/core/providers/openvino/backend_utils.cc +++ b/onnxruntime/core/providers/openvino/backend_utils.cc @@ -121,7 +121,7 @@ std::istream& operator>>(std::istream& stream, SharedContext::SharedWeights::Met namespace backend_utils { bool IsDebugEnabled() { - const std::string env_name = onnxruntime::GetEnvironmentVar("ORT_OPENVINO_ENABLE_DEBUG"); + static std::string env_name = onnxruntime::GetEnvironmentVar("ORT_OPENVINO_ENABLE_DEBUG"); if (!env_name.empty()) { return true; } @@ -129,7 +129,7 @@ bool IsDebugEnabled() { } bool IsCILogEnabled() { - const std::string env_name = onnxruntime::GetEnvironmentVar("ORT_OPENVINO_ENABLE_CI_LOG"); + static std::string env_name = onnxruntime::GetEnvironmentVar("ORT_OPENVINO_ENABLE_CI_LOG"); if (!env_name.empty()) { return true; } diff --git a/onnxruntime/core/providers/openvino/backends/basic_backend.cc b/onnxruntime/core/providers/openvino/backends/basic_backend.cc index c7ea76fabe815..1ce815d44ec4c 100644 --- a/onnxruntime/core/providers/openvino/backends/basic_backend.cc +++ b/onnxruntime/core/providers/openvino/backends/basic_backend.cc @@ -140,6 +140,7 @@ BasicBackend::BasicBackend(std::unique_ptr& model_pr }; } inferRequestsQueue_ = std::unique_ptr(new InferRequestsQueue(exe_network_, num_infer_req, std::move(initializer))); + bindings_ = std::make_unique(exe_network_, subgraph_context_); } bool BasicBackend::ValidateSubgraph(std::map>& const_outputs_map) { @@ -362,29 +363,16 @@ void BasicBackend::SetNumThreads(ov::AnyMap& device_config) { // an Infer Request indexed by infer_req_idx void BasicBackend::StartAsyncInference(Ort::KernelContext& context, OVInferRequestPtr infer_request) { try { - auto ov_input_info = exe_network_.Get().inputs(); - - // Loop over subgraph original input names to find the correspondent OV input name - for (const auto& [onnx_input_name, onnx_input_index] : subgraph_context_.input_names) { - std::string input_name{}; - uint32_t input_idx = 0; - for (uint32_t index = 0; const auto& ov_input : ov_input_info) { - if (ov_input.get_names().contains(onnx_input_name)) { - input_name = onnx_input_name; - input_idx = index; - break; - } - index++; - } - ORT_ENFORCE(!input_name.empty(), log_tag, - "Input names mismatch between OpenVINO and ONNX. ", onnx_input_name, - " doesn't exist in the list of OpenVINO input tensor names"); + bool cpu_or_gpu = (session_context_.device_type.find("CPU") != std::string::npos || + session_context_.device_type.find("GPU") != std::string::npos); + bool npu = (session_context_.device_type.find("NPU") != std::string::npos); + + for (const auto& input_info : bindings_->network_inputs_) { size_t batch_slice_idx = 0; if (subgraph_context_.has_dynamic_input_shape && !session_context_.disable_dynamic_shapes && - (session_context_.device_type.find("CPU") != std::string::npos || - session_context_.device_type.find("GPU") != std::string::npos)) { - auto tensor = context.GetInput(subgraph_context_.input_names.at(input_name)); + cpu_or_gpu) { + auto tensor = context.GetInput(input_info.onnx_index); auto tensor_info = tensor.GetTensorTypeAndShapeInfo(); auto tensor_shape = tensor_info.GetShape(); auto tensor_size = tensor_shape.size(); @@ -395,98 +383,72 @@ void BasicBackend::StartAsyncInference(Ort::KernelContext& context, OVInferReque input_tensor_shape[tensor_iter] = *i; tensor_iter += 1; } - const auto& input = ov_input_info.at(input_idx); OVTensorPtr tensor_ptr; // avoid input copies on the CPU device if (session_context_.device_type.find("CPU") != std::string::npos) { - tensor_ptr = std::make_shared(input.get_element_type(), input_tensor_shape, + tensor_ptr = std::make_shared(input_info.type, input_tensor_shape, (void*)tensor_data); } else { - tensor_ptr = std::make_shared(input.get_element_type(), input_tensor_shape); - FillInputBlob(tensor_ptr, batch_slice_idx, input_name, context, subgraph_context_); + tensor_ptr = std::make_shared(input_info.type, input_tensor_shape); + FillInputBlob(tensor_ptr, batch_slice_idx, input_info.name, context, subgraph_context_); } try { - infer_request->SetTensor(std::move(input_name), tensor_ptr); + infer_request->SetTensor(input_info.name, tensor_ptr); } catch (const char* msg) { ORT_THROW(msg); } } else { - if ((session_context_.device_type.find("CPU") != std::string::npos || - session_context_.device_type.find("GPU") != std::string::npos)) { + if (cpu_or_gpu) { OVTensorPtr graph_input_blob; try { - graph_input_blob = infer_request->GetTensor(input_name); + graph_input_blob = infer_request->GetTensor(input_info.name); } catch (const char* msg) { ORT_THROW(msg); } - FillInputBlob(std::move(graph_input_blob), batch_slice_idx, std::move(input_name), context, subgraph_context_); + FillInputBlob(std::move(graph_input_blob), batch_slice_idx, input_info.name, context, subgraph_context_); } else { - auto tensor = context.GetInput(subgraph_context_.input_names.at(input_name)); - ort_tensor_key_t ort_tensor_key{input_name}; + auto tensor = context.GetInput(input_info.onnx_index); + ort_tensor_key_t ort_tensor_key{input_info.name}; auto it = ort_ov_tensor_map.find(ort_tensor_key); - if ((it == ort_ov_tensor_map.end()) || - (it != ort_ov_tensor_map.end() && (it->second.ort_ptr != tensor.GetTensorRawData()))) { + if ((it == ort_ov_tensor_map.end()) || it->second.ort_ptr != tensor.GetTensorRawData()) { ov_tensor_data_t ov_tensor_data; - const auto& input = ov_input_info.at(input_idx); - ov_tensor_data.tensor_ptr = std::make_shared(input.get_element_type(), input.get_shape(), + ov_tensor_data.tensor_ptr = std::make_shared(input_info.type, input_info.ov_shape, const_cast(tensor.GetTensorRawData())); ov_tensor_data.ort_ptr = tensor.GetTensorRawData(); ort_ov_tensor_map[ort_tensor_key] = ov_tensor_data; try { - infer_request->SetTensor(std::move(input_name), ov_tensor_data.tensor_ptr); + infer_request->SetTensor(input_info.name, ov_tensor_data.tensor_ptr); } catch (const char* msg) { ORT_THROW(msg); } } } } - } // Loop subgraph original input names + } // Loop subgraph original input - if (session_context_.device_type.find("NPU") != std::string::npos) { + if (npu) { // Set the output blob as remote blob - auto graph_output_info = exe_network_.Get().outputs(); - auto output_idx = 0; - for (auto output_info_iter = graph_output_info.begin(); - output_info_iter != graph_output_info.end(); ++output_info_iter) { - auto output_names = output_info_iter->get_names(); - std::string onnx_output_name; - std::string output_name; - // using the output name retrieved from ONNX original to match with the output names returned by OV tensors - for (auto it = subgraph_context_.output_names.begin(); it != subgraph_context_.output_names.end(); ++it) { - onnx_output_name = it->first; - if (output_names.find(onnx_output_name) != output_names.end()) { - // Assigning the output_name - output_name = it->first; - break; - } - } - size_t batch_size = 1; - Ort::UnownedValue tensor = GetOutputTensor(context, - batch_size, - infer_request, - output_name, - subgraph_context_.output_names); - ort_tensor_key_t ort_tensor_key{output_name}; + for (const auto& output_info : bindings_->network_outputs_) { + Ort::UnownedValue tensor = context.GetOutput(output_info.onnx_index, output_info.onnx_shape); + + ort_tensor_key_t ort_tensor_key{output_info.name}; const auto& it = ort_ov_tensor_map.find(ort_tensor_key); - if ((it == ort_ov_tensor_map.end()) || - (it != ort_ov_tensor_map.end() && (it->second.ort_ptr != tensor.GetTensorRawData()))) { + if ((it == ort_ov_tensor_map.end()) || (it->second.ort_ptr != tensor.GetTensorRawData())) { ov_tensor_data_t ov_tensor_data; - const auto& output = graph_output_info.at(output_idx); ov_tensor_data.ort_ptr = tensor.GetTensorRawData(); - ov_tensor_data.tensor_ptr = std::make_shared(output.get_element_type(), output.get_shape(), + ov_tensor_data.tensor_ptr = std::make_shared(output_info.type, output_info.ov_shape, const_cast(tensor.GetTensorRawData())); ort_ov_tensor_map[ort_tensor_key] = ov_tensor_data; try { - infer_request->SetTensor(std::move(output_name), ov_tensor_data.tensor_ptr); + infer_request->SetTensor(output_info.name, ov_tensor_data.tensor_ptr); } catch (const char* msg) { ORT_THROW(msg); } } - output_idx++; } } @@ -611,44 +573,22 @@ void BasicBackend::StartRemoteAsyncInference(Ort::KernelContext& context, OVInfe void BasicBackend::CompleteAsyncInference(Ort::KernelContext& context, OVInferRequestPtr infer_request) { // Wait for Async inference completion try { + bool cpu_or_gpu = session_context_.device_type.find("CPU") != std::string::npos || + session_context_.device_type.find("GPU") != std::string::npos; + infer_request->WaitRequest(); - auto graph_output_info = exe_network_.Get().outputs(); - for (auto output_info_iter = graph_output_info.begin(); - output_info_iter != graph_output_info.end(); ++output_info_iter) { - OVTensorPtr graph_output_blob; - auto output_names = output_info_iter->get_names(); - std::string onnx_output_name; - std::string output_name; - bool output_name_found = false; - // using the output name retrieved from ONNX original to match with the output names returned by OV tensors - for (auto it = subgraph_context_.output_names.begin(); it != subgraph_context_.output_names.end(); ++it) { - onnx_output_name = it->first; - if (output_names.find(onnx_output_name) != output_names.end()) { - // Assigning the output_name - output_name = it->first; - output_name_found = true; - break; - } - } - if (!output_name_found) { - ORT_THROW( - log_tag + - "Output names mismatch between OpenVINO and ONNX. " - "[ONNX Output: ] " + - onnx_output_name + - " doesn't exist in the " - "list of OpenVINO output tensor names"); - } - if ((session_context_.device_type.find("CPU") != std::string::npos || - session_context_.device_type.find("GPU") != std::string::npos)) { + + if (cpu_or_gpu) { + for (const auto& output_info : bindings_->network_outputs_) { + OVTensorPtr graph_output_blob; try { - graph_output_blob = infer_request->GetTensor(output_name); + graph_output_blob = infer_request->GetTensor(output_info.name); } catch (const char* msg) { ORT_THROW(msg); } size_t batch_size = 1; Ort::UnownedValue output_tensor = - GetOutputTensor(context, batch_size, infer_request, std::move(output_name), subgraph_context_.output_names); + GetOutputTensor(context, batch_size, infer_request, output_info.name, subgraph_context_.output_names); auto mem_info = output_tensor.GetTensorMemoryInfo(); if (mem_info.GetAllocatorName() == OpenVINO_GPU) { return; diff --git a/onnxruntime/core/providers/openvino/backends/basic_backend.h b/onnxruntime/core/providers/openvino/backends/basic_backend.h index 7d905f4a1e2f7..eaa8afb67c5e8 100644 --- a/onnxruntime/core/providers/openvino/backends/basic_backend.h +++ b/onnxruntime/core/providers/openvino/backends/basic_backend.h @@ -18,6 +18,7 @@ #include "core/providers/openvino/contexts.h" #include "core/providers/openvino/ibackend.h" #include "core/providers/openvino/ov_interface.h" +#include "core/providers/openvino/backend_utils.h" namespace onnxruntime { namespace openvino_ep { @@ -27,6 +28,42 @@ struct ov_tensor_data_t { const void* ort_ptr; }; +struct OnnxToOvNetworkBindings { + struct ParameterInfo { + std::string name; + uint32_t ov_index; + uint32_t onnx_index; + ov::element::Type type; + ov::Shape ov_shape; + std::vector onnx_shape; + }; + std::vector network_outputs_; + std::vector network_inputs_; + + OnnxToOvNetworkBindings(OVExeNetwork& exec_network, SubGraphContext& subgraph_context) { + auto populate = [&](auto& input_output_map, const SubGraphContext::string_index_map_t& onnx_input_map, const auto& ov_parameters) { + for (const auto& [onnx_name, onnx_param_index] : onnx_input_map) { + auto it = std::find_if(ov_parameters.begin(), ov_parameters.end(), + [&onnx_name](const auto& ov_parameter_info) { return ov_parameter_info.get_names().contains(onnx_name); }); + auto ov_param_index = std::distance(ov_parameters.begin(), it); + + ORT_ENFORCE(it != ov_parameters.end(), backend_utils::log_tag, + "Input names mismatch between OpenVINO and ONNX. ", onnx_name, + " doesn't exist in the list of OpenVINO input tensor names"); + auto shape = ov_parameters[ov_param_index].get_shape(); + auto type = ov_parameters[ov_param_index].get_element_type(); + + ParameterInfo info{onnx_name, ov_param_index, onnx_param_index, type, shape}; + std::transform(shape.begin(), shape.end(), std::back_inserter(info.onnx_shape), [](const auto& dim) { return static_cast(dim); }); + input_output_map.push_back(std::move(info)); + } + }; + + populate(network_inputs_, subgraph_context.input_names, exec_network.Get().inputs()); + populate(network_outputs_, subgraph_context.output_names, exec_network.Get().outputs()); + } +}; + class InferRequestsQueue; class BasicBackend : public IBackend { public: @@ -43,7 +80,6 @@ class BasicBackend : public IBackend { } private: - void PopulateCompiledDirectory(std::string, std::string&, std::string&, bool&); bool ValidateSubgraph(std::map>& const_outputs_map); void PopulateConfigValue(ov::AnyMap& device_config); void EnableCaching(); @@ -71,6 +107,7 @@ class BasicBackend : public IBackend { using ort_tensor_key_t = const std::string; std::map ort_ov_tensor_map; + std::unique_ptr bindings_; }; class InferRequestsQueue { From a8d4fe5821c6bdee8a848aa46f27cdebaebc01da Mon Sep 17 00:00:00 2001 From: Eric Crawford Date: Thu, 22 May 2025 17:27:55 -0700 Subject: [PATCH 2/2] Fix dynamic shape handling --- .../openvino/backends/basic_backend.cc | 4 ++-- .../providers/openvino/backends/basic_backend.h | 17 ++++++++++++----- 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/onnxruntime/core/providers/openvino/backends/basic_backend.cc b/onnxruntime/core/providers/openvino/backends/basic_backend.cc index 1ce815d44ec4c..e77ff973f3a87 100644 --- a/onnxruntime/core/providers/openvino/backends/basic_backend.cc +++ b/onnxruntime/core/providers/openvino/backends/basic_backend.cc @@ -413,7 +413,7 @@ void BasicBackend::StartAsyncInference(Ort::KernelContext& context, OVInferReque auto it = ort_ov_tensor_map.find(ort_tensor_key); if ((it == ort_ov_tensor_map.end()) || it->second.ort_ptr != tensor.GetTensorRawData()) { ov_tensor_data_t ov_tensor_data; - ov_tensor_data.tensor_ptr = std::make_shared(input_info.type, input_info.ov_shape, + ov_tensor_data.tensor_ptr = std::make_shared(input_info.type, input_info.ov_shape.get_shape(), const_cast(tensor.GetTensorRawData())); ov_tensor_data.ort_ptr = tensor.GetTensorRawData(); @@ -439,7 +439,7 @@ void BasicBackend::StartAsyncInference(Ort::KernelContext& context, OVInferReque if ((it == ort_ov_tensor_map.end()) || (it->second.ort_ptr != tensor.GetTensorRawData())) { ov_tensor_data_t ov_tensor_data; ov_tensor_data.ort_ptr = tensor.GetTensorRawData(); - ov_tensor_data.tensor_ptr = std::make_shared(output_info.type, output_info.ov_shape, + ov_tensor_data.tensor_ptr = std::make_shared(output_info.type, output_info.ov_shape.get_shape(), const_cast(tensor.GetTensorRawData())); ort_ov_tensor_map[ort_tensor_key] = ov_tensor_data; diff --git a/onnxruntime/core/providers/openvino/backends/basic_backend.h b/onnxruntime/core/providers/openvino/backends/basic_backend.h index eaa8afb67c5e8..230d3cb5db34a 100644 --- a/onnxruntime/core/providers/openvino/backends/basic_backend.h +++ b/onnxruntime/core/providers/openvino/backends/basic_backend.h @@ -13,6 +13,8 @@ #include #include #include +#include +#include #include "core/session/onnxruntime_cxx_api.h" #include "core/providers/openvino/contexts.h" @@ -34,7 +36,7 @@ struct OnnxToOvNetworkBindings { uint32_t ov_index; uint32_t onnx_index; ov::element::Type type; - ov::Shape ov_shape; + ov::PartialShape ov_shape; std::vector onnx_shape; }; std::vector network_outputs_; @@ -45,16 +47,21 @@ struct OnnxToOvNetworkBindings { for (const auto& [onnx_name, onnx_param_index] : onnx_input_map) { auto it = std::find_if(ov_parameters.begin(), ov_parameters.end(), [&onnx_name](const auto& ov_parameter_info) { return ov_parameter_info.get_names().contains(onnx_name); }); - auto ov_param_index = std::distance(ov_parameters.begin(), it); ORT_ENFORCE(it != ov_parameters.end(), backend_utils::log_tag, "Input names mismatch between OpenVINO and ONNX. ", onnx_name, " doesn't exist in the list of OpenVINO input tensor names"); - auto shape = ov_parameters[ov_param_index].get_shape(); - auto type = ov_parameters[ov_param_index].get_element_type(); + auto ov_param_index = std::distance(ov_parameters.begin(), it); + + auto shape = ov_parameters[ov_param_index].get_partial_shape(); + auto type = ov_parameters[ov_param_index].get_element_type(); ParameterInfo info{onnx_name, ov_param_index, onnx_param_index, type, shape}; - std::transform(shape.begin(), shape.end(), std::back_inserter(info.onnx_shape), [](const auto& dim) { return static_cast(dim); }); + + if (shape.is_static()) { + auto static_shape = shape.get_shape(); + std::transform(static_shape.begin(), static_shape.end(), std::back_inserter(info.onnx_shape), [](const auto& dim) { return static_cast(dim); }); + } input_output_map.push_back(std::move(info)); } };