From 74a7ffc2c3b44f1df0527eff8bef3ec47db2c502 Mon Sep 17 00:00:00 2001 From: TejalKhade28 Date: Fri, 23 May 2025 20:19:01 +0530 Subject: [PATCH 1/3] Catch exception with TDR --- .../openvino/backends/basic_backend.cc | 55 ++++++++++--------- 1 file changed, 29 insertions(+), 26 deletions(-) diff --git a/onnxruntime/core/providers/openvino/backends/basic_backend.cc b/onnxruntime/core/providers/openvino/backends/basic_backend.cc index e77ff973f3a87..9085bfea5f0c4 100644 --- a/onnxruntime/core/providers/openvino/backends/basic_backend.cc +++ b/onnxruntime/core/providers/openvino/backends/basic_backend.cc @@ -573,36 +573,39 @@ void BasicBackend::StartRemoteAsyncInference(Ort::KernelContext& context, OVInfe void BasicBackend::CompleteAsyncInference(Ort::KernelContext& context, OVInferRequestPtr infer_request) { // Wait for Async inference completion try { - bool cpu_or_gpu = session_context_.device_type.find("CPU") != std::string::npos || - session_context_.device_type.find("GPU") != std::string::npos; - infer_request->WaitRequest(); + } catch (const char* msg) { + ORT_THROW(msg); + } - if (cpu_or_gpu) { - for (const auto& output_info : bindings_->network_outputs_) { - OVTensorPtr graph_output_blob; - try { - graph_output_blob = infer_request->GetTensor(output_info.name); - } catch (const char* msg) { - ORT_THROW(msg); - } - size_t batch_size = 1; - Ort::UnownedValue output_tensor = - GetOutputTensor(context, batch_size, infer_request, output_info.name, subgraph_context_.output_names); - auto mem_info = output_tensor.GetTensorMemoryInfo(); - if (mem_info.GetAllocatorName() == OpenVINO_GPU) { + bool cpu_or_gpu = session_context_.device_type.find("CPU") != std::string::npos || + session_context_.device_type.find("GPU") != std::string::npos; + if (cpu_or_gpu) { + for (const auto& output_info : bindings_->network_outputs_) { + OVTensorPtr graph_output_blob; + try { + graph_output_blob = infer_request->GetTensor(output_info.name); + } catch (const char* msg) { + ORT_THROW(msg); + } + size_t batch_size = 1; + Ort::UnownedValue output_tensor = + GetOutputTensor(context, batch_size, infer_request, output_info.name, subgraph_context_.output_names); + auto mem_info = output_tensor.GetTensorMemoryInfo(); + if (mem_info.GetAllocatorName() == OpenVINO_GPU) { return; - } else { - size_t batch_slice = 0; - FillOutputBlob(std::move(graph_output_blob), output_tensor, batch_slice); - } + } else { + size_t batch_slice = 0; + FillOutputBlob(std::move(graph_output_blob), output_tensor, batch_slice); } } + } - if (!const_outputs_map_.empty()) { - for (const auto& item : const_outputs_map_) { - const auto& out_name = item.first; - auto node = item.second; + if (!const_outputs_map_.empty()) { + for (const auto& item : const_outputs_map_) { + const auto& out_name = item.first; + auto node = item.second; + try { Ort::UnownedValue output_tensor = GetOutputTensor(context, out_name, subgraph_context_.output_names, @@ -613,10 +616,10 @@ void BasicBackend::CompleteAsyncInference(Ort::KernelContext& context, OVInferRe } else { FillOutputsWithConstantData(std::move(node), output_tensor); } + } catch (std::string const& msg) { + ORT_THROW(msg); } } - } catch (const char* msg) { - ORT_THROW(msg); } } From 321351679e02a0e449b363eb3ef05f8539947481 Mon Sep 17 00:00:00 2001 From: TejalKhade28 Date: Sat, 31 May 2025 12:43:09 +0530 Subject: [PATCH 2/3] Handle exceptions during parallel execution with OVEP --- .../providers/openvino/backends/basic_backend.cc | 14 +++++++++++--- .../providers/openvino/backends/basic_backend.h | 13 +++++++++++++ .../core/providers/openvino/ov_interface.cc | 10 +++++++--- onnxruntime/core/providers/openvino/ov_interface.h | 1 + onnxruntime/test/perftest/performance_runner.cc | 9 ++++++--- 5 files changed, 38 insertions(+), 9 deletions(-) diff --git a/onnxruntime/core/providers/openvino/backends/basic_backend.cc b/onnxruntime/core/providers/openvino/backends/basic_backend.cc index 9085bfea5f0c4..8737016cd902c 100644 --- a/onnxruntime/core/providers/openvino/backends/basic_backend.cc +++ b/onnxruntime/core/providers/openvino/backends/basic_backend.cc @@ -574,8 +574,10 @@ void BasicBackend::CompleteAsyncInference(Ort::KernelContext& context, OVInferRe // Wait for Async inference completion try { infer_request->WaitRequest(); - } catch (const char* msg) { - ORT_THROW(msg); + } catch(const std::runtime_error& e) { + infer_request->CancelRequest(); + inferRequestsQueue_->deleteRequest(); + ORT_THROW(log_tag + e.what()); } bool cpu_or_gpu = session_context_.device_type.find("CPU") != std::string::npos || @@ -653,9 +655,15 @@ void BasicBackend::Infer(OrtKernelContext* ctx) { } } else { - // Requesting for an idle infer_request from a pool of infer_requests_ OVInferRequestPtr infer_request; infer_request = inferRequestsQueue_->getIdleRequest(); + if(infer_request == nullptr) { + ORT_THROW("OpenVINO Execution Provider :: There are no inference requests"); + LOGS_DEFAULT(FATAL) << log_tag << "Create Infer Requests do not exist"; + return; + } + + LOGS_DEFAULT(INFO) << log_tag << "Get Idle Request"; #ifdef IO_BUFFER_ENABLED if ((session_context_.device_type.find("GPU") != std::string::npos) && (session_context_.context != nullptr) && session_context_.is_wholly_supported_graph) { diff --git a/onnxruntime/core/providers/openvino/backends/basic_backend.h b/onnxruntime/core/providers/openvino/backends/basic_backend.h index 130699abd465b..49fbeeed3af27 100644 --- a/onnxruntime/core/providers/openvino/backends/basic_backend.h +++ b/onnxruntime/core/providers/openvino/backends/basic_backend.h @@ -121,6 +121,7 @@ class InferRequestsQueue { public: InferRequestsQueue(OVExeNetwork& net, size_t nireq, std::function initializer) { OVInferRequestPtr infer_request; + live_threads=nireq; for (size_t id = 0; id < nireq; id++) { infer_request = std::make_shared(net.CreateInferRequest()); initializer(infer_request); @@ -152,16 +153,28 @@ class InferRequestsQueue { OVInferRequestPtr getIdleRequest() { std::unique_lock lock(_mutex); + std::cout << "get Idle Request" << live_threads << "\n"; + if(live_threads==0) { + return nullptr; + } + _cv.wait(lock, [this] { return infer_requests_.size() > 0; }); auto request = infer_requests_.at(0); infer_requests_.erase(infer_requests_.begin()); return request; } + void deleteRequest() { + std::unique_lock lock(_mutex); + live_threads=live_threads-1; + std::cout << "delete Request" << live_threads << "\n"; + } + private: std::mutex _mutex; std::condition_variable _cv; std::vector infer_requests_; + int live_threads; }; } // namespace openvino_ep diff --git a/onnxruntime/core/providers/openvino/ov_interface.cc b/onnxruntime/core/providers/openvino/ov_interface.cc index a175ca863d1d1..87da0ade21551 100644 --- a/onnxruntime/core/providers/openvino/ov_interface.cc +++ b/onnxruntime/core/providers/openvino/ov_interface.cc @@ -294,12 +294,16 @@ void OVInferRequest::Infer() { } void OVInferRequest::WaitRequest() { + ovInfReq.wait(); +} + +void OVInferRequest::CancelRequest() { try { - ovInfReq.wait(); + ovInfReq.cancel(); } catch (const Exception& e) { - ORT_THROW(log_tag + " Wait Model Failed: " + e.what()); + ORT_THROW(log_tag + " Cancel Model Failed: " + e.what()); } catch (...) { - ORT_THROW(log_tag + " Wait Mode Failed"); + ORT_THROW(log_tag + " Cancel Mode Failed"); } } diff --git a/onnxruntime/core/providers/openvino/ov_interface.h b/onnxruntime/core/providers/openvino/ov_interface.h index bebe73bd702dd..079426e2d67fb 100644 --- a/onnxruntime/core/providers/openvino/ov_interface.h +++ b/onnxruntime/core/providers/openvino/ov_interface.h @@ -122,6 +122,7 @@ class OVInferRequest { void StartAsync(); void Infer(); void WaitRequest(); + void CancelRequest(); void QueryStatus(); explicit OVInferRequest(ov::InferRequest obj) : ovInfReq(std::move(obj)) {} OVInferRequest() : ovInfReq(ov::InferRequest()) {} diff --git a/onnxruntime/test/perftest/performance_runner.cc b/onnxruntime/test/perftest/performance_runner.cc index faf0c34193717..8ec9694227c14 100644 --- a/onnxruntime/test/perftest/performance_runner.cc +++ b/onnxruntime/test/perftest/performance_runner.cc @@ -203,8 +203,9 @@ Status PerformanceRunner::RunParallelDuration() { counter++; tpool->Schedule([this, &counter, &m, &cv]() { auto status = RunOneIteration(); - if (!status.IsOK()) + if (!status.IsOK()) { std::cerr << status.ErrorMessage(); + } // Simplified version of Eigen::Barrier std::lock_guard lg(m); counter--; @@ -216,8 +217,10 @@ Status PerformanceRunner::RunParallelDuration() { } while (duration_seconds.count() < performance_test_config_.run_config.duration_in_seconds); // Join - std::unique_lock lock(m); - cv.wait(lock, [&counter]() { return counter == 0; }); + tpool->Schedule([this, &counter, &m, &cv]() { + std::unique_lock lock(m); + cv.wait(lock, [&counter]() { return counter == 0; }); + }); return Status::OK(); } From fd6bcac279ebade813b8c6f412c82d37bfa211f9 Mon Sep 17 00:00:00 2001 From: TejalKhade28 Date: Mon, 2 Jun 2025 15:48:33 +0530 Subject: [PATCH 3/3] Remove IO Buffer Implementation --- cmake/onnxruntime_providers_openvino.cmake | 5 - .../openvino/backends/basic_backend.cc | 148 ------------------ .../openvino/backends/basic_backend.h | 9 -- .../core/providers/openvino/ov_interface.cc | 32 ---- .../core/providers/openvino/ov_interface.h | 19 --- 5 files changed, 213 deletions(-) diff --git a/cmake/onnxruntime_providers_openvino.cmake b/cmake/onnxruntime_providers_openvino.cmake index 03f67983c70ab..d7cb2d5ea0d0f 100644 --- a/cmake/onnxruntime_providers_openvino.cmake +++ b/cmake/onnxruntime_providers_openvino.cmake @@ -30,11 +30,6 @@ endif() list(APPEND OPENVINO_LIB_LIST openvino::frontend::onnx openvino::runtime ${PYTHON_LIBRARIES}) - if ((DEFINED ENV{OPENCL_LIBS}) AND (DEFINED ENV{OPENCL_INCS}) AND onnxruntime_USE_OPENVINO_GPU) - add_definitions(-DIO_BUFFER_ENABLED=1) - list(APPEND OPENVINO_LIB_LIST $ENV{OPENCL_LIBS}) - endif() - source_group(TREE ${ONNXRUNTIME_ROOT}/core FILES ${onnxruntime_providers_openvino_cc_srcs}) onnxruntime_add_shared_library_module(onnxruntime_providers_openvino ${onnxruntime_providers_openvino_cc_srcs} "${ONNXRUNTIME_ROOT}/core/dll/onnxruntime.rc") diff --git a/onnxruntime/core/providers/openvino/backends/basic_backend.cc b/onnxruntime/core/providers/openvino/backends/basic_backend.cc index 8737016cd902c..dedb6da1bae58 100644 --- a/onnxruntime/core/providers/openvino/backends/basic_backend.cc +++ b/onnxruntime/core/providers/openvino/backends/basic_backend.cc @@ -62,25 +62,6 @@ BasicBackend::BasicBackend(std::unique_ptr& model_pr try { // IO_BUFFER is enabled on GPU HW. // Pre-requisite is provider_option "context" must be set -#if defined(IO_BUFFER_ENABLED) - cl_context ctx = static_cast(session_context_.context); - remote_context_ = new ov::intel_gpu::ocl::ClContext(OVCore::Get()->core, ctx); - if (subgraph_context_.is_ep_ctx_graph) { - exe_network_ = OVCore::Get()->ImportModel(*model_stream, - remote_context_, - subgraph_context_.subgraph_name); - model_stream.reset(); // Delete stream after it is no longer needed - } else { - std::string model = model_proto->SerializeAsString(); - if (!subgraph_context.has_dynamic_input_shape) { - model_proto.reset() - } - auto ov_model = CreateOVModel(std::move(model), session_context_, const_outputs_map_); - LOGS_DEFAULT(INFO) << log_tag << "IO Buffering Enabled"; - exe_network_ = OVCore::Get()->CompileModel( - ov_model, remote_context_, subgraph_context_.subgraph_name); - } -#else // !IO_BUFFER_ENABLED auto auto_unified_compile = ((hw_target.find("AUTO") == std::string::npos) || (session_context_.OpenVINO_Version.at(0) >= 2024 && session_context_.OpenVINO_Version.at(1) > 2)); @@ -117,7 +98,6 @@ BasicBackend::BasicBackend(std::unique_ptr& model_pr exe_network_ = OVCore::Get()->CompileModel( ov_model, hw_target, device_config, subgraph_context_.subgraph_name); } -#endif LOGS_DEFAULT(INFO) << log_tag << "Loaded model to the plugin"; } catch (const char* msg) { ORT_THROW(msg); @@ -459,115 +439,6 @@ void BasicBackend::StartAsyncInference(Ort::KernelContext& context, OVInferReque } } -#ifdef IO_BUFFER_ENABLED -// Wait for Remote Aynchronous inference completion -void BasicBackend::StartRemoteAsyncInference(Ort::KernelContext& context, OVInferRequestPtr infer_request) { - try { - auto graph_input_info = exe_network_.Get().inputs(); - int input_idx = 0; - for (auto input_info_iter = graph_input_info.begin(); - input_info_iter != graph_input_info.end(); ++input_info_iter) { - auto input_names = input_info_iter->get_names(); - std::string onnx_input_name; - std::string input_name; - // use names retrieved from original ONNX model to assign the right onnx input name for the graph - for (auto it = subgraph_context_.input_names.begin(); it != subgraph_context_.input_names.end(); ++it) { - if (it->second == input_idx) { - onnx_input_name = it->first; - break; - } - } - // using the input name retrieved from ONNX original to match with the input names returned by OV tensors - if (input_names.find(onnx_input_name) != input_names.end()) { - input_name = onnx_input_name; - } else { - ORT_THROW(log_tag + - "Input names mismatch between OpenVINO and ONNX. " + - onnx_input_name + - " doesn't exist in the list of OpenVINO input tensor names"); - } - input_idx++; - // Kernel Context Input Buffer - const auto tensor = context.GetInput(subgraph_context_.input_names.at(input_name)); - // If the ORTValue wraps a device pointer - auto mem_info = tensor.GetTensorMemoryInfo(); - if (mem_info.GetAllocatorName() == OpenVINO_GPU) { - // Get the shared buffer pointer - const void* tensor_data = tensor.GetTensorRawData(); - const cl::Buffer* shared_buffer_const = static_cast(tensor_data); - // Create an Input Remote Blob - auto input = graph_input_info.at(0); - auto remote_blob = remote_context_->create_tensor( - input.get_element_type(), input.get_shape(), *shared_buffer_const); - ov::Tensor tensor_remote = static_cast(remote_blob); - OVTensorPtr tensor_ptr = std::make_shared(tensor_remote); - infer_request->SetTensor(input_name, tensor_ptr); - } else { - OVTensorPtr graph_input_blob; - graph_input_blob = infer_request->GetTensor(input_name); - size_t batch_slice_idx = 0; - FillInputBlob(graph_input_blob, batch_slice_idx, input_name, context, subgraph_context_); - } - } - - // Set the output blob as remote blob - auto graph_output_info = exe_network_.Get().outputs(); - for (auto output_info_iter = graph_output_info.begin(); - output_info_iter != graph_output_info.end(); ++output_info_iter) { - auto output_names = output_info_iter->get_names(); - std::string onnx_output_name; - std::string output_name; - bool output_name_found = false; - // using the output name retrieved from ONNX original to match with the output names returned by OV tensors - for (auto it = subgraph_context_.output_names.begin(); it != subgraph_context_.output_names.end(); ++it) { - onnx_output_name = it->first; - if (output_names.find(onnx_output_name) != output_names.end()) { - // Assigning the output_name - output_name = it->first; - output_name_found = true; - break; - } - } - if (!output_name_found) { - ORT_THROW( - log_tag + - "Output names mismatch between OpenVINO and ONNX. [ONNX Output: ] " + - onnx_output_name + " doesn't exist in the list of OpenVINO output tensor names"); - } - - size_t batch_size = 1; - Ort::UnownedValue tensor = GetOutputTensor(context, - batch_size, - infer_request, - output_name, - subgraph_context_.output_names); - auto mem_info = tensor.GetTensorMemoryInfo(); - // Check if ORT Value wraps a device pointer - if (mem_info.GetAllocatorName() == OpenVINO_GPU) { - const void* tensor_data = tensor.GetTensorRawData(); - const cl::Buffer* shared_buffer_const = static_cast(tensor_data); - // Create a shared Blob, set the Infer Request Output Blob - auto output = graph_output_info.at(0); - auto remote_tensor = - remote_context_->create_tensor(output.get_element_type(), output.get_shape(), *shared_buffer_const); - ov::Tensor tensor_t = static_cast(remote_tensor); - OVTensorPtr tensor_ptr = std::make_shared(tensor_t); - try { - infer_request->SetTensor(output_name, tensor_ptr); - } catch (const char* msg) { - ORT_THROW(msg); - } - } - } - - // Start Async inference - infer_request->StartAsync(); - } catch (const char* msg) { - ORT_THROW(msg); - } -} -#endif - // Wait for asynchronous inference completion on an Infer Request object indexed by infer_req_idx // and copy the results into a slice location within the batched output buffer indexed by batch_slice_idx void BasicBackend::CompleteAsyncInference(Ort::KernelContext& context, OVInferRequestPtr infer_request) { @@ -664,28 +535,11 @@ void BasicBackend::Infer(OrtKernelContext* ctx) { } LOGS_DEFAULT(INFO) << log_tag << "Get Idle Request"; -#ifdef IO_BUFFER_ENABLED - if ((session_context_.device_type.find("GPU") != std::string::npos) && - (session_context_.context != nullptr) && session_context_.is_wholly_supported_graph) { - try { - StartRemoteAsyncInference(context, infer_request); - } catch (std::string const& msg) { - ORT_THROW(msg); - } - } else { - try { - StartAsyncInference(context, infer_request); - } catch (std::string const& msg) { - ORT_THROW(msg); - } - } -#else try { StartAsyncInference(context, infer_request); } catch (const std::runtime_error& e) { ORT_THROW(log_tag + " Exception at StartAsyncInference: " + e.what()); } -#endif try { CompleteAsyncInference(context, infer_request); } catch (const std::runtime_error& e) { @@ -707,13 +561,11 @@ void BasicBackend::Infer(OrtKernelContext* ctx) { // Once the inference is completed, the infer_request becomes free and is placed back into pool of infer_requests_ inferRequestsQueue_->putIdleRequest(std::move(infer_request)); #ifndef NDEBUG -#ifndef IO_BUFFER_ENABLED // Printing performance counts is disabled when IO_BUFFER_ENABLED if (openvino_ep::backend_utils::IsDebugEnabled()) { inferRequestsQueue_->printstatus(); // Printing the elements of infer_requests_ vector pool only in debug mode std::string& hw_target = session_context_.device_type; printPerformanceCounts(std::move(infer_request_), std::cout, hw_target); } -#endif #endif } } diff --git a/onnxruntime/core/providers/openvino/backends/basic_backend.h b/onnxruntime/core/providers/openvino/backends/basic_backend.h index 49fbeeed3af27..697c088a80620 100644 --- a/onnxruntime/core/providers/openvino/backends/basic_backend.h +++ b/onnxruntime/core/providers/openvino/backends/basic_backend.h @@ -94,11 +94,6 @@ class BasicBackend : public IBackend { void EnableStreams(); void SetNumThreads(ov::AnyMap& device_config); void StartAsyncInference(Ort::KernelContext& context, std::shared_ptr infer_request); - -#ifdef IO_BUFFER_ENABLED - void StartRemoteAsyncInference(Ort::KernelContext& context, std::shared_ptr infer_request); -#endif - void CompleteAsyncInference(Ort::KernelContext& context, std::shared_ptr infer_request); SessionContext& session_context_; @@ -108,10 +103,6 @@ class BasicBackend : public IBackend { OVExeNetwork exe_network_; std::map> const_outputs_map_; std::unique_ptr inferRequestsQueue_; -#if defined IO_BUFFER_ENABLED - OVRemoteContextPtr remote_context_; -#endif - using ort_tensor_key_t = const std::string; std::map ort_ov_tensor_map; std::unique_ptr bindings_; diff --git a/onnxruntime/core/providers/openvino/ov_interface.cc b/onnxruntime/core/providers/openvino/ov_interface.cc index 87da0ade21551..0024a5e121bbf 100644 --- a/onnxruntime/core/providers/openvino/ov_interface.cc +++ b/onnxruntime/core/providers/openvino/ov_interface.cc @@ -143,38 +143,6 @@ void OVCore::SetCache(const std::string& cache_dir_path) { core.set_property(ov::cache_dir(cache_dir_path)); } -#ifdef IO_BUFFER_ENABLED -OVExeNetwork OVCore::CompileModel(std::shared_ptr& model, - OVRemoteContextPtr context, std::string name) { - try { - auto obj = core.compile_model(model, *context); -#ifndef NDEBUG - printDebugInfo(obj); -#endif - return OVExeNetwork(obj); - } catch (const Exception& e) { - ORT_THROW(log_tag + " Exception while Loading Network for graph: " + name + e.what()); - } catch (...) { - ORT_THROW(log_tag + " Exception while Loading Network for graph " + name); - } -} -OVExeNetwork OVCore::ImportModel(std::shared_ptr model_stream, - OVRemoteContextPtr context, std::string name) { - try { - auto obj = core.import_model(*model_stream, *context); -#ifndef NDEBUG - printDebugInfo(obj); -#endif - OVExeNetwork exe(obj); - return exe; - } catch (const Exception& e) { - ORT_THROW(log_tag + " Exception while Loading Network for graph: " + name + e.what()); - } catch (...) { - ORT_THROW(log_tag + " Exception while Loading Network for graph " + name); - } -} -#endif - std::vector OVCore::GetAvailableDevices() const { std::vector available_devices = core.get_available_devices(); return available_devices; diff --git a/onnxruntime/core/providers/openvino/ov_interface.h b/onnxruntime/core/providers/openvino/ov_interface.h index 079426e2d67fb..866f4a02f7780 100644 --- a/onnxruntime/core/providers/openvino/ov_interface.h +++ b/onnxruntime/core/providers/openvino/ov_interface.h @@ -14,11 +14,6 @@ #include "openvino/runtime/intel_npu/properties.hpp" #include "openvino/pass/convert_fp32_to_fp16.hpp" #include "openvino/frontend/manager.hpp" - -#ifdef IO_BUFFER_ENABLED -#include -#endif - #include namespace onnxruntime { @@ -32,12 +27,6 @@ typedef ov::ProfilingInfo OVProfilingInfo; typedef ov::Model OVNetwork; typedef std::shared_ptr OVInferRequestPtr; typedef std::shared_ptr OVTensorPtr; - -#ifdef IO_BUFFER_ENABLED -typedef ov::intel_gpu::ocl::ClContext* OVRemoteContextPtr; -typedef ov::RemoteContext OVRemoteContext; -#endif - std::optional queryOVProperty(const std::string& property, const std::string& device_type); template @@ -87,14 +76,6 @@ struct OVCore : WeakSingleton { std::string hw_target, const ov::AnyMap& device_config, std::string name); -#ifdef IO_BUFFER_ENABLED - OVExeNetwork CompileModel(std::shared_ptr& model, - OVRemoteContextPtr context, - std::string name); - OVExeNetwork ImportModel(std::shared_ptr model_stream, - OVRemoteContextPtr context, - std::string name); -#endif std::vector GetAvailableDevices() const; std::vector GetAvailableDevices(const std::string& device_type) const; void SetCache(const std::string& cache_dir_path);