intel · preetha-intel · Jun 5, 2025 · May 23, 2025 · May 31, 2025 · Jun 2, 2025
diff --git a/cmake/onnxruntime_providers_openvino.cmake b/cmake/onnxruntime_providers_openvino.cmake
@@ -30,11 +30,6 @@
   endif()
 
   list(APPEND OPENVINO_LIB_LIST openvino::frontend::onnx openvino::runtime ${PYTHON_LIBRARIES})
-  if ((DEFINED ENV{OPENCL_LIBS}) AND (DEFINED ENV{OPENCL_INCS}) AND onnxruntime_USE_OPENVINO_GPU)
-    add_definitions(-DIO_BUFFER_ENABLED=1)
-    list(APPEND OPENVINO_LIB_LIST $ENV{OPENCL_LIBS})
-  endif()
-
   source_group(TREE ${ONNXRUNTIME_ROOT}/core FILES ${onnxruntime_providers_openvino_cc_srcs})
   onnxruntime_add_shared_library_module(onnxruntime_providers_openvino ${onnxruntime_providers_openvino_cc_srcs} "${ONNXRUNTIME_ROOT}/core/dll/onnxruntime.rc")
 

diff --git a/onnxruntime/core/providers/openvino/backends/basic_backend.cc b/onnxruntime/core/providers/openvino/backends/basic_backend.cc
@@ -62,25 +62,6 @@ BasicBackend::BasicBackend(std::unique_ptr<ONNX_NAMESPACE::ModelProto>& model_pr
   try {
     // IO_BUFFER is enabled on GPU HW.
     // Pre-requisite is provider_option "context" must be set
-#if defined(IO_BUFFER_ENABLED)
-    cl_context ctx = static_cast<cl_context>(session_context_.context);
-    remote_context_ = new ov::intel_gpu::ocl::ClContext(OVCore::Get()->core, ctx);
-    if (subgraph_context_.is_ep_ctx_graph) {
-      exe_network_ = OVCore::Get()->ImportModel(*model_stream,
-                                                remote_context_,
-                                                subgraph_context_.subgraph_name);
-      model_stream.reset();  // Delete stream after it is no longer needed
-    } else {
-      std::string model = model_proto->SerializeAsString();
-      if (!subgraph_context.has_dynamic_input_shape) {
-        model_proto.reset()
-      }
-      auto ov_model = CreateOVModel(std::move(model), session_context_, const_outputs_map_);
-      LOGS_DEFAULT(INFO) << log_tag << "IO Buffering Enabled";
-      exe_network_ = OVCore::Get()->CompileModel(
-          ov_model, remote_context_, subgraph_context_.subgraph_name);
-    }
-#else  // !IO_BUFFER_ENABLED
     auto auto_unified_compile = ((hw_target.find("AUTO") == std::string::npos) ||
                                  (session_context_.OpenVINO_Version.at(0) >= 2024 &&
                                   session_context_.OpenVINO_Version.at(1) > 2));
@@ -117,7 +98,6 @@ BasicBackend::BasicBackend(std::unique_ptr<ONNX_NAMESPACE::ModelProto>& model_pr
       exe_network_ = OVCore::Get()->CompileModel(
           ov_model, hw_target, device_config, subgraph_context_.subgraph_name);
     }
-#endif
     LOGS_DEFAULT(INFO) << log_tag << "Loaded model to the plugin";
   } catch (const char* msg) {
     ORT_THROW(msg);
@@ -459,150 +439,46 @@ void BasicBackend::StartAsyncInference(Ort::KernelContext& context, OVInferReque
   }
 }
 
-#ifdef IO_BUFFER_ENABLED
-// Wait for Remote Aynchronous inference completion
-void BasicBackend::StartRemoteAsyncInference(Ort::KernelContext& context, OVInferRequestPtr infer_request) {
-  try {
-    auto graph_input_info = exe_network_.Get().inputs();
-    int input_idx = 0;
-    for (auto input_info_iter = graph_input_info.begin();
-         input_info_iter != graph_input_info.end(); ++input_info_iter) {
-      auto input_names = input_info_iter->get_names();
-      std::string onnx_input_name;
-      std::string input_name;
-      // use names retrieved from original ONNX model to assign the right onnx input name for the graph
-      for (auto it = subgraph_context_.input_names.begin(); it != subgraph_context_.input_names.end(); ++it) {
-        if (it->second == input_idx) {
-          onnx_input_name = it->first;
-          break;
-        }
-      }
-      // using the input name retrieved from ONNX original to match with the input names returned by OV tensors
-      if (input_names.find(onnx_input_name) != input_names.end()) {
-        input_name = onnx_input_name;
-      } else {
-        ORT_THROW(log_tag +
-                  "Input names mismatch between OpenVINO and ONNX. " +
-                  onnx_input_name +
-                  " doesn't exist in the list of OpenVINO input tensor names");
-      }
-      input_idx++;
-      // Kernel Context Input Buffer
-      const auto tensor = context.GetInput(subgraph_context_.input_names.at(input_name));
-      // If the ORTValue wraps a device pointer
-      auto mem_info = tensor.GetTensorMemoryInfo();
-      if (mem_info.GetAllocatorName() == OpenVINO_GPU) {
-        // Get the shared buffer pointer
-        const void* tensor_data = tensor.GetTensorRawData();
-        const cl::Buffer* shared_buffer_const = static_cast<const cl::Buffer*>(tensor_data);
-        // Create an Input Remote Blob
-        auto input = graph_input_info.at(0);
-        auto remote_blob = remote_context_->create_tensor(
-            input.get_element_type(), input.get_shape(), *shared_buffer_const);
-        ov::Tensor tensor_remote = static_cast<ov::Tensor>(remote_blob);
-        OVTensorPtr tensor_ptr = std::make_shared<ov::Tensor>(tensor_remote);
-        infer_request->SetTensor(input_name, tensor_ptr);
-      } else {
-        OVTensorPtr graph_input_blob;
-        graph_input_blob = infer_request->GetTensor(input_name);
-        size_t batch_slice_idx = 0;
-        FillInputBlob(graph_input_blob, batch_slice_idx, input_name, context, subgraph_context_);
-      }
-    }
-
-    // Set the output blob as remote blob
-    auto graph_output_info = exe_network_.Get().outputs();
-    for (auto output_info_iter = graph_output_info.begin();
-         output_info_iter != graph_output_info.end(); ++output_info_iter) {
-      auto output_names = output_info_iter->get_names();
-      std::string onnx_output_name;
-      std::string output_name;
-      bool output_name_found = false;
-      // using the output name retrieved from ONNX original to match with the output names returned by OV tensors
-      for (auto it = subgraph_context_.output_names.begin(); it != subgraph_context_.output_names.end(); ++it) {
-        onnx_output_name = it->first;
-        if (output_names.find(onnx_output_name) != output_names.end()) {
-          // Assigning the output_name
-          output_name = it->first;
-          output_name_found = true;
-          break;
-        }
-      }
-      if (!output_name_found) {
-        ORT_THROW(
-            log_tag +
-            "Output names mismatch between OpenVINO and ONNX. [ONNX Output: ] " +
-            onnx_output_name + " doesn't exist in the list of OpenVINO output tensor names");
-      }
-
-      size_t batch_size = 1;
-      Ort::UnownedValue tensor = GetOutputTensor(context,
-                                                 batch_size,
-                                                 infer_request,
-                                                 output_name,
-                                                 subgraph_context_.output_names);
-      auto mem_info = tensor.GetTensorMemoryInfo();
-      // Check if ORT Value wraps a device pointer
-      if (mem_info.GetAllocatorName() == OpenVINO_GPU) {
-        const void* tensor_data = tensor.GetTensorRawData();
-        const cl::Buffer* shared_buffer_const = static_cast<const cl::Buffer*>(tensor_data);
-        // Create a shared Blob, set the Infer Request Output Blob
-        auto output = graph_output_info.at(0);
-        auto remote_tensor =
-            remote_context_->create_tensor(output.get_element_type(), output.get_shape(), *shared_buffer_const);
-        ov::Tensor tensor_t = static_cast<ov::Tensor>(remote_tensor);
-        OVTensorPtr tensor_ptr = std::make_shared<ov::Tensor>(tensor_t);
-        try {
-          infer_request->SetTensor(output_name, tensor_ptr);
-        } catch (const char* msg) {
-          ORT_THROW(msg);
-        }
-      }
-    }
-
-    // Start Async inference
-    infer_request->StartAsync();
-  } catch (const char* msg) {
-    ORT_THROW(msg);
-  }
-}
-#endif
-
 // Wait for asynchronous inference completion on an Infer Request object indexed by infer_req_idx
 // and copy the results into a slice location within the batched output buffer indexed by batch_slice_idx
 void BasicBackend::CompleteAsyncInference(Ort::KernelContext& context, OVInferRequestPtr infer_request) {
   // Wait for Async inference completion
   try {
-    bool cpu_or_gpu = session_context_.device_type.find("CPU") != std::string::npos ||
-                      session_context_.device_type.find("GPU") != std::string::npos;
-
     infer_request->WaitRequest();
+  } catch(const std::runtime_error& e) {
+    infer_request->CancelRequest();
+    inferRequestsQueue_->deleteRequest();
+    ORT_THROW(log_tag + e.what());
+  }
 
-    if (cpu_or_gpu) {
-      for (const auto& output_info : bindings_->network_outputs_) {
-        OVTensorPtr graph_output_blob;
-        try {
-          graph_output_blob = infer_request->GetTensor(output_info.name);
-        } catch (const char* msg) {
-          ORT_THROW(msg);
-        }
-        size_t batch_size = 1;
-        Ort::UnownedValue output_tensor =
-            GetOutputTensor(context, batch_size, infer_request, output_info.name, subgraph_context_.output_names);
-        auto mem_info = output_tensor.GetTensorMemoryInfo();
-        if (mem_info.GetAllocatorName() == OpenVINO_GPU) {
+  bool cpu_or_gpu = session_context_.device_type.find("CPU") != std::string::npos ||
+                    session_context_.device_type.find("GPU") != std::string::npos;
+  if (cpu_or_gpu) {
+    for (const auto& output_info : bindings_->network_outputs_) {
+      OVTensorPtr graph_output_blob;
+      try {
+        graph_output_blob = infer_request->GetTensor(output_info.name);
+      } catch (const char* msg) {
+        ORT_THROW(msg);
+      }
+      size_t batch_size = 1;
+      Ort::UnownedValue output_tensor =
+          GetOutputTensor(context, batch_size, infer_request, output_info.name, subgraph_context_.output_names);
+      auto mem_info = output_tensor.GetTensorMemoryInfo();
+      if (mem_info.GetAllocatorName() == OpenVINO_GPU) {
           return;
-        } else {
-          size_t batch_slice = 0;
-          FillOutputBlob(std::move(graph_output_blob), output_tensor, batch_slice);
-        }
+      } else {
+        size_t batch_slice = 0;
+        FillOutputBlob(std::move(graph_output_blob), output_tensor, batch_slice);
       }
     }
+  }
 
-    if (!const_outputs_map_.empty()) {
-      for (const auto& item : const_outputs_map_) {
-        const auto& out_name = item.first;
-        auto node = item.second;
+  if (!const_outputs_map_.empty()) {
+    for (const auto& item : const_outputs_map_) {
+      const auto& out_name = item.first;
+      auto node = item.second;
+      try {
         Ort::UnownedValue output_tensor = GetOutputTensor(context,
                                                           out_name,
                                                           subgraph_context_.output_names,
@@ -613,10 +489,10 @@ void BasicBackend::CompleteAsyncInference(Ort::KernelContext& context, OVInferRe
         } else {
           FillOutputsWithConstantData(std::move(node), output_tensor);
         }
+      } catch (std::string const& msg) {
+        ORT_THROW(msg);
       }
     }
-  } catch (const char* msg) {
-    ORT_THROW(msg);
   }
 }
 
@@ -650,31 +526,20 @@ void BasicBackend::Infer(OrtKernelContext* ctx) {
     }
 
   } else {
-    // Requesting for an idle infer_request from a pool of infer_requests_
     OVInferRequestPtr infer_request;
     infer_request = inferRequestsQueue_->getIdleRequest();
-#ifdef IO_BUFFER_ENABLED
-    if ((session_context_.device_type.find("GPU") != std::string::npos) &&
-        (session_context_.context != nullptr) && session_context_.is_wholly_supported_graph) {
-      try {
-        StartRemoteAsyncInference(context, infer_request);
-      } catch (std::string const& msg) {
-        ORT_THROW(msg);
-      }
-    } else {
-      try {
-        StartAsyncInference(context, infer_request);
-      } catch (std::string const& msg) {
-        ORT_THROW(msg);
-      }
+    if(infer_request == nullptr) {
+      ORT_THROW("OpenVINO Execution Provider :: There are no inference requests");
+      LOGS_DEFAULT(FATAL) << log_tag << "Create Infer Requests do not exist";
+      return;
     }
-#else
+
+    LOGS_DEFAULT(INFO) << log_tag << "Get Idle Request";
     try {
       StartAsyncInference(context, infer_request);
     } catch (const std::runtime_error& e) {
       ORT_THROW(log_tag + " Exception at StartAsyncInference: " + e.what());
     }
-#endif
     try {
       CompleteAsyncInference(context, infer_request);
     } catch (const std::runtime_error& e) {
@@ -696,13 +561,11 @@ void BasicBackend::Infer(OrtKernelContext* ctx) {
     // Once the inference is completed, the infer_request becomes free and is placed back into pool of infer_requests_
     inferRequestsQueue_->putIdleRequest(std::move(infer_request));
 #ifndef NDEBUG
-#ifndef IO_BUFFER_ENABLED  // Printing performance counts is disabled when IO_BUFFER_ENABLED
     if (openvino_ep::backend_utils::IsDebugEnabled()) {
       inferRequestsQueue_->printstatus();  // Printing the elements of infer_requests_ vector pool only in debug mode
       std::string& hw_target = session_context_.device_type;
       printPerformanceCounts(std::move(infer_request_), std::cout, hw_target);
     }
-#endif
 #endif
   }
 }

diff --git a/onnxruntime/core/providers/openvino/backends/basic_backend.h b/onnxruntime/core/providers/openvino/backends/basic_backend.h
@@ -94,11 +94,6 @@ class BasicBackend : public IBackend {
   void EnableStreams();
   void SetNumThreads(ov::AnyMap& device_config);
   void StartAsyncInference(Ort::KernelContext& context, std::shared_ptr<OVInferRequest> infer_request);
-
-#ifdef IO_BUFFER_ENABLED
-  void StartRemoteAsyncInference(Ort::KernelContext& context, std::shared_ptr<OVInferRequest> infer_request);
-#endif
-
   void CompleteAsyncInference(Ort::KernelContext& context, std::shared_ptr<OVInferRequest> infer_request);
 
   SessionContext& session_context_;
@@ -108,10 +103,6 @@ class BasicBackend : public IBackend {
   OVExeNetwork exe_network_;
   std::map<std::string, std::shared_ptr<ov::Node>> const_outputs_map_;
   std::unique_ptr<InferRequestsQueue> inferRequestsQueue_;
-#if defined IO_BUFFER_ENABLED
-  OVRemoteContextPtr remote_context_;
-#endif
-
   using ort_tensor_key_t = const std::string;
   std::map<ort_tensor_key_t, ov_tensor_data_t> ort_ov_tensor_map;
   std::unique_ptr<OnnxToOvNetworkBindings> bindings_;
@@ -121,6 +112,7 @@ class InferRequestsQueue {
  public:
   InferRequestsQueue(OVExeNetwork& net, size_t nireq, std::function<void(OVInferRequestPtr)> initializer) {
     OVInferRequestPtr infer_request;
+    live_threads=nireq;
     for (size_t id = 0; id < nireq; id++) {
       infer_request = std::make_shared<OVInferRequest>(net.CreateInferRequest());
       initializer(infer_request);
@@ -152,16 +144,28 @@ class InferRequestsQueue {
 
   OVInferRequestPtr getIdleRequest() {
     std::unique_lock<std::mutex> lock(_mutex);
+    std::cout << "get Idle Request" << live_threads << "\n";
+    if(live_threads==0) {
+      return nullptr;
+    }
+
     _cv.wait(lock, [this] { return infer_requests_.size() > 0; });
     auto request = infer_requests_.at(0);
     infer_requests_.erase(infer_requests_.begin());
     return request;
   }
 
+  void deleteRequest() {
+    std::unique_lock<std::mutex> lock(_mutex);
+    live_threads=live_threads-1;
+    std::cout << "delete Request" << live_threads << "\n";
+  }
+
  private:
   std::mutex _mutex;
   std::condition_variable _cv;
   std::vector<OVInferRequestPtr> infer_requests_;
+  int live_threads;
 };
 
 }  // namespace openvino_ep