From 85a77a70a44e24363f9eeb34d890ddde3c5651b3 Mon Sep 17 00:00:00 2001
From: Preetha Veeramalai <preetha.veeramalai@intel.com>
Date: Mon, 2 Jun 2025 08:08:15 -0700
Subject: [PATCH] Remove stale IO buffer code

---
 cmake/onnxruntime_providers_openvino.cmake    |   1 -
 .../openvino/backends/basic_backend.cc        | 150 +-----------------
 .../openvino/backends/basic_backend.h         |   7 -
 .../openvino/openvino_provider_factory.cc     |   8 -
 .../core/providers/openvino/ov_interface.cc   |  32 ----
 .../core/providers/openvino/ov_interface.h    |  17 --
 6 files changed, 1 insertion(+), 214 deletions(-)

diff --git a/cmake/onnxruntime_providers_openvino.cmake b/cmake/onnxruntime_providers_openvino.cmake
index 03f67983c70ab..9e272bd18a497 100644
--- a/cmake/onnxruntime_providers_openvino.cmake
+++ b/cmake/onnxruntime_providers_openvino.cmake
@@ -31,7 +31,6 @@
 
   list(APPEND OPENVINO_LIB_LIST openvino::frontend::onnx openvino::runtime ${PYTHON_LIBRARIES})
   if ((DEFINED ENV{OPENCL_LIBS}) AND (DEFINED ENV{OPENCL_INCS}) AND onnxruntime_USE_OPENVINO_GPU)
-    add_definitions(-DIO_BUFFER_ENABLED=1)
     list(APPEND OPENVINO_LIB_LIST $ENV{OPENCL_LIBS})
   endif()
 
diff --git a/onnxruntime/core/providers/openvino/backends/basic_backend.cc b/onnxruntime/core/providers/openvino/backends/basic_backend.cc
index e77ff973f3a87..e050967e4bfb7 100644
--- a/onnxruntime/core/providers/openvino/backends/basic_backend.cc
+++ b/onnxruntime/core/providers/openvino/backends/basic_backend.cc
@@ -60,27 +60,7 @@ BasicBackend::BasicBackend(std::unique_ptr<ONNX_NAMESPACE::ModelProto>& model_pr
   }
 
   try {
-    // IO_BUFFER is enabled on GPU HW.
-    // Pre-requisite is provider_option "context" must be set
-#if defined(IO_BUFFER_ENABLED)
-    cl_context ctx = static_cast<cl_context>(session_context_.context);
-    remote_context_ = new ov::intel_gpu::ocl::ClContext(OVCore::Get()->core, ctx);
-    if (subgraph_context_.is_ep_ctx_graph) {
-      exe_network_ = OVCore::Get()->ImportModel(*model_stream,
-                                                remote_context_,
-                                                subgraph_context_.subgraph_name);
-      model_stream.reset();  // Delete stream after it is no longer needed
-    } else {
-      std::string model = model_proto->SerializeAsString();
-      if (!subgraph_context.has_dynamic_input_shape) {
-        model_proto.reset()
-      }
-      auto ov_model = CreateOVModel(std::move(model), session_context_, const_outputs_map_);
-      LOGS_DEFAULT(INFO) << log_tag << "IO Buffering Enabled";
-      exe_network_ = OVCore::Get()->CompileModel(
-          ov_model, remote_context_, subgraph_context_.subgraph_name);
-    }
-#else  // !IO_BUFFER_ENABLED
+
     auto auto_unified_compile = ((hw_target.find("AUTO") == std::string::npos) ||
                                  (session_context_.OpenVINO_Version.at(0) >= 2024 &&
                                   session_context_.OpenVINO_Version.at(1) > 2));
@@ -117,7 +97,6 @@ BasicBackend::BasicBackend(std::unique_ptr<ONNX_NAMESPACE::ModelProto>& model_pr
       exe_network_ = OVCore::Get()->CompileModel(
           ov_model, hw_target, device_config, subgraph_context_.subgraph_name);
     }
-#endif
     LOGS_DEFAULT(INFO) << log_tag << "Loaded model to the plugin";
   } catch (const char* msg) {
     ORT_THROW(msg);
@@ -459,114 +438,6 @@ void BasicBackend::StartAsyncInference(Ort::KernelContext& context, OVInferReque
   }
 }
 
-#ifdef IO_BUFFER_ENABLED
-// Wait for Remote Aynchronous inference completion
-void BasicBackend::StartRemoteAsyncInference(Ort::KernelContext& context, OVInferRequestPtr infer_request) {
-  try {
-    auto graph_input_info = exe_network_.Get().inputs();
-    int input_idx = 0;
-    for (auto input_info_iter = graph_input_info.begin();
-         input_info_iter != graph_input_info.end(); ++input_info_iter) {
-      auto input_names = input_info_iter->get_names();
-      std::string onnx_input_name;
-      std::string input_name;
-      // use names retrieved from original ONNX model to assign the right onnx input name for the graph
-      for (auto it = subgraph_context_.input_names.begin(); it != subgraph_context_.input_names.end(); ++it) {
-        if (it->second == input_idx) {
-          onnx_input_name = it->first;
-          break;
-        }
-      }
-      // using the input name retrieved from ONNX original to match with the input names returned by OV tensors
-      if (input_names.find(onnx_input_name) != input_names.end()) {
-        input_name = onnx_input_name;
-      } else {
-        ORT_THROW(log_tag +
-                  "Input names mismatch between OpenVINO and ONNX. " +
-                  onnx_input_name +
-                  " doesn't exist in the list of OpenVINO input tensor names");
-      }
-      input_idx++;
-      // Kernel Context Input Buffer
-      const auto tensor = context.GetInput(subgraph_context_.input_names.at(input_name));
-      // If the ORTValue wraps a device pointer
-      auto mem_info = tensor.GetTensorMemoryInfo();
-      if (mem_info.GetAllocatorName() == OpenVINO_GPU) {
-        // Get the shared buffer pointer
-        const void* tensor_data = tensor.GetTensorRawData();
-        const cl::Buffer* shared_buffer_const = static_cast<const cl::Buffer*>(tensor_data);
-        // Create an Input Remote Blob
-        auto input = graph_input_info.at(0);
-        auto remote_blob = remote_context_->create_tensor(
-            input.get_element_type(), input.get_shape(), *shared_buffer_const);
-        ov::Tensor tensor_remote = static_cast<ov::Tensor>(remote_blob);
-        OVTensorPtr tensor_ptr = std::make_shared<ov::Tensor>(tensor_remote);
-        infer_request->SetTensor(input_name, tensor_ptr);
-      } else {
-        OVTensorPtr graph_input_blob;
-        graph_input_blob = infer_request->GetTensor(input_name);
-        size_t batch_slice_idx = 0;
-        FillInputBlob(graph_input_blob, batch_slice_idx, input_name, context, subgraph_context_);
-      }
-    }
-
-    // Set the output blob as remote blob
-    auto graph_output_info = exe_network_.Get().outputs();
-    for (auto output_info_iter = graph_output_info.begin();
-         output_info_iter != graph_output_info.end(); ++output_info_iter) {
-      auto output_names = output_info_iter->get_names();
-      std::string onnx_output_name;
-      std::string output_name;
-      bool output_name_found = false;
-      // using the output name retrieved from ONNX original to match with the output names returned by OV tensors
-      for (auto it = subgraph_context_.output_names.begin(); it != subgraph_context_.output_names.end(); ++it) {
-        onnx_output_name = it->first;
-        if (output_names.find(onnx_output_name) != output_names.end()) {
-          // Assigning the output_name
-          output_name = it->first;
-          output_name_found = true;
-          break;
-        }
-      }
-      if (!output_name_found) {
-        ORT_THROW(
-            log_tag +
-            "Output names mismatch between OpenVINO and ONNX. [ONNX Output: ] " +
-            onnx_output_name + " doesn't exist in the list of OpenVINO output tensor names");
-      }
-
-      size_t batch_size = 1;
-      Ort::UnownedValue tensor = GetOutputTensor(context,
-                                                 batch_size,
-                                                 infer_request,
-                                                 output_name,
-                                                 subgraph_context_.output_names);
-      auto mem_info = tensor.GetTensorMemoryInfo();
-      // Check if ORT Value wraps a device pointer
-      if (mem_info.GetAllocatorName() == OpenVINO_GPU) {
-        const void* tensor_data = tensor.GetTensorRawData();
-        const cl::Buffer* shared_buffer_const = static_cast<const cl::Buffer*>(tensor_data);
-        // Create a shared Blob, set the Infer Request Output Blob
-        auto output = graph_output_info.at(0);
-        auto remote_tensor =
-            remote_context_->create_tensor(output.get_element_type(), output.get_shape(), *shared_buffer_const);
-        ov::Tensor tensor_t = static_cast<ov::Tensor>(remote_tensor);
-        OVTensorPtr tensor_ptr = std::make_shared<ov::Tensor>(tensor_t);
-        try {
-          infer_request->SetTensor(output_name, tensor_ptr);
-        } catch (const char* msg) {
-          ORT_THROW(msg);
-        }
-      }
-    }
-
-    // Start Async inference
-    infer_request->StartAsync();
-  } catch (const char* msg) {
-    ORT_THROW(msg);
-  }
-}
-#endif
 
 // Wait for asynchronous inference completion on an Infer Request object indexed by infer_req_idx
 // and copy the results into a slice location within the batched output buffer indexed by batch_slice_idx
@@ -653,28 +524,11 @@ void BasicBackend::Infer(OrtKernelContext* ctx) {
     // Requesting for an idle infer_request from a pool of infer_requests_
     OVInferRequestPtr infer_request;
     infer_request = inferRequestsQueue_->getIdleRequest();
-#ifdef IO_BUFFER_ENABLED
-    if ((session_context_.device_type.find("GPU") != std::string::npos) &&
-        (session_context_.context != nullptr) && session_context_.is_wholly_supported_graph) {
-      try {
-        StartRemoteAsyncInference(context, infer_request);
-      } catch (std::string const& msg) {
-        ORT_THROW(msg);
-      }
-    } else {
-      try {
-        StartAsyncInference(context, infer_request);
-      } catch (std::string const& msg) {
-        ORT_THROW(msg);
-      }
-    }
-#else
     try {
       StartAsyncInference(context, infer_request);
     } catch (const std::runtime_error& e) {
       ORT_THROW(log_tag + " Exception at StartAsyncInference: " + e.what());
     }
-#endif
     try {
       CompleteAsyncInference(context, infer_request);
     } catch (const std::runtime_error& e) {
@@ -696,13 +550,11 @@ void BasicBackend::Infer(OrtKernelContext* ctx) {
     // Once the inference is completed, the infer_request becomes free and is placed back into pool of infer_requests_
     inferRequestsQueue_->putIdleRequest(std::move(infer_request));
 #ifndef NDEBUG
-#ifndef IO_BUFFER_ENABLED  // Printing performance counts is disabled when IO_BUFFER_ENABLED
     if (openvino_ep::backend_utils::IsDebugEnabled()) {
       inferRequestsQueue_->printstatus();  // Printing the elements of infer_requests_ vector pool only in debug mode
       std::string& hw_target = session_context_.device_type;
       printPerformanceCounts(std::move(infer_request_), std::cout, hw_target);
     }
-#endif
 #endif
   }
 }
diff --git a/onnxruntime/core/providers/openvino/backends/basic_backend.h b/onnxruntime/core/providers/openvino/backends/basic_backend.h
index 130699abd465b..a19b49e48e513 100644
--- a/onnxruntime/core/providers/openvino/backends/basic_backend.h
+++ b/onnxruntime/core/providers/openvino/backends/basic_backend.h
@@ -95,10 +95,6 @@ class BasicBackend : public IBackend {
   void SetNumThreads(ov::AnyMap& device_config);
   void StartAsyncInference(Ort::KernelContext& context, std::shared_ptr<OVInferRequest> infer_request);
 
-#ifdef IO_BUFFER_ENABLED
-  void StartRemoteAsyncInference(Ort::KernelContext& context, std::shared_ptr<OVInferRequest> infer_request);
-#endif
-
   void CompleteAsyncInference(Ort::KernelContext& context, std::shared_ptr<OVInferRequest> infer_request);
 
   SessionContext& session_context_;
@@ -108,9 +104,6 @@ class BasicBackend : public IBackend {
   OVExeNetwork exe_network_;
   std::map<std::string, std::shared_ptr<ov::Node>> const_outputs_map_;
   std::unique_ptr<InferRequestsQueue> inferRequestsQueue_;
-#if defined IO_BUFFER_ENABLED
-  OVRemoteContextPtr remote_context_;
-#endif
 
   using ort_tensor_key_t = const std::string;
   std::map<ort_tensor_key_t, ov_tensor_data_t> ort_ov_tensor_map;
diff --git a/onnxruntime/core/providers/openvino/openvino_provider_factory.cc b/onnxruntime/core/providers/openvino/openvino_provider_factory.cc
index e5526ecd52bb9..0545f6f1a55ec 100644
--- a/onnxruntime/core/providers/openvino/openvino_provider_factory.cc
+++ b/onnxruntime/core/providers/openvino/openvino_provider_factory.cc
@@ -296,14 +296,6 @@ static void ParseProviderInfo(const ProviderOptions& provider_options,
   }
 
   pi.context = ParseUint64(provider_options, "context");
-#if defined(IO_BUFFER_ENABLED)
-  // a valid context must be provided to enable IO Buffer optimizations
-  if (pi.context == nullptr) {
-#undef IO_BUFFER_ENABLED
-#define IO_BUFFER_ENABLED = 0
-    LOGS_DEFAULT(WARNING) << "Context is not set. Disabling IO Buffer optimization";
-  }
-#endif
 
   if (provider_options.contains("num_of_threads")) {
     if (!std::all_of(provider_options.at("num_of_threads").begin(),
diff --git a/onnxruntime/core/providers/openvino/ov_interface.cc b/onnxruntime/core/providers/openvino/ov_interface.cc
index a175ca863d1d1..9b9ce366f7ec9 100644
--- a/onnxruntime/core/providers/openvino/ov_interface.cc
+++ b/onnxruntime/core/providers/openvino/ov_interface.cc
@@ -143,38 +143,6 @@ void OVCore::SetCache(const std::string& cache_dir_path) {
   core.set_property(ov::cache_dir(cache_dir_path));
 }
 
-#ifdef IO_BUFFER_ENABLED
-OVExeNetwork OVCore::CompileModel(std::shared_ptr<const OVNetwork>& model,
-                                  OVRemoteContextPtr context, std::string name) {
-  try {
-    auto obj = core.compile_model(model, *context);
-#ifndef NDEBUG
-    printDebugInfo(obj);
-#endif
-    return OVExeNetwork(obj);
-  } catch (const Exception& e) {
-    ORT_THROW(log_tag + " Exception while Loading Network for graph: " + name + e.what());
-  } catch (...) {
-    ORT_THROW(log_tag + " Exception while Loading Network for graph " + name);
-  }
-}
-OVExeNetwork OVCore::ImportModel(std::shared_ptr<std::istringstream> model_stream,
-                                 OVRemoteContextPtr context, std::string name) {
-  try {
-    auto obj = core.import_model(*model_stream, *context);
-#ifndef NDEBUG
-    printDebugInfo(obj);
-#endif
-    OVExeNetwork exe(obj);
-    return exe;
-  } catch (const Exception& e) {
-    ORT_THROW(log_tag + " Exception while Loading Network for graph: " + name + e.what());
-  } catch (...) {
-    ORT_THROW(log_tag + " Exception while Loading Network for graph " + name);
-  }
-}
-#endif
-
 std::vector<std::string> OVCore::GetAvailableDevices() const {
   std::vector<std::string> available_devices = core.get_available_devices();
   return available_devices;
diff --git a/onnxruntime/core/providers/openvino/ov_interface.h b/onnxruntime/core/providers/openvino/ov_interface.h
index bebe73bd702dd..d47ffd5c3c561 100644
--- a/onnxruntime/core/providers/openvino/ov_interface.h
+++ b/onnxruntime/core/providers/openvino/ov_interface.h
@@ -15,10 +15,6 @@
 #include "openvino/pass/convert_fp32_to_fp16.hpp"
 #include "openvino/frontend/manager.hpp"
 
-#ifdef IO_BUFFER_ENABLED
-#include <openvino/runtime/intel_gpu/ocl/ocl.hpp>
-#endif
-
 #include <string>
 
 namespace onnxruntime {
@@ -33,11 +29,6 @@ typedef ov::Model OVNetwork;
 typedef std::shared_ptr<OVInferRequest> OVInferRequestPtr;
 typedef std::shared_ptr<OVTensor> OVTensorPtr;
 
-#ifdef IO_BUFFER_ENABLED
-typedef ov::intel_gpu::ocl::ClContext* OVRemoteContextPtr;
-typedef ov::RemoteContext OVRemoteContext;
-#endif
-
 std::optional<bool> queryOVProperty(const std::string& property, const std::string& device_type);
 
 template <typename T>
@@ -87,14 +78,6 @@ struct OVCore : WeakSingleton<OVCore> {
                            std::string hw_target,
                            const ov::AnyMap& device_config,
                            std::string name);
-#ifdef IO_BUFFER_ENABLED
-  OVExeNetwork CompileModel(std::shared_ptr<const OVNetwork>& model,
-                            OVRemoteContextPtr context,
-                            std::string name);
-  OVExeNetwork ImportModel(std::shared_ptr<std::istringstream> model_stream,
-                           OVRemoteContextPtr context,
-                           std::string name);
-#endif
   std::vector<std::string> GetAvailableDevices() const;
   std::vector<std::string> GetAvailableDevices(const std::string& device_type) const;
   void SetCache(const std::string& cache_dir_path);