Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 0 additions & 5 deletions cmake/onnxruntime_providers_openvino.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -30,11 +30,6 @@
endif()

list(APPEND OPENVINO_LIB_LIST openvino::frontend::onnx openvino::runtime ${PYTHON_LIBRARIES})
if ((DEFINED ENV{OPENCL_LIBS}) AND (DEFINED ENV{OPENCL_INCS}) AND onnxruntime_USE_OPENVINO_GPU)
add_definitions(-DIO_BUFFER_ENABLED=1)
list(APPEND OPENVINO_LIB_LIST $ENV{OPENCL_LIBS})
endif()

source_group(TREE ${ONNXRUNTIME_ROOT}/core FILES ${onnxruntime_providers_openvino_cc_srcs})
onnxruntime_add_shared_library_module(onnxruntime_providers_openvino ${onnxruntime_providers_openvino_cc_srcs} "${ONNXRUNTIME_ROOT}/core/dll/onnxruntime.rc")

Expand Down
211 changes: 37 additions & 174 deletions onnxruntime/core/providers/openvino/backends/basic_backend.cc
Original file line number Diff line number Diff line change
Expand Up @@ -62,25 +62,6 @@ BasicBackend::BasicBackend(std::unique_ptr<ONNX_NAMESPACE::ModelProto>& model_pr
try {
// IO_BUFFER is enabled on GPU HW.
// Pre-requisite is provider_option "context" must be set
#if defined(IO_BUFFER_ENABLED)
cl_context ctx = static_cast<cl_context>(session_context_.context);
remote_context_ = new ov::intel_gpu::ocl::ClContext(OVCore::Get()->core, ctx);
if (subgraph_context_.is_ep_ctx_graph) {
exe_network_ = OVCore::Get()->ImportModel(*model_stream,
remote_context_,
subgraph_context_.subgraph_name);
model_stream.reset(); // Delete stream after it is no longer needed
} else {
std::string model = model_proto->SerializeAsString();
if (!subgraph_context.has_dynamic_input_shape) {
model_proto.reset()
}
auto ov_model = CreateOVModel(std::move(model), session_context_, const_outputs_map_);
LOGS_DEFAULT(INFO) << log_tag << "IO Buffering Enabled";
exe_network_ = OVCore::Get()->CompileModel(
ov_model, remote_context_, subgraph_context_.subgraph_name);
}
#else // !IO_BUFFER_ENABLED
auto auto_unified_compile = ((hw_target.find("AUTO") == std::string::npos) ||
(session_context_.OpenVINO_Version.at(0) >= 2024 &&
session_context_.OpenVINO_Version.at(1) > 2));
Expand Down Expand Up @@ -117,7 +98,6 @@ BasicBackend::BasicBackend(std::unique_ptr<ONNX_NAMESPACE::ModelProto>& model_pr
exe_network_ = OVCore::Get()->CompileModel(
ov_model, hw_target, device_config, subgraph_context_.subgraph_name);
}
#endif
LOGS_DEFAULT(INFO) << log_tag << "Loaded model to the plugin";
} catch (const char* msg) {
ORT_THROW(msg);
Expand Down Expand Up @@ -459,150 +439,46 @@ void BasicBackend::StartAsyncInference(Ort::KernelContext& context, OVInferReque
}
}

#ifdef IO_BUFFER_ENABLED

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@sfatimar . Why is IO Buffer Optimization removed ?

// Wait for Remote Aynchronous inference completion
void BasicBackend::StartRemoteAsyncInference(Ort::KernelContext& context, OVInferRequestPtr infer_request) {
try {
auto graph_input_info = exe_network_.Get().inputs();
int input_idx = 0;
for (auto input_info_iter = graph_input_info.begin();
input_info_iter != graph_input_info.end(); ++input_info_iter) {
auto input_names = input_info_iter->get_names();
std::string onnx_input_name;
std::string input_name;
// use names retrieved from original ONNX model to assign the right onnx input name for the graph
for (auto it = subgraph_context_.input_names.begin(); it != subgraph_context_.input_names.end(); ++it) {
if (it->second == input_idx) {
onnx_input_name = it->first;
break;
}
}
// using the input name retrieved from ONNX original to match with the input names returned by OV tensors
if (input_names.find(onnx_input_name) != input_names.end()) {
input_name = onnx_input_name;
} else {
ORT_THROW(log_tag +
"Input names mismatch between OpenVINO and ONNX. " +
onnx_input_name +
" doesn't exist in the list of OpenVINO input tensor names");
}
input_idx++;
// Kernel Context Input Buffer
const auto tensor = context.GetInput(subgraph_context_.input_names.at(input_name));
// If the ORTValue wraps a device pointer
auto mem_info = tensor.GetTensorMemoryInfo();
if (mem_info.GetAllocatorName() == OpenVINO_GPU) {
// Get the shared buffer pointer
const void* tensor_data = tensor.GetTensorRawData();
const cl::Buffer* shared_buffer_const = static_cast<const cl::Buffer*>(tensor_data);
// Create an Input Remote Blob
auto input = graph_input_info.at(0);
auto remote_blob = remote_context_->create_tensor(
input.get_element_type(), input.get_shape(), *shared_buffer_const);
ov::Tensor tensor_remote = static_cast<ov::Tensor>(remote_blob);
OVTensorPtr tensor_ptr = std::make_shared<ov::Tensor>(tensor_remote);
infer_request->SetTensor(input_name, tensor_ptr);
} else {
OVTensorPtr graph_input_blob;
graph_input_blob = infer_request->GetTensor(input_name);
size_t batch_slice_idx = 0;
FillInputBlob(graph_input_blob, batch_slice_idx, input_name, context, subgraph_context_);
}
}

// Set the output blob as remote blob
auto graph_output_info = exe_network_.Get().outputs();
for (auto output_info_iter = graph_output_info.begin();
output_info_iter != graph_output_info.end(); ++output_info_iter) {
auto output_names = output_info_iter->get_names();
std::string onnx_output_name;
std::string output_name;
bool output_name_found = false;
// using the output name retrieved from ONNX original to match with the output names returned by OV tensors
for (auto it = subgraph_context_.output_names.begin(); it != subgraph_context_.output_names.end(); ++it) {
onnx_output_name = it->first;
if (output_names.find(onnx_output_name) != output_names.end()) {
// Assigning the output_name
output_name = it->first;
output_name_found = true;
break;
}
}
if (!output_name_found) {
ORT_THROW(
log_tag +
"Output names mismatch between OpenVINO and ONNX. [ONNX Output: ] " +
onnx_output_name + " doesn't exist in the list of OpenVINO output tensor names");
}

size_t batch_size = 1;
Ort::UnownedValue tensor = GetOutputTensor(context,
batch_size,
infer_request,
output_name,
subgraph_context_.output_names);
auto mem_info = tensor.GetTensorMemoryInfo();
// Check if ORT Value wraps a device pointer
if (mem_info.GetAllocatorName() == OpenVINO_GPU) {
const void* tensor_data = tensor.GetTensorRawData();
const cl::Buffer* shared_buffer_const = static_cast<const cl::Buffer*>(tensor_data);
// Create a shared Blob, set the Infer Request Output Blob
auto output = graph_output_info.at(0);
auto remote_tensor =
remote_context_->create_tensor(output.get_element_type(), output.get_shape(), *shared_buffer_const);
ov::Tensor tensor_t = static_cast<ov::Tensor>(remote_tensor);
OVTensorPtr tensor_ptr = std::make_shared<ov::Tensor>(tensor_t);
try {
infer_request->SetTensor(output_name, tensor_ptr);
} catch (const char* msg) {
ORT_THROW(msg);
}
}
}

// Start Async inference
infer_request->StartAsync();
} catch (const char* msg) {
ORT_THROW(msg);
}
}
#endif

// Wait for asynchronous inference completion on an Infer Request object indexed by infer_req_idx
// and copy the results into a slice location within the batched output buffer indexed by batch_slice_idx
void BasicBackend::CompleteAsyncInference(Ort::KernelContext& context, OVInferRequestPtr infer_request) {
// Wait for Async inference completion
try {
bool cpu_or_gpu = session_context_.device_type.find("CPU") != std::string::npos ||
session_context_.device_type.find("GPU") != std::string::npos;

infer_request->WaitRequest();
} catch(const std::runtime_error& e) {
infer_request->CancelRequest();
inferRequestsQueue_->deleteRequest();
ORT_THROW(log_tag + e.what());
}

if (cpu_or_gpu) {
for (const auto& output_info : bindings_->network_outputs_) {
OVTensorPtr graph_output_blob;
try {
graph_output_blob = infer_request->GetTensor(output_info.name);
} catch (const char* msg) {
ORT_THROW(msg);
}
size_t batch_size = 1;
Ort::UnownedValue output_tensor =
GetOutputTensor(context, batch_size, infer_request, output_info.name, subgraph_context_.output_names);
auto mem_info = output_tensor.GetTensorMemoryInfo();
if (mem_info.GetAllocatorName() == OpenVINO_GPU) {
bool cpu_or_gpu = session_context_.device_type.find("CPU") != std::string::npos ||
session_context_.device_type.find("GPU") != std::string::npos;
if (cpu_or_gpu) {
for (const auto& output_info : bindings_->network_outputs_) {
OVTensorPtr graph_output_blob;
try {
graph_output_blob = infer_request->GetTensor(output_info.name);
} catch (const char* msg) {
ORT_THROW(msg);
}
size_t batch_size = 1;
Ort::UnownedValue output_tensor =
GetOutputTensor(context, batch_size, infer_request, output_info.name, subgraph_context_.output_names);
auto mem_info = output_tensor.GetTensorMemoryInfo();
if (mem_info.GetAllocatorName() == OpenVINO_GPU) {
return;
} else {
size_t batch_slice = 0;
FillOutputBlob(std::move(graph_output_blob), output_tensor, batch_slice);
}
} else {
size_t batch_slice = 0;
FillOutputBlob(std::move(graph_output_blob), output_tensor, batch_slice);
}
}
}

if (!const_outputs_map_.empty()) {
for (const auto& item : const_outputs_map_) {
const auto& out_name = item.first;
auto node = item.second;
if (!const_outputs_map_.empty()) {
for (const auto& item : const_outputs_map_) {
const auto& out_name = item.first;
auto node = item.second;
try {
Ort::UnownedValue output_tensor = GetOutputTensor(context,
out_name,
subgraph_context_.output_names,
Expand All @@ -613,10 +489,10 @@ void BasicBackend::CompleteAsyncInference(Ort::KernelContext& context, OVInferRe
} else {
FillOutputsWithConstantData(std::move(node), output_tensor);
}
} catch (std::string const& msg) {
ORT_THROW(msg);
}
}
} catch (const char* msg) {
ORT_THROW(msg);
}
}

Expand Down Expand Up @@ -650,31 +526,20 @@ void BasicBackend::Infer(OrtKernelContext* ctx) {
}

} else {
// Requesting for an idle infer_request from a pool of infer_requests_
OVInferRequestPtr infer_request;
infer_request = inferRequestsQueue_->getIdleRequest();
#ifdef IO_BUFFER_ENABLED
if ((session_context_.device_type.find("GPU") != std::string::npos) &&
(session_context_.context != nullptr) && session_context_.is_wholly_supported_graph) {
try {
StartRemoteAsyncInference(context, infer_request);
} catch (std::string const& msg) {
ORT_THROW(msg);
}
} else {
try {
StartAsyncInference(context, infer_request);
} catch (std::string const& msg) {
ORT_THROW(msg);
}
if(infer_request == nullptr) {
ORT_THROW("OpenVINO Execution Provider :: There are no inference requests");
LOGS_DEFAULT(FATAL) << log_tag << "Create Infer Requests do not exist";
return;
}
#else

LOGS_DEFAULT(INFO) << log_tag << "Get Idle Request";
try {
StartAsyncInference(context, infer_request);
} catch (const std::runtime_error& e) {
ORT_THROW(log_tag + " Exception at StartAsyncInference: " + e.what());
}
#endif
try {
CompleteAsyncInference(context, infer_request);
} catch (const std::runtime_error& e) {
Expand All @@ -696,13 +561,11 @@ void BasicBackend::Infer(OrtKernelContext* ctx) {
// Once the inference is completed, the infer_request becomes free and is placed back into pool of infer_requests_
inferRequestsQueue_->putIdleRequest(std::move(infer_request));
#ifndef NDEBUG
#ifndef IO_BUFFER_ENABLED // Printing performance counts is disabled when IO_BUFFER_ENABLED
if (openvino_ep::backend_utils::IsDebugEnabled()) {
inferRequestsQueue_->printstatus(); // Printing the elements of infer_requests_ vector pool only in debug mode
std::string& hw_target = session_context_.device_type;
printPerformanceCounts(std::move(infer_request_), std::cout, hw_target);
}
#endif
#endif
}
}
Expand Down
22 changes: 13 additions & 9 deletions onnxruntime/core/providers/openvino/backends/basic_backend.h
Original file line number Diff line number Diff line change
Expand Up @@ -94,11 +94,6 @@ class BasicBackend : public IBackend {
void EnableStreams();
void SetNumThreads(ov::AnyMap& device_config);
void StartAsyncInference(Ort::KernelContext& context, std::shared_ptr<OVInferRequest> infer_request);

#ifdef IO_BUFFER_ENABLED
void StartRemoteAsyncInference(Ort::KernelContext& context, std::shared_ptr<OVInferRequest> infer_request);
#endif

void CompleteAsyncInference(Ort::KernelContext& context, std::shared_ptr<OVInferRequest> infer_request);

SessionContext& session_context_;
Expand All @@ -108,10 +103,6 @@ class BasicBackend : public IBackend {
OVExeNetwork exe_network_;
std::map<std::string, std::shared_ptr<ov::Node>> const_outputs_map_;
std::unique_ptr<InferRequestsQueue> inferRequestsQueue_;
#if defined IO_BUFFER_ENABLED
OVRemoteContextPtr remote_context_;
#endif

using ort_tensor_key_t = const std::string;
std::map<ort_tensor_key_t, ov_tensor_data_t> ort_ov_tensor_map;
std::unique_ptr<OnnxToOvNetworkBindings> bindings_;
Expand All @@ -121,6 +112,7 @@ class InferRequestsQueue {
public:
InferRequestsQueue(OVExeNetwork& net, size_t nireq, std::function<void(OVInferRequestPtr)> initializer) {
OVInferRequestPtr infer_request;
live_threads=nireq;
for (size_t id = 0; id < nireq; id++) {
infer_request = std::make_shared<OVInferRequest>(net.CreateInferRequest());
initializer(infer_request);
Expand Down Expand Up @@ -152,16 +144,28 @@ class InferRequestsQueue {

OVInferRequestPtr getIdleRequest() {
std::unique_lock<std::mutex> lock(_mutex);
std::cout << "get Idle Request" << live_threads << "\n";
if(live_threads==0) {
return nullptr;
}

_cv.wait(lock, [this] { return infer_requests_.size() > 0; });
auto request = infer_requests_.at(0);
infer_requests_.erase(infer_requests_.begin());
return request;
}

void deleteRequest() {
std::unique_lock<std::mutex> lock(_mutex);
live_threads=live_threads-1;
std::cout << "delete Request" << live_threads << "\n";
}

private:
std::mutex _mutex;
std::condition_variable _cv;
std::vector<OVInferRequestPtr> infer_requests_;
int live_threads;
};

} // namespace openvino_ep
Expand Down
Loading
Loading