Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 15 additions & 4 deletions onnxruntime/core/providers/openvino/backend_manager.cc
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,10 @@ BackendManager::BackendManager(SessionContext& session_context,
shared_context_{shared_context} {
subgraph_context_.is_ep_ctx_graph = ep_ctx_handle_.CheckForOVEPCtxNodeInGraph(subgraph);

bool cpu_or_gpu = session_context_.device_type.find("CPU") != std::string::npos ||
session_context_.device_type.find("GPU") != std::string::npos;
bool npu = session_context_.device_type.find("NPU") != std::string::npos;

subgraph_context_.model_precision = [&](const GraphViewer& graph_viewer) {
// return empty if graph has no inputs or if types are not one of FP32/FP16
// else assume the type of the first input
Expand Down Expand Up @@ -105,8 +109,7 @@ BackendManager::BackendManager(SessionContext& session_context,
if (ModelHasSymbolicInputDims(subgraph)) {
subgraph_context_.has_dynamic_input_shape = true;
LOGS_DEFAULT(INFO) << "[OpenVINO-EP] Model has symbolic input dims";
if ((session_context_.device_type.find("CPU") != std::string::npos ||
session_context_.device_type.find("GPU") != std::string::npos) &&
if (cpu_or_gpu || (npu && session_context_.enable_causallm) &&
!session_context_.disable_dynamic_shapes) {
LOGS_DEFAULT(INFO) << "[OpenVINO-EP] Starting backend initialization. "
<< "Creating backend Dynamic Shapes";
Expand Down Expand Up @@ -480,6 +483,9 @@ BackendManager::ReWriteBatchDimWithOne(const ONNX_NAMESPACE::ModelProto& model_p
void BackendManager::Compute(OrtKernelContext* context) {
Ort::KernelContext ctx(context);
std::chrono::high_resolution_clock::time_point start_compute, end_compute;
bool cpu_or_gpu = session_context_.device_type.find("CPU") != std::string::npos ||
session_context_.device_type.find("GPU") != std::string::npos;
bool npu = session_context_.device_type.find("NPU") != std::string::npos;
#ifdef OPENVINO_FIL_ENABLED
static bool fil_enabled = true;
if (fil_enabled) {
Expand All @@ -493,8 +499,7 @@ void BackendManager::Compute(OrtKernelContext* context) {
// disable_dynamic_shapes is always set to true for OV NPU plugin.
if (subgraph_context_.has_dynamic_input_shape &&
!session_context_.disable_dynamic_shapes &&
(session_context_.device_type.find("CPU") != std::string::npos ||
session_context_.device_type.find("GPU") != std::string::npos)) {
(cpu_or_gpu || (npu && session_context_.enable_causallm))) {
concrete_backend_->Infer(context);
} else if (subgraph_context_.has_dynamic_input_shape) {
std::vector<std::vector<int64_t>> tensor_shapes = GetInputTensorShapes(ctx);
Expand Down Expand Up @@ -567,5 +572,11 @@ void BackendManager::ShutdownBackendManager() {
concrete_backend_.reset();
}

void BackendManager::RewindKVCache(size_t index) {
if (concrete_backend_) {
concrete_backend_->RewindKVCache(index);
}
}

} // namespace openvino_ep
} // namespace onnxruntime
1 change: 1 addition & 0 deletions onnxruntime/core/providers/openvino/backend_manager.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ class BackendManager {
SessionContext& GetSessionContext();
Status ExportCompiledBlobAsEPCtxNode(const onnxruntime::GraphViewer& subgraph);
ov::CompiledModel GetOVCompiledModel();
void RewindKVCache(size_t index);

private:
std::unique_ptr<ONNX_NAMESPACE::ModelProto> GetModelProtoFromFusedNode(
Expand Down
67 changes: 48 additions & 19 deletions onnxruntime/core/providers/openvino/backends/basic_backend.cc
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
#include "core/providers/openvino/backends/basic_backend.h"
#include "core/providers/openvino/onnx_ctx_model_helper.h"
#include "core/providers/openvino/backend_manager.h"
#include "core/providers/openvino/ov_stateful_patch_utils.h"

namespace onnxruntime {

Expand All @@ -29,6 +30,7 @@ BasicBackend::BasicBackend(std::unique_ptr<ONNX_NAMESPACE::ModelProto>& model_pr
ptr_stream_t& model_stream)
: session_context_{session_context}, subgraph_context_{subgraph_context}, shared_context_{shared_context} {
std::string& hw_target = session_context_.device_type;
bool enable_causallm = session_context_.enable_causallm;

if (ValidateSubgraph(const_outputs_map_))
return;
Expand All @@ -43,7 +45,7 @@ BasicBackend::BasicBackend(std::unique_ptr<ONNX_NAMESPACE::ModelProto>& model_pr
// Setting OpenCL queue throttling for GPU
EnableGPUThrottling(device_config);

// Enable streams; default=1 unless ovverriden by user config
// Enable streams; default=1 unless overridden by user configuration
EnableStreams();

// Set the inference_num_threads property of the CPU
Expand Down Expand Up @@ -76,7 +78,7 @@ BasicBackend::BasicBackend(std::unique_ptr<ONNX_NAMESPACE::ModelProto>& model_pr
} else if (!session_context_.has_external_weights &&
!subgraph_context_.has_dynamic_input_shape &&
!session_context_.so_context_enable &&
auto_unified_compile) {
!enable_causallm && auto_unified_compile) {
// Unified OV compile_model is efficient when ov model caching is enabled
// Unified OV compile_model API is supported with AUTO from version 2024.3 and above
// Inputs with static dimensions
Expand All @@ -96,7 +98,7 @@ BasicBackend::BasicBackend(std::unique_ptr<ONNX_NAMESPACE::ModelProto>& model_pr
}
auto ov_model = CreateOVModel(std::move(model), session_context_, const_outputs_map_);
exe_network_ = OVCore::Get()->CompileModel(
ov_model, hw_target, device_config, subgraph_context_.subgraph_name);
ov_model, hw_target, device_config, enable_causallm, subgraph_context_.subgraph_name);
}
LOGS_DEFAULT(INFO) << log_tag << "Loaded model to the plugin";
} catch (const char* msg) {
Expand All @@ -120,7 +122,7 @@ BasicBackend::BasicBackend(std::unique_ptr<ONNX_NAMESPACE::ModelProto>& model_pr
};
}
inferRequestsQueue_ = std::unique_ptr<InferRequestsQueue>(new InferRequestsQueue(exe_network_, num_infer_req, std::move(initializer)));
bindings_ = std::make_unique<OnnxToOvNetworkBindings>(exe_network_, subgraph_context_);
bindings_ = std::make_unique<OnnxToOvNetworkBindings>(exe_network_, subgraph_context_, session_context_);
}

bool BasicBackend::ValidateSubgraph(std::map<std::string, std::shared_ptr<ov::Node>>& const_outputs_map) {
Expand Down Expand Up @@ -181,6 +183,15 @@ void BasicBackend::PopulateConfigValue(ov::AnyMap& device_config) {
if (!session_context_.load_config.empty()) {
const std::map<std::string, ov::AnyMap>& target_config = session_context_.load_config;

if ((session_context_.device_type.find("NPU") != std::string::npos) && session_context_.enable_causallm) {
if (target_config.find("NPU") != target_config.end()) {
auto npu_genai_config = target_config.at("NPU");
CausalLMConfig().ApplyConfig(npu_genai_config, device_config);
} else {
LOGS_DEFAULT(WARNING) << "ORT GenAI CausalLMConfig Configuration not found.";
}
}

if (session_context_.device_type.find("NPU") != std::string::npos) {
auto npuw_config = target_config.at("NPU");

Expand Down Expand Up @@ -246,7 +257,8 @@ void BasicBackend::PopulateConfigValue(ov::AnyMap& device_config) {
auto set_target_properties = [&](const std::string& device, const ov::AnyMap& config_options,
const std::vector<ov::PropertyName>& supported_properties) {
for (const auto& [key, value] : config_options) {
if (key.find("NPUW") != std::string::npos) {
if ((key.find("NPUW") != std::string::npos) ||
((device_config.find(key) != device_config.end()) && session_context_.enable_causallm)) {
continue;
}
if (is_supported_and_mutable(key, supported_properties)) {
Expand Down Expand Up @@ -339,6 +351,13 @@ void BasicBackend::SetNumThreads(ov::AnyMap& device_config) {
device_config.emplace(ov::inference_num_threads(session_context_.num_of_threads));
}

void BasicBackend::RewindKVCache(size_t index) {
OVInferRequestPtr infer_request;
infer_request = inferRequestsQueue_->getIdleRequest();
infer_request->RewindKVCache(index);
inferRequestsQueue_->putIdleRequest(std::move(infer_request));
}

// Starts an asynchronous inference request for data in slice indexed by batch_slice_idx on
// an Infer Request indexed by infer_req_idx
void BasicBackend::StartAsyncInference(Ort::KernelContext& context, OVInferRequestPtr infer_request) {
Expand All @@ -351,7 +370,7 @@ void BasicBackend::StartAsyncInference(Ort::KernelContext& context, OVInferReque
size_t batch_slice_idx = 0;
if (subgraph_context_.has_dynamic_input_shape &&
!session_context_.disable_dynamic_shapes &&
cpu_or_gpu) {
cpu_or_gpu || (npu && session_context_.enable_causallm)) {
auto tensor = context.GetInput(input_info.onnx_index);
auto tensor_info = tensor.GetTensorTypeAndShapeInfo();
auto tensor_shape = tensor_info.GetShape();
Expand Down Expand Up @@ -409,7 +428,8 @@ void BasicBackend::StartAsyncInference(Ort::KernelContext& context, OVInferReque
}
} // Loop subgraph original input

if (npu) {
// For Stateful Compilation i.e. enable_causallm as True, we use the dynamic shapes path for NPU plugin as well.
if (npu && !session_context_.enable_causallm) {
// Set the output blob as remote blob
for (const auto& output_info : bindings_->network_outputs_) {
Ort::UnownedValue tensor = context.GetOutput(output_info.onnx_index, output_info.onnx_shape);
Expand Down Expand Up @@ -453,19 +473,20 @@ void BasicBackend::CompleteAsyncInference(Ort::KernelContext& context, OVInferRe

bool cpu_or_gpu = session_context_.device_type.find("CPU") != std::string::npos ||
session_context_.device_type.find("GPU") != std::string::npos;
if (cpu_or_gpu) {
bool npu = session_context_.device_type.find("NPU") != std::string::npos;
if (cpu_or_gpu || (npu && session_context_.enable_causallm)) {
for (const auto& output_info : bindings_->network_outputs_) {
OVTensorPtr graph_output_blob;
try {
graph_output_blob = infer_request->GetTensor(output_info.name);
} catch (const char* msg) {
ORT_THROW(msg);
}
size_t batch_size = 1;
Ort::UnownedValue output_tensor =
GetOutputTensor(context, batch_size, infer_request, output_info.name, subgraph_context_.output_names);
auto mem_info = output_tensor.GetTensorMemoryInfo();
if (mem_info.GetAllocatorName() == OpenVINO_GPU) {
OVTensorPtr graph_output_blob;
try {
graph_output_blob = infer_request->GetTensor(output_info.name);
} catch (const char* msg) {
ORT_THROW(msg);
}
size_t batch_size = 1;
Ort::UnownedValue output_tensor =
GetOutputTensor(context, batch_size, infer_request, output_info.name, subgraph_context_.output_names);
auto mem_info = output_tensor.GetTensorMemoryInfo();
if (mem_info.GetAllocatorName() == OpenVINO_GPU) {
return;
} else {
size_t batch_slice = 0;
Expand Down Expand Up @@ -538,11 +559,19 @@ void BasicBackend::Infer(OrtKernelContext* ctx) {
try {
StartAsyncInference(context, infer_request);
} catch (const std::runtime_error& e) {
// If the inference fails (exception from ov::InferRequest::infer()),
// we need to put the infer_request back into the pool to avoid deadlocks
// and to allow the next inference request to proceed.
inferRequestsQueue_->putIdleRequest(std::move(infer_request));
ORT_THROW(log_tag + " Exception at StartAsyncInference: " + e.what());
}
try {
CompleteAsyncInference(context, infer_request);
} catch (const std::runtime_error& e) {
// If the inference fails (exception from ov::InferRequest::infer()),
// we need to put the infer_request back into the pool to avoid deadlocks
// and to allow the next inference request to proceed.
inferRequestsQueue_->putIdleRequest(std::move(infer_request));
ORT_THROW(log_tag + " Exception at CompleteAsyncInference: " + e.what());
}

Expand Down
16 changes: 13 additions & 3 deletions onnxruntime/core/providers/openvino/backends/basic_backend.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,12 +42,22 @@ struct OnnxToOvNetworkBindings {
std::vector<ParameterInfo> network_outputs_;
std::vector<ParameterInfo> network_inputs_;

OnnxToOvNetworkBindings(OVExeNetwork& exec_network, SubGraphContext& subgraph_context) {
OnnxToOvNetworkBindings(OVExeNetwork& exec_network, SubGraphContext& subgraph_context, SessionContext& session_context) {
auto populate = [&](auto& input_output_map, const SubGraphContext::string_index_map_t& onnx_input_map, const auto& ov_parameters) {
for (const auto& [onnx_name, onnx_param_index] : onnx_input_map) {
auto it = std::find_if(ov_parameters.begin(), ov_parameters.end(),
[&onnx_name](const auto& ov_parameter_info) { return ov_parameter_info.get_names().contains(onnx_name); });

// For Stateful Model Compilation, the ONNX model includes KV cache (past/present) tensors.
// However, these tensors are internally converted to a stateful representation, which removes them.
// To prevent runtime exceptions, we simply continue processing here.
if ((onnx_name.empty() || onnx_name == "beam_idx" ||
onnx_name.find("past_key_values") != std::string::npos ||
onnx_name.find("present") != std::string::npos) &&
session_context.enable_causallm) {
continue;
}

ORT_ENFORCE(it != ov_parameters.end(), backend_utils::log_tag,
"Input names mismatch between OpenVINO and ONNX. ", onnx_name,
" doesn't exist in the list of OpenVINO input tensor names");
Expand Down Expand Up @@ -85,6 +95,7 @@ class BasicBackend : public IBackend {
ov::CompiledModel GetOVCompiledModel() override {
return exe_network_.Get();
}
void RewindKVCache(size_t index) override;

private:
bool ValidateSubgraph(std::map<std::string, std::shared_ptr<ov::Node>>& const_outputs_map);
Expand Down Expand Up @@ -114,7 +125,7 @@ class InferRequestsQueue {
OVInferRequestPtr infer_request;
live_threads=nireq;
for (size_t id = 0; id < nireq; id++) {
infer_request = std::make_shared<OVInferRequest>(net.CreateInferRequest());
infer_request = net.CreateInferRequest();
initializer(infer_request);
infer_requests_.push_back(infer_request);
}
Expand Down Expand Up @@ -144,7 +155,6 @@ class InferRequestsQueue {

OVInferRequestPtr getIdleRequest() {
std::unique_lock<std::mutex> lock(_mutex);
std::cout << "get Idle Request" << live_threads << "\n";
if(live_threads==0) {
return nullptr;
}
Expand Down
1 change: 1 addition & 0 deletions onnxruntime/core/providers/openvino/contexts.h
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@ struct ProviderInfo {
bool disable_dynamic_shapes{false}; // [disable_dynamic_shapes]: Rewrite dynamic shaped models to
// static shape at runtime and execute.
bool enable_qdq_optimizer{false}; // Enables QDQ pruning for efficient inference latency with NPU
bool enable_causallm{false}; // Enables Causal LM Compilation for ORT GenAI OVEP Pass
bool so_context_enable{false}; // ORT session option
bool so_disable_cpu_ep_fallback{false}; // ORT session option
bool so_context_embed_mode{false}; // ORT session option
Expand Down
1 change: 1 addition & 0 deletions onnxruntime/core/providers/openvino/ibackend.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ class IBackend {
virtual void Infer(OrtKernelContext* context) = 0;
virtual ov::CompiledModel GetOVCompiledModel() = 0;
virtual ~IBackend() = default;
virtual void RewindKVCache(size_t index) {}
};
using ptr_stream_t = std::unique_ptr<std::istream>;
class BackendFactory {
Expand Down
19 changes: 19 additions & 0 deletions onnxruntime/core/providers/openvino/openvino_execution_provider.cc
Original file line number Diff line number Diff line change
Expand Up @@ -254,6 +254,25 @@ common::Status OpenVINOExecutionProvider::SetEpDynamicOptions(gsl::span<const ch
}
}
}
} else if (key == "kvcache_rewind") {
// Convert kvcache_rewind value to int64_t
int64_t index;
try {
index = std::stoll(value);
} catch (const std::exception& e) {
LOGS_DEFAULT(WARNING) << "Conversion for kvcache_rewind string value to int64_t index failed."
<< "Exception:" + std::string(e.what());
return Status::OK();
}

// Trigger KVCache Rewind for target Backend
for (auto& backend : backend_managers_) {
if (index >= 0) {
backend.RewindKVCache(static_cast<size_t>(index));
} else {
LOGS_DEFAULT(WARNING) << "kvcache_rewind index is < 0:\t" << index;
}
}
} else {
// Handle unknown options
LOGS_DEFAULT(WARNING) << "Unknown key/value pair - ignoring " << key << "/" << value;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -343,13 +343,20 @@ static void ParseProviderInfo(const ProviderOptions& provider_options,

pi.enable_qdq_optimizer = ParseBooleanOption(provider_options, "enable_qdq_optimizer");

pi.enable_causallm = ParseBooleanOption(provider_options, "enable_causallm");

pi.disable_dynamic_shapes = ParseBooleanOption(provider_options, "disable_dynamic_shapes");
} catch (std::string msg) {
ORT_THROW(msg);
}
// Always true for NPU plugin or when passed .
if (pi.device_type.find("NPU") != std::string::npos) {
pi.disable_dynamic_shapes = true;
// For Stateful Compilation i.e. enable_causallm as True, we use the dynamic shapes path.
if (pi.enable_causallm) {
pi.disable_dynamic_shapes = false;
} else {
pi.disable_dynamic_shapes = true;
}
}
}

Expand Down
Loading
Loading