diff --git a/onnxruntime/core/providers/openvino/backend_manager.cc b/onnxruntime/core/providers/openvino/backend_manager.cc index b7e6245b1834f..c22f2e9cc0fa1 100644 --- a/onnxruntime/core/providers/openvino/backend_manager.cc +++ b/onnxruntime/core/providers/openvino/backend_manager.cc @@ -44,6 +44,10 @@ BackendManager::BackendManager(SessionContext& session_context, shared_context_{shared_context} { subgraph_context_.is_ep_ctx_graph = ep_ctx_handle_.CheckForOVEPCtxNodeInGraph(subgraph); + bool cpu_or_gpu = session_context_.device_type.find("CPU") != std::string::npos || + session_context_.device_type.find("GPU") != std::string::npos; + bool npu = session_context_.device_type.find("NPU") != std::string::npos; + subgraph_context_.model_precision = [&](const GraphViewer& graph_viewer) { // return empty if graph has no inputs or if types are not one of FP32/FP16 // else assume the type of the first input @@ -105,8 +109,7 @@ BackendManager::BackendManager(SessionContext& session_context, if (ModelHasSymbolicInputDims(subgraph)) { subgraph_context_.has_dynamic_input_shape = true; LOGS_DEFAULT(INFO) << "[OpenVINO-EP] Model has symbolic input dims"; - if ((session_context_.device_type.find("CPU") != std::string::npos || - session_context_.device_type.find("GPU") != std::string::npos) && + if (cpu_or_gpu || (npu && session_context_.enable_causallm) && !session_context_.disable_dynamic_shapes) { LOGS_DEFAULT(INFO) << "[OpenVINO-EP] Starting backend initialization. " << "Creating backend Dynamic Shapes"; @@ -480,6 +483,9 @@ BackendManager::ReWriteBatchDimWithOne(const ONNX_NAMESPACE::ModelProto& model_p void BackendManager::Compute(OrtKernelContext* context) { Ort::KernelContext ctx(context); std::chrono::high_resolution_clock::time_point start_compute, end_compute; + bool cpu_or_gpu = session_context_.device_type.find("CPU") != std::string::npos || + session_context_.device_type.find("GPU") != std::string::npos; + bool npu = session_context_.device_type.find("NPU") != std::string::npos; #ifdef OPENVINO_FIL_ENABLED static bool fil_enabled = true; if (fil_enabled) { @@ -493,8 +499,7 @@ void BackendManager::Compute(OrtKernelContext* context) { // disable_dynamic_shapes is always set to true for OV NPU plugin. if (subgraph_context_.has_dynamic_input_shape && !session_context_.disable_dynamic_shapes && - (session_context_.device_type.find("CPU") != std::string::npos || - session_context_.device_type.find("GPU") != std::string::npos)) { + (cpu_or_gpu || (npu && session_context_.enable_causallm))) { concrete_backend_->Infer(context); } else if (subgraph_context_.has_dynamic_input_shape) { std::vector> tensor_shapes = GetInputTensorShapes(ctx); @@ -567,5 +572,11 @@ void BackendManager::ShutdownBackendManager() { concrete_backend_.reset(); } +void BackendManager::RewindKVCache(size_t index) { + if (concrete_backend_) { + concrete_backend_->RewindKVCache(index); + } +} + } // namespace openvino_ep } // namespace onnxruntime diff --git a/onnxruntime/core/providers/openvino/backend_manager.h b/onnxruntime/core/providers/openvino/backend_manager.h index cb1ca7001a00c..799dc50dd7a63 100644 --- a/onnxruntime/core/providers/openvino/backend_manager.h +++ b/onnxruntime/core/providers/openvino/backend_manager.h @@ -30,6 +30,7 @@ class BackendManager { SessionContext& GetSessionContext(); Status ExportCompiledBlobAsEPCtxNode(const onnxruntime::GraphViewer& subgraph); ov::CompiledModel GetOVCompiledModel(); + void RewindKVCache(size_t index); private: std::unique_ptr GetModelProtoFromFusedNode( diff --git a/onnxruntime/core/providers/openvino/backends/basic_backend.cc b/onnxruntime/core/providers/openvino/backends/basic_backend.cc index dedb6da1bae58..7902b3edb2276 100644 --- a/onnxruntime/core/providers/openvino/backends/basic_backend.cc +++ b/onnxruntime/core/providers/openvino/backends/basic_backend.cc @@ -15,6 +15,7 @@ #include "core/providers/openvino/backends/basic_backend.h" #include "core/providers/openvino/onnx_ctx_model_helper.h" #include "core/providers/openvino/backend_manager.h" +#include "core/providers/openvino/ov_stateful_patch_utils.h" namespace onnxruntime { @@ -29,6 +30,7 @@ BasicBackend::BasicBackend(std::unique_ptr& model_pr ptr_stream_t& model_stream) : session_context_{session_context}, subgraph_context_{subgraph_context}, shared_context_{shared_context} { std::string& hw_target = session_context_.device_type; + bool enable_causallm = session_context_.enable_causallm; if (ValidateSubgraph(const_outputs_map_)) return; @@ -43,7 +45,7 @@ BasicBackend::BasicBackend(std::unique_ptr& model_pr // Setting OpenCL queue throttling for GPU EnableGPUThrottling(device_config); - // Enable streams; default=1 unless ovverriden by user config + // Enable streams; default=1 unless overridden by user configuration EnableStreams(); // Set the inference_num_threads property of the CPU @@ -76,7 +78,7 @@ BasicBackend::BasicBackend(std::unique_ptr& model_pr } else if (!session_context_.has_external_weights && !subgraph_context_.has_dynamic_input_shape && !session_context_.so_context_enable && - auto_unified_compile) { + !enable_causallm && auto_unified_compile) { // Unified OV compile_model is efficient when ov model caching is enabled // Unified OV compile_model API is supported with AUTO from version 2024.3 and above // Inputs with static dimensions @@ -96,7 +98,7 @@ BasicBackend::BasicBackend(std::unique_ptr& model_pr } auto ov_model = CreateOVModel(std::move(model), session_context_, const_outputs_map_); exe_network_ = OVCore::Get()->CompileModel( - ov_model, hw_target, device_config, subgraph_context_.subgraph_name); + ov_model, hw_target, device_config, enable_causallm, subgraph_context_.subgraph_name); } LOGS_DEFAULT(INFO) << log_tag << "Loaded model to the plugin"; } catch (const char* msg) { @@ -120,7 +122,7 @@ BasicBackend::BasicBackend(std::unique_ptr& model_pr }; } inferRequestsQueue_ = std::unique_ptr(new InferRequestsQueue(exe_network_, num_infer_req, std::move(initializer))); - bindings_ = std::make_unique(exe_network_, subgraph_context_); + bindings_ = std::make_unique(exe_network_, subgraph_context_, session_context_); } bool BasicBackend::ValidateSubgraph(std::map>& const_outputs_map) { @@ -181,6 +183,15 @@ void BasicBackend::PopulateConfigValue(ov::AnyMap& device_config) { if (!session_context_.load_config.empty()) { const std::map& target_config = session_context_.load_config; + if ((session_context_.device_type.find("NPU") != std::string::npos) && session_context_.enable_causallm) { + if (target_config.find("NPU") != target_config.end()) { + auto npu_genai_config = target_config.at("NPU"); + CausalLMConfig().ApplyConfig(npu_genai_config, device_config); + } else { + LOGS_DEFAULT(WARNING) << "ORT GenAI CausalLMConfig Configuration not found."; + } + } + if (session_context_.device_type.find("NPU") != std::string::npos) { auto npuw_config = target_config.at("NPU"); @@ -246,7 +257,8 @@ void BasicBackend::PopulateConfigValue(ov::AnyMap& device_config) { auto set_target_properties = [&](const std::string& device, const ov::AnyMap& config_options, const std::vector& supported_properties) { for (const auto& [key, value] : config_options) { - if (key.find("NPUW") != std::string::npos) { + if ((key.find("NPUW") != std::string::npos) || + ((device_config.find(key) != device_config.end()) && session_context_.enable_causallm)) { continue; } if (is_supported_and_mutable(key, supported_properties)) { @@ -339,6 +351,13 @@ void BasicBackend::SetNumThreads(ov::AnyMap& device_config) { device_config.emplace(ov::inference_num_threads(session_context_.num_of_threads)); } +void BasicBackend::RewindKVCache(size_t index) { + OVInferRequestPtr infer_request; + infer_request = inferRequestsQueue_->getIdleRequest(); + infer_request->RewindKVCache(index); + inferRequestsQueue_->putIdleRequest(std::move(infer_request)); +} + // Starts an asynchronous inference request for data in slice indexed by batch_slice_idx on // an Infer Request indexed by infer_req_idx void BasicBackend::StartAsyncInference(Ort::KernelContext& context, OVInferRequestPtr infer_request) { @@ -351,7 +370,7 @@ void BasicBackend::StartAsyncInference(Ort::KernelContext& context, OVInferReque size_t batch_slice_idx = 0; if (subgraph_context_.has_dynamic_input_shape && !session_context_.disable_dynamic_shapes && - cpu_or_gpu) { + cpu_or_gpu || (npu && session_context_.enable_causallm)) { auto tensor = context.GetInput(input_info.onnx_index); auto tensor_info = tensor.GetTensorTypeAndShapeInfo(); auto tensor_shape = tensor_info.GetShape(); @@ -409,7 +428,8 @@ void BasicBackend::StartAsyncInference(Ort::KernelContext& context, OVInferReque } } // Loop subgraph original input - if (npu) { + // For Stateful Compilation i.e. enable_causallm as True, we use the dynamic shapes path for NPU plugin as well. + if (npu && !session_context_.enable_causallm) { // Set the output blob as remote blob for (const auto& output_info : bindings_->network_outputs_) { Ort::UnownedValue tensor = context.GetOutput(output_info.onnx_index, output_info.onnx_shape); @@ -453,19 +473,20 @@ void BasicBackend::CompleteAsyncInference(Ort::KernelContext& context, OVInferRe bool cpu_or_gpu = session_context_.device_type.find("CPU") != std::string::npos || session_context_.device_type.find("GPU") != std::string::npos; - if (cpu_or_gpu) { + bool npu = session_context_.device_type.find("NPU") != std::string::npos; + if (cpu_or_gpu || (npu && session_context_.enable_causallm)) { for (const auto& output_info : bindings_->network_outputs_) { - OVTensorPtr graph_output_blob; - try { - graph_output_blob = infer_request->GetTensor(output_info.name); - } catch (const char* msg) { - ORT_THROW(msg); - } - size_t batch_size = 1; - Ort::UnownedValue output_tensor = - GetOutputTensor(context, batch_size, infer_request, output_info.name, subgraph_context_.output_names); - auto mem_info = output_tensor.GetTensorMemoryInfo(); - if (mem_info.GetAllocatorName() == OpenVINO_GPU) { + OVTensorPtr graph_output_blob; + try { + graph_output_blob = infer_request->GetTensor(output_info.name); + } catch (const char* msg) { + ORT_THROW(msg); + } + size_t batch_size = 1; + Ort::UnownedValue output_tensor = + GetOutputTensor(context, batch_size, infer_request, output_info.name, subgraph_context_.output_names); + auto mem_info = output_tensor.GetTensorMemoryInfo(); + if (mem_info.GetAllocatorName() == OpenVINO_GPU) { return; } else { size_t batch_slice = 0; @@ -538,11 +559,19 @@ void BasicBackend::Infer(OrtKernelContext* ctx) { try { StartAsyncInference(context, infer_request); } catch (const std::runtime_error& e) { + // If the inference fails (exception from ov::InferRequest::infer()), + // we need to put the infer_request back into the pool to avoid deadlocks + // and to allow the next inference request to proceed. + inferRequestsQueue_->putIdleRequest(std::move(infer_request)); ORT_THROW(log_tag + " Exception at StartAsyncInference: " + e.what()); } try { CompleteAsyncInference(context, infer_request); } catch (const std::runtime_error& e) { + // If the inference fails (exception from ov::InferRequest::infer()), + // we need to put the infer_request back into the pool to avoid deadlocks + // and to allow the next inference request to proceed. + inferRequestsQueue_->putIdleRequest(std::move(infer_request)); ORT_THROW(log_tag + " Exception at CompleteAsyncInference: " + e.what()); } diff --git a/onnxruntime/core/providers/openvino/backends/basic_backend.h b/onnxruntime/core/providers/openvino/backends/basic_backend.h index 697c088a80620..fe178ccb5661b 100644 --- a/onnxruntime/core/providers/openvino/backends/basic_backend.h +++ b/onnxruntime/core/providers/openvino/backends/basic_backend.h @@ -42,12 +42,22 @@ struct OnnxToOvNetworkBindings { std::vector network_outputs_; std::vector network_inputs_; - OnnxToOvNetworkBindings(OVExeNetwork& exec_network, SubGraphContext& subgraph_context) { + OnnxToOvNetworkBindings(OVExeNetwork& exec_network, SubGraphContext& subgraph_context, SessionContext& session_context) { auto populate = [&](auto& input_output_map, const SubGraphContext::string_index_map_t& onnx_input_map, const auto& ov_parameters) { for (const auto& [onnx_name, onnx_param_index] : onnx_input_map) { auto it = std::find_if(ov_parameters.begin(), ov_parameters.end(), [&onnx_name](const auto& ov_parameter_info) { return ov_parameter_info.get_names().contains(onnx_name); }); + // For Stateful Model Compilation, the ONNX model includes KV cache (past/present) tensors. + // However, these tensors are internally converted to a stateful representation, which removes them. + // To prevent runtime exceptions, we simply continue processing here. + if ((onnx_name.empty() || onnx_name == "beam_idx" || + onnx_name.find("past_key_values") != std::string::npos || + onnx_name.find("present") != std::string::npos) && + session_context.enable_causallm) { + continue; + } + ORT_ENFORCE(it != ov_parameters.end(), backend_utils::log_tag, "Input names mismatch between OpenVINO and ONNX. ", onnx_name, " doesn't exist in the list of OpenVINO input tensor names"); @@ -85,6 +95,7 @@ class BasicBackend : public IBackend { ov::CompiledModel GetOVCompiledModel() override { return exe_network_.Get(); } + void RewindKVCache(size_t index) override; private: bool ValidateSubgraph(std::map>& const_outputs_map); @@ -114,7 +125,7 @@ class InferRequestsQueue { OVInferRequestPtr infer_request; live_threads=nireq; for (size_t id = 0; id < nireq; id++) { - infer_request = std::make_shared(net.CreateInferRequest()); + infer_request = net.CreateInferRequest(); initializer(infer_request); infer_requests_.push_back(infer_request); } @@ -144,7 +155,6 @@ class InferRequestsQueue { OVInferRequestPtr getIdleRequest() { std::unique_lock lock(_mutex); - std::cout << "get Idle Request" << live_threads << "\n"; if(live_threads==0) { return nullptr; } diff --git a/onnxruntime/core/providers/openvino/contexts.h b/onnxruntime/core/providers/openvino/contexts.h index 7560f4570bd32..2506d587dd3ad 100644 --- a/onnxruntime/core/providers/openvino/contexts.h +++ b/onnxruntime/core/providers/openvino/contexts.h @@ -97,6 +97,7 @@ struct ProviderInfo { bool disable_dynamic_shapes{false}; // [disable_dynamic_shapes]: Rewrite dynamic shaped models to // static shape at runtime and execute. bool enable_qdq_optimizer{false}; // Enables QDQ pruning for efficient inference latency with NPU + bool enable_causallm{false}; // Enables Causal LM Compilation for ORT GenAI OVEP Pass bool so_context_enable{false}; // ORT session option bool so_disable_cpu_ep_fallback{false}; // ORT session option bool so_context_embed_mode{false}; // ORT session option diff --git a/onnxruntime/core/providers/openvino/ibackend.h b/onnxruntime/core/providers/openvino/ibackend.h index 4532349897d17..752668b3c6fbe 100644 --- a/onnxruntime/core/providers/openvino/ibackend.h +++ b/onnxruntime/core/providers/openvino/ibackend.h @@ -17,6 +17,7 @@ class IBackend { virtual void Infer(OrtKernelContext* context) = 0; virtual ov::CompiledModel GetOVCompiledModel() = 0; virtual ~IBackend() = default; + virtual void RewindKVCache(size_t index) {} }; using ptr_stream_t = std::unique_ptr; class BackendFactory { diff --git a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc index 3793317749a04..d12f1edc57da5 100644 --- a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc +++ b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc @@ -254,6 +254,25 @@ common::Status OpenVINOExecutionProvider::SetEpDynamicOptions(gsl::span= 0) { + backend.RewindKVCache(static_cast(index)); + } else { + LOGS_DEFAULT(WARNING) << "kvcache_rewind index is < 0:\t" << index; + } + } } else { // Handle unknown options LOGS_DEFAULT(WARNING) << "Unknown key/value pair - ignoring " << key << "/" << value; diff --git a/onnxruntime/core/providers/openvino/openvino_provider_factory.cc b/onnxruntime/core/providers/openvino/openvino_provider_factory.cc index e5526ecd52bb9..f7e64a9be2c60 100644 --- a/onnxruntime/core/providers/openvino/openvino_provider_factory.cc +++ b/onnxruntime/core/providers/openvino/openvino_provider_factory.cc @@ -343,13 +343,20 @@ static void ParseProviderInfo(const ProviderOptions& provider_options, pi.enable_qdq_optimizer = ParseBooleanOption(provider_options, "enable_qdq_optimizer"); + pi.enable_causallm = ParseBooleanOption(provider_options, "enable_causallm"); + pi.disable_dynamic_shapes = ParseBooleanOption(provider_options, "disable_dynamic_shapes"); } catch (std::string msg) { ORT_THROW(msg); } // Always true for NPU plugin or when passed . if (pi.device_type.find("NPU") != std::string::npos) { - pi.disable_dynamic_shapes = true; + // For Stateful Compilation i.e. enable_causallm as True, we use the dynamic shapes path. + if (pi.enable_causallm) { + pi.disable_dynamic_shapes = false; + } else { + pi.disable_dynamic_shapes = true; + } } } diff --git a/onnxruntime/core/providers/openvino/ov_interface.cc b/onnxruntime/core/providers/openvino/ov_interface.cc index 0024a5e121bbf..0818f350562e9 100644 --- a/onnxruntime/core/providers/openvino/ov_interface.cc +++ b/onnxruntime/core/providers/openvino/ov_interface.cc @@ -7,6 +7,8 @@ #include "core/session/onnxruntime_cxx_api.h" #include "core/providers/shared_library/provider_api.h" #include "core/providers/openvino/backend_utils.h" +#include "core/providers/openvino/backends/basic_backend.h" +#include "core/providers/openvino/ov_stateful_patch_utils.h" using Exception = ov::Exception; @@ -82,17 +84,85 @@ std::shared_ptr OVCore::ReadModel(std::string&& model, const std::str } } +OVExeNetwork OVCore::StatefulCompileModel(std::shared_ptr& model, + std::string& hw_target, + const ov::AnyMap& device_config) { + ov::CompiledModel compiled_model; + ov::AnyMap config = device_config; + + if (onnxruntime::openvino_ep::backend_utils::IsDebugEnabled()) { + std::cout << "Stateless OV Model Statistic:" << std::endl; + LogBasicModelInfo(model); + } + + LOGS_DEFAULT(INFO) << log_tag << "Converting from Stateless OV Model to Stateful OV Model" << std::endl; + bool model_status = IsStateful(model); + LOGS_DEFAULT(INFO) << log_tag << "Model IsStateful() Status:\t" << (model_status ? "True" : "False"); + if (!model_status) { + PatchStatefulDecoder(model); + } + + if (onnxruntime::openvino_ep::backend_utils::IsDebugEnabled()) { + std::cout << "Stateful OV Model Statistic:" << std::endl; + LogBasicModelInfo(model); + } + + auto kv_pos = GetKVAxesPos(model); + + if (hw_target.find("NPU") != std::string::npos) { + KVDesc kv_desc; + auto parse_genai_config = [&](const std::string& key, unsigned int default_value) { + return (config.count(key) && !config.at(key).empty() && config.at(key).as() != "0") ? config.at(key).as() : default_value; + }; + + kv_desc.max_prompt_len = parse_genai_config("MAX_PROMPT_LEN", CausalLMConfig().max_prompt_len); + kv_desc.min_response_len = parse_genai_config("MIN_RESPONSE_LEN", CausalLMConfig().min_response_len); + + // For compilation, MAX_PROMPT_LEN & MIN_RESPONSE_LEN should not be 0 + if (kv_desc.max_prompt_len == 0 || kv_desc.min_response_len == 0) { + ORT_THROW(log_tag + "MAX_PROMPT_LEN and MIN_RESPONSE_LEN cannot be 0 or empty"); + } + + if (onnxruntime::openvino_ep::backend_utils::IsDebugEnabled()) { + std::cout << "kv_pos.batch = " << kv_pos.batch << std::endl; + std::cout << "kv_pos.seq_len = " << kv_pos.seq_len << std::endl; + std::cout << "kv_desc.max_prompt_len:\t" << kv_desc.max_prompt_len << std::endl; + std::cout << "kv_desc.min_response_len:\t" << kv_desc.min_response_len << std::endl; + } + + UpdateNPUConfig(config, kv_pos, kv_desc); + } else { + // This patches the OV IR model so that it only produces the logits required for sampling. + // Actually either way that happens within NPUW::LLMCompiledModel creation for NPU device, + // while this is here mostly to align this behavior for other devices viz. (CPU, GPU). + ApplySliceBeforeMatmulTransformation(model); + } + + LOGS_DEFAULT(INFO) << log_tag << "Compiling OV Model using Stateful Transformation flow"; + compiled_model = OVCore::Get()->core.compile_model(model, hw_target, config); + OVExeNetwork exe(compiled_model, hw_target, true); + return exe; +} + OVExeNetwork OVCore::CompileModel(std::shared_ptr& ie_cnn_network, std::string& hw_target, ov::AnyMap& device_config, + bool enable_causallm, const std::string& name) { - ov::CompiledModel obj; + OVExeNetwork exe; try { - obj = core.compile_model(ie_cnn_network, hw_target, device_config); + if (enable_causallm) { + auto mutable_model = ie_cnn_network->clone(); + exe = OVCore::Get()->StatefulCompileModel(mutable_model, hw_target, device_config); + } else { + auto obj = core.compile_model(ie_cnn_network, hw_target, device_config); + exe = OVExeNetwork(obj, hw_target); + } + #ifndef NDEBUG - printDebugInfo(obj); + printDebugInfo(exe.Get()); #endif - OVExeNetwork exe(obj); + return exe; } catch (const Exception& e) { ORT_THROW(log_tag + " Exception while Loading Network for graph: " + name + e.what()); @@ -111,7 +181,7 @@ OVExeNetwork OVCore::CompileModel(const std::string& onnx_model, #ifndef NDEBUG printDebugInfo(obj); #endif - OVExeNetwork exe(obj); + OVExeNetwork exe(obj, hw_target); return exe; } catch (const Exception& e) { ORT_THROW(log_tag + " Exception while Loading Network for graph: " + name + e.what()); @@ -128,9 +198,9 @@ OVExeNetwork OVCore::ImportModel(std::istream& model_stream, ov::CompiledModel obj; obj = core.import_model(model_stream, hw_target, device_config); #ifndef NDEBUG - printDebugInfo(obj); + printDebugInfo(exe.Get()); #endif - OVExeNetwork exe(obj); + OVExeNetwork exe(obj, hw_target); return exe; } catch (const Exception& e) { ORT_THROW(log_tag + " Exception while Loading Network for graph: " + name + e.what()); @@ -192,11 +262,16 @@ void OVCore::SetStreams(const std::string& device_type, int num_streams) { core.set_property(device_type, {ov::num_streams(num_streams)}); } -OVInferRequest OVExeNetwork::CreateInferRequest() { +std::shared_ptr OVExeNetwork::CreateInferRequest() { try { - auto infReq = obj.create_infer_request(); - OVInferRequest inf_obj(std::move(infReq)); - return inf_obj; + auto infReq = compiled_model_obj.create_infer_request(); + std::shared_ptr ovInfReq; + if (is_stateful_causallm) { + ovInfReq = std::make_shared(std::move(infReq), target_device); + } else { + ovInfReq = std::make_shared(std::move(infReq)); + } + return ovInfReq; } catch (const Exception& e) { ORT_THROW(log_tag + "Exception while creating InferRequest object: " + e.what()); } catch (...) { @@ -245,9 +320,9 @@ void OVInferRequest::StartAsync() { try { ovInfReq.start_async(); } catch (const Exception& e) { - ORT_THROW(log_tag + " Couldn't start Inference: " + e.what()); + throw std::runtime_error(log_tag + " Couldn't start Inference: " + e.what()); } catch (...) { - ORT_THROW(log_tag + " In Error Couldn't start Inference"); + throw std::runtime_error(log_tag + " In Error Couldn't start Inference"); } } @@ -255,9 +330,9 @@ void OVInferRequest::Infer() { try { ovInfReq.infer(); } catch (const Exception& e) { - ORT_THROW(log_tag + " Couldn't start Inference: " + e.what()); + throw std::runtime_error(log_tag + " Couldn't start Inference: " + e.what()); } catch (...) { - ORT_THROW(log_tag + " In Error Couldn't start Inference"); + throw std::runtime_error(log_tag + " In Error Couldn't start Inference"); } } @@ -279,5 +354,160 @@ void OVInferRequest::QueryStatus() { std::cout << "ovInfReq.query_state()" << " "; } + +StatefulOVInferRequest::StatefulOVInferRequest(ov::InferRequest infer_request, std::string device) + : OVInferRequest(std::move(infer_request)), target_device(device) { + bool gpu_or_npu = ((device.find("NPU") != std::string::npos) || (device.find("GPU") != std::string::npos)); + if (gpu_or_npu) { + prefill_use_full_chat_history = true; + } +} + +void StatefulOVInferRequest::FillTensor(const std::string& tensor_name, const ov::element::Type& type, + const std::vector& shape, int32_t fill_value) { + ov::Tensor tensor = ov::Tensor(type, shape); + std::fill_n(tensor.data(), tensor.get_size(), fill_value); + ovInfReq.set_tensor(tensor_name, tensor); +} + +void StatefulOVInferRequest::CacheTensor(const std::string& tensor_name, std::vector& cache) { + auto tensor = ovInfReq.get_tensor(tensor_name); + auto* pData = tensor.data(); + for (size_t i = 0; i < tensor.get_size(); i++) { + cache.emplace_back(pData[i]); + } +} + +void StatefulOVInferRequest::SetTensorFromCache(const std::string& tensor_name, + const std::vector& cache_data) { + auto tensor = ovInfReq.get_tensor(tensor_name); + auto new_shape = tensor.get_shape(); + new_shape[1] = cache_data.size(); + + auto new_tensor = ov::Tensor(tensor.get_element_type(), new_shape); + auto* pNewData = new_tensor.data(); + std::memcpy(pNewData, cache_data.data(), cache_data.size() * sizeof(int64_t)); + + ovInfReq.set_tensor(tensor_name, new_tensor); +} + +std::optional StatefulOVInferRequest::FindTensor(const std::string& tensor_name) { + // Check if tensor exists by examining input names in the compiled model + const auto& model = ovInfReq.get_compiled_model(); + bool tensor_exists = false; + + for (const auto& input : model.inputs()) { + const auto& names = input.get_names(); + if (names.find(tensor_name) != names.end()) { + tensor_exists = true; + break; + } + } + + if (tensor_exists) { + return ovInfReq.get_tensor(tensor_name); + } + + return std::nullopt; +} + +void StatefulOVInferRequest::PreProcessInferRequest() { + // Workaround: Setting the value here as it cannot be set at the ORT GenAI layer currently. + // TODO(ankit): Address this issue and implement the fix at the appropriate layer. + FillTensor("beam_idx", ov::element::i32, {1}, 0); + + // If 'prefill use full chat history' mode is enabled, we need to cache input_ids and position_ids. + if (prefill_use_full_chat_history) { + auto input_ids_tensor = ovInfReq.get_tensor("input_ids"); + CacheTensor("input_ids", cached_input_ids); + + // "position_ids" (GQA with Rotary Embeddings doesnt have position_ids) - check if exists + auto position_ids_opt = FindTensor("position_ids"); + bool has_position_ids = position_ids_opt.has_value(); + + if (has_position_ids) { + CacheTensor("position_ids", cached_position_ids); + } + + // If we're about to run the prefill model + if (input_ids_tensor.get_size() > 1) { + // Check if the size of the current "input_ids" tensor does not match the size of the cached "input_ids". + // This indicates that we are running a subsequent prompt (not the initial prefill). + if (input_ids_tensor.get_shape()[1] != cached_input_ids.size()) { + // Clear the internal KVCache state. For NPU device, this operation is a no-op. + ovInfReq.reset_state(); + + // Set tensors using cached values + SetTensorFromCache("input_ids", cached_input_ids); + + // Only set position_ids if it exists and we have cached values + if (has_position_ids && !cached_position_ids.empty()) { + SetTensorFromCache("position_ids", cached_position_ids); + } + } + } + } +} + +void StatefulOVInferRequest::StartAsync() { + PreProcessInferRequest(); + OVInferRequest::StartAsync(); +} + +void StatefulOVInferRequest::Infer() { + PreProcessInferRequest(); + OVInferRequest::Infer(); +} + +void StatefulOVInferRequest::RewindKVCache(size_t index) { + LOGS_DEFAULT(INFO) << log_tag << "RewindKVCache: Rewinding OpenVINO-internal KVCache state to index=" << index; + + if (prefill_use_full_chat_history) { + // Clear the internal KVCache state. For NPU device, this operation is a no-op. + ovInfReq.reset_state(); + + // Resize the cached "input_ids" and "position_ids" to the specified index. + if (cached_input_ids.size() > index) { + cached_input_ids.resize(index); + } + + if (cached_position_ids.size() > index) { + cached_position_ids.resize(index); + } + } else { + if (index == 0) { + // In this case, since we're resetting the entire KVCache, simply reset the state. + ovInfReq.reset_state(); + } else { + // Retrieve KVCache states and trim them to the specified index. + // The following logic is adapted from: + // https://github.com/openvinotoolkit/openvino.genai/blob/releases/2025/1/src/cpp/src/utils.cpp#L329 + auto states = ovInfReq.query_state(); + for (auto& state : states) { + ov::Tensor old_tensor = state.get_state(); + // Tensor shape: [batch_size, num_kv_heads, seq_len, head_size] + auto shape = old_tensor.get_shape(); + + if (shape[2] > index) { + // Update the sequence length dimension to the specified index. + shape[2] = index; + + ov::Coordinate new_shape_begin{0, 0, 0, 0}; + ov::Coordinate new_shape_end{shape}; + + // Create a trimmed tensor with the updated shape. + auto trimmed_tensor = ov::Tensor(old_tensor, new_shape_begin, new_shape_end); + + // Copy the trimmed tensor into a new tensor and update the state. + ov::Tensor new_tensor(old_tensor.get_element_type(), shape); + trimmed_tensor.copy_to(new_tensor); + + state.set_state(new_tensor); + } + } + } + } +} + } // namespace openvino_ep } // namespace onnxruntime diff --git a/onnxruntime/core/providers/openvino/ov_interface.h b/onnxruntime/core/providers/openvino/ov_interface.h index 866f4a02f7780..c3d165b40840c 100644 --- a/onnxruntime/core/providers/openvino/ov_interface.h +++ b/onnxruntime/core/providers/openvino/ov_interface.h @@ -61,10 +61,14 @@ struct OVCore : WeakSingleton { // OV Interface For Reading Model std::shared_ptr ReadModel(std::string&& model_stream, const std::string& model_path); + OVExeNetwork StatefulCompileModel(std::shared_ptr& model, + std::string& hw_target, + const ov::AnyMap& device_config); // OV Interface for Compiling OV Model Type OVExeNetwork CompileModel(std::shared_ptr& ie_cnn_network, std::string& hw_target, ov::AnyMap& device_config, + bool enable_causallm, const std::string& name); // OV Interface for Fast Compile OVExeNetwork CompileModel(const std::string& onnx_model, @@ -83,16 +87,20 @@ struct OVCore : WeakSingleton { }; class OVExeNetwork { - ov::CompiledModel obj; + ov::CompiledModel compiled_model_obj; + std::string target_device; + bool is_stateful_causallm; public: - explicit OVExeNetwork(ov::CompiledModel md) : obj(md) {} - OVExeNetwork() : obj(ov::CompiledModel()) {} - ov::CompiledModel& Get() { return obj; } - OVInferRequest CreateInferRequest(); + explicit OVExeNetwork(ov::CompiledModel compiled_model, std::string device, bool stateful_causallm = false) + : compiled_model_obj(compiled_model), target_device(device), is_stateful_causallm(stateful_causallm) {} + OVExeNetwork() : compiled_model_obj(ov::CompiledModel()) {} + ov::CompiledModel& Get() { return compiled_model_obj; } + std::shared_ptr CreateInferRequest(); }; class OVInferRequest { + protected: ov::InferRequest ovInfReq; public: @@ -100,16 +108,42 @@ class OVInferRequest { OVTensorPtr GetTensor(const std::string& name); std::string GetInputTensorName(uint32_t index); void SetTensor(const std::string& name, OVTensorPtr& blob); - void StartAsync(); - void Infer(); + virtual void StartAsync(); + virtual void Infer(); void WaitRequest(); void CancelRequest(); void QueryStatus(); - explicit OVInferRequest(ov::InferRequest obj) : ovInfReq(std::move(obj)) {} + explicit OVInferRequest(ov::InferRequest infer_request_obj) : ovInfReq(std::move(infer_request_obj)) {} OVInferRequest() : ovInfReq(ov::InferRequest()) {} ov::InferRequest& GetNewObj() { return ovInfReq; } + virtual void RewindKVCache(size_t index) {} +}; + +class StatefulOVInferRequest : public OVInferRequest { + public: + explicit StatefulOVInferRequest(ov::InferRequest infer_request, std::string device); + + void StartAsync() override; + void Infer() override; + void RewindKVCache(size_t index) override; + void FillTensor(const std::string& tensor_name, const ov::element::Type& type, + const std::vector& shape, int32_t fill_value); + void CacheTensor(const std::string& tensor_name, std::vector& cache); + void SetTensorFromCache(const std::string& tensor_name, const std::vector& cache_data); + std::optional FindTensor(const std::string& tensor_name); + + private: + void PreProcessInferRequest(); + std::string target_device; + + // If prefill_use_full_chat_history is true, cache the "input_ids" & "position_ids" tensors, + // and ensure that full chat history is passed for each prefill call. + bool prefill_use_full_chat_history = false; + std::vector cached_input_ids; + std::vector cached_position_ids; }; + } // namespace openvino_ep } // namespace onnxruntime diff --git a/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.cc b/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.cc new file mode 100644 index 0000000000000..67ba42884e4f0 --- /dev/null +++ b/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.cc @@ -0,0 +1,350 @@ +// Copyright (C) Intel Corporation +// Licensed under the MIT License + +#include "core/providers/openvino/ov_stateful_patch_utils.h" + +namespace onnxruntime { +namespace openvino_ep { + +void LogBasicModelInfo(const std::shared_ptr& model) { + std::cout << "Model Name: " << model->get_friendly_name() << std::endl; + + // Log detailed information about model inputs and outputs + auto inputs = model->inputs(); + auto outputs = model->outputs(); + + std::cout << "\tInputs: " << std::endl; + for (const ov::Output& input : inputs) { + const std::string name = input.get_any_name(); + const ov::element::Type type = input.get_element_type(); + const ov::PartialShape shape = input.get_partial_shape(); + const ov::Layout layout = ov::layout::get_layout(input); + + std::cout << "\t\t" << name << ", " << type << ", " << shape << ", " << layout.to_string() << std::endl; + } + + std::cout << "\tOutputs: " << std::endl; + for (const ov::Output& output : outputs) { + const std::string name = output.get_any_name(); + const ov::element::Type type = output.get_element_type(); + const ov::PartialShape shape = output.get_partial_shape(); + const ov::Layout layout = ov::layout::get_layout(output); + + std::cout << "\t\t" << name << ", " << type << ", " << shape << ", " << layout.to_string() << std::endl; + } + + return; +} + +bool ModelHasInputOutputNames(std::shared_ptr model, const std::string& name_to_match) { + for (const ov::Output& input : model->inputs()) { + auto& names = input.get_names(); + + for (auto& name : names) { + if (name == name_to_match) { + return true; + } + } + } + + for (const ov::Output& output : model->outputs()) { + auto& names = output.get_names(); + for (auto& name : names) { + if (name == name_to_match) { + return true; + } + } + } + + return false; +} + +void FuseCacheReorder(std::shared_ptr ov_model, + std::vector& not_kv_inputs, + const std::vector& key_value_input_names, + int gather_dim) { + if (ModelHasInputOutputNames(ov_model, "beam_idx")) { + throw std::runtime_error("Model already has fused cache"); + } + + std::string main_input_name = "inputs_embeds"; + if (ModelHasInputOutputNames(ov_model, "input_ids")) { + main_input_name = "input_ids"; + } + + auto input_batch = ov_model->input(main_input_name).get_partial_shape()[0]; + + auto beam_idx = std::make_shared(ov::element::i32, ov::PartialShape({input_batch})); + beam_idx->set_friendly_name("beam_idx"); + beam_idx->output(0).get_tensor().add_names({"beam_idx"}); + ov_model->add_parameters({beam_idx}); + not_kv_inputs.push_back(beam_idx->get_friendly_name()); + + // Go over all cache parameters and fuse _reorder_cache with indices provided by the new parameter beam_idx + for (const auto& input_name : key_value_input_names) { + auto parameter_output_port = ov_model->input(input_name); + auto consumers = parameter_output_port.get_target_inputs(); + + auto gather_op = + std::make_shared(parameter_output_port, + beam_idx, + ov::opset13::Constant::create(ov::element::i64, {}, {gather_dim})); + + // Replace the source output for all consumers of the input tensor + for (auto& consumer : consumers) { + consumer.replace_source_output(gather_op->output(0)); + } + } + + // Validate the modified model + ov_model->validate_nodes_and_infer_types(); +} + +void MakeStateful(std::shared_ptr& ov_model, + const std::vector& key_value_input_names, + const std::vector& key_value_output_names) { + std::map input_output_map; + + // Create mapping for KV-cache inputs and outputs + for (size_t i = 0; i < key_value_input_names.size(); ++i) { + input_output_map[key_value_input_names[i]] = key_value_output_names[i]; + } + + // Apply the transformation to make the model stateful + ov::pass::Manager manager; + manager.register_pass(input_output_map); + manager.run_passes(ov_model); +} + +// Converted to C++ from below reference URL: +// https://github.com/huggingface/optimum-intel/blob/main/optimum/exporters/openvino/stateful.py#L281 +void PatchStatefulDecoder(std::shared_ptr model) { + std::vector key_value_input_names; + std::vector not_kv_inputs; + for (const ov::Output& input : model->inputs()) { + auto& names = input.get_names(); + + bool found = false; + for (auto& name : names) { + if (name.find("key_values") != std::string::npos) { + key_value_input_names.push_back(name); + found = true; + break; + } + } + + if (!found) { + not_kv_inputs.push_back(input.get_any_name()); + } + } + + std::vector key_value_output_names; + for (const ov::Output& output : model->outputs()) { + auto& names = output.get_names(); + for (auto& name : names) { + if (name.find("present") != std::string::npos) { + key_value_output_names.push_back(name); + break; + } + } + } + + if (key_value_input_names.empty() || key_value_output_names.empty()) { + std::cout << "no key_value_input_names or key_value_output_names found" << std::endl; + return; + } + + // By default, batch is the 0 - th but chatglm uses 1 - st dimension as batch + // TODO(ryan): Deduce from a model via ordinal reshape(? ) and topology + // batch_dim = 1 if config.model_type == "chatglm" and not hasattr(config, "rope_ratio") else 0 + auto batch_dim = 0; + + FuseCacheReorder(model, not_kv_inputs, key_value_input_names, batch_dim); + + MakeStateful(model, key_value_input_names, key_value_output_names); +} + +// Some other utility functions copied from OpenVINO GenAI +bool HasOpWithType(const std::shared_ptr& function, const std::string& type_name) { + for (const auto& op : function->get_ops()) { + if (op->get_type_name() == type_name) { + return true; + } + } + return false; +} + +std::tuple, int64_t> FindLLMMatmul(const std::shared_ptr& model) { + auto last_node = model->output(0).get_node()->input_value(0).get_node_shared_ptr(); + std::shared_ptr matmul = ov::as_type_ptr(last_node); + + // In the case of PagedAttention, all tokens are moved to the batch dimension, + // and slicing/gathering must be performed accordingly. + const bool pa_based_model = HasOpWithType(model, "PagedAttentionExtension"); + int64_t slice_gather_dim = pa_based_model ? 0 : 1; + + // There are several patterns for MatMul we are looking for: + // MatMul -> Result + // MatMul -> Add -> Result + // MatMul -> Transpose -> Result + // MatMul -> Divide -> Tanh -> Multiply -> Result + // MatMul -> Convert -> Result + if (!matmul) { + if (auto add = ov::as_type_ptr(last_node)) { + matmul = ov::as_type_ptr(add->input_value(0).get_node_shared_ptr()); + } else if (auto transpose = ov::as_type_ptr(last_node)) { + matmul = ov::as_type_ptr(transpose->input_value(0).get_node_shared_ptr()); + auto order = ov::as_type_ptr(transpose->input_value(1).get_node_shared_ptr())->get_axis_vector_val(); + slice_gather_dim = order[slice_gather_dim]; + } else if (auto multiply = ov::as_type_ptr(last_node)) { + if (auto tanh = ov::as_type_ptr(multiply->input_value(0).get_node_shared_ptr())) { + if (auto divide = ov::as_type_ptr(tanh->input_value(0).get_node_shared_ptr())) { + matmul = ov::as_type_ptr(divide->input_value(0).get_node_shared_ptr()); + } + } + } else if (auto convert = ov::as_type_ptr(last_node)) { + matmul = ov::as_type_ptr(convert->input_value(0).get_node_shared_ptr()); + } + } + return std::make_tuple(matmul, slice_gather_dim); +} + +void ApplySliceBeforeMatmulTransformation(std::shared_ptr model) { + std::shared_ptr matmul = nullptr; + int64_t slice_gather_dim = -1; + std::tie(matmul, slice_gather_dim) = FindLLMMatmul(model); + + if (matmul && matmul->input(0).get_partial_shape().rank().get_length() == 3) { + auto start = std::make_shared(ov::element::i64, ov::Shape{1}, std::vector{-1}); + auto stop = std::make_shared(ov::element::i64, ov::Shape{1}, std::vector{-2}); + auto step = std::make_shared(ov::element::i64, ov::Shape{1}, std::vector{-1}); + auto axis = std::make_shared(ov::element::i64, ov::Shape{1}, std::vector{slice_gather_dim}); + auto slice = std::make_shared(matmul->input_value(0), start, stop, step, axis); + matmul->input(0).replace_source_output(slice); + } +} + +void UpdateConfig(ov::AnyMap& config, const std::pair& pair) { + if (config.count(pair.first) == 0) { + config.insert(pair); + } +} + +std::optional PopOption(ov::AnyMap& config, const std::string& option_name) { + if (auto it = config.find(option_name); it != config.end()) { + std::optional found = std::make_optional(it->second); + config.erase(it); + return found; + } + return std::nullopt; +} + +void RenameKey(ov::AnyMap& config, const std::string& old_key, const std::string& new_key) { + if (config.count(old_key) != 0) { + auto opt_value = PopOption(config, old_key); + config[new_key] = opt_value.value(); + } +} + +KVAxesPosition GetKVAxesPos(std::shared_ptr model) { + // Sequence length axis in key/values tensors. For most cases, the tensor shape is + // [batch_size, num_kv_heads, seq_len, head_size]. Therefore, the sequence length axis + // is usually at index 2, and the batch axis is at index 0. + KVAxesPosition kv_pos{0u, 2u}; + + // "ReadValue" node is KV cache representation in stateful model + std::string kv_node_type_name = std::string(ov::op::v6::ReadValue::get_type_info_static().name); + + for (const auto& op : model->get_ops()) { + // Check input size, as in LoRA adapters case it could be 0 + if (op->get_type_name() != kv_node_type_name || op->get_input_size() < 1) { + continue; + } + + // Shape example: [-1,4,0,64] + auto shape = op->get_input_partial_shape(0); + + for (int64_t i = 0; i < shape.rank().get_length(); i++) { + // Find axis = 0. This would be sequence length axis. + if (shape[i] == 0) { + kv_pos.seq_len = i; + } else if (shape[i].is_dynamic()) { + // Dynamic axis is a batch + kv_pos.batch = i; + } + } + break; + } + + return kv_pos; +} + +void UpdateNPUConfig(ov::AnyMap& config, const KVAxesPosition& kv_pos, const KVDesc& kv_desc) { + UpdateConfig(config, {"NPU_USE_NPUW", "YES"}); + UpdateConfig(config, {"NPUW_LLM", "YES"}); + + UpdateConfig(config, {"NPUW_LLM_BATCH_DIM", kv_pos.batch}); + UpdateConfig(config, {"NPUW_LLM_SEQ_LEN_DIM", kv_pos.seq_len}); + + UpdateConfig(config, {"NPUW_LLM_MAX_PROMPT_LEN", kv_desc.max_prompt_len}); + UpdateConfig(config, {"NPUW_LLM_MIN_RESPONSE_LEN", kv_desc.min_response_len}); + + RenameKey(config, "++PREFILL_CONFIG", "++NPUW_LLM_PREFILL_CONFIG"); + RenameKey(config, "++GENERATE_CONFIG", "++NPUW_LLM_GENERATE_CONFIG"); + RenameKey(config, "PREFILL_CONFIG", "NPUW_LLM_PREFILL_CONFIG"); + RenameKey(config, "PREFILL_HINT", "NPUW_LLM_PREFILL_HINT"); + RenameKey(config, "GENERATE_CONFIG", "NPUW_LLM_GENERATE_CONFIG"); + RenameKey(config, "GENERATE_HINT", "NPUW_LLM_GENERATE_HINT"); + + const size_t npuw_context_len_threshold = 2048; + if ((kv_desc.max_prompt_len + kv_desc.min_response_len) >= npuw_context_len_threshold) { + // This improves accuracy for generation sequences that exceed 2k tokens. + config["++NPUW_LLM_PREFILL_CONFIG"] = ov::AnyMap{{"NPUW_DEVICES", "NPU,CPU"}, {"NPUW_ONLINE_AVOID", "P:SinCos/NPU"}}; + config["++NPUW_LLM_GENERATE_CONFIG"] = ov::AnyMap{{"NPUW_DEVICES", "NPU,CPU"}, {"NPUW_ONLINE_AVOID", "P:SinCos/NPU"}}; + } +} + +std::optional PopOptionNew(ov::AnyMap& config, const std::string& option_name) { + if (auto it = config.find(option_name); it != config.end()) { + std::optional found = std::make_optional(it->second); + config.erase(it); + return found; + } + return std::nullopt; +} + +std::optional PopIntAndCast(ov::AnyMap& config, const std::string& key) { + auto anyopt = PopOptionNew(config, key); + if (anyopt.has_value()) { + const auto any = anyopt.value(); + int64_t value; + // NB: Integer value coming from python has int64_t datatype + if (any.is()) { + value = any.as(); + } else if (any.is()) { + value = any.as(); + } else { + OPENVINO_THROW("Failed to extract " + key + ". Type mismatch: expected types: int or int64_t"); + } + if (value < 0) { + OPENVINO_THROW(key + " cannot be negative!"); + } + return std::make_optional(static_cast(value)); + } + return std::nullopt; +} + +bool IsStateful(const std::shared_ptr& model) { + for (auto&& ptr : model->get_ordered_ops()) { + if (ov::is_type(ptr) || + ov::is_type(ptr) || + ov::is_type(ptr) || + ov::is_type(ptr)) { + return true; + } + } + return false; +} + +} // namespace openvino_ep +} // namespace onnxruntime diff --git a/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.h b/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.h new file mode 100644 index 0000000000000..0b89c4ed02e13 --- /dev/null +++ b/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.h @@ -0,0 +1,84 @@ +// Copyright (C) Intel Corporation +// Licensed under the MIT License + +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#include "openvino/pass/manager.hpp" +#include "openvino/pass/make_stateful.hpp" +#include "openvino/opsets/opset13.hpp" + +namespace onnxruntime { +namespace openvino_ep { + +void LogBasicModelInfo(const std::shared_ptr& model); + +bool ModelHasInputOutputNames(std::shared_ptr model, const std::string& name_to_match); + +void FuseCacheReorder(std::shared_ptr ov_model, + std::vector& not_kv_inputs, + const std::vector& key_value_input_names, + int gather_dim); + +void MakeStateful(std::shared_ptr& ov_model, + const std::vector& key_value_input_names, + const std::vector& key_value_output_names); + +void PatchStatefulDecoder(std::shared_ptr model); + +bool HasOpWithType(const std::shared_ptr& function, const std::string& type_name); + +std::tuple, int64_t> FindLLMMatmul(const std::shared_ptr& model); + +void ApplySliceBeforeMatmulTransformation(std::shared_ptr model); + +void UpdateConfig(ov::AnyMap& config, const std::pair& pair); + +std::optional PopOption(ov::AnyMap& config, const std::string& option_name); + +void RenameKey(ov::AnyMap& config, const std::string& old_key, const std::string& new_key); + +struct KVAxesPosition { + size_t batch; + size_t seq_len; +}; + +KVAxesPosition GetKVAxesPos(std::shared_ptr model); + +struct KVDesc { + uint32_t max_prompt_len; + uint32_t min_response_len; +}; + +struct CausalLMConfig { + void ApplyConfig(const ov::AnyMap& external_config, ov::AnyMap& genai_config) { + if (external_config.find("MAX_PROMPT_LEN") != external_config.end()) { + max_prompt_len = external_config.at("MAX_PROMPT_LEN").as(); + } + if (external_config.find("MIN_RESPONSE_LEN") != external_config.end()) { + min_response_len = external_config.at("MIN_RESPONSE_LEN").as(); + } + genai_config["MAX_PROMPT_LEN"] = ov::Any(max_prompt_len); + genai_config["MIN_RESPONSE_LEN"] = ov::Any(min_response_len); + } + + unsigned int max_prompt_len = 1024; + unsigned int min_response_len = 128; +}; + +void UpdateNPUConfig(ov::AnyMap& config, const KVAxesPosition& kv_pos, const KVDesc& kv_desc); + +std::optional PopOptionNew(ov::AnyMap& config, const std::string& option_name); +std::optional PopIntAndCast(ov::AnyMap& config, const std::string& key); + +bool IsStateful(const std::shared_ptr& model); + +} // namespace openvino_ep +} // namespace onnxruntime diff --git a/onnxruntime/test/contrib_ops/attention_op_test.cc b/onnxruntime/test/contrib_ops/attention_op_test.cc index 61e5fa05c66c1..4dff0376fcd84 100644 --- a/onnxruntime/test/contrib_ops/attention_op_test.cc +++ b/onnxruntime/test/contrib_ops/attention_op_test.cc @@ -2047,7 +2047,7 @@ TEST(AttentionTest, AttentionPastState_dynamic) { test.AddInput("past", past_dims, past_data); test.AddReferenceOutputs("testdata/attention_past_state.onnx", 0.005f); - test.Run(); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kOpenVINOExecutionProvider}); } #endif //! defined(__wasm__) diff --git a/onnxruntime/test/perftest/ort_test_session.cc b/onnxruntime/test/perftest/ort_test_session.cc index 05136ec0750a1..e8eda5af1dc29 100644 --- a/onnxruntime/test/perftest/ort_test_session.cc +++ b/onnxruntime/test/perftest/ort_test_session.cc @@ -760,6 +760,15 @@ select from 'TF8', 'TF16', 'UINT8', 'FLOAT', 'ITENSOR'. \n)"); } else { ORT_THROW("[ERROR] [OpenVINO] The value for the key 'enable_qdq_optimizer' should be a boolean i.e. true or false. Default value is false.\n"); } + } else if (key == "enable_causallm") { + if (value == "true" || value == "True" || + value == "false" || value == "False") { + ov_options[key] = value; + } else { + ORT_THROW( + "[ERROR] [OpenVINO] The value for the key 'enable_causallm' should be a boolean i.e. true or false." + " Default value is false. This provider option must be used with CausalLM Models viz. LLMs & SLMs only.\n"); + } } else if (key == "disable_dynamic_shapes") { if (value == "true" || value == "True" || value == "false" || value == "False") { @@ -817,7 +826,8 @@ select from 'TF8', 'TF16', 'UINT8', 'FLOAT', 'ITENSOR'. \n)"); ORT_THROW( "[ERROR] [OpenVINO] wrong key type entered. Choose from the following runtime key options that are available for OpenVINO." " ['device_type', 'device_id', 'num_of_threads', 'load_config', 'cache_dir', 'num_streams', " - "'enable_opencl_throttling', 'disable_dynamic_shapes', 'enable_qdq_optimizer', 'model_priority'] \n"); + "'enable_opencl_throttling', 'disable_dynamic_shapes', 'enable_qdq_optimizer'," + " 'enable_causallm', 'model_priority'] \n"); } } session_options.AppendExecutionProvider_OpenVINO_V2(ov_options);