diff --git a/onnxruntime/core/providers/openvino/ov_interface.cc b/onnxruntime/core/providers/openvino/ov_interface.cc index 446ed098521cb..23be3447b8799 100644 --- a/onnxruntime/core/providers/openvino/ov_interface.cc +++ b/onnxruntime/core/providers/openvino/ov_interface.cc @@ -374,11 +374,42 @@ void OVInferRequest::Infer() { StatefulOVInferRequest::StatefulOVInferRequest(ov::InferRequest infer_request, std::string device) : OVInferRequest(std::move(infer_request)), target_device(device) { bool gpu_or_npu = ((device.find("NPU") != std::string::npos) || (device.find("GPU") != std::string::npos)); - if (gpu_or_npu) { + + _npu_logits_slice_required = IsNPULogitsSliceRequired(); + + // check if there is input_ids tensors and if the tensor type is int64, + // because logic prefill_use_full_chat_history is only for specific inputs and data type + auto input_ids_opt = FindTensor("input_ids"); + if (gpu_or_npu && input_ids_opt.has_value() && input_ids_opt->get_element_type() == ov::element::i64) { prefill_use_full_chat_history = true; } } +static inline bool IsNPUWSliceOutEnabled(const ov::CompiledModel& compiled_model) { + auto slice_out_val = compiled_model.get_property("NPUW_SLICE_OUT"); + if (!slice_out_val.empty()) { + if (slice_out_val.is()) { + return (slice_out_val.as() == "YES"); + } else if (slice_out_val.is()) { + return slice_out_val.as(); + } + } + + return false; +} + +bool StatefulOVInferRequest::IsNPULogitsSliceRequired() { + if (target_device.find("NPU") != std::string::npos) { + const auto& model = ovInfReq.get_compiled_model(); + // If NPUW_SLICE_OUT is enabled, it means that it's not required to slice within OVEP. + // Otherwise, if NPUW_SLICE_OUT is NOT enabled, then we need to perform some explicit logit + // slicing in OVEP. + return !IsNPUWSliceOutEnabled(model); + } + + return false; +} + void StatefulOVInferRequest::FillTensor(const std::string& tensor_name, const ov::element::Type& type, const std::vector& shape, int32_t fill_value) { ov::Tensor tensor = ov::Tensor(type, shape); @@ -519,5 +550,46 @@ void StatefulOVInferRequest::RewindKVCache(size_t index) { } } } + +OVTensorPtr StatefulOVInferRequest::GetTensor(const std::string& input_name) { + + auto tobj = OVInferRequest::GetTensor(input_name); + + if (_npu_logits_slice_required) { + if (input_name == "logits") { + if (tobj->get_shape().size() != 3) { + ORT_THROW(log_tag + std::format("Expected logits to have shape of rank 3, but it has shape of rank {}", + tobj->get_shape().size())); + } + + // When _npu_logits_slice_required is true, it means that prefill may produce logits of shape: + // [, sequence_length, ] + // (Where 'sequence_length' is number of input tokens to prefill) + // But, ORT GenAI is expecting to receive logits of shape: + // [, 1, ] + // In this case, detect when shape[1] is not 1. When it is, create a slice of shape [, 1, ] + if (tobj->get_shape()[1] > 1) { + return OvExceptionBoundary([&]() { + const ov::Coordinate begin = {0, tobj->get_shape()[1] - 1, 0}; + const ov::Coordinate end = {tobj->get_shape()[0], tobj->get_shape()[1], tobj->get_shape()[2]}; + auto sliced_tensor = ov::Tensor(*tobj, begin, end); + if (sliced_tensor.is_continuous()) { + OVTensorPtr blob = std::make_shared(sliced_tensor); + return blob; + } else { + auto continuous_sliced_tensor = ov::Tensor(sliced_tensor.get_element_type(), sliced_tensor.get_shape()); + sliced_tensor.copy_to(continuous_sliced_tensor); + OVTensorPtr blob = std::make_shared(continuous_sliced_tensor); + return blob; + } + }, + "Could not create sliced logits tensor"); + } + } + } + + return tobj; +} + } // namespace openvino_ep } // namespace onnxruntime diff --git a/onnxruntime/core/providers/openvino/ov_interface.h b/onnxruntime/core/providers/openvino/ov_interface.h index 8a55fdcbd4fb4..8fc28b8885e5d 100644 --- a/onnxruntime/core/providers/openvino/ov_interface.h +++ b/onnxruntime/core/providers/openvino/ov_interface.h @@ -110,7 +110,7 @@ class OVInferRequest { public: uint32_t GetNumInputs(); - OVTensorPtr GetTensor(const std::string& name); + virtual OVTensorPtr GetTensor(const std::string& name); std::string GetInputTensorName(uint32_t index); // Set tensor call infer req tensor if ort_ptr differs from last set ptr. @@ -147,6 +147,7 @@ class StatefulOVInferRequest : public OVInferRequest { void CacheTensor(const std::string& tensor_name, std::vector& cache); void SetTensorFromCache(const std::string& tensor_name, const std::vector& cache_data); std::optional FindTensor(const std::string& tensor_name); + OVTensorPtr GetTensor(const std::string& name) override; private: void PreProcessInferRequest(); @@ -157,6 +158,9 @@ class StatefulOVInferRequest : public OVInferRequest { bool prefill_use_full_chat_history = false; std::vector cached_input_ids; std::vector cached_position_ids; + + bool IsNPULogitsSliceRequired(); + bool _npu_logits_slice_required = false; }; } // namespace openvino_ep