diff --git a/onnxruntime/core/providers/openvino/backend_manager.cc b/onnxruntime/core/providers/openvino/backend_manager.cc index 139a0eac512a4..13f09b9d9acdb 100644 --- a/onnxruntime/core/providers/openvino/backend_manager.cc +++ b/onnxruntime/core/providers/openvino/backend_manager.cc @@ -83,23 +83,22 @@ BackendManager::BackendManager(SessionContext& session_context, } std::string device_type = session_context_.device_type; - // Check if model is using external weights - if (auto filename = backend_utils::GetExternalWeightFilename(subgraph)) { - std::filesystem::path weights_filepath = session_context_.onnx_model_path_name.parent_path() / filename.value(); - - // Initialize external weights with fully qualified path - if (!std::filesystem::exists(weights_filepath)) { - ORT_THROW("Error: Failed to locate weight file at ", weights_filepath.string()); + auto& sw = shared_context_.shared_weights; + if (session_context_.so_share_ep_contexts) { + std::filesystem::path weight_filename = session_context_.onnx_model_path_name.parent_path(); + if (sw.external_weight_filename.empty() && !sw.metadata.empty()) { + // Reasonable assumption that all metadata entries have the same external file location + sw.external_weight_filename = sw.metadata.begin()->second.location; } + weight_filename /= sw.external_weight_filename; + std::ifstream weight_file(weight_filename); - external_weights_.emplace(weights_filepath); - } - - if (session_context_.so_share_ep_contexts) { - ORT_ENFORCE(external_weights_.has_value(), "Expected external weight object to be valid"); - backend_utils::CreateOVTensors(session_context_.device_type, - shared_context_.shared_weights.metadata, - external_weights_.value()); + if (weight_file) { + if (!sw.mapped_weights) { + sw.mapped_weights = std::make_unique(weight_filename); + } + backend_utils::CreateOVTensors(session_context_.device_type, sw.metadata, *sw.mapped_weights); + } } if (ModelHasSymbolicInputDims(subgraph)) { @@ -325,7 +324,7 @@ static bool IsQDQGraph(const onnxruntime::GraphViewer& graph_viewer) { static void DumpOpenVINOEPModel([[maybe_unused]] const std::filesystem::path& onnx_model_path_name, [[maybe_unused]] ONNX_NAMESPACE::ModelProto* model_proto, [[maybe_unused]] const onnxruntime::Node& fused_node) { -#ifdef NOT_RELEASE +#ifndef RELEASE if (openvino_ep::backend_utils::IsDebugEnabled()) { auto model_name = onnx_model_path_name.empty() ? "unknown.onnx" : onnx_model_path_name.filename(); @@ -385,12 +384,7 @@ BackendManager::GetModelProtoFromFusedNode(const onnxruntime::Node& fused_node, if (session_context_.device_type.find("NPU") != std::string::npos && (enable_ovep_qdq_optimizer || session_context_.so_share_ep_contexts)) { std::unique_ptr model; - Status status = CreateModelWithStrippedQDQNodes(subgraph, - logger, - session_context_.so_share_ep_contexts, - enable_ovep_qdq_optimizer, - model, - shared_context_.shared_weights.metadata); + Status status = CreateModelWithStrippedQDQNodes(subgraph, logger, session_context_.so_share_ep_contexts, enable_ovep_qdq_optimizer, model, shared_context_.shared_weights); auto model_proto = model->ToProto(); model_proto->set_ir_version(ONNX_NAMESPACE::Version::IR_VERSION); print_model_proto_duration(); diff --git a/onnxruntime/core/providers/openvino/backend_manager.h b/onnxruntime/core/providers/openvino/backend_manager.h index 22936acf3ea66..cdc27701ec2e6 100644 --- a/onnxruntime/core/providers/openvino/backend_manager.h +++ b/onnxruntime/core/providers/openvino/backend_manager.h @@ -54,7 +54,6 @@ class BackendManager { EPCtxHandler& ep_ctx_handle_; SessionContext& session_context_; SharedContext& shared_context_; - std::optional external_weights_; }; } // namespace openvino_ep diff --git a/onnxruntime/core/providers/openvino/backend_utils.cc b/onnxruntime/core/providers/openvino/backend_utils.cc index 58309d37877f1..2ee5e9ec3e3a9 100644 --- a/onnxruntime/core/providers/openvino/backend_utils.cc +++ b/onnxruntime/core/providers/openvino/backend_utils.cc @@ -4,7 +4,6 @@ #include #include #include -#include #include #include @@ -21,7 +20,22 @@ using Exception = ov::Exception; namespace onnxruntime { namespace openvino_ep { -std::ostream& operator<<(std::ostream& stream, const Metadata::Map& metadata) { +SharedContext::SharedWeights::WeightsFile::WeightsFile(std::filesystem::path filename) : file_(filename, std::ios::in | std::ios::binary) { + try { + file_.exceptions(std::ifstream::failbit | std::ifstream::badbit); + weights_size_ = file_.seekg(0, std::ios::end).tellg(); + } catch (std::ifstream::failure& e) { + ORT_THROW("Error: Failed to open weight file at ", filename.string(), " ", e.what()); + } +} + +void SharedContext::SharedWeights::WeightsFile::load_weights(size_t file_offset, void* data, size_t size) { + ORT_ENFORCE(file_offset < weights_size_ && size <= weights_size_ && (file_offset <= weights_size_ - size), "Error: File offset is out of bounds."); + file_.seekg(file_offset); + file_.read(reinterpret_cast(data), size); +} + +std::ostream& operator<<(std::ostream& stream, const SharedContext::SharedWeights::Metadata::Map& metadata) { try { stream << metadata.size(); @@ -55,14 +69,14 @@ std::ostream& operator<<(std::ostream& stream, const Metadata::Map& metadata) { return stream; } -std::istream& operator>>(std::istream& stream, Metadata::Map& metadata) { +std::istream& operator>>(std::istream& stream, SharedContext::SharedWeights::Metadata::Map& metadata) { size_t map_size{0}; try { stream >> map_size; while (!stream.eof()) { - Metadata::Key key; - Metadata::Value value; + SharedContext::SharedWeights::Metadata::Key key; + SharedContext::SharedWeights::Metadata::Value value; stream >> key.name; stream >> value.location; stream >> value.data_offset; @@ -385,19 +399,8 @@ ov::element::Type GetOpenVINOElementType(ONNX_NAMESPACE::TensorProto_DataType dt // Function to handle tensor creation from external data void CreateOVTensors(const std::string& device_name, - Metadata::Map& metadata_map, - std::filesystem::path& weights_filepath) { - // File is guaranteed to exist at this point - std::ifstream file(weights_filepath, std::ios::in | std::ios::binary); - file.exceptions(std::ifstream::failbit | std::ifstream::badbit); - size_t weights_size = std::filesystem::file_size(weights_filepath); - - const auto load_weights = [&file, weights_size](size_t file_offset, void* data, size_t size) { - ORT_ENFORCE(file_offset < weights_size && size <= weights_size && (file_offset <= weights_size - size), "Error: File offset is out of bounds."); - file.seekg(file_offset); - file.read(reinterpret_cast(data), size); - }; - + SharedContext::SharedWeights::Metadata::Map& metadata_map, + SharedContext::SharedWeights::WeightsFile& weights) { for (auto& [key, value] : metadata_map) { if (value.tensor) continue; @@ -413,18 +416,18 @@ void CreateOVTensors(const std::string& device_name, auto&& remote_tensor = npu_context.create_l0_host_tensor(ov_elementType, value.dimensions, ov::intel_npu::TensorType::INPUT); // Copy data to remote tensor - load_weights(value.data_offset, remote_tensor.get(), value.size); + weights.load_weights(value.data_offset, remote_tensor.get(), value.size); value.tensor = std::make_shared(remote_tensor); } else { // Use vanilla tensors value.tensor = std::make_shared(ov_elementType, value.dimensions); - load_weights(value.data_offset, value.tensor->data(), value.size); + weights.load_weights(value.data_offset, value.tensor->data(), value.size); } ORT_ENFORCE(value.tensor->get_byte_size() == value.size, "Unexpected tensor size mismatch"); } } -void DestroyOVTensors(Metadata::Map& metadata_map) { +void DestroyOVTensors(SharedContext::SharedWeights::Metadata::Map& metadata_map) { for (auto& [key, value] : metadata_map) { if (value.tensor) { value.tensor.reset(); @@ -433,51 +436,6 @@ void DestroyOVTensors(Metadata::Map& metadata_map) { metadata_map.clear(); } -std::optional GetExternalWeightFilename(const GraphViewer& graph) { - auto get_external_location = [](const ONNX_NAMESPACE::TensorProto& proto) -> std::optional { - using mutable_proto_t = ONNX_NAMESPACE::TensorProto*; - auto& mutable_proto = *const_cast(&proto); - auto* entry_protos = mutable_proto.mutable_external_data(); - - if (proto.has_data_location() && proto.data_location() == ONNX_NAMESPACE::TensorProto_DataLocation_EXTERNAL) { - for (int i = 0; i < entry_protos->size(); i++) { - auto& string_entry_proto{entry_protos->at(i)}; - const auto& pb_key{*(string_entry_proto.mutable_key())}; - const auto& pb_value{*(string_entry_proto.mutable_value())}; - if (pb_key == "location") { - return std::make_optional(pb_value); - } - } - } - - return std::nullopt; - }; - - // Handle constant initializers - auto& initializers = graph.GetAllInitializedTensors(); - for (const auto& it : initializers) { - if (auto result = get_external_location(*it.second)) { - return result; - } - } - - // Handle outer-scope constant initializers - for (auto& node_idx : graph.GetNodesInTopologicalOrder()) { - const auto& node = graph.GetNode(node_idx); - for (const auto& input : node->InputDefs()) { - if (graph.IsConstantInitializer(input->Name(), true)) { - const auto& initializer_tensor = *graph.GetConstantInitializer(input->Name(), true); - - if (auto result = get_external_location(initializer_tensor)) { - return result; - } - } - } - } - - return std::nullopt; -} - } // namespace backend_utils } // namespace openvino_ep } // namespace onnxruntime diff --git a/onnxruntime/core/providers/openvino/backend_utils.h b/onnxruntime/core/providers/openvino/backend_utils.h index b56c5e6e7f6ef..f13b1b05ced67 100644 --- a/onnxruntime/core/providers/openvino/backend_utils.h +++ b/onnxruntime/core/providers/openvino/backend_utils.h @@ -67,18 +67,15 @@ CreateOVModel(std::string&& model, std::map>& const_outputs_map); void CreateOVTensors(const std::string& device_name, - Metadata::Map& metadata_map, - std::filesystem::path& weights_filepath); -void DestroyOVTensors(Metadata::Map& metadata_map); + SharedContext::SharedWeights::Metadata::Map& metadata_map, + SharedContext::SharedWeights::WeightsFile& weights); +void DestroyOVTensors(SharedContext::SharedWeights::Metadata::Map& metadata_map); void printPerformanceCounts(const std::vector& performanceMap, std::ostream& stream, std::string deviceName); void printPerformanceCounts(OVInferRequestPtr request, std::ostream& stream, std::string deviceName); -// Returns the location string from the first external initializer nodes found or nullopt if none found -std::optional GetExternalWeightFilename(const GraphViewer& graph); - } // namespace backend_utils } // namespace openvino_ep } // namespace onnxruntime diff --git a/onnxruntime/core/providers/openvino/backends/basic_backend.cc b/onnxruntime/core/providers/openvino/backends/basic_backend.cc index c11f853dd1122..c814df618e3b3 100644 --- a/onnxruntime/core/providers/openvino/backends/basic_backend.cc +++ b/onnxruntime/core/providers/openvino/backends/basic_backend.cc @@ -125,12 +125,10 @@ BasicBackend::BasicBackend(std::unique_ptr& model_pr std::function initializer = [](OVInferRequestPtr) {}; auto metadata = shared_context_.shared_weights.metadata; if (session_context_.so_share_ep_contexts) { - // When shared ep contexts is set external weight references are transformed to model inputs. This - // creates an initializer to populate/bind input weight tensors to each inference request initializer = [&metadata](OVInferRequestPtr ir_ptr) { const auto input_count = ir_ptr->GetNumInputs(); for (auto i = 0u; i < input_count; i++) { - using Key = Metadata::Key; + using Key = SharedContext::SharedWeights::Metadata::Key; const auto tensor_key = Key{ir_ptr->GetInputTensorName(i)}; if (metadata.contains(tensor_key)) { auto& value = metadata.at(tensor_key); @@ -139,8 +137,6 @@ BasicBackend::BasicBackend(std::unique_ptr& model_pr } }; } - - // Create inference request queue and initialize according to passed function inferRequestsQueue_ = std::unique_ptr(new InferRequestsQueue(exe_network_, num_infer_req, std::move(initializer))); } diff --git a/onnxruntime/core/providers/openvino/contexts.h b/onnxruntime/core/providers/openvino/contexts.h index c0c4551607202..7560f4570bd32 100644 --- a/onnxruntime/core/providers/openvino/contexts.h +++ b/onnxruntime/core/providers/openvino/contexts.h @@ -18,29 +18,6 @@ namespace openvino_ep { namespace fs = std::filesystem; -struct Metadata { - struct Key { - std::string name; - bool operator==(const Key&) const = default; - }; - struct Hash { - std::size_t operator()(const Key& key) const noexcept { - return std::hash()(key.name); - } - }; - struct Value { - std::string location; - unsigned int data_offset; - unsigned int size; - std::vector dimensions; - std::int32_t element_type; - std::shared_ptr tensor; - }; - using Map = std::unordered_map; - friend std::ostream& operator<<(std::ostream& right, const Metadata::Map& metadata); - friend std::istream& operator>>(std::istream& right, Metadata::Map& metadata); -}; - class SharedContext : public WeakSingleton { // Keep the core alive as long as the shared SharedContext are alive. std::shared_ptr OVCore_; @@ -48,12 +25,45 @@ class SharedContext : public WeakSingleton { public: SharedContext() : OVCore_(OVCore::Get()) {} struct SharedWeights { + struct Metadata { + struct Key { + std::string name; + bool operator==(const Key&) const = default; + }; + struct Hash { + std::size_t operator()(const Key& key) const noexcept { + return std::hash()(key.name); + } + }; + struct Value { + std::string location; + unsigned int data_offset; + unsigned int size; + std::vector dimensions; + std::int32_t element_type; + std::shared_ptr tensor; + }; + using Map = std::unordered_map; + friend std::ostream& operator<<(std::ostream& right, const Metadata::Map& metadata); + friend std::istream& operator>>(std::istream& right, Metadata::Map& metadata); + }; + + struct WeightsFile { + ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(WeightsFile); + WeightsFile() = delete; + explicit WeightsFile(std::filesystem::path filename); + + void load_weights(size_t file_offset, void* data, size_t size); + + private: + std::ifstream file_; + size_t weights_size_; + }; + + fs::path external_weight_filename; + std::unique_ptr mapped_weights; Metadata::Map metadata; } shared_weights; - - void clear() { // Deletes the data stored in the SharedContext - shared_weights.metadata.clear(); - } }; using config_t = std::map; @@ -92,7 +102,6 @@ struct ProviderInfo { bool so_context_embed_mode{false}; // ORT session option bool so_share_ep_contexts{false}; // ORT session option fs::path so_context_file_path{}; // ORT session option - bool so_stop_share_ep_contexts{false}; // ORT session option const ConfigOptions* config_options{NULL}; const std::unordered_set valid_provider_keys = {"device_type", "device_id", "device_luid", "cache_dir", "precision", "load_config", "context", "num_of_threads", "model_priority", "num_streams", "enable_opencl_throttling", "enable_qdq_optimizer", diff --git a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc index 767b6519f1387..f9d4ab13cf2ce 100644 --- a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc +++ b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc @@ -65,7 +65,6 @@ OpenVINOExecutionProvider::~OpenVINOExecutionProvider() { backend_manager.ShutdownBackendManager(); } backend_managers_.clear(); - shared_context_.reset(); } std::vector> @@ -107,12 +106,7 @@ common::Status OpenVINOExecutionProvider::Compile( auto& metadata = shared_context_->shared_weights.metadata; if (session_context_.so_share_ep_contexts && metadata.empty()) { // Metadata is always read from model location, this could be a source or epctx model - fs::path metadata_filename; - if (session_context_.so_context_file_path.empty()) { - metadata_filename = session_context_.onnx_model_path_name.parent_path() / "metadata.bin"; - } else { - metadata_filename = session_context_.so_context_file_path.parent_path() / "metadata.bin"; - } + fs::path metadata_filename = session_context_.onnx_model_path_name.parent_path() / "metadata.bin"; std::ifstream file(metadata_filename, std::ios::binary); if (file) { file >> metadata; @@ -197,10 +191,6 @@ common::Status OpenVINOExecutionProvider::Compile( } } - if (session_context_.so_stop_share_ep_contexts) { - shared_context_->clear(); - } - return status; } diff --git a/onnxruntime/core/providers/openvino/openvino_provider_factory.cc b/onnxruntime/core/providers/openvino/openvino_provider_factory.cc index 93ec08b88ae21..f7f15dc62fd11 100644 --- a/onnxruntime/core/providers/openvino/openvino_provider_factory.cc +++ b/onnxruntime/core/providers/openvino/openvino_provider_factory.cc @@ -28,7 +28,6 @@ void ParseConfigOptions(ProviderInfo& pi) { pi.so_context_embed_mode = pi.config_options->GetConfigOrDefault(kOrtSessionOptionEpContextEmbedMode, "0") == "1"; pi.so_share_ep_contexts = pi.config_options->GetConfigOrDefault(kOrtSessionOptionShareEpContexts, "0") == "1"; pi.so_context_file_path = pi.config_options->GetConfigOrDefault(kOrtSessionOptionEpContextFilePath, ""); - pi.so_stop_share_ep_contexts = pi.config_options->GetConfigOrDefault(kOrtSessionOptionStopShareEpContexts, "0") == "1"; if (pi.so_share_ep_contexts) { ov::AnyMap map; diff --git a/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.cc b/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.cc index 61040c5552c71..860cfb5713903 100644 --- a/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.cc +++ b/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.cc @@ -11,7 +11,6 @@ #include #include #include -#include #include "core/providers/shared_library/provider_api.h" #include "core/providers/openvino/qdq_transformations/qdq_stripping.h" @@ -684,10 +683,10 @@ static void AddInitializerAsInput(onnxruntime::Graph& dst_graph, // Creates a new model without the DQ/Q operators in the src graph. Status CreateModelWithStrippedQDQNodes(const GraphViewer& src_graph, const logging::Logger& logger, - bool transform_weight_as_input, + bool enable_ovep_weight_sharing, bool enable_ovep_qdq_optimizer, /*out*/ std::unique_ptr& model, - /*out*/ Metadata::Map& weight_metadata) { + /*out*/ sw& shared_weights) { // NOTE: This function is a re-implementation of GraphViewerToProto() in core/graph/graph_proto_serializer.cc // with the following differences: // - Uses onnxruntime::Graph APIs instead of onnx::GraphProto APIs. @@ -778,7 +777,7 @@ Status CreateModelWithStrippedQDQNodes(const GraphViewer& src_graph, continue; // Already handled this node unit } - bool IsWeightSharingWithoutOVEPQDQStripping = transform_weight_as_input && !enable_ovep_qdq_optimizer; + bool IsWeightSharingWithoutOVEPQDQStripping = enable_ovep_weight_sharing && !enable_ovep_qdq_optimizer; if (node_unit->UnitType() == NodeUnit::Type::SingleNode) { AddStandaloneNodeUnit(dst_graph, src_graph, *node_unit, initializers_to_keep, IsWeightSharingWithoutOVEPQDQStripping, logger); @@ -803,9 +802,11 @@ Status CreateModelWithStrippedQDQNodes(const GraphViewer& src_graph, std::sort(const_inits.begin(), const_inits.end()); // initialize map for creating metadata for initilizers with external weights - const auto& insert_metadata = [&weight_metadata](const ONNX_NAMESPACE::TensorProto& proto) { - Metadata::Map::key_type key{proto.name()}; - Metadata::Map::mapped_type value{}; + auto& metadata = shared_weights.metadata; + + const auto& insert_metadata = [&metadata](const ONNX_NAMESPACE::TensorProto& proto) { + sw::Metadata::Map::key_type key{proto.name()}; + sw::Metadata::Map::mapped_type value{}; using mutable_proto_t = ONNX_NAMESPACE::TensorProto*; auto& mutable_proto = *const_cast(&proto); @@ -828,7 +829,7 @@ Status CreateModelWithStrippedQDQNodes(const GraphViewer& src_graph, dim = proto.dims()[index++]; } - weight_metadata.emplace(key, std::move(value)); + metadata.emplace(key, std::move(value)); }; // Handle constant initializers @@ -838,7 +839,7 @@ Status CreateModelWithStrippedQDQNodes(const GraphViewer& src_graph, // Check if the initializer has external data if (initializer_tensor.has_data_location() && initializer_tensor.data_location() == ONNX_NAMESPACE::TensorProto_DataLocation_EXTERNAL && - transform_weight_as_input) { + enable_ovep_weight_sharing) { insert_metadata(initializer_tensor); // Add initializer with external data as input @@ -866,7 +867,7 @@ Status CreateModelWithStrippedQDQNodes(const GraphViewer& src_graph, // Check if the initializer has external data if (initializer_tensor.has_data_location() && initializer_tensor.data_location() == ONNX_NAMESPACE::TensorProto_DataLocation_EXTERNAL && - transform_weight_as_input) { + enable_ovep_weight_sharing) { insert_metadata(initializer_tensor); // Add initializer as input if it has external data diff --git a/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.h b/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.h index 7e87352e5992d..53de0fd019311 100644 --- a/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.h +++ b/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.h @@ -10,12 +10,16 @@ namespace onnxruntime { namespace openvino_ep { +using sw = SharedContext::SharedWeights; + // Creates a new model without the DQ/Q operators in the src graph as per pre-defined rulesets Status CreateModelWithStrippedQDQNodes(const GraphViewer& src_graph, const logging::Logger& logger, - bool transform_weight_as_input, + bool enable_ovep_weight_sharing, bool enable_ovep_qdq_optimizer, /*out*/ std::unique_ptr& model, - /*out*/ Metadata::Map& metadata); + /*out*/ sw& shared_weights); + +bool dumpMetaDataMapToBinary(const sw::Metadata::Map& shared_weights, const std::string& filename); } // namespace openvino_ep } // namespace onnxruntime