diff --git a/onnxruntime/core/providers/openvino/backend_manager.cc b/onnxruntime/core/providers/openvino/backend_manager.cc
index b7e6245b1834f..c22f2e9cc0fa1 100644
--- a/onnxruntime/core/providers/openvino/backend_manager.cc
+++ b/onnxruntime/core/providers/openvino/backend_manager.cc
@@ -44,6 +44,10 @@ BackendManager::BackendManager(SessionContext& session_context,
                                                               shared_context_{shared_context} {
   subgraph_context_.is_ep_ctx_graph = ep_ctx_handle_.CheckForOVEPCtxNodeInGraph(subgraph);
 
+  bool cpu_or_gpu = session_context_.device_type.find("CPU") != std::string::npos ||
+                    session_context_.device_type.find("GPU") != std::string::npos;
+  bool npu = session_context_.device_type.find("NPU") != std::string::npos;
+
   subgraph_context_.model_precision = [&](const GraphViewer& graph_viewer) {
     // return empty if graph has no inputs or if types are not one of FP32/FP16
     // else assume the type of the first input
@@ -105,8 +109,7 @@ BackendManager::BackendManager(SessionContext& session_context,
   if (ModelHasSymbolicInputDims(subgraph)) {
     subgraph_context_.has_dynamic_input_shape = true;
     LOGS_DEFAULT(INFO) << "[OpenVINO-EP] Model has symbolic input dims";
-    if ((session_context_.device_type.find("CPU") != std::string::npos ||
-         session_context_.device_type.find("GPU") != std::string::npos) &&
+    if (cpu_or_gpu || (npu && session_context_.enable_causallm) &&
         !session_context_.disable_dynamic_shapes) {
       LOGS_DEFAULT(INFO) << "[OpenVINO-EP] Starting backend initialization. "
                          << "Creating backend Dynamic Shapes";
@@ -480,6 +483,9 @@ BackendManager::ReWriteBatchDimWithOne(const ONNX_NAMESPACE::ModelProto& model_p
 void BackendManager::Compute(OrtKernelContext* context) {
   Ort::KernelContext ctx(context);
   std::chrono::high_resolution_clock::time_point start_compute, end_compute;
+  bool cpu_or_gpu = session_context_.device_type.find("CPU") != std::string::npos ||
+                    session_context_.device_type.find("GPU") != std::string::npos;
+  bool npu = session_context_.device_type.find("NPU") != std::string::npos;
 #ifdef OPENVINO_FIL_ENABLED
   static bool fil_enabled = true;
   if (fil_enabled) {
@@ -493,8 +499,7 @@ void BackendManager::Compute(OrtKernelContext* context) {
   // disable_dynamic_shapes is always set to true for OV NPU plugin.
   if (subgraph_context_.has_dynamic_input_shape &&
       !session_context_.disable_dynamic_shapes &&
-      (session_context_.device_type.find("CPU") != std::string::npos ||
-       session_context_.device_type.find("GPU") != std::string::npos)) {
+      (cpu_or_gpu || (npu && session_context_.enable_causallm))) {
     concrete_backend_->Infer(context);
   } else if (subgraph_context_.has_dynamic_input_shape) {
     std::vector<std::vector<int64_t>> tensor_shapes = GetInputTensorShapes(ctx);
@@ -567,5 +572,11 @@ void BackendManager::ShutdownBackendManager() {
   concrete_backend_.reset();
 }
 
+void BackendManager::RewindKVCache(size_t index) {
+  if (concrete_backend_) {
+    concrete_backend_->RewindKVCache(index);
+  }
+}
+
 }  // namespace openvino_ep
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/openvino/backend_manager.h b/onnxruntime/core/providers/openvino/backend_manager.h
index cb1ca7001a00c..799dc50dd7a63 100644
--- a/onnxruntime/core/providers/openvino/backend_manager.h
+++ b/onnxruntime/core/providers/openvino/backend_manager.h
@@ -30,6 +30,7 @@ class BackendManager {
   SessionContext& GetSessionContext();
   Status ExportCompiledBlobAsEPCtxNode(const onnxruntime::GraphViewer& subgraph);
   ov::CompiledModel GetOVCompiledModel();
+  void RewindKVCache(size_t index);
 
  private:
   std::unique_ptr<ONNX_NAMESPACE::ModelProto> GetModelProtoFromFusedNode(
diff --git a/onnxruntime/core/providers/openvino/backends/basic_backend.cc b/onnxruntime/core/providers/openvino/backends/basic_backend.cc
index dedb6da1bae58..7902b3edb2276 100644
--- a/onnxruntime/core/providers/openvino/backends/basic_backend.cc
+++ b/onnxruntime/core/providers/openvino/backends/basic_backend.cc
@@ -15,6 +15,7 @@
 #include "core/providers/openvino/backends/basic_backend.h"
 #include "core/providers/openvino/onnx_ctx_model_helper.h"
 #include "core/providers/openvino/backend_manager.h"
+#include "core/providers/openvino/ov_stateful_patch_utils.h"
 
 namespace onnxruntime {
 
@@ -29,6 +30,7 @@ BasicBackend::BasicBackend(std::unique_ptr<ONNX_NAMESPACE::ModelProto>& model_pr
                            ptr_stream_t& model_stream)
     : session_context_{session_context}, subgraph_context_{subgraph_context}, shared_context_{shared_context} {
   std::string& hw_target = session_context_.device_type;
+  bool enable_causallm = session_context_.enable_causallm;
 
   if (ValidateSubgraph(const_outputs_map_))
     return;
@@ -43,7 +45,7 @@ BasicBackend::BasicBackend(std::unique_ptr<ONNX_NAMESPACE::ModelProto>& model_pr
   // Setting OpenCL queue throttling for GPU
   EnableGPUThrottling(device_config);
 
-  // Enable streams; default=1 unless ovverriden by user config
+  // Enable streams; default=1 unless overridden by user configuration
   EnableStreams();
 
   // Set the inference_num_threads property of the CPU
@@ -76,7 +78,7 @@ BasicBackend::BasicBackend(std::unique_ptr<ONNX_NAMESPACE::ModelProto>& model_pr
     } else if (!session_context_.has_external_weights &&
                !subgraph_context_.has_dynamic_input_shape &&
                !session_context_.so_context_enable &&
-               auto_unified_compile) {
+               !enable_causallm && auto_unified_compile) {
       // Unified OV compile_model is efficient when ov model caching is enabled
       // Unified OV compile_model API is supported with AUTO from version 2024.3 and above
       // Inputs with static dimensions
@@ -96,7 +98,7 @@ BasicBackend::BasicBackend(std::unique_ptr<ONNX_NAMESPACE::ModelProto>& model_pr
       }
       auto ov_model = CreateOVModel(std::move(model), session_context_, const_outputs_map_);
       exe_network_ = OVCore::Get()->CompileModel(
-          ov_model, hw_target, device_config, subgraph_context_.subgraph_name);
+          ov_model, hw_target, device_config, enable_causallm, subgraph_context_.subgraph_name);
     }
     LOGS_DEFAULT(INFO) << log_tag << "Loaded model to the plugin";
   } catch (const char* msg) {
@@ -120,7 +122,7 @@ BasicBackend::BasicBackend(std::unique_ptr<ONNX_NAMESPACE::ModelProto>& model_pr
     };
   }
   inferRequestsQueue_ = std::unique_ptr<InferRequestsQueue>(new InferRequestsQueue(exe_network_, num_infer_req, std::move(initializer)));
-  bindings_ = std::make_unique<OnnxToOvNetworkBindings>(exe_network_, subgraph_context_);
+  bindings_ = std::make_unique<OnnxToOvNetworkBindings>(exe_network_, subgraph_context_, session_context_);
 }
 
 bool BasicBackend::ValidateSubgraph(std::map<std::string, std::shared_ptr<ov::Node>>& const_outputs_map) {
@@ -181,6 +183,15 @@ void BasicBackend::PopulateConfigValue(ov::AnyMap& device_config) {
   if (!session_context_.load_config.empty()) {
     const std::map<std::string, ov::AnyMap>& target_config = session_context_.load_config;
 
+    if ((session_context_.device_type.find("NPU") != std::string::npos) && session_context_.enable_causallm) {
+      if (target_config.find("NPU") != target_config.end()) {
+        auto npu_genai_config = target_config.at("NPU");
+        CausalLMConfig().ApplyConfig(npu_genai_config, device_config);
+      } else {
+        LOGS_DEFAULT(WARNING) << "ORT GenAI CausalLMConfig Configuration not found.";
+      }
+    }
+
     if (session_context_.device_type.find("NPU") != std::string::npos) {
       auto npuw_config = target_config.at("NPU");
 
@@ -246,7 +257,8 @@ void BasicBackend::PopulateConfigValue(ov::AnyMap& device_config) {
     auto set_target_properties = [&](const std::string& device, const ov::AnyMap& config_options,
                                      const std::vector<ov::PropertyName>& supported_properties) {
       for (const auto& [key, value] : config_options) {
-        if (key.find("NPUW") != std::string::npos) {
+        if ((key.find("NPUW") != std::string::npos) ||
+            ((device_config.find(key) != device_config.end()) && session_context_.enable_causallm)) {
           continue;
         }
         if (is_supported_and_mutable(key, supported_properties)) {
@@ -339,6 +351,13 @@ void BasicBackend::SetNumThreads(ov::AnyMap& device_config) {
     device_config.emplace(ov::inference_num_threads(session_context_.num_of_threads));
 }
 
+void BasicBackend::RewindKVCache(size_t index) {
+  OVInferRequestPtr infer_request;
+  infer_request = inferRequestsQueue_->getIdleRequest();
+  infer_request->RewindKVCache(index);
+  inferRequestsQueue_->putIdleRequest(std::move(infer_request));
+}
+
 // Starts an asynchronous inference request for data in slice indexed by batch_slice_idx on
 // an Infer Request indexed by infer_req_idx
 void BasicBackend::StartAsyncInference(Ort::KernelContext& context, OVInferRequestPtr infer_request) {
@@ -351,7 +370,7 @@ void BasicBackend::StartAsyncInference(Ort::KernelContext& context, OVInferReque
       size_t batch_slice_idx = 0;
       if (subgraph_context_.has_dynamic_input_shape &&
           !session_context_.disable_dynamic_shapes &&
-          cpu_or_gpu) {
+          cpu_or_gpu || (npu && session_context_.enable_causallm)) {
         auto tensor = context.GetInput(input_info.onnx_index);
         auto tensor_info = tensor.GetTensorTypeAndShapeInfo();
         auto tensor_shape = tensor_info.GetShape();
@@ -409,7 +428,8 @@ void BasicBackend::StartAsyncInference(Ort::KernelContext& context, OVInferReque
       }
     }  // Loop subgraph original input
 
-    if (npu) {
+    // For Stateful Compilation i.e. enable_causallm as True, we use the dynamic shapes path for NPU plugin as well.
+    if (npu && !session_context_.enable_causallm) {
       // Set the output blob as remote blob
       for (const auto& output_info : bindings_->network_outputs_) {
         Ort::UnownedValue tensor = context.GetOutput(output_info.onnx_index, output_info.onnx_shape);
@@ -453,19 +473,20 @@ void BasicBackend::CompleteAsyncInference(Ort::KernelContext& context, OVInferRe
 
   bool cpu_or_gpu = session_context_.device_type.find("CPU") != std::string::npos ||
                     session_context_.device_type.find("GPU") != std::string::npos;
-  if (cpu_or_gpu) {
+  bool npu = session_context_.device_type.find("NPU") != std::string::npos;
+  if (cpu_or_gpu || (npu && session_context_.enable_causallm)) {
     for (const auto& output_info : bindings_->network_outputs_) {
-      OVTensorPtr graph_output_blob;
-      try {
-        graph_output_blob = infer_request->GetTensor(output_info.name);
-      } catch (const char* msg) {
-        ORT_THROW(msg);
-      }
-      size_t batch_size = 1;
-      Ort::UnownedValue output_tensor =
-          GetOutputTensor(context, batch_size, infer_request, output_info.name, subgraph_context_.output_names);
-      auto mem_info = output_tensor.GetTensorMemoryInfo();
-      if (mem_info.GetAllocatorName() == OpenVINO_GPU) {
+        OVTensorPtr graph_output_blob;
+        try {
+          graph_output_blob = infer_request->GetTensor(output_info.name);
+        } catch (const char* msg) {
+          ORT_THROW(msg);
+        }
+        size_t batch_size = 1;
+        Ort::UnownedValue output_tensor =
+            GetOutputTensor(context, batch_size, infer_request, output_info.name, subgraph_context_.output_names);
+        auto mem_info = output_tensor.GetTensorMemoryInfo();
+        if (mem_info.GetAllocatorName() == OpenVINO_GPU) {
           return;
       } else {
         size_t batch_slice = 0;
@@ -538,11 +559,19 @@ void BasicBackend::Infer(OrtKernelContext* ctx) {
     try {
       StartAsyncInference(context, infer_request);
     } catch (const std::runtime_error& e) {
+      // If the inference fails (exception from ov::InferRequest::infer()),
+      // we need to put the infer_request back into the pool to avoid deadlocks
+      // and to allow the next inference request to proceed.
+      inferRequestsQueue_->putIdleRequest(std::move(infer_request));
       ORT_THROW(log_tag + " Exception at StartAsyncInference: " + e.what());
     }
     try {
       CompleteAsyncInference(context, infer_request);
     } catch (const std::runtime_error& e) {
+      // If the inference fails (exception from ov::InferRequest::infer()),
+      // we need to put the infer_request back into the pool to avoid deadlocks
+      // and to allow the next inference request to proceed.
+      inferRequestsQueue_->putIdleRequest(std::move(infer_request));
       ORT_THROW(log_tag + " Exception at CompleteAsyncInference: " + e.what());
     }
 
diff --git a/onnxruntime/core/providers/openvino/backends/basic_backend.h b/onnxruntime/core/providers/openvino/backends/basic_backend.h
index 697c088a80620..fe178ccb5661b 100644
--- a/onnxruntime/core/providers/openvino/backends/basic_backend.h
+++ b/onnxruntime/core/providers/openvino/backends/basic_backend.h
@@ -42,12 +42,22 @@ struct OnnxToOvNetworkBindings {
   std::vector<ParameterInfo> network_outputs_;
   std::vector<ParameterInfo> network_inputs_;
 
-  OnnxToOvNetworkBindings(OVExeNetwork& exec_network, SubGraphContext& subgraph_context) {
+  OnnxToOvNetworkBindings(OVExeNetwork& exec_network, SubGraphContext& subgraph_context, SessionContext& session_context) {
     auto populate = [&](auto& input_output_map, const SubGraphContext::string_index_map_t& onnx_input_map, const auto& ov_parameters) {
       for (const auto& [onnx_name, onnx_param_index] : onnx_input_map) {
         auto it = std::find_if(ov_parameters.begin(), ov_parameters.end(),
                                [&onnx_name](const auto& ov_parameter_info) { return ov_parameter_info.get_names().contains(onnx_name); });
 
+        // For Stateful Model Compilation, the ONNX model includes KV cache (past/present) tensors.
+        // However, these tensors are internally converted to a stateful representation, which removes them.
+        // To prevent runtime exceptions, we simply continue processing here.
+        if ((onnx_name.empty() || onnx_name == "beam_idx" ||
+            onnx_name.find("past_key_values") != std::string::npos ||
+            onnx_name.find("present") != std::string::npos) &&
+            session_context.enable_causallm) {
+          continue;
+        }
+
         ORT_ENFORCE(it != ov_parameters.end(), backend_utils::log_tag,
                     "Input names mismatch between OpenVINO and ONNX. ", onnx_name,
                     " doesn't exist in the list of OpenVINO input tensor names");
@@ -85,6 +95,7 @@ class BasicBackend : public IBackend {
   ov::CompiledModel GetOVCompiledModel() override {
     return exe_network_.Get();
   }
+  void RewindKVCache(size_t index) override;
 
  private:
   bool ValidateSubgraph(std::map<std::string, std::shared_ptr<ov::Node>>& const_outputs_map);
@@ -114,7 +125,7 @@ class InferRequestsQueue {
     OVInferRequestPtr infer_request;
     live_threads=nireq;
     for (size_t id = 0; id < nireq; id++) {
-      infer_request = std::make_shared<OVInferRequest>(net.CreateInferRequest());
+      infer_request = net.CreateInferRequest();
       initializer(infer_request);
       infer_requests_.push_back(infer_request);
     }
@@ -144,7 +155,6 @@ class InferRequestsQueue {
 
   OVInferRequestPtr getIdleRequest() {
     std::unique_lock<std::mutex> lock(_mutex);
-    std::cout << "get Idle Request" << live_threads << "\n";
     if(live_threads==0) {
       return nullptr;
     }
diff --git a/onnxruntime/core/providers/openvino/contexts.h b/onnxruntime/core/providers/openvino/contexts.h
index 7560f4570bd32..2506d587dd3ad 100644
--- a/onnxruntime/core/providers/openvino/contexts.h
+++ b/onnxruntime/core/providers/openvino/contexts.h
@@ -97,6 +97,7 @@ struct ProviderInfo {
   bool disable_dynamic_shapes{false};      // [disable_dynamic_shapes]:  Rewrite dynamic shaped models to
                                            // static shape at runtime and execute.
   bool enable_qdq_optimizer{false};        // Enables QDQ pruning for efficient inference latency with NPU
+  bool enable_causallm{false};             // Enables Causal LM Compilation for ORT GenAI OVEP Pass
   bool so_context_enable{false};           // ORT session option
   bool so_disable_cpu_ep_fallback{false};  // ORT session option
   bool so_context_embed_mode{false};       // ORT session option
diff --git a/onnxruntime/core/providers/openvino/ibackend.h b/onnxruntime/core/providers/openvino/ibackend.h
index 4532349897d17..752668b3c6fbe 100644
--- a/onnxruntime/core/providers/openvino/ibackend.h
+++ b/onnxruntime/core/providers/openvino/ibackend.h
@@ -17,6 +17,7 @@ class IBackend {
   virtual void Infer(OrtKernelContext* context) = 0;
   virtual ov::CompiledModel GetOVCompiledModel() = 0;
   virtual ~IBackend() = default;
+  virtual void RewindKVCache(size_t index) {}
 };
 using ptr_stream_t = std::unique_ptr<std::istream>;
 class BackendFactory {
diff --git a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc
index 3793317749a04..d12f1edc57da5 100644
--- a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc
+++ b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc
@@ -254,6 +254,25 @@ common::Status OpenVINOExecutionProvider::SetEpDynamicOptions(gsl::span<const ch
           }
         }
       }
+    } else if (key == "kvcache_rewind") {
+      // Convert kvcache_rewind value to int64_t
+      int64_t index;
+      try {
+        index = std::stoll(value);
+      } catch (const std::exception& e) {
+        LOGS_DEFAULT(WARNING) << "Conversion for kvcache_rewind string value to int64_t index failed."
+                              << "Exception:" + std::string(e.what());
+        return Status::OK();
+      }
+
+      // Trigger KVCache Rewind for target Backend
+      for (auto& backend : backend_managers_) {
+        if (index >= 0) {
+          backend.RewindKVCache(static_cast<size_t>(index));
+        } else {
+          LOGS_DEFAULT(WARNING) << "kvcache_rewind index is < 0:\t" << index;
+        }
+      }
     } else {
       // Handle unknown options
       LOGS_DEFAULT(WARNING) << "Unknown key/value pair - ignoring " << key << "/" << value;
diff --git a/onnxruntime/core/providers/openvino/openvino_provider_factory.cc b/onnxruntime/core/providers/openvino/openvino_provider_factory.cc
index e5526ecd52bb9..f7e64a9be2c60 100644
--- a/onnxruntime/core/providers/openvino/openvino_provider_factory.cc
+++ b/onnxruntime/core/providers/openvino/openvino_provider_factory.cc
@@ -343,13 +343,20 @@ static void ParseProviderInfo(const ProviderOptions& provider_options,
 
     pi.enable_qdq_optimizer = ParseBooleanOption(provider_options, "enable_qdq_optimizer");
 
+    pi.enable_causallm = ParseBooleanOption(provider_options, "enable_causallm");
+
     pi.disable_dynamic_shapes = ParseBooleanOption(provider_options, "disable_dynamic_shapes");
   } catch (std::string msg) {
     ORT_THROW(msg);
   }
   // Always true for NPU plugin or when passed .
   if (pi.device_type.find("NPU") != std::string::npos) {
-    pi.disable_dynamic_shapes = true;
+    // For Stateful Compilation i.e. enable_causallm as True, we use the dynamic shapes path.
+    if (pi.enable_causallm) {
+      pi.disable_dynamic_shapes = false;
+    } else {
+      pi.disable_dynamic_shapes = true;
+    }
   }
 }
 
diff --git a/onnxruntime/core/providers/openvino/ov_interface.cc b/onnxruntime/core/providers/openvino/ov_interface.cc
index 0024a5e121bbf..0818f350562e9 100644
--- a/onnxruntime/core/providers/openvino/ov_interface.cc
+++ b/onnxruntime/core/providers/openvino/ov_interface.cc
@@ -7,6 +7,8 @@
 #include "core/session/onnxruntime_cxx_api.h"
 #include "core/providers/shared_library/provider_api.h"
 #include "core/providers/openvino/backend_utils.h"
+#include "core/providers/openvino/backends/basic_backend.h"
+#include "core/providers/openvino/ov_stateful_patch_utils.h"
 
 using Exception = ov::Exception;
 
@@ -82,17 +84,85 @@ std::shared_ptr<OVNetwork> OVCore::ReadModel(std::string&& model, const std::str
   }
 }
 
+OVExeNetwork OVCore::StatefulCompileModel(std::shared_ptr<OVNetwork>& model,
+                                          std::string& hw_target,
+                                          const ov::AnyMap& device_config) {
+  ov::CompiledModel compiled_model;
+  ov::AnyMap config = device_config;
+
+  if (onnxruntime::openvino_ep::backend_utils::IsDebugEnabled()) {
+    std::cout << "Stateless OV Model Statistic:" << std::endl;
+    LogBasicModelInfo(model);
+  }
+
+  LOGS_DEFAULT(INFO) << log_tag << "Converting from Stateless OV Model to Stateful OV Model" << std::endl;
+  bool model_status = IsStateful(model);
+  LOGS_DEFAULT(INFO) << log_tag << "Model IsStateful() Status:\t" << (model_status ? "True" : "False");
+  if (!model_status) {
+    PatchStatefulDecoder(model);
+  }
+
+  if (onnxruntime::openvino_ep::backend_utils::IsDebugEnabled()) {
+    std::cout << "Stateful OV Model Statistic:" << std::endl;
+    LogBasicModelInfo(model);
+  }
+
+  auto kv_pos = GetKVAxesPos(model);
+
+  if (hw_target.find("NPU") != std::string::npos) {
+    KVDesc kv_desc;
+    auto parse_genai_config = [&](const std::string& key, unsigned int default_value) {
+      return (config.count(key) && !config.at(key).empty() && config.at(key).as<std::string>() != "0") ? config.at(key).as<unsigned int>() : default_value;
+    };
+
+    kv_desc.max_prompt_len = parse_genai_config("MAX_PROMPT_LEN", CausalLMConfig().max_prompt_len);
+    kv_desc.min_response_len = parse_genai_config("MIN_RESPONSE_LEN", CausalLMConfig().min_response_len);
+
+    // For compilation, MAX_PROMPT_LEN & MIN_RESPONSE_LEN should not be 0
+    if (kv_desc.max_prompt_len == 0 || kv_desc.min_response_len == 0) {
+      ORT_THROW(log_tag + "MAX_PROMPT_LEN and MIN_RESPONSE_LEN cannot be 0 or empty");
+    }
+
+    if (onnxruntime::openvino_ep::backend_utils::IsDebugEnabled()) {
+      std::cout << "kv_pos.batch = " << kv_pos.batch << std::endl;
+      std::cout << "kv_pos.seq_len = " << kv_pos.seq_len << std::endl;
+      std::cout << "kv_desc.max_prompt_len:\t" << kv_desc.max_prompt_len << std::endl;
+      std::cout << "kv_desc.min_response_len:\t" << kv_desc.min_response_len << std::endl;
+    }
+
+    UpdateNPUConfig(config, kv_pos, kv_desc);
+  } else {
+    // This patches the OV IR model so that it only produces the logits required for sampling.
+    // Actually either way that happens within NPUW::LLMCompiledModel creation for NPU device,
+    // while this is here mostly to align this behavior for other devices viz. (CPU, GPU).
+    ApplySliceBeforeMatmulTransformation(model);
+  }
+
+  LOGS_DEFAULT(INFO) << log_tag << "Compiling OV Model using Stateful Transformation flow";
+  compiled_model = OVCore::Get()->core.compile_model(model, hw_target, config);
+  OVExeNetwork exe(compiled_model, hw_target, true);
+  return exe;
+}
+
 OVExeNetwork OVCore::CompileModel(std::shared_ptr<const OVNetwork>& ie_cnn_network,
                                   std::string& hw_target,
                                   ov::AnyMap& device_config,
+                                  bool enable_causallm,
                                   const std::string& name) {
-  ov::CompiledModel obj;
+  OVExeNetwork exe;
   try {
-    obj = core.compile_model(ie_cnn_network, hw_target, device_config);
+    if (enable_causallm) {
+      auto mutable_model = ie_cnn_network->clone();
+      exe = OVCore::Get()->StatefulCompileModel(mutable_model, hw_target, device_config);
+    } else {
+      auto obj = core.compile_model(ie_cnn_network, hw_target, device_config);
+      exe = OVExeNetwork(obj, hw_target);
+    }
+
 #ifndef NDEBUG
-    printDebugInfo(obj);
+    printDebugInfo(exe.Get());
 #endif
-    OVExeNetwork exe(obj);
+
     return exe;
   } catch (const Exception& e) {
     ORT_THROW(log_tag + " Exception while Loading Network for graph: " + name + e.what());
@@ -111,7 +181,7 @@ OVExeNetwork OVCore::CompileModel(const std::string& onnx_model,
 #ifndef NDEBUG
     printDebugInfo(obj);
 #endif
-    OVExeNetwork exe(obj);
+    OVExeNetwork exe(obj, hw_target);
     return exe;
   } catch (const Exception& e) {
     ORT_THROW(log_tag + " Exception while Loading Network for graph: " + name + e.what());
@@ -128,9 +198,9 @@ OVExeNetwork OVCore::ImportModel(std::istream& model_stream,
     ov::CompiledModel obj;
     obj = core.import_model(model_stream, hw_target, device_config);
 #ifndef NDEBUG
-    printDebugInfo(obj);
+    printDebugInfo(exe.Get());
 #endif
-    OVExeNetwork exe(obj);
+    OVExeNetwork exe(obj, hw_target);
     return exe;
   } catch (const Exception& e) {
     ORT_THROW(log_tag + " Exception while Loading Network for graph: " + name + e.what());
@@ -192,11 +262,16 @@ void OVCore::SetStreams(const std::string& device_type, int num_streams) {
   core.set_property(device_type, {ov::num_streams(num_streams)});
 }
 
-OVInferRequest OVExeNetwork::CreateInferRequest() {
+std::shared_ptr<OVInferRequest> OVExeNetwork::CreateInferRequest() {
   try {
-    auto infReq = obj.create_infer_request();
-    OVInferRequest inf_obj(std::move(infReq));
-    return inf_obj;
+    auto infReq = compiled_model_obj.create_infer_request();
+    std::shared_ptr<OVInferRequest> ovInfReq;
+    if (is_stateful_causallm) {
+      ovInfReq = std::make_shared<StatefulOVInferRequest>(std::move(infReq), target_device);
+    } else {
+      ovInfReq = std::make_shared<OVInferRequest>(std::move(infReq));
+    }
+    return ovInfReq;
   } catch (const Exception& e) {
     ORT_THROW(log_tag + "Exception while creating InferRequest object: " + e.what());
   } catch (...) {
@@ -245,9 +320,9 @@ void OVInferRequest::StartAsync() {
   try {
     ovInfReq.start_async();
   } catch (const Exception& e) {
-    ORT_THROW(log_tag + " Couldn't start Inference: " + e.what());
+    throw std::runtime_error(log_tag + " Couldn't start Inference: " + e.what());
   } catch (...) {
-    ORT_THROW(log_tag + " In Error Couldn't start Inference");
+    throw std::runtime_error(log_tag + " In Error Couldn't start Inference");
   }
 }
 
@@ -255,9 +330,9 @@ void OVInferRequest::Infer() {
   try {
     ovInfReq.infer();
   } catch (const Exception& e) {
-    ORT_THROW(log_tag + " Couldn't start Inference: " + e.what());
+    throw std::runtime_error(log_tag + " Couldn't start Inference: " + e.what());
   } catch (...) {
-    ORT_THROW(log_tag + " In Error Couldn't start Inference");
+    throw std::runtime_error(log_tag + " In Error Couldn't start Inference");
   }
 }
 
@@ -279,5 +354,160 @@ void OVInferRequest::QueryStatus() {
   std::cout << "ovInfReq.query_state()"
             << " ";
 }
+
+StatefulOVInferRequest::StatefulOVInferRequest(ov::InferRequest infer_request, std::string device)
+    : OVInferRequest(std::move(infer_request)), target_device(device) {
+  bool gpu_or_npu = ((device.find("NPU") != std::string::npos) || (device.find("GPU") != std::string::npos));
+  if (gpu_or_npu) {
+    prefill_use_full_chat_history = true;
+  }
+}
+
+void StatefulOVInferRequest::FillTensor(const std::string& tensor_name, const ov::element::Type& type,
+                                         const std::vector<size_t>& shape, int32_t fill_value) {
+  ov::Tensor tensor = ov::Tensor(type, shape);
+  std::fill_n(tensor.data<int32_t>(), tensor.get_size(), fill_value);
+  ovInfReq.set_tensor(tensor_name, tensor);
+}
+
+void StatefulOVInferRequest::CacheTensor(const std::string& tensor_name, std::vector<int64_t>& cache) {
+  auto tensor = ovInfReq.get_tensor(tensor_name);
+  auto* pData = tensor.data<int64_t>();
+  for (size_t i = 0; i < tensor.get_size(); i++) {
+    cache.emplace_back(pData[i]);
+  }
+}
+
+void StatefulOVInferRequest::SetTensorFromCache(const std::string& tensor_name,
+                                               const std::vector<int64_t>& cache_data) {
+  auto tensor = ovInfReq.get_tensor(tensor_name);
+  auto new_shape = tensor.get_shape();
+  new_shape[1] = cache_data.size();
+
+  auto new_tensor = ov::Tensor(tensor.get_element_type(), new_shape);
+  auto* pNewData = new_tensor.data<int64_t>();
+  std::memcpy(pNewData, cache_data.data(), cache_data.size() * sizeof(int64_t));
+
+  ovInfReq.set_tensor(tensor_name, new_tensor);
+}
+
+std::optional<ov::Tensor> StatefulOVInferRequest::FindTensor(const std::string& tensor_name) {
+  // Check if tensor exists by examining input names in the compiled model
+  const auto& model = ovInfReq.get_compiled_model();
+  bool tensor_exists = false;
+
+  for (const auto& input : model.inputs()) {
+    const auto& names = input.get_names();
+    if (names.find(tensor_name) != names.end()) {
+      tensor_exists = true;
+      break;
+    }
+  }
+
+  if (tensor_exists) {
+    return ovInfReq.get_tensor(tensor_name);
+  }
+
+  return std::nullopt;
+}
+
+void StatefulOVInferRequest::PreProcessInferRequest() {
+  // Workaround: Setting the value here as it cannot be set at the ORT GenAI layer currently.
+  // TODO(ankit): Address this issue and implement the fix at the appropriate layer.
+  FillTensor("beam_idx", ov::element::i32, {1}, 0);
+
+  // If 'prefill use full chat history' mode is enabled, we need to cache input_ids and position_ids.
+  if (prefill_use_full_chat_history) {
+    auto input_ids_tensor = ovInfReq.get_tensor("input_ids");
+    CacheTensor("input_ids", cached_input_ids);
+
+    // "position_ids" (GQA with Rotary Embeddings doesnt have position_ids) - check if exists
+    auto position_ids_opt = FindTensor("position_ids");
+    bool has_position_ids = position_ids_opt.has_value();
+
+    if (has_position_ids) {
+      CacheTensor("position_ids", cached_position_ids);
+    }
+
+    // If we're about to run the prefill model
+    if (input_ids_tensor.get_size() > 1) {
+      // Check if the size of the current "input_ids" tensor does not match the size of the cached "input_ids".
+      // This indicates that we are running a subsequent prompt (not the initial prefill).
+      if (input_ids_tensor.get_shape()[1] != cached_input_ids.size()) {
+        // Clear the internal KVCache state. For NPU device, this operation is a no-op.
+        ovInfReq.reset_state();
+
+        // Set tensors using cached values
+        SetTensorFromCache("input_ids", cached_input_ids);
+
+        // Only set position_ids if it exists and we have cached values
+        if (has_position_ids && !cached_position_ids.empty()) {
+          SetTensorFromCache("position_ids", cached_position_ids);
+        }
+      }
+    }
+  }
+}
+
+void StatefulOVInferRequest::StartAsync() {
+  PreProcessInferRequest();
+  OVInferRequest::StartAsync();
+}
+
+void StatefulOVInferRequest::Infer() {
+  PreProcessInferRequest();
+  OVInferRequest::Infer();
+}
+
+void StatefulOVInferRequest::RewindKVCache(size_t index) {
+  LOGS_DEFAULT(INFO) << log_tag << "RewindKVCache: Rewinding OpenVINO-internal KVCache state to index=" << index;
+
+  if (prefill_use_full_chat_history) {
+    // Clear the internal KVCache state. For NPU device, this operation is a no-op.
+    ovInfReq.reset_state();
+
+    // Resize the cached "input_ids" and "position_ids" to the specified index.
+    if (cached_input_ids.size() > index) {
+      cached_input_ids.resize(index);
+    }
+
+    if (cached_position_ids.size() > index) {
+      cached_position_ids.resize(index);
+    }
+  } else {
+    if (index == 0) {
+      // In this case, since we're resetting the entire KVCache, simply reset the state.
+      ovInfReq.reset_state();
+    } else {
+      // Retrieve KVCache states and trim them to the specified index.
+      // The following logic is adapted from:
+      // https://github.com/openvinotoolkit/openvino.genai/blob/releases/2025/1/src/cpp/src/utils.cpp#L329
+      auto states = ovInfReq.query_state();
+      for (auto& state : states) {
+        ov::Tensor old_tensor = state.get_state();
+        // Tensor shape: [batch_size, num_kv_heads, seq_len, head_size]
+        auto shape = old_tensor.get_shape();
+
+        if (shape[2] > index) {
+          // Update the sequence length dimension to the specified index.
+          shape[2] = index;
+
+          ov::Coordinate new_shape_begin{0, 0, 0, 0};
+          ov::Coordinate new_shape_end{shape};
+
+          // Create a trimmed tensor with the updated shape.
+          auto trimmed_tensor = ov::Tensor(old_tensor, new_shape_begin, new_shape_end);
+
+          // Copy the trimmed tensor into a new tensor and update the state.
+          ov::Tensor new_tensor(old_tensor.get_element_type(), shape);
+          trimmed_tensor.copy_to(new_tensor);
+
+          state.set_state(new_tensor);
+        }
+      }
+    }
+  }
+}
+
 }  // namespace openvino_ep
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/openvino/ov_interface.h b/onnxruntime/core/providers/openvino/ov_interface.h
index 866f4a02f7780..c3d165b40840c 100644
--- a/onnxruntime/core/providers/openvino/ov_interface.h
+++ b/onnxruntime/core/providers/openvino/ov_interface.h
@@ -61,10 +61,14 @@ struct OVCore : WeakSingleton<OVCore> {
   // OV Interface For Reading Model
   std::shared_ptr<OVNetwork> ReadModel(std::string&& model_stream, const std::string& model_path);
 
+  OVExeNetwork StatefulCompileModel(std::shared_ptr<OVNetwork>& model,
+                                    std::string& hw_target,
+                                    const ov::AnyMap& device_config);
   // OV Interface for Compiling OV Model Type
   OVExeNetwork CompileModel(std::shared_ptr<const OVNetwork>& ie_cnn_network,
                             std::string& hw_target,
                             ov::AnyMap& device_config,
+                            bool enable_causallm,
                             const std::string& name);
   // OV Interface for Fast Compile
   OVExeNetwork CompileModel(const std::string& onnx_model,
@@ -83,16 +87,20 @@ struct OVCore : WeakSingleton<OVCore> {
 };
 
 class OVExeNetwork {
-  ov::CompiledModel obj;
+  ov::CompiledModel compiled_model_obj;
+  std::string target_device;
+  bool is_stateful_causallm;
 
  public:
-  explicit OVExeNetwork(ov::CompiledModel md) : obj(md) {}
-  OVExeNetwork() : obj(ov::CompiledModel()) {}
-  ov::CompiledModel& Get() { return obj; }
-  OVInferRequest CreateInferRequest();
+  explicit OVExeNetwork(ov::CompiledModel compiled_model, std::string device, bool stateful_causallm = false)
+      : compiled_model_obj(compiled_model), target_device(device), is_stateful_causallm(stateful_causallm) {}
+  OVExeNetwork() : compiled_model_obj(ov::CompiledModel()) {}
+  ov::CompiledModel& Get() { return compiled_model_obj; }
+  std::shared_ptr<OVInferRequest> CreateInferRequest();
 };
 
 class OVInferRequest {
+ protected:
   ov::InferRequest ovInfReq;
 
  public:
@@ -100,16 +108,42 @@ class OVInferRequest {
   OVTensorPtr GetTensor(const std::string& name);
   std::string GetInputTensorName(uint32_t index);
   void SetTensor(const std::string& name, OVTensorPtr& blob);
-  void StartAsync();
-  void Infer();
+  virtual void StartAsync();
+  virtual void Infer();
   void WaitRequest();
   void CancelRequest();
   void QueryStatus();
-  explicit OVInferRequest(ov::InferRequest obj) : ovInfReq(std::move(obj)) {}
+  explicit OVInferRequest(ov::InferRequest infer_request_obj) : ovInfReq(std::move(infer_request_obj)) {}
   OVInferRequest() : ovInfReq(ov::InferRequest()) {}
   ov::InferRequest& GetNewObj() {
     return ovInfReq;
   }
+  virtual void RewindKVCache(size_t index) {}
+};
+
+class StatefulOVInferRequest : public OVInferRequest {
+ public:
+  explicit StatefulOVInferRequest(ov::InferRequest infer_request, std::string device);
+
+  void StartAsync() override;
+  void Infer() override;
+  void RewindKVCache(size_t index) override;
+  void FillTensor(const std::string& tensor_name, const ov::element::Type& type,
+                   const std::vector<size_t>& shape, int32_t fill_value);
+  void CacheTensor(const std::string& tensor_name, std::vector<int64_t>& cache);
+  void SetTensorFromCache(const std::string& tensor_name, const std::vector<int64_t>& cache_data);
+  std::optional<ov::Tensor> FindTensor(const std::string& tensor_name);
+
+ private:
+  void PreProcessInferRequest();
+  std::string target_device;
+
+  // If prefill_use_full_chat_history is true, cache the "input_ids" & "position_ids" tensors,
+  // and ensure that full chat history is passed for each prefill call.
+  bool prefill_use_full_chat_history = false;
+  std::vector<int64_t> cached_input_ids;
+  std::vector<int64_t> cached_position_ids;
 };
+
 }  // namespace openvino_ep
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.cc b/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.cc
new file mode 100644
index 0000000000000..67ba42884e4f0
--- /dev/null
+++ b/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.cc
@@ -0,0 +1,350 @@
+// Copyright (C) Intel Corporation
+// Licensed under the MIT License
+
+#include "core/providers/openvino/ov_stateful_patch_utils.h"
+
+namespace onnxruntime {
+namespace openvino_ep {
+
+void LogBasicModelInfo(const std::shared_ptr<const ov::Model>& model) {
+  std::cout << "Model Name: " << model->get_friendly_name() << std::endl;
+
+  // Log detailed information about model inputs and outputs
+  auto inputs = model->inputs();
+  auto outputs = model->outputs();
+
+  std::cout << "\tInputs: " << std::endl;
+  for (const ov::Output<const ov::Node>& input : inputs) {
+    const std::string name = input.get_any_name();
+    const ov::element::Type type = input.get_element_type();
+    const ov::PartialShape shape = input.get_partial_shape();
+    const ov::Layout layout = ov::layout::get_layout(input);
+
+    std::cout << "\t\t" << name << ", " << type << ", " << shape << ", " << layout.to_string() << std::endl;
+  }
+
+  std::cout << "\tOutputs: " << std::endl;
+  for (const ov::Output<const ov::Node>& output : outputs) {
+    const std::string name = output.get_any_name();
+    const ov::element::Type type = output.get_element_type();
+    const ov::PartialShape shape = output.get_partial_shape();
+    const ov::Layout layout = ov::layout::get_layout(output);
+
+    std::cout << "\t\t" << name << ", " << type << ", " << shape << ", " << layout.to_string() << std::endl;
+  }
+
+  return;
+}
+
+bool ModelHasInputOutputNames(std::shared_ptr<ov::Model> model, const std::string& name_to_match) {
+  for (const ov::Output<ov::Node>& input : model->inputs()) {
+    auto& names = input.get_names();
+
+    for (auto& name : names) {
+      if (name == name_to_match) {
+        return true;
+      }
+    }
+  }
+
+  for (const ov::Output<ov::Node>& output : model->outputs()) {
+    auto& names = output.get_names();
+    for (auto& name : names) {
+      if (name == name_to_match) {
+        return true;
+      }
+    }
+  }
+
+  return false;
+}
+
+void FuseCacheReorder(std::shared_ptr<ov::Model> ov_model,
+                      std::vector<std::string>& not_kv_inputs,
+                      const std::vector<std::string>& key_value_input_names,
+                      int gather_dim) {
+  if (ModelHasInputOutputNames(ov_model, "beam_idx")) {
+    throw std::runtime_error("Model already has fused cache");
+  }
+
+  std::string main_input_name = "inputs_embeds";
+  if (ModelHasInputOutputNames(ov_model, "input_ids")) {
+    main_input_name = "input_ids";
+  }
+
+  auto input_batch = ov_model->input(main_input_name).get_partial_shape()[0];
+
+  auto beam_idx = std::make_shared<ov::opset13::Parameter>(ov::element::i32, ov::PartialShape({input_batch}));
+  beam_idx->set_friendly_name("beam_idx");
+  beam_idx->output(0).get_tensor().add_names({"beam_idx"});
+  ov_model->add_parameters({beam_idx});
+  not_kv_inputs.push_back(beam_idx->get_friendly_name());
+
+  // Go over all cache parameters and fuse _reorder_cache with indices provided by the new parameter beam_idx
+  for (const auto& input_name : key_value_input_names) {
+    auto parameter_output_port = ov_model->input(input_name);
+    auto consumers = parameter_output_port.get_target_inputs();
+
+    auto gather_op =
+        std::make_shared<ov::opset13::Gather>(parameter_output_port,
+                                              beam_idx,
+                                              ov::opset13::Constant::create(ov::element::i64, {}, {gather_dim}));
+
+    // Replace the source output for all consumers of the input tensor
+    for (auto& consumer : consumers) {
+      consumer.replace_source_output(gather_op->output(0));
+    }
+  }
+
+  // Validate the modified model
+  ov_model->validate_nodes_and_infer_types();
+}
+
+void MakeStateful(std::shared_ptr<ov::Model>& ov_model,
+                  const std::vector<std::string>& key_value_input_names,
+                  const std::vector<std::string>& key_value_output_names) {
+  std::map<std::string, std::string> input_output_map;
+
+  // Create mapping for KV-cache inputs and outputs
+  for (size_t i = 0; i < key_value_input_names.size(); ++i) {
+    input_output_map[key_value_input_names[i]] = key_value_output_names[i];
+  }
+
+  // Apply the transformation to make the model stateful
+  ov::pass::Manager manager;
+  manager.register_pass<ov::pass::MakeStateful>(input_output_map);
+  manager.run_passes(ov_model);
+}
+
+// Converted to C++ from below reference URL:
+// https://github.com/huggingface/optimum-intel/blob/main/optimum/exporters/openvino/stateful.py#L281
+void PatchStatefulDecoder(std::shared_ptr<ov::Model> model) {
+  std::vector<std::string> key_value_input_names;
+  std::vector<std::string> not_kv_inputs;
+  for (const ov::Output<ov::Node>& input : model->inputs()) {
+    auto& names = input.get_names();
+
+    bool found = false;
+    for (auto& name : names) {
+      if (name.find("key_values") != std::string::npos) {
+        key_value_input_names.push_back(name);
+        found = true;
+        break;
+      }
+    }
+
+    if (!found) {
+      not_kv_inputs.push_back(input.get_any_name());
+    }
+  }
+
+  std::vector<std::string> key_value_output_names;
+  for (const ov::Output<ov::Node>& output : model->outputs()) {
+    auto& names = output.get_names();
+    for (auto& name : names) {
+      if (name.find("present") != std::string::npos) {
+        key_value_output_names.push_back(name);
+        break;
+      }
+    }
+  }
+
+  if (key_value_input_names.empty() || key_value_output_names.empty()) {
+    std::cout << "no key_value_input_names or key_value_output_names found" << std::endl;
+    return;
+  }
+
+  // By default, batch is the 0 - th but chatglm uses 1 - st dimension as batch
+  // TODO(ryan): Deduce from a model via ordinal reshape(? ) and topology
+  // batch_dim = 1 if config.model_type == "chatglm" and not hasattr(config, "rope_ratio") else 0
+  auto batch_dim = 0;
+
+  FuseCacheReorder(model, not_kv_inputs, key_value_input_names, batch_dim);
+
+  MakeStateful(model, key_value_input_names, key_value_output_names);
+}
+
+// Some other utility functions copied from OpenVINO GenAI
+bool HasOpWithType(const std::shared_ptr<const ov::Model>& function, const std::string& type_name) {
+  for (const auto& op : function->get_ops()) {
+    if (op->get_type_name() == type_name) {
+      return true;
+    }
+  }
+  return false;
+}
+
+std::tuple<std::shared_ptr<ov::Node>, int64_t> FindLLMMatmul(const std::shared_ptr<ov::Model>& model) {
+  auto last_node = model->output(0).get_node()->input_value(0).get_node_shared_ptr();
+  std::shared_ptr<ov::Node> matmul = ov::as_type_ptr<ov::op::v0::MatMul>(last_node);
+
+  // In the case of PagedAttention, all tokens are moved to the batch dimension,
+  // and slicing/gathering must be performed accordingly.
+  const bool pa_based_model = HasOpWithType(model, "PagedAttentionExtension");
+  int64_t slice_gather_dim = pa_based_model ? 0 : 1;
+
+  // There are several patterns for MatMul we are looking for:
+  // MatMul -> Result
+  // MatMul -> Add -> Result
+  // MatMul -> Transpose -> Result
+  // MatMul -> Divide -> Tanh -> Multiply -> Result
+  // MatMul -> Convert -> Result
+  if (!matmul) {
+    if (auto add = ov::as_type_ptr<ov::op::v1::Add>(last_node)) {
+      matmul = ov::as_type_ptr<ov::op::v0::MatMul>(add->input_value(0).get_node_shared_ptr());
+    } else if (auto transpose = ov::as_type_ptr<ov::op::v1::Transpose>(last_node)) {
+      matmul = ov::as_type_ptr<ov::op::v0::MatMul>(transpose->input_value(0).get_node_shared_ptr());
+      auto order = ov::as_type_ptr<ov::op::v0::Constant>(transpose->input_value(1).get_node_shared_ptr())->get_axis_vector_val();
+      slice_gather_dim = order[slice_gather_dim];
+    } else if (auto multiply = ov::as_type_ptr<ov::op::v1::Multiply>(last_node)) {
+      if (auto tanh = ov::as_type_ptr<ov::op::v0::Tanh>(multiply->input_value(0).get_node_shared_ptr())) {
+        if (auto divide = ov::as_type_ptr<ov::op::v1::Divide>(tanh->input_value(0).get_node_shared_ptr())) {
+          matmul = ov::as_type_ptr<ov::op::v0::MatMul>(divide->input_value(0).get_node_shared_ptr());
+        }
+      }
+    } else if (auto convert = ov::as_type_ptr<ov::op::v0::Convert>(last_node)) {
+      matmul = ov::as_type_ptr<ov::op::v0::MatMul>(convert->input_value(0).get_node_shared_ptr());
+    }
+  }
+  return std::make_tuple(matmul, slice_gather_dim);
+}
+
+void ApplySliceBeforeMatmulTransformation(std::shared_ptr<ov::Model> model) {
+  std::shared_ptr<ov::Node> matmul = nullptr;
+  int64_t slice_gather_dim = -1;
+  std::tie(matmul, slice_gather_dim) = FindLLMMatmul(model);
+
+  if (matmul && matmul->input(0).get_partial_shape().rank().get_length() == 3) {
+    auto start = std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{1}, std::vector<int64_t>{-1});
+    auto stop = std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{1}, std::vector<int64_t>{-2});
+    auto step = std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{1}, std::vector<int64_t>{-1});
+    auto axis = std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{1}, std::vector<int64_t>{slice_gather_dim});
+    auto slice = std::make_shared<ov::op::v8::Slice>(matmul->input_value(0), start, stop, step, axis);
+    matmul->input(0).replace_source_output(slice);
+  }
+}
+
+void UpdateConfig(ov::AnyMap& config, const std::pair<std::string, ov::Any>& pair) {
+  if (config.count(pair.first) == 0) {
+    config.insert(pair);
+  }
+}
+
+std::optional<ov::Any> PopOption(ov::AnyMap& config, const std::string& option_name) {
+  if (auto it = config.find(option_name); it != config.end()) {
+    std::optional<ov::Any> found = std::make_optional(it->second);
+    config.erase(it);
+    return found;
+  }
+  return std::nullopt;
+}
+
+void RenameKey(ov::AnyMap& config, const std::string& old_key, const std::string& new_key) {
+  if (config.count(old_key) != 0) {
+    auto opt_value = PopOption(config, old_key);
+    config[new_key] = opt_value.value();
+  }
+}
+
+KVAxesPosition GetKVAxesPos(std::shared_ptr<const ov::Model> model) {
+  // Sequence length axis in key/values tensors. For most cases, the tensor shape is
+  // [batch_size, num_kv_heads, seq_len, head_size]. Therefore, the sequence length axis
+  // is usually at index 2, and the batch axis is at index 0.
+  KVAxesPosition kv_pos{0u, 2u};
+
+  // "ReadValue" node is KV cache representation in stateful model
+  std::string kv_node_type_name = std::string(ov::op::v6::ReadValue::get_type_info_static().name);
+
+  for (const auto& op : model->get_ops()) {
+    // Check input size, as in LoRA adapters case it could be 0
+    if (op->get_type_name() != kv_node_type_name || op->get_input_size() < 1) {
+      continue;
+    }
+
+    // Shape example: [-1,4,0,64]
+    auto shape = op->get_input_partial_shape(0);
+
+    for (int64_t i = 0; i < shape.rank().get_length(); i++) {
+      // Find axis = 0. This would be sequence length axis.
+      if (shape[i] == 0) {
+        kv_pos.seq_len = i;
+      } else if (shape[i].is_dynamic()) {
+        // Dynamic axis is a batch
+        kv_pos.batch = i;
+      }
+    }
+    break;
+  }
+
+  return kv_pos;
+}
+
+void UpdateNPUConfig(ov::AnyMap& config, const KVAxesPosition& kv_pos, const KVDesc& kv_desc) {
+  UpdateConfig(config, {"NPU_USE_NPUW", "YES"});
+  UpdateConfig(config, {"NPUW_LLM", "YES"});
+
+  UpdateConfig(config, {"NPUW_LLM_BATCH_DIM", kv_pos.batch});
+  UpdateConfig(config, {"NPUW_LLM_SEQ_LEN_DIM", kv_pos.seq_len});
+
+  UpdateConfig(config, {"NPUW_LLM_MAX_PROMPT_LEN", kv_desc.max_prompt_len});
+  UpdateConfig(config, {"NPUW_LLM_MIN_RESPONSE_LEN", kv_desc.min_response_len});
+
+  RenameKey(config, "++PREFILL_CONFIG", "++NPUW_LLM_PREFILL_CONFIG");
+  RenameKey(config, "++GENERATE_CONFIG", "++NPUW_LLM_GENERATE_CONFIG");
+  RenameKey(config, "PREFILL_CONFIG", "NPUW_LLM_PREFILL_CONFIG");
+  RenameKey(config, "PREFILL_HINT", "NPUW_LLM_PREFILL_HINT");
+  RenameKey(config, "GENERATE_CONFIG", "NPUW_LLM_GENERATE_CONFIG");
+  RenameKey(config, "GENERATE_HINT", "NPUW_LLM_GENERATE_HINT");
+
+  const size_t npuw_context_len_threshold = 2048;
+  if ((kv_desc.max_prompt_len + kv_desc.min_response_len) >= npuw_context_len_threshold) {
+    // This improves accuracy for generation sequences that exceed 2k tokens.
+    config["++NPUW_LLM_PREFILL_CONFIG"] = ov::AnyMap{{"NPUW_DEVICES", "NPU,CPU"}, {"NPUW_ONLINE_AVOID", "P:SinCos/NPU"}};
+    config["++NPUW_LLM_GENERATE_CONFIG"] = ov::AnyMap{{"NPUW_DEVICES", "NPU,CPU"}, {"NPUW_ONLINE_AVOID", "P:SinCos/NPU"}};
+  }
+}
+
+std::optional<ov::Any> PopOptionNew(ov::AnyMap& config, const std::string& option_name) {
+  if (auto it = config.find(option_name); it != config.end()) {
+    std::optional<ov::Any> found = std::make_optional(it->second);
+    config.erase(it);
+    return found;
+  }
+  return std::nullopt;
+}
+
+std::optional<uint32_t> PopIntAndCast(ov::AnyMap& config, const std::string& key) {
+  auto anyopt = PopOptionNew(config, key);
+  if (anyopt.has_value()) {
+    const auto any = anyopt.value();
+    int64_t value;
+    // NB: Integer value coming from python has int64_t datatype
+    if (any.is<int64_t>()) {
+      value = any.as<int64_t>();
+    } else if (any.is<int>()) {
+      value = any.as<int>();
+    } else {
+      OPENVINO_THROW("Failed to extract " + key + ". Type mismatch: expected types: int or int64_t");
+    }
+    if (value < 0) {
+      OPENVINO_THROW(key + " cannot be negative!");
+    }
+    return std::make_optional(static_cast<uint32_t>(value));
+  }
+  return std::nullopt;
+}
+
+bool IsStateful(const std::shared_ptr<ov::Model>& model) {
+  for (auto&& ptr : model->get_ordered_ops()) {
+    if (ov::is_type<ov::op::v3::ReadValue>(ptr) ||
+        ov::is_type<ov::op::v6::ReadValue>(ptr) ||
+        ov::is_type<ov::op::v3::Assign>(ptr) ||
+        ov::is_type<ov::op::v6::Assign>(ptr)) {
+      return true;
+    }
+  }
+  return false;
+}
+
+}  // namespace openvino_ep
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.h b/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.h
new file mode 100644
index 0000000000000..0b89c4ed02e13
--- /dev/null
+++ b/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.h
@@ -0,0 +1,84 @@
+// Copyright (C) Intel Corporation
+// Licensed under the MIT License
+
+#pragma once
+
+#include <iostream>
+#include <map>
+#include <memory>
+#include <optional>
+#include <string>
+#include <tuple>
+#include <vector>
+
+#include "openvino/pass/manager.hpp"
+#include "openvino/pass/make_stateful.hpp"
+#include "openvino/opsets/opset13.hpp"
+
+namespace onnxruntime {
+namespace openvino_ep {
+
+void LogBasicModelInfo(const std::shared_ptr<const ov::Model>& model);
+
+bool ModelHasInputOutputNames(std::shared_ptr<ov::Model> model, const std::string& name_to_match);
+
+void FuseCacheReorder(std::shared_ptr<ov::Model> ov_model,
+                      std::vector<std::string>& not_kv_inputs,
+                      const std::vector<std::string>& key_value_input_names,
+                      int gather_dim);
+
+void MakeStateful(std::shared_ptr<ov::Model>& ov_model,
+                  const std::vector<std::string>& key_value_input_names,
+                  const std::vector<std::string>& key_value_output_names);
+
+void PatchStatefulDecoder(std::shared_ptr<ov::Model> model);
+
+bool HasOpWithType(const std::shared_ptr<const ov::Model>& function, const std::string& type_name);
+
+std::tuple<std::shared_ptr<ov::Node>, int64_t> FindLLMMatmul(const std::shared_ptr<ov::Model>& model);
+
+void ApplySliceBeforeMatmulTransformation(std::shared_ptr<ov::Model> model);
+
+void UpdateConfig(ov::AnyMap& config, const std::pair<std::string, ov::Any>& pair);
+
+std::optional<ov::Any> PopOption(ov::AnyMap& config, const std::string& option_name);
+
+void RenameKey(ov::AnyMap& config, const std::string& old_key, const std::string& new_key);
+
+struct KVAxesPosition {
+  size_t batch;
+  size_t seq_len;
+};
+
+KVAxesPosition GetKVAxesPos(std::shared_ptr<const ov::Model> model);
+
+struct KVDesc {
+  uint32_t max_prompt_len;
+  uint32_t min_response_len;
+};
+
+struct CausalLMConfig {
+  void ApplyConfig(const ov::AnyMap& external_config, ov::AnyMap& genai_config) {
+    if (external_config.find("MAX_PROMPT_LEN") != external_config.end()) {
+      max_prompt_len = external_config.at("MAX_PROMPT_LEN").as<unsigned int>();
+    }
+    if (external_config.find("MIN_RESPONSE_LEN") != external_config.end()) {
+      min_response_len = external_config.at("MIN_RESPONSE_LEN").as<unsigned int>();
+    }
+    genai_config["MAX_PROMPT_LEN"] = ov::Any(max_prompt_len);
+    genai_config["MIN_RESPONSE_LEN"] = ov::Any(min_response_len);
+  }
+
+  unsigned int max_prompt_len = 1024;
+  unsigned int min_response_len = 128;
+};
+
+void UpdateNPUConfig(ov::AnyMap& config, const KVAxesPosition& kv_pos, const KVDesc& kv_desc);
+
+std::optional<ov::Any> PopOptionNew(ov::AnyMap& config, const std::string& option_name);
+std::optional<uint32_t> PopIntAndCast(ov::AnyMap& config, const std::string& key);
+
+bool IsStateful(const std::shared_ptr<ov::Model>& model);
+
+}  // namespace openvino_ep
+}  // namespace onnxruntime
diff --git a/onnxruntime/test/contrib_ops/attention_op_test.cc b/onnxruntime/test/contrib_ops/attention_op_test.cc
index 61e5fa05c66c1..4dff0376fcd84 100644
--- a/onnxruntime/test/contrib_ops/attention_op_test.cc
+++ b/onnxruntime/test/contrib_ops/attention_op_test.cc
@@ -2047,7 +2047,7 @@ TEST(AttentionTest, AttentionPastState_dynamic) {
   test.AddInput<float>("past", past_dims, past_data);
 
   test.AddReferenceOutputs("testdata/attention_past_state.onnx", 0.005f);
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kOpenVINOExecutionProvider});
 }
 #endif  //! defined(__wasm__)
 
diff --git a/onnxruntime/test/perftest/ort_test_session.cc b/onnxruntime/test/perftest/ort_test_session.cc
index 05136ec0750a1..e8eda5af1dc29 100644
--- a/onnxruntime/test/perftest/ort_test_session.cc
+++ b/onnxruntime/test/perftest/ort_test_session.cc
@@ -760,6 +760,15 @@ select from 'TF8', 'TF16', 'UINT8', 'FLOAT', 'ITENSOR'. \n)");
         } else {
           ORT_THROW("[ERROR] [OpenVINO] The value for the key 'enable_qdq_optimizer' should be a boolean i.e. true or false. Default value is false.\n");
         }
+      } else if (key == "enable_causallm") {
+        if (value == "true" || value == "True" ||
+            value == "false" || value == "False") {
+          ov_options[key] = value;
+        } else {
+          ORT_THROW(
+              "[ERROR] [OpenVINO] The value for the key 'enable_causallm' should be a boolean i.e. true or false."
+              " Default value is false. This provider option must be used with CausalLM Models viz. LLMs & SLMs only.\n");
+        }
       } else if (key == "disable_dynamic_shapes") {
         if (value == "true" || value == "True" ||
             value == "false" || value == "False") {
@@ -817,7 +826,8 @@ select from 'TF8', 'TF16', 'UINT8', 'FLOAT', 'ITENSOR'. \n)");
         ORT_THROW(
             "[ERROR] [OpenVINO] wrong key type entered. Choose from the following runtime key options that are available for OpenVINO."
             " ['device_type', 'device_id', 'num_of_threads', 'load_config', 'cache_dir', 'num_streams', "
-            "'enable_opencl_throttling', 'disable_dynamic_shapes', 'enable_qdq_optimizer', 'model_priority'] \n");
+            "'enable_opencl_throttling', 'disable_dynamic_shapes', 'enable_qdq_optimizer',"
+            " 'enable_causallm', 'model_priority'] \n");
       }
     }
     session_options.AppendExecutionProvider_OpenVINO_V2(ov_options);