diff --git a/.dco-signoff-marker b/.dco-signoff-marker new file mode 100644 index 000000000..e69de29bb diff --git a/.squashed-commit-marker b/.squashed-commit-marker new file mode 100644 index 000000000..2141ede1b --- /dev/null +++ b/.squashed-commit-marker @@ -0,0 +1 @@ +Squashed commit: a39633dd5d0cdd1727690af873752b2162021d67 diff --git a/src/hyperlight_host/src/hypervisor/mod.rs b/src/hyperlight_host/src/hypervisor/mod.rs index 7a4c31014..23624365e 100644 --- a/src/hyperlight_host/src/hypervisor/mod.rs +++ b/src/hyperlight_host/src/hypervisor/mod.rs @@ -23,7 +23,7 @@ use crate::hypervisor::regs::{ CommonFpu, CommonRegisters, CommonSegmentRegister, CommonSpecialRegisters, }; use crate::mem::memory_region::{MemoryRegion, MemoryRegionFlags}; -use crate::metrics::METRIC_GUEST_CANCELLATION; +use crate::metrics::{METRIC_ERRONEOUS_VCPU_KICKS, METRIC_GUEST_CANCELLATION}; #[cfg(feature = "mem_profile")] use crate::sandbox::trace::MemTraceInfo; use crate::{HyperlightError, Result, log_then_return}; @@ -469,8 +469,12 @@ impl VirtualCPU { } Ok(HyperlightExit::Cancelled()) => { // If cancellation was not requested for this specific guest function call, - // the vcpu was interrupted by a stale cancellation from a previous call + // the vcpu was interrupted by a stale cancellation. This can occur when: + // - Linux: A signal from a previous call arrives late + // - Windows: WHvCancelRunVirtualProcessor called right after vcpu exits but RUNNING_BIT is still true if !cancel_requested && !debug_interrupted { + // Track that an erroneous vCPU kick occurred + metrics::counter!(METRIC_ERRONEOUS_VCPU_KICKS).increment(1); // treat this the same as a HyperlightExit::Retry, the cancel was not meant for this call continue; } diff --git a/src/hyperlight_host/src/metrics/mod.rs b/src/hyperlight_host/src/metrics/mod.rs index 3a630fa44..a2228c8fc 100644 --- a/src/hyperlight_host/src/metrics/mod.rs +++ b/src/hyperlight_host/src/metrics/mod.rs @@ -21,6 +21,12 @@ pub(crate) static METRIC_GUEST_ERROR_LABEL_CODE: &str = "code"; // Counter metric that counts the number of times a guest function was called due to timing out pub(crate) static METRIC_GUEST_CANCELLATION: &str = "guest_cancellations_total"; +// Counter metric that counts the number of times a vCPU was erroneously kicked by a stale cancellation +// This can happen in two scenarios: +// 1. Linux: A signal from a previous guest call arrives late and interrupts a new call +// 2. Windows: WHvCancelRunVirtualProcessor is called right after vCPU exits but RUNNING_BIT is still true +pub(crate) static METRIC_ERRONEOUS_VCPU_KICKS: &str = "erroneous_vcpu_kicks_total"; + // Histogram metric that measures the duration of guest function calls #[cfg(feature = "function_call_metrics")] pub(crate) static METRIC_GUEST_FUNC_DURATION: &str = "guest_call_duration_seconds";