From 766fb9623ccb9530b70c6ddbb94b6e5d4cb6d832 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 20 Nov 2025 09:58:44 +0000 Subject: [PATCH 1/4] Initial plan From 5b0ff74d78b5ac4249b7edc55875376d5f3b79b3 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 20 Nov 2025 10:05:55 +0000 Subject: [PATCH 2/4] Add metric for tracking erroneous vCPU kicks Co-authored-by: simongdavies <1397489+simongdavies@users.noreply.github.com> --- src/hyperlight_host/src/hypervisor/hyperv_linux.rs | 8 +++++++- src/hyperlight_host/src/hypervisor/hyperv_windows.rs | 4 ++++ src/hyperlight_host/src/hypervisor/kvm.rs | 8 +++++++- src/hyperlight_host/src/metrics/mod.rs | 3 +++ 4 files changed, 21 insertions(+), 2 deletions(-) diff --git a/src/hyperlight_host/src/hypervisor/hyperv_linux.rs b/src/hyperlight_host/src/hypervisor/hyperv_linux.rs index 64074b5f9..56511ff9b 100644 --- a/src/hyperlight_host/src/hypervisor/hyperv_linux.rs +++ b/src/hyperlight_host/src/hypervisor/hyperv_linux.rs @@ -815,11 +815,17 @@ impl Hypervisor for HypervLinuxDriver { // and resume execution HyperlightExit::Debug(VcpuStopReason::Interrupt) } else { + // Track erroneous vCPU kick - stale signal from previous call + metrics::counter!(crate::metrics::METRIC_ERRONEOUS_VCPU_KICK).increment(1); HyperlightExit::Retry() } #[cfg(not(gdb))] - HyperlightExit::Retry() + { + // Track erroneous vCPU kick - stale signal from previous call + metrics::counter!(crate::metrics::METRIC_ERRONEOUS_VCPU_KICK).increment(1); + HyperlightExit::Retry() + } } } libc::EAGAIN => HyperlightExit::Retry(), diff --git a/src/hyperlight_host/src/hypervisor/hyperv_windows.rs b/src/hyperlight_host/src/hypervisor/hyperv_windows.rs index ca1f75997..4f1504c7b 100644 --- a/src/hyperlight_host/src/hypervisor/hyperv_windows.rs +++ b/src/hyperlight_host/src/hypervisor/hyperv_windows.rs @@ -678,6 +678,8 @@ impl Hypervisor for HypervWindowsDriver { // needs to change the state of a VM or to inject an event into the processor // see https://learn.microsoft.com/en-us/virtualization/api/hypervisor-platform/funcs/whvcancelrunvirtualprocessor#remarks debug!("Internal cancellation detected, returning Retry error"); + // Track erroneous vCPU kick - internal cancellation not requested by user + metrics::counter!(crate::metrics::METRIC_ERRONEOUS_VCPU_KICK).increment(1); HyperlightExit::Retry() } else { HyperlightExit::Cancelled() @@ -692,6 +694,8 @@ impl Hypervisor for HypervWindowsDriver { // needs to change the state of a VM or to inject an event into the processor // see https://learn.microsoft.com/en-us/virtualization/api/hypervisor-platform/funcs/whvcancelrunvirtualprocessor#remarks debug!("Internal cancellation detected, returning Retry error"); + // Track erroneous vCPU kick - internal cancellation not requested by user + metrics::counter!(crate::metrics::METRIC_ERRONEOUS_VCPU_KICK).increment(1); HyperlightExit::Retry() } else { HyperlightExit::Cancelled() diff --git a/src/hyperlight_host/src/hypervisor/kvm.rs b/src/hyperlight_host/src/hypervisor/kvm.rs index 04b8ed60f..1ef78f2b3 100644 --- a/src/hyperlight_host/src/hypervisor/kvm.rs +++ b/src/hyperlight_host/src/hypervisor/kvm.rs @@ -760,11 +760,17 @@ impl Hypervisor for KVMDriver { // and resume execution HyperlightExit::Debug(VcpuStopReason::Interrupt) } else { + // Track erroneous vCPU kick - stale signal from previous call + metrics::counter!(crate::metrics::METRIC_ERRONEOUS_VCPU_KICK).increment(1); HyperlightExit::Retry() } #[cfg(not(gdb))] - HyperlightExit::Retry() + { + // Track erroneous vCPU kick - stale signal from previous call + metrics::counter!(crate::metrics::METRIC_ERRONEOUS_VCPU_KICK).increment(1); + HyperlightExit::Retry() + } } } libc::EAGAIN => HyperlightExit::Retry(), diff --git a/src/hyperlight_host/src/metrics/mod.rs b/src/hyperlight_host/src/metrics/mod.rs index 3a630fa44..4e16d7c89 100644 --- a/src/hyperlight_host/src/metrics/mod.rs +++ b/src/hyperlight_host/src/metrics/mod.rs @@ -21,6 +21,9 @@ pub(crate) static METRIC_GUEST_ERROR_LABEL_CODE: &str = "code"; // Counter metric that counts the number of times a guest function was called due to timing out pub(crate) static METRIC_GUEST_CANCELLATION: &str = "guest_cancellations_total"; +// Counter metric that counts the number of times a vcpu was kicked by a stale signal +pub(crate) static METRIC_ERRONEOUS_VCPU_KICK: &str = "erroneous_vcpu_kicks_total"; + // Histogram metric that measures the duration of guest function calls #[cfg(feature = "function_call_metrics")] pub(crate) static METRIC_GUEST_FUNC_DURATION: &str = "guest_call_duration_seconds"; From 32b77f9fa45a0ae83bdaf2a93699335bd7366994 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 20 Nov 2025 10:13:29 +0000 Subject: [PATCH 3/4] Add test for erroneous vCPU kick metric Co-authored-by: simongdavies <1397489+simongdavies@users.noreply.github.com> --- .../src/hypervisor/hyperv_linux.rs | 6 ++-- src/hyperlight_host/src/hypervisor/kvm.rs | 6 ++-- src/hyperlight_host/src/metrics/mod.rs | 29 +++++++++++++++++++ 3 files changed, 37 insertions(+), 4 deletions(-) diff --git a/src/hyperlight_host/src/hypervisor/hyperv_linux.rs b/src/hyperlight_host/src/hypervisor/hyperv_linux.rs index 56511ff9b..471a9a852 100644 --- a/src/hyperlight_host/src/hypervisor/hyperv_linux.rs +++ b/src/hyperlight_host/src/hypervisor/hyperv_linux.rs @@ -816,14 +816,16 @@ impl Hypervisor for HypervLinuxDriver { HyperlightExit::Debug(VcpuStopReason::Interrupt) } else { // Track erroneous vCPU kick - stale signal from previous call - metrics::counter!(crate::metrics::METRIC_ERRONEOUS_VCPU_KICK).increment(1); + metrics::counter!(crate::metrics::METRIC_ERRONEOUS_VCPU_KICK) + .increment(1); HyperlightExit::Retry() } #[cfg(not(gdb))] { // Track erroneous vCPU kick - stale signal from previous call - metrics::counter!(crate::metrics::METRIC_ERRONEOUS_VCPU_KICK).increment(1); + metrics::counter!(crate::metrics::METRIC_ERRONEOUS_VCPU_KICK) + .increment(1); HyperlightExit::Retry() } } diff --git a/src/hyperlight_host/src/hypervisor/kvm.rs b/src/hyperlight_host/src/hypervisor/kvm.rs index 1ef78f2b3..5a7ddb522 100644 --- a/src/hyperlight_host/src/hypervisor/kvm.rs +++ b/src/hyperlight_host/src/hypervisor/kvm.rs @@ -761,14 +761,16 @@ impl Hypervisor for KVMDriver { HyperlightExit::Debug(VcpuStopReason::Interrupt) } else { // Track erroneous vCPU kick - stale signal from previous call - metrics::counter!(crate::metrics::METRIC_ERRONEOUS_VCPU_KICK).increment(1); + metrics::counter!(crate::metrics::METRIC_ERRONEOUS_VCPU_KICK) + .increment(1); HyperlightExit::Retry() } #[cfg(not(gdb))] { // Track erroneous vCPU kick - stale signal from previous call - metrics::counter!(crate::metrics::METRIC_ERRONEOUS_VCPU_KICK).increment(1); + metrics::counter!(crate::metrics::METRIC_ERRONEOUS_VCPU_KICK) + .increment(1); HyperlightExit::Retry() } } diff --git a/src/hyperlight_host/src/metrics/mod.rs b/src/hyperlight_host/src/metrics/mod.rs index 4e16d7c89..033936e8f 100644 --- a/src/hyperlight_host/src/metrics/mod.rs +++ b/src/hyperlight_host/src/metrics/mod.rs @@ -199,4 +199,33 @@ mod tests { } } } + + #[test] + fn test_erroneous_vcpu_kick_metric_exists() { + // This test verifies that the metric can be incremented without errors. + // The actual scenario where this metric is incremented (stale signal delivery) + // is a race condition that's difficult to reliably reproduce in a test. + let recorder = metrics_util::debugging::DebuggingRecorder::new(); + let snapshotter = recorder.snapshotter(); + + let snapshot = with_local_recorder(&recorder, || { + // Manually increment the metric to verify it works + metrics::counter!(METRIC_ERRONEOUS_VCPU_KICK).increment(1); + snapshotter.snapshot() + }); + + // Verify the metric was recorded + #[expect(clippy::mutable_key_type)] + let snapshot = snapshot.into_hashmap(); + + let counter_key = CompositeKey::new( + metrics_util::MetricKind::Counter, + Key::from_name(METRIC_ERRONEOUS_VCPU_KICK), + ); + assert_eq!( + snapshot.get(&counter_key).unwrap().2, + metrics_util::debugging::DebugValue::Counter(1), + "Erroneous vCPU kick metric should be recorded" + ); + } } From 6a7fd7548e4ca63d0ee248385ffe391a4072135c Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 20 Nov 2025 10:17:30 +0000 Subject: [PATCH 4/4] Refactor to reduce code duplication in metric tracking Co-authored-by: simongdavies <1397489+simongdavies@users.noreply.github.com> --- .../src/hypervisor/hyperv_linux.rs | 30 +++++++------ .../src/hypervisor/hyperv_windows.rs | 44 ++++++++++--------- src/hyperlight_host/src/hypervisor/kvm.rs | 30 +++++++------ 3 files changed, 55 insertions(+), 49 deletions(-) diff --git a/src/hyperlight_host/src/hypervisor/hyperv_linux.rs b/src/hyperlight_host/src/hypervisor/hyperv_linux.rs index 471a9a852..0b001f453 100644 --- a/src/hyperlight_host/src/hypervisor/hyperv_linux.rs +++ b/src/hyperlight_host/src/hypervisor/hyperv_linux.rs @@ -805,20 +805,22 @@ impl Hypervisor for HypervLinuxDriver { HyperlightExit::Cancelled() } else { #[cfg(gdb)] - if debug_interrupt { - self.interrupt_handle - .debug_interrupt - .store(false, Ordering::Relaxed); - - // If the vCPU was stopped because of an interrupt, we need to - // return a special exit reason so that the gdb thread can handle it - // and resume execution - HyperlightExit::Debug(VcpuStopReason::Interrupt) - } else { - // Track erroneous vCPU kick - stale signal from previous call - metrics::counter!(crate::metrics::METRIC_ERRONEOUS_VCPU_KICK) - .increment(1); - HyperlightExit::Retry() + { + if debug_interrupt { + self.interrupt_handle + .debug_interrupt + .store(false, Ordering::Relaxed); + + // If the vCPU was stopped because of an interrupt, we need to + // return a special exit reason so that the gdb thread can handle it + // and resume execution + HyperlightExit::Debug(VcpuStopReason::Interrupt) + } else { + // Track erroneous vCPU kick - stale signal from previous call + metrics::counter!(crate::metrics::METRIC_ERRONEOUS_VCPU_KICK) + .increment(1); + HyperlightExit::Retry() + } } #[cfg(not(gdb))] diff --git a/src/hyperlight_host/src/hypervisor/hyperv_windows.rs b/src/hyperlight_host/src/hypervisor/hyperv_windows.rs index 4f1504c7b..1baa1e98a 100644 --- a/src/hyperlight_host/src/hypervisor/hyperv_windows.rs +++ b/src/hyperlight_host/src/hypervisor/hyperv_windows.rs @@ -662,27 +662,29 @@ impl Hypervisor for HypervWindowsDriver { WHV_RUN_VP_EXIT_REASON(8193i32) => { debug!("HyperV Cancelled Details :\n {:#?}", &self); #[cfg(gdb)] - if debug_interrupt { - self.interrupt_handle - .debug_interrupt - .store(false, Ordering::Relaxed); - - // If the vCPU was stopped because of an interrupt, we need to - // return a special exit reason so that the gdb thread can handle it - // and resume execution - HyperlightExit::Debug(VcpuStopReason::Interrupt) - } else if !cancel_was_requested_manually { - // This was an internal cancellation - // The virtualization stack can use this function to return the control - // of a virtual processor back to the virtualization stack in case it - // needs to change the state of a VM or to inject an event into the processor - // see https://learn.microsoft.com/en-us/virtualization/api/hypervisor-platform/funcs/whvcancelrunvirtualprocessor#remarks - debug!("Internal cancellation detected, returning Retry error"); - // Track erroneous vCPU kick - internal cancellation not requested by user - metrics::counter!(crate::metrics::METRIC_ERRONEOUS_VCPU_KICK).increment(1); - HyperlightExit::Retry() - } else { - HyperlightExit::Cancelled() + { + if debug_interrupt { + self.interrupt_handle + .debug_interrupt + .store(false, Ordering::Relaxed); + + // If the vCPU was stopped because of an interrupt, we need to + // return a special exit reason so that the gdb thread can handle it + // and resume execution + HyperlightExit::Debug(VcpuStopReason::Interrupt) + } else if !cancel_was_requested_manually { + // This was an internal cancellation + // The virtualization stack can use this function to return the control + // of a virtual processor back to the virtualization stack in case it + // needs to change the state of a VM or to inject an event into the processor + // see https://learn.microsoft.com/en-us/virtualization/api/hypervisor-platform/funcs/whvcancelrunvirtualprocessor#remarks + debug!("Internal cancellation detected, returning Retry error"); + // Track erroneous vCPU kick - internal cancellation not requested by user + metrics::counter!(crate::metrics::METRIC_ERRONEOUS_VCPU_KICK).increment(1); + HyperlightExit::Retry() + } else { + HyperlightExit::Cancelled() + } } #[cfg(not(gdb))] diff --git a/src/hyperlight_host/src/hypervisor/kvm.rs b/src/hyperlight_host/src/hypervisor/kvm.rs index 5a7ddb522..e62e624b5 100644 --- a/src/hyperlight_host/src/hypervisor/kvm.rs +++ b/src/hyperlight_host/src/hypervisor/kvm.rs @@ -750,20 +750,22 @@ impl Hypervisor for KVMDriver { HyperlightExit::Cancelled() } else { #[cfg(gdb)] - if debug_interrupt { - self.interrupt_handle - .debug_interrupt - .store(false, Ordering::Relaxed); - - // If the vCPU was stopped because of an interrupt, we need to - // return a special exit reason so that the gdb thread can handle it - // and resume execution - HyperlightExit::Debug(VcpuStopReason::Interrupt) - } else { - // Track erroneous vCPU kick - stale signal from previous call - metrics::counter!(crate::metrics::METRIC_ERRONEOUS_VCPU_KICK) - .increment(1); - HyperlightExit::Retry() + { + if debug_interrupt { + self.interrupt_handle + .debug_interrupt + .store(false, Ordering::Relaxed); + + // If the vCPU was stopped because of an interrupt, we need to + // return a special exit reason so that the gdb thread can handle it + // and resume execution + HyperlightExit::Debug(VcpuStopReason::Interrupt) + } else { + // Track erroneous vCPU kick - stale signal from previous call + metrics::counter!(crate::metrics::METRIC_ERRONEOUS_VCPU_KICK) + .increment(1); + HyperlightExit::Retry() + } } #[cfg(not(gdb))]