Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 22 additions & 12 deletions src/hyperlight_host/src/hypervisor/hyperv_linux.rs
Original file line number Diff line number Diff line change
Expand Up @@ -805,21 +805,31 @@ impl Hypervisor for HypervLinuxDriver {
HyperlightExit::Cancelled()
} else {
#[cfg(gdb)]
if debug_interrupt {
self.interrupt_handle
.debug_interrupt
.store(false, Ordering::Relaxed);

// If the vCPU was stopped because of an interrupt, we need to
// return a special exit reason so that the gdb thread can handle it
// and resume execution
HyperlightExit::Debug(VcpuStopReason::Interrupt)
} else {
HyperlightExit::Retry()
{
if debug_interrupt {
self.interrupt_handle
.debug_interrupt
.store(false, Ordering::Relaxed);

// If the vCPU was stopped because of an interrupt, we need to
// return a special exit reason so that the gdb thread can handle it
// and resume execution
HyperlightExit::Debug(VcpuStopReason::Interrupt)
} else {
// Track erroneous vCPU kick - stale signal from previous call
metrics::counter!(crate::metrics::METRIC_ERRONEOUS_VCPU_KICK)
.increment(1);
HyperlightExit::Retry()
}
}

#[cfg(not(gdb))]
HyperlightExit::Retry()
{
// Track erroneous vCPU kick - stale signal from previous call
metrics::counter!(crate::metrics::METRIC_ERRONEOUS_VCPU_KICK)
.increment(1);
HyperlightExit::Retry()
}
}
}
libc::EAGAIN => HyperlightExit::Retry(),
Expand Down
44 changes: 25 additions & 19 deletions src/hyperlight_host/src/hypervisor/hyperv_windows.rs
Original file line number Diff line number Diff line change
Expand Up @@ -662,25 +662,29 @@ impl Hypervisor for HypervWindowsDriver {
WHV_RUN_VP_EXIT_REASON(8193i32) => {
debug!("HyperV Cancelled Details :\n {:#?}", &self);
#[cfg(gdb)]
if debug_interrupt {
self.interrupt_handle
.debug_interrupt
.store(false, Ordering::Relaxed);

// If the vCPU was stopped because of an interrupt, we need to
// return a special exit reason so that the gdb thread can handle it
// and resume execution
HyperlightExit::Debug(VcpuStopReason::Interrupt)
} else if !cancel_was_requested_manually {
// This was an internal cancellation
// The virtualization stack can use this function to return the control
// of a virtual processor back to the virtualization stack in case it
// needs to change the state of a VM or to inject an event into the processor
// see https://learn.microsoft.com/en-us/virtualization/api/hypervisor-platform/funcs/whvcancelrunvirtualprocessor#remarks
debug!("Internal cancellation detected, returning Retry error");
HyperlightExit::Retry()
} else {
HyperlightExit::Cancelled()
{
if debug_interrupt {
self.interrupt_handle
.debug_interrupt
.store(false, Ordering::Relaxed);

// If the vCPU was stopped because of an interrupt, we need to
// return a special exit reason so that the gdb thread can handle it
// and resume execution
HyperlightExit::Debug(VcpuStopReason::Interrupt)
} else if !cancel_was_requested_manually {
// This was an internal cancellation
// The virtualization stack can use this function to return the control
// of a virtual processor back to the virtualization stack in case it
// needs to change the state of a VM or to inject an event into the processor
// see https://learn.microsoft.com/en-us/virtualization/api/hypervisor-platform/funcs/whvcancelrunvirtualprocessor#remarks
debug!("Internal cancellation detected, returning Retry error");
// Track erroneous vCPU kick - internal cancellation not requested by user
metrics::counter!(crate::metrics::METRIC_ERRONEOUS_VCPU_KICK).increment(1);
HyperlightExit::Retry()
} else {
HyperlightExit::Cancelled()
}
}

#[cfg(not(gdb))]
Expand All @@ -692,6 +696,8 @@ impl Hypervisor for HypervWindowsDriver {
// needs to change the state of a VM or to inject an event into the processor
// see https://learn.microsoft.com/en-us/virtualization/api/hypervisor-platform/funcs/whvcancelrunvirtualprocessor#remarks
debug!("Internal cancellation detected, returning Retry error");
// Track erroneous vCPU kick - internal cancellation not requested by user
metrics::counter!(crate::metrics::METRIC_ERRONEOUS_VCPU_KICK).increment(1);
HyperlightExit::Retry()
} else {
HyperlightExit::Cancelled()
Expand Down
34 changes: 22 additions & 12 deletions src/hyperlight_host/src/hypervisor/kvm.rs
Original file line number Diff line number Diff line change
Expand Up @@ -750,21 +750,31 @@ impl Hypervisor for KVMDriver {
HyperlightExit::Cancelled()
} else {
#[cfg(gdb)]
if debug_interrupt {
self.interrupt_handle
.debug_interrupt
.store(false, Ordering::Relaxed);

// If the vCPU was stopped because of an interrupt, we need to
// return a special exit reason so that the gdb thread can handle it
// and resume execution
HyperlightExit::Debug(VcpuStopReason::Interrupt)
} else {
HyperlightExit::Retry()
{
if debug_interrupt {
self.interrupt_handle
.debug_interrupt
.store(false, Ordering::Relaxed);

// If the vCPU was stopped because of an interrupt, we need to
// return a special exit reason so that the gdb thread can handle it
// and resume execution
HyperlightExit::Debug(VcpuStopReason::Interrupt)
} else {
// Track erroneous vCPU kick - stale signal from previous call
metrics::counter!(crate::metrics::METRIC_ERRONEOUS_VCPU_KICK)
.increment(1);
HyperlightExit::Retry()
}
}

#[cfg(not(gdb))]
HyperlightExit::Retry()
{
// Track erroneous vCPU kick - stale signal from previous call
metrics::counter!(crate::metrics::METRIC_ERRONEOUS_VCPU_KICK)
.increment(1);
HyperlightExit::Retry()
}
}
}
libc::EAGAIN => HyperlightExit::Retry(),
Expand Down
32 changes: 32 additions & 0 deletions src/hyperlight_host/src/metrics/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,9 @@ pub(crate) static METRIC_GUEST_ERROR_LABEL_CODE: &str = "code";
// Counter metric that counts the number of times a guest function was called due to timing out
pub(crate) static METRIC_GUEST_CANCELLATION: &str = "guest_cancellations_total";

// Counter metric that counts the number of times a vcpu was kicked by a stale signal
pub(crate) static METRIC_ERRONEOUS_VCPU_KICK: &str = "erroneous_vcpu_kicks_total";

// Histogram metric that measures the duration of guest function calls
#[cfg(feature = "function_call_metrics")]
pub(crate) static METRIC_GUEST_FUNC_DURATION: &str = "guest_call_duration_seconds";
Expand Down Expand Up @@ -196,4 +199,33 @@ mod tests {
}
}
}

#[test]
fn test_erroneous_vcpu_kick_metric_exists() {
// This test verifies that the metric can be incremented without errors.
// The actual scenario where this metric is incremented (stale signal delivery)
// is a race condition that's difficult to reliably reproduce in a test.
let recorder = metrics_util::debugging::DebuggingRecorder::new();
let snapshotter = recorder.snapshotter();

let snapshot = with_local_recorder(&recorder, || {
// Manually increment the metric to verify it works
metrics::counter!(METRIC_ERRONEOUS_VCPU_KICK).increment(1);
snapshotter.snapshot()
});

// Verify the metric was recorded
#[expect(clippy::mutable_key_type)]
let snapshot = snapshot.into_hashmap();

let counter_key = CompositeKey::new(
metrics_util::MetricKind::Counter,
Key::from_name(METRIC_ERRONEOUS_VCPU_KICK),
);
assert_eq!(
snapshot.get(&counter_key).unwrap().2,
metrics_util::debugging::DebugValue::Counter(1),
"Erroneous vCPU kick metric should be recorded"
);
}
}