From 63b7b52b68d701690874d9be0e9437c659cd91d1 Mon Sep 17 00:00:00 2001 From: Simon Davies Date: Thu, 23 Apr 2026 10:33:38 +0100 Subject: [PATCH 01/10] common: add paravirtualized clock types and clock-page layout constants MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduces the types and layout constants shared between host and guest that describe the paravirtualized clock page: - hyperlight_common::time - KvmPvclockVcpuTimeInfo (KVM ABI, 32 bytes) - HvReferenceTscPage (Hyper-V TLFS §12.7, 4 KiB) - PVCLOCK_TSC_STABLE_BIT - ClockType { None, KvmPvclock, HyperVReferenceTsc } (unrecognised bytes deliberately decode to None, so a misbehaving host advertising garbage is treated as 'clock unavailable' rather than being mis-parsed) - Compile-time size invariants on the hypervisor-ABI structs. - hyperlight_common::layout - SCRATCH_TOP_CLOCK_PAGE_OFFSET (0x3000: one page below the guest-counter page, avoiding the i686 frame-number issue) - CLOCK_PAGE_SIZE (0x1000) - CLOCK_PAGE_CLOCK_TYPE_OFFSET (0x0FE0, u64) - CLOCK_PAGE_BOOT_TIME_NS_OFFSET (0x0FE8, u64) - CLOCK_PAGE_TRAILER_SIZE (16) - Compile-time asserts that the trailer does not collide with the 32-byte KVM calibration header and fits inside the page. The clock page will live in the sandbox's scratch region, which upstream snapshot machinery structurally excludes from snapshots — a snapshot taken at T0 cannot be used to observe T0's wall clock once the guest runs again. No feature gate on this crate: the types are always compiled in. Actually writing / reading the page and arming the hypervisor clock will land in subsequent commits under the host-only enable_guest_clock feature. Signed-off-by: Simon Davies --- src/hyperlight_common/src/layout.rs | 50 ++++++++- src/hyperlight_common/src/lib.rs | 4 + src/hyperlight_common/src/time.rs | 165 ++++++++++++++++++++++++++++ 3 files changed, 218 insertions(+), 1 deletion(-) create mode 100644 src/hyperlight_common/src/time.rs diff --git a/src/hyperlight_common/src/layout.rs b/src/hyperlight_common/src/layout.rs index 1a7ca0880..c267f41ce 100644 --- a/src/hyperlight_common/src/layout.rs +++ b/src/hyperlight_common/src/layout.rs @@ -38,7 +38,31 @@ pub const SCRATCH_TOP_SIZE_OFFSET: u64 = 0x08; pub const SCRATCH_TOP_ALLOCATOR_OFFSET: u64 = 0x10; pub const SCRATCH_TOP_SNAPSHOT_PT_GPA_BASE_OFFSET: u64 = 0x18; pub const SCRATCH_TOP_SNAPSHOT_GENERATION_OFFSET: u64 = 0x20; -pub const SCRATCH_TOP_EXN_STACK_OFFSET: u64 = 0x30; + +/// Offset from the top of scratch for the `clock_type` field (u64). +/// +/// Identifies which paravirtualized clock the host configured +/// ([`crate::time::ClockType`]). Lives in the bookkeeping page at the +/// top of scratch — NOT in the clock page itself — so the hypervisor +/// cannot clobber it if it extends the TLFS-reserved region. +pub const SCRATCH_TOP_CLOCK_TYPE_OFFSET: u64 = 0x28; + +/// Offset from the top of scratch for the `boot_time_ns` field (u64). +/// +/// The Unix-epoch origin of the monotonic clock, computed by the host +/// as `SystemTime::now() - current_monotonic_ns()` and written in +/// `arm_clock`. The guest recovers wall time as +/// `boot_time_ns + monotonic_time_ns()`. +/// +/// Hyper-V has no equivalent to KVM's `MSR_KVM_WALL_CLOCK_NEW`, so +/// we use this uniform host-computed approach on all backends. +pub const SCRATCH_TOP_BOOT_TIME_NS_OFFSET: u64 = 0x30; + +// ---- Next free offset in the bookkeeping page: 0x38 ---- +// When adding new host→guest shared fields, use the next multiple of +// 8 after the last offset above. All fields in this page are u64, +// little-endian, host-written and guest-read, and are excluded from +// snapshots because they live in scratch memory. /// Offset from the top of scratch memory for a shared host-guest u64 counter. /// @@ -49,6 +73,30 @@ pub const SCRATCH_TOP_EXN_STACK_OFFSET: u64 = 0x30; #[cfg(feature = "guest-counter")] pub const SCRATCH_TOP_GUEST_COUNTER_OFFSET: u64 = 0x1008; +/// Offset from the top of scratch memory for the start of the paravirtualized +/// clock page. +/// +/// The clock page is a single 4 KiB page occupying the scratch offsets +/// `[0x3000, 0x2000)` from the top — i.e. one page lower than the +/// guest-counter page, to avoid the i686 frame-number issue that forces the +/// counter off the very last page (see [`SCRATCH_TOP_GUEST_COUNTER_OFFSET`]). +/// +/// The constant is the *high* (exclusive) offset; the page base is one page +/// below, at `top - SCRATCH_TOP_CLOCK_PAGE_OFFSET` + 1 byte — in other words, +/// subtract this value from `MAX_GPA`/`MAX_GVA` + 1 to get the page base. +/// +/// The page is only present when the host is built with the +/// `enable_guest_clock` feature. +pub const SCRATCH_TOP_CLOCK_PAGE_OFFSET: u64 = 0x3000; + +/// Size of the paravirtualized clock page in bytes (one 4 KiB page). +/// The entire page is owned by the hypervisor (KVM pvclock or Hyper-V +/// Reference TSC). Hyperlight's own metadata (`clock_type`, +/// `boot_time_ns`) lives in the bookkeeping page at offsets +/// `SCRATCH_TOP_CLOCK_TYPE_OFFSET` / `SCRATCH_TOP_BOOT_TIME_NS_OFFSET`, +/// NOT in the clock page, so a future TLFS extension cannot clobber it. +pub const CLOCK_PAGE_SIZE: u64 = 0x1000; + pub fn scratch_base_gpa(size: usize) -> u64 { (MAX_GPA - size + 1) as u64 } diff --git a/src/hyperlight_common/src/lib.rs b/src/hyperlight_common/src/lib.rs index eb4be220c..0bbb70dba 100644 --- a/src/hyperlight_common/src/lib.rs +++ b/src/hyperlight_common/src/lib.rs @@ -48,5 +48,9 @@ pub mod func; // cbindgen:ignore pub mod vmem; +/// Paravirtualized clock structures shared between host and guest. +/// cbindgen:ignore +pub mod time; + /// ELF note types for embedding hyperlight version metadata in guest binaries. pub mod version_note; diff --git a/src/hyperlight_common/src/time.rs b/src/hyperlight_common/src/time.rs new file mode 100644 index 000000000..48cd0e660 --- /dev/null +++ b/src/hyperlight_common/src/time.rs @@ -0,0 +1,165 @@ +/* +Copyright 2025 The Hyperlight Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +//! Paravirtualized clock structures shared between host and guest. +//! +//! Guests can read time without a VM exit by consulting a shared memory page +//! that the hypervisor updates. The page is placed in the sandbox's scratch +//! region (see [`crate::layout::SCRATCH_TOP_CLOCK_PAGE_OFFSET`]), so it is +//! not included in sandbox snapshots. +//! +//! # Layout +//! +//! The clock page (4 KiB) is 100% hypervisor-owned: +//! +//! ```text +//! clock page (offset -0x3000 from scratch top): +//! 0x0000 .. : hypervisor calibration data +//! - KVM: KvmPvclockVcpuTimeInfo (32 bytes) +//! - Hyper-V: HvReferenceTscPage (4096 bytes) +//! ``` +//! +//! Hyperlight's own metadata lives in the bookkeeping page at the top +//! of scratch (separate from the clock page), so a future TLFS +//! extension of the reserved region cannot clobber it: +//! +//! ```text +//! bookkeeping page (top of scratch, offset -0x08..-0x30): +//! -0x28 : clock_type (u64, ClockType discriminant) +//! -0x30 : boot_time_ns (u64, Unix-epoch origin of monotonic clock) +//! ``` + +/// KVM pvclock flag: TSC is stable and synchronized across vCPUs. +/// +/// When this bit is set in [`KvmPvclockVcpuTimeInfo::flags`], the TSC is +/// guaranteed to be monotonic and synchronized across all vCPUs, even when +/// migrating between physical CPUs on the same host. +/// +/// Reference: Linux kernel `arch/x86/include/asm/pvclock-abi.h`. +pub const PVCLOCK_TSC_STABLE_BIT: u8 = 1 << 0; + +/// KVM pvclock structure (defined by KVM ABI). +/// +/// The host writes to this structure, and the guest reads it to compute the +/// current time in nanoseconds. +/// +/// Reference: Linux kernel `arch/x86/include/asm/pvclock.h`. +#[repr(C)] +#[derive(Debug, Clone, Copy)] +pub struct KvmPvclockVcpuTimeInfo { + /// Version counter — odd means update in progress. Guest must re-read + /// if this changes during read. + pub version: u32, + pub pad0: u32, + /// TSC value when `system_time` was captured. + pub tsc_timestamp: u64, + /// System time in nanoseconds at `tsc_timestamp`. + pub system_time: u64, + /// Multiplier for TSC → nanoseconds conversion. + pub tsc_to_system_mul: u32, + /// Shift for TSC → nanoseconds conversion (can be negative). + pub tsc_shift: i8, + /// Flags (e.g. [`PVCLOCK_TSC_STABLE_BIT`]). + pub flags: u8, + pub pad: [u8; 2], +} + +/// Hyper-V Reference TSC page structure (defined by Hyper-V ABI). +/// +/// Used by both MSHV (Linux) and WHP (Windows). Time is in 100-nanosecond +/// intervals. +/// +/// Reference: Hyper-V TLFS §12.7. +#[repr(C)] +#[derive(Debug, Clone, Copy)] +pub struct HvReferenceTscPage { + /// Sequence counter. A value of 0 means the host is directing the guest + /// to fall back to an MSR read; the guest must also re-read if this + /// changes during a read. + pub tsc_sequence: u32, + pub reserved1: u32, + /// Scale factor for TSC → time conversion. + /// Formula: `time = (tsc * tsc_scale) >> 64 + tsc_offset` (in 100 ns). + pub tsc_scale: u64, + /// Offset to add after scaling (in 100 ns units). + pub tsc_offset: i64, + /// Rest of the 4 KiB page is reserved by the TLFS. + pub reserved2: [u64; 509], +} + +/// Type of paravirtualized clock configured for the guest. +/// +/// This is the value written by the host at +/// [`crate::layout::SCRATCH_TOP_CLOCK_TYPE_OFFSET`] in the scratch +/// bookkeeping page. +/// The guest treats any value other than the two supported variants as +/// [`ClockType::None`] — this means a misbehaving host that writes garbage +/// to the bookkeeping page simply ends up advertising "no clock", rather than +/// causing the guest to misinterpret the calibration header. +#[repr(u64)] +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum ClockType { + /// No clock configured — time functions return `None` / zero. + None = 0, + /// KVM pvclock (Linux KVM hypervisor). + KvmPvclock = 1, + /// Hyper-V Reference TSC (MSHV on Linux, WHP on Windows). + HyperVReferenceTsc = 2, +} + +impl From for ClockType { + fn from(value: u64) -> Self { + match value { + 1 => ClockType::KvmPvclock, + 2 => ClockType::HyperVReferenceTsc, + _ => ClockType::None, + } + } +} + +impl From for u64 { + fn from(value: ClockType) -> Self { + value as u64 + } +} + +// Compile-time size invariants. These layouts are dictated by the hypervisor +// ABI (KVM pvclock, Hyper-V TLFS §12.7) — a size mismatch is a programming +// error that must surface at build time. +const _: () = { + assert!(core::mem::size_of::() == 32); + assert!(core::mem::size_of::() == 4096); +}; + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn clock_type_conversion_round_trips() { + assert_eq!(ClockType::from(0u64), ClockType::None); + assert_eq!(ClockType::from(1u64), ClockType::KvmPvclock); + assert_eq!(ClockType::from(2u64), ClockType::HyperVReferenceTsc); + } + + #[test] + fn clock_type_conversion_unknown_is_none() { + // A host that writes an unrecognised value must be treated as + // "clock unavailable", not as an opportunity to misinterpret. + assert_eq!(ClockType::from(3u64), ClockType::None); + assert_eq!(ClockType::from(u64::MAX), ClockType::None); + } +} From 65c1074322d6e583d9a891d9c468b23a46781068 Mon Sep 17 00:00:00 2001 From: Simon Davies Date: Thu, 23 Apr 2026 10:52:43 +0100 Subject: [PATCH 02/10] host: add enable_guest_clock feature + VirtualMachine::setup_pvclock Introduce a new opt-in host cargo feature 'enable_guest_clock' that will eventually let guests read wall-clock and monotonic time without taking VM exits via a paravirtualized clock page. This commit lays the hypervisor plumbing only: it adds the feature flag plus a required VirtualMachine::setup_pvclock(gpa) trait method, gated on both the feature and target_arch = x86_64 (pvclock is an x86 concept). Backend implementations: KVM x86_64 writes MSR_KVM_SYSTEM_TIME_NEW (0x4b564d01) with (gpa | 1); MSHV x86_64 sets HV_REGISTER_REFERENCE_TSC with (gpa | 1); WHP on Windows sets WHvRegisterReferenceTsc with (gpa | 1). The method is not yet invoked from the initialise path; that comes in a follow-on commit once the scratch-region clock page layout is wired up. The allow(dead_code) marker on the trait method will be removed then. No behaviour change when the feature is off. Clippy and fmt clean both with and without the feature. Signed-off-by: Simon Davies --- src/hyperlight_host/Cargo.toml | 2 + .../hypervisor/virtual_machine/kvm/x86_64.rs | 46 ++++++++++++++++++ .../src/hypervisor/virtual_machine/mod.rs | 27 +++++++++++ .../hypervisor/virtual_machine/mshv/x86_64.rs | 48 +++++++++++++++++++ .../src/hypervisor/virtual_machine/whp.rs | 47 ++++++++++++++++++ 5 files changed, 170 insertions(+) diff --git a/src/hyperlight_host/Cargo.toml b/src/hyperlight_host/Cargo.toml index abc24fab8..9c8dd41a1 100644 --- a/src/hyperlight_host/Cargo.toml +++ b/src/hyperlight_host/Cargo.toml @@ -140,6 +140,8 @@ build-metadata = ["dep:built"] i686-guest = ["hyperlight-common/i686-guest"] nanvix-unstable = ["i686-guest", "hyperlight-common/nanvix-unstable"] guest-counter = ["hyperlight-common/guest-counter"] +# Populate the paravirtualized clock page so guests can read time without VM exits. x86_64 only. +enable_guest_clock = [] [[bench]] name = "benchmarks" diff --git a/src/hyperlight_host/src/hypervisor/virtual_machine/kvm/x86_64.rs b/src/hyperlight_host/src/hypervisor/virtual_machine/kvm/x86_64.rs index db68dfdd0..626ebbe7e 100644 --- a/src/hyperlight_host/src/hypervisor/virtual_machine/kvm/x86_64.rs +++ b/src/hyperlight_host/src/hypervisor/virtual_machine/kvm/x86_64.rs @@ -468,6 +468,52 @@ impl VirtualMachine for KvmVm { Ok(()) } + + #[cfg(feature = "enable_guest_clock")] + fn setup_pvclock(&mut self, clock_page_gpa: u64) -> crate::Result<()> { + // KVM pvclock: write `MSR_KVM_SYSTEM_TIME_NEW` with `gpa | 1`. + // Bit 0 is the "enable" flag; clearing it disables pvclock for this + // vCPU. + // + // Reference: https://docs.kernel.org/virt/kvm/x86/msr.html#pvclock + use kvm_bindings::{Msrs, kvm_msr_entry}; + + const MSR_KVM_SYSTEM_TIME_NEW: u32 = 0x4b564d01; + const PVCLOCK_ENABLE_BIT: u64 = 1; + + let mut msrs = Msrs::new(1) + .map_err(|e| crate::new_error!("Failed to allocate MSR list for pvclock: {}", e))?; + msrs.as_mut_slice()[0] = kvm_msr_entry { + index: MSR_KVM_SYSTEM_TIME_NEW, + data: clock_page_gpa | PVCLOCK_ENABLE_BIT, + ..Default::default() + }; + + self.vcpu_fd + .set_msrs(&msrs) + .map_err(|e| crate::new_error!("Failed to set pvclock MSR: {}", e))?; + + tracing::debug!( + target: "hyperlight::pvclock", + clock_page_gpa, + "KVM pvclock armed" + ); + Ok(()) + } + + #[cfg(feature = "enable_guest_clock")] + fn current_monotonic_ns(&self) -> crate::Result { + // KVM_GET_CLOCK returns kvmclock nanoseconds — the same time base + // the guest reads through the pvclock page. We cannot use + // clock_gettime(CLOCK_MONOTONIC) here because kvmclock has its + // own epoch (which can be shifted via KVM_SET_CLOCK) and does not + // necessarily match host CLOCK_MONOTONIC. + let clock = self + .vm_fd + .get_clock() + .map_err(|e| crate::new_error!("KVM_GET_CLOCK failed: {}", e))?; + Ok(clock.clock) + } } #[cfg(gdb)] diff --git a/src/hyperlight_host/src/hypervisor/virtual_machine/mod.rs b/src/hyperlight_host/src/hypervisor/virtual_machine/mod.rs index ecb19a09f..047f36d58 100644 --- a/src/hyperlight_host/src/hypervisor/virtual_machine/mod.rs +++ b/src/hyperlight_host/src/hypervisor/virtual_machine/mod.rs @@ -353,6 +353,33 @@ pub(crate) trait VirtualMachine: Debug + Send { #[cfg(not(feature = "i686-guest"))] fn set_xsave(&self, xsave: &[u32]) -> std::result::Result<(), RegisterError>; + /// Arm the hypervisor's paravirtualized clock for this vCPU, pointing it + /// at the guest physical address of the sandbox's clock page. + /// + /// Must be called before the first `run_vcpu`, and again on snapshot + /// restore since the register lives in vCPU state. + #[cfg(all(feature = "enable_guest_clock", target_arch = "x86_64"))] + #[allow(dead_code)] // wired up in a follow-on commit + fn setup_pvclock( + &mut self, + clock_page_gpa: u64, + ) -> std::result::Result<(), crate::HyperlightError>; + + /// Read the host's monotonic clock for the time base that backs the + /// guest's paravirtualized clock page, in nanoseconds. + /// + /// Each hypervisor has its own monotonic epoch that may differ from + /// `CLOCK_MONOTONIC`, so we cannot use a single host clock. This + /// value is used to derive `boot_time_ns = wall_now - monotonic_now`, + /// giving guests a uniform wall-clock origin across all backends. + /// + /// KVM does offer `MSR_KVM_WALL_CLOCK_NEW` for this, but Hyper-V's + /// TLFS explicitly states its reference time is "not intended to be + /// used as a source of wall clock time". Rather than diverge per + /// backend, we use the same host-computed approach everywhere. + #[cfg(all(feature = "enable_guest_clock", target_arch = "x86_64"))] + fn current_monotonic_ns(&self) -> std::result::Result; + /// Get partition handle #[cfg(target_os = "windows")] fn partition_handle(&self) -> windows::Win32::System::Hypervisor::WHV_PARTITION_HANDLE; diff --git a/src/hyperlight_host/src/hypervisor/virtual_machine/mshv/x86_64.rs b/src/hyperlight_host/src/hypervisor/virtual_machine/mshv/x86_64.rs index 27f024ca6..82b912612 100644 --- a/src/hyperlight_host/src/hypervisor/virtual_machine/mshv/x86_64.rs +++ b/src/hyperlight_host/src/hypervisor/virtual_machine/mshv/x86_64.rs @@ -469,6 +469,54 @@ impl VirtualMachine for MshvVm { .map_err(|e| RegisterError::SetXsave(e.into()))?; Ok(()) } + + #[cfg(feature = "enable_guest_clock")] + fn setup_pvclock(&mut self, clock_page_gpa: u64) -> crate::Result<()> { + // Hyper-V Reference TSC page: write `HV_REGISTER_REFERENCE_TSC` with + // `gpa | 1`. Bit 0 is the "enable" flag. + // + // Reference: Hyper-V TLFS section 12.7 (Reference TSC Page). + use mshv_bindings::hv_register_name_HV_REGISTER_REFERENCE_TSC; + + const REFERENCE_TSC_ENABLE_BIT: u64 = 1; + + self.vcpu_fd + .set_reg(&[hv_register_assoc { + name: hv_register_name_HV_REGISTER_REFERENCE_TSC, + value: hv_register_value { + reg64: clock_page_gpa | REFERENCE_TSC_ENABLE_BIT, + }, + ..Default::default() + }]) + .map_err(|e| crate::new_error!("Failed to set HV_REGISTER_REFERENCE_TSC: {}", e))?; + + tracing::debug!( + target: "hyperlight::pvclock", + clock_page_gpa, + "MSHV Reference TSC armed" + ); + Ok(()) + } + + #[cfg(feature = "enable_guest_clock")] + fn current_monotonic_ns(&self) -> crate::Result { + // HV Reference TSC is partition reference time in 100 ns units; + // the host reads the same time base via HV_REGISTER_TIME_REF_COUNT. + use mshv_bindings::hv_register_name_HV_REGISTER_TIME_REF_COUNT; + let mut reg = [hv_register_assoc { + name: hv_register_name_HV_REGISTER_TIME_REF_COUNT, + value: hv_register_value { reg64: 0 }, + ..Default::default() + }]; + self.vcpu_fd + .get_reg(&mut reg) + .map_err(|e| crate::new_error!("Failed to read HV_REGISTER_TIME_REF_COUNT: {}", e))?; + // SAFETY: the union holds reg64 because we set the register name to + // a 64-bit register, and `get_reg` writes through the same union + // shape we provided. + let ticks_100ns = unsafe { reg[0].value.reg64 }; + Ok(ticks_100ns.wrapping_mul(100)) + } } #[cfg(gdb)] diff --git a/src/hyperlight_host/src/hypervisor/virtual_machine/whp.rs b/src/hyperlight_host/src/hypervisor/virtual_machine/whp.rs index 18e366835..d853cd846 100644 --- a/src/hyperlight_host/src/hypervisor/virtual_machine/whp.rs +++ b/src/hyperlight_host/src/hypervisor/virtual_machine/whp.rs @@ -803,6 +803,53 @@ impl VirtualMachine for WhpVm { fn partition_handle(&self) -> WHV_PARTITION_HANDLE { self.partition } + + #[cfg(feature = "enable_guest_clock")] + fn setup_pvclock(&mut self, clock_page_gpa: u64) -> crate::Result<()> { + // Hyper-V Reference TSC page via WHP: write `WHvRegisterReferenceTsc` + // with `gpa | 1`. Bit 0 is the "enable" flag. + // + // Reference: Hyper-V TLFS section 12.7 (Reference TSC Page). + const REFERENCE_TSC_ENABLE_BIT: u64 = 1; + + let reg_value = WHV_REGISTER_VALUE { + Reg64: clock_page_gpa | REFERENCE_TSC_ENABLE_BIT, + }; + self.set_registers(&[(WHvRegisterReferenceTsc, Align16(reg_value))]) + .map_err(|e| crate::new_error!("Failed to set WHvRegisterReferenceTsc: {}", e))?; + + tracing::debug!( + target: "hyperlight::pvclock", + clock_page_gpa, + "WHP Reference TSC armed" + ); + Ok(()) + } + + #[cfg(feature = "enable_guest_clock")] + fn current_monotonic_ns(&self) -> crate::Result { + // WHP exposes the partition reference time (same time base as the + // Reference TSC page) via a partition property, NOT a vCPU register. + // The value is in 100 ns units. + // + // Note: WHP does not expose `WHvRegisterTimeRefCount` — that is a + // Hyper-V/MSHV register name. The WHP equivalent is + // `WHvPartitionPropertyCodeReferenceTime`. + let mut property: WHV_PARTITION_PROPERTY = unsafe { std::mem::zeroed() }; + let mut written_size = 0u32; + unsafe { + WHvGetPartitionProperty( + self.partition, + WHvPartitionPropertyCodeReferenceTime, + &mut property as *mut WHV_PARTITION_PROPERTY as *mut c_void, + std::mem::size_of::() as u32, + Some(&mut written_size), + ) + .map_err(|e| crate::new_error!("Failed to read WHP ReferenceTime: {}", e))?; + } + let ticks_100ns = unsafe { property.ReferenceTime }; + Ok(ticks_100ns.wrapping_mul(100)) + } } #[cfg(gdb)] From 49d073d2b9bd2280551f8170383a738872e53fb7 Mon Sep 17 00:00:00 2001 From: Simon Davies Date: Thu, 23 Apr 2026 11:06:29 +0100 Subject: [PATCH 03/10] host/common: reserve scratch clock page and expose GPA/GVA With enable_guest_clock on, the host bumps min_scratch_size by one page so the sandbox's scratch region has room for the paravirtualized clock page at its top. The extra page lives in scratch (not in the snapshot) so boot_time_ns and calibration data don't leak across snapshot/restore. Common gains clock_page_gpa() / clock_page_gva() const helpers (both are fixed relative to MAX_GPA/MAX_GVA, independent of scratch_size). Host gets SandboxMemoryLayout::get_clock_page_gpa() under the feature. The bump is applied at the host enforcement site (SandboxMemoryLayout::new and set_pt_size) rather than in common::layout::min_scratch_size, because the guest has no knowledge of whether the host built with the clock feature -- common must stay guest-visible. Still no behaviour change when the feature is off. The accessor and the helpers are #[allow(dead_code)] for now -- they're wired into initialise() in a follow-on commit. Signed-off-by: Simon Davies --- src/hyperlight_common/src/layout.rs | 21 +++++++++++++++++++++ src/hyperlight_host/src/mem/layout.rs | 8 ++++++++ 2 files changed, 29 insertions(+) diff --git a/src/hyperlight_common/src/layout.rs b/src/hyperlight_common/src/layout.rs index c267f41ce..00f560a7b 100644 --- a/src/hyperlight_common/src/layout.rs +++ b/src/hyperlight_common/src/layout.rs @@ -104,5 +104,26 @@ pub fn scratch_base_gva(size: usize) -> u64 { (MAX_GVA - size + 1) as u64 } +/// Guest physical address of the base of the paravirtualized clock page. +/// +/// The clock page sits at a fixed offset from the top of the guest physical +/// address space, independent of `scratch_size`: it is always +/// `MAX_GPA + 1 - SCRATCH_TOP_CLOCK_PAGE_OFFSET`. +/// +/// Only meaningful when the host is built with the `enable_guest_clock` +/// feature; otherwise the page is not populated. +pub const fn clock_page_gpa() -> u64 { + (MAX_GPA as u64) + 1 - SCRATCH_TOP_CLOCK_PAGE_OFFSET +} + +/// Guest virtual address of the base of the paravirtualized clock page. +/// +/// See [`clock_page_gpa`]. Scratch is mapped identity-style from +/// `scratch_base_gva` to `scratch_base_gpa`, so the clock page sits at the +/// equivalent offset in the guest virtual address space. +pub const fn clock_page_gva() -> u64 { + (MAX_GVA as u64) + 1 - SCRATCH_TOP_CLOCK_PAGE_OFFSET +} + /// Compute the minimum scratch region size needed for a sandbox. pub use arch::min_scratch_size; diff --git a/src/hyperlight_host/src/mem/layout.rs b/src/hyperlight_host/src/mem/layout.rs index 26615d579..7395f3908 100644 --- a/src/hyperlight_host/src/mem/layout.rs +++ b/src/hyperlight_host/src/mem/layout.rs @@ -341,6 +341,11 @@ impl SandboxMemoryLayout { cfg.get_input_data_size(), cfg.get_output_data_size(), ); + // The guest allocator unconditionally reserves the clock page at + // the top of scratch (so its footprint is feature-independent), + // so the host minimum must always account for it. + let min_scratch_size = + min_scratch_size + hyperlight_common::layout::CLOCK_PAGE_SIZE as usize; if scratch_size < min_scratch_size { return Err(MemoryRequestTooSmall(scratch_size, min_scratch_size)); } @@ -595,6 +600,9 @@ impl SandboxMemoryLayout { self.sandbox_memory_config.get_input_data_size(), self.sandbox_memory_config.get_output_data_size(), ); + // Must match the unconditional clock page reservation in the guest allocator. + let min_fixed_scratch = + min_fixed_scratch + hyperlight_common::layout::CLOCK_PAGE_SIZE as usize; let min_scratch = min_fixed_scratch + size; if self.scratch_size < min_scratch { return Err(MemoryRequestTooSmall(self.scratch_size, min_scratch)); From 432b6b38252b612f8915820b630357f12d2bbc3e Mon Sep 17 00:00:00 2001 From: Simon Davies Date: Thu, 23 Apr 2026 11:15:34 +0100 Subject: [PATCH 04/10] host: arm guest clock at initialise and on snapshot restore Wire the pvclock / HV Reference TSC plumbing into the sandbox lifecycle. VirtualMachine::setup_pvclock now returns the ClockType discriminant it armed, so the caller doesn't have to duplicate the KVM-vs-HV decision made inside the backend. HyperlightVm gains a feature-gated arm_clock(scratch) helper that (a) asks the backend VirtualMachine to point its clock register at clock_page_gpa(), (b) stamps the returned ClockType into the clock-page trailer, and (c) stamps SystemTime::now() as boot_time_ns into the trailer. It takes the host-side scratch_mem by reference (HostSharedMemory has the write API; the GuestSharedMemory stored on HyperlightVm does not). arm_clock is called from two places: * HyperlightVm::initialise on x86_64, just before the first run_vcpu, so the guest's first instruction already sees a live clock page. * InitializedMultiUseSandbox::restore after reset_vcpu, so the restored guest observes wall-clock reflecting the restore moment (not the original boot) and the per-vCPU clock MSR is re-issued on the fresh vCPU state. InitializeError::ArmClock is added (with boxed inner HyperlightError to avoid the HyperlightError -> HyperlightVmError -> InitializeError recursion cycle). No behaviour change when the feature is off; just clippy debug and just clippy release both clean. Signed-off-by: Simon Davies --- .../src/hypervisor/hyperlight_vm/mod.rs | 121 ++++++++++++++++++ .../src/hypervisor/hyperlight_vm/x86_64.rs | 19 +++ .../hypervisor/virtual_machine/kvm/x86_64.rs | 7 +- .../src/hypervisor/virtual_machine/mod.rs | 3 +- .../hypervisor/virtual_machine/mshv/x86_64.rs | 7 +- .../src/hypervisor/virtual_machine/whp.rs | 7 +- .../src/sandbox/initialized_multi_use.rs | 8 ++ src/hyperlight_host/src/sandbox/snapshot.rs | 7 +- 8 files changed, 170 insertions(+), 9 deletions(-) diff --git a/src/hyperlight_host/src/hypervisor/hyperlight_vm/mod.rs b/src/hyperlight_host/src/hypervisor/hyperlight_vm/mod.rs index 830b856c0..eb0f9384f 100644 --- a/src/hyperlight_host/src/hypervisor/hyperlight_vm/mod.rs +++ b/src/hyperlight_host/src/hypervisor/hyperlight_vm/mod.rs @@ -183,6 +183,9 @@ pub enum InitializeError { SetupRegs(#[from] RegisterError), #[error("Guest initialised stack pointer to architecturally invalid value: {0}")] InvalidStackPointer(u64), + #[cfg(all(feature = "enable_guest_clock", target_arch = "x86_64"))] + #[error("Failed to arm paravirtualized guest clock: {0}")] + ArmClock(#[source] Box), } /// Errors that can occur during VM execution in the run loop @@ -492,6 +495,124 @@ impl HyperlightVm { Ok(()) } + /// Set up the pvclock / Reference TSC MSR and stamp `clock_type` + /// into the scratch bookkeeping page so the guest can read monotonic + /// time during `hyperlight_main` (init). + /// + /// Does NOT stamp `boot_time_ns` — on some backends (KVM) the + /// monotonic clock source is unreliable until after the first + /// vCPU run (see [`arm_clock`]). Wall-clock time returns `None` + /// until `arm_clock` is called. + /// + /// Must be called before the first vCPU run. + #[cfg(all(feature = "enable_guest_clock", target_arch = "x86_64"))] + pub(crate) fn setup_clock( + &mut self, + scratch: &crate::mem::shared_mem::HostSharedMemory, + ) -> crate::Result<()> { + use hyperlight_common::layout::{SCRATCH_TOP_CLOCK_TYPE_OFFSET, clock_page_gpa}; + + use crate::mem::shared_mem::SharedMemory; + + let gpa = clock_page_gpa(); + let clock_type = self.vm.setup_pvclock(gpa)?; + + // Write clock_type to the bookkeeping page (top of scratch), + // NOT into the clock page itself — the clock page is 100% + // hypervisor-owned. + let scratch_size = scratch.mem_size(); + let clock_type_offset = scratch_size + .checked_sub(SCRATCH_TOP_CLOCK_TYPE_OFFSET as usize) + .ok_or_else(|| crate::new_error!("scratch region too small for clock metadata"))?; + + scratch.write::(clock_type_offset, u64::from(clock_type))?; + + tracing::debug!( + target: "hyperlight::pvclock", + ?clock_type, + "clock MSR configured, boot_time_ns deferred until after first vCPU run" + ); + Ok(()) + } + + /// Arm the paravirtualized clock: set up the MSR and stamp + /// `clock_type` + `boot_time_ns` into the scratch bookkeeping page. + /// + /// Computes `boot_time_ns = wall_now - monotonic_now` where + /// `monotonic_now` comes from `VirtualMachine::current_monotonic_ns()`. + /// The guest recovers wall time as + /// `boot_time_ns + monotonic_time_ns()`. + /// + /// # Call sites + /// + /// - **Initial sandbox creation**: called after `initialise()` + /// returns (i.e. after the first vCPU run). On some backends + /// (KVM) the monotonic clock source is unreliable until the + /// first vCPU entry. Monotonic time is available during + /// `hyperlight_main` via the pvclock page (set up by + /// [`setup_clock`] before the first vCPU run), but wall-clock + /// time returns `None` until this method stamps `boot_time_ns`. + /// + /// - **Snapshot restore**: called directly by the restore path. + /// Re-stamps fresh `boot_time_ns` so the restored guest sees + /// wall time reflecting the restore moment. + /// + /// Must be called while `scratch_memory` is `Some`. + #[cfg(all(feature = "enable_guest_clock", target_arch = "x86_64"))] + pub(crate) fn arm_clock( + &mut self, + scratch: &crate::mem::shared_mem::HostSharedMemory, + ) -> crate::Result<()> { + use std::time::{SystemTime, UNIX_EPOCH}; + + use hyperlight_common::layout::{ + SCRATCH_TOP_BOOT_TIME_NS_OFFSET, SCRATCH_TOP_CLOCK_TYPE_OFFSET, clock_page_gpa, + }; + + use crate::mem::shared_mem::SharedMemory; + + let gpa = clock_page_gpa(); + let clock_type = self.vm.setup_pvclock(gpa)?; + + let scratch_size = scratch.mem_size(); + + // Sample monotonic first, then wall clock. If preempted between + // the two reads, boot_time_ns shifts forward (guest wall clock + // runs slightly ahead of host) rather than backward — "slightly + // in the future" is more benign than "slightly in the past" for + // most use cases. The gap is bounded by the 20ms test tolerance. + let mono_ns = self.vm.current_monotonic_ns()?; + let wall_ns = u64::try_from( + SystemTime::now() + .duration_since(UNIX_EPOCH) + .map_err(|e| crate::new_error!("system time before Unix epoch: {}", e))? + .as_nanos(), + ) + .map_err(|_| crate::new_error!("wall_ns overflowed u64"))?; + let boot_time_ns = wall_ns.wrapping_sub(mono_ns); + + // Write metadata to the bookkeeping page (top of scratch), + // NOT into the clock page — the clock page is 100% + // hypervisor-owned. + let clock_type_offset = scratch_size + .checked_sub(SCRATCH_TOP_CLOCK_TYPE_OFFSET as usize) + .ok_or_else(|| crate::new_error!("scratch region too small for clock metadata"))?; + let boot_time_offset = scratch_size + .checked_sub(SCRATCH_TOP_BOOT_TIME_NS_OFFSET as usize) + .ok_or_else(|| crate::new_error!("scratch region too small for clock metadata"))?; + + scratch.write::(clock_type_offset, u64::from(clock_type))?; + scratch.write::(boot_time_offset, boot_time_ns)?; + + tracing::debug!( + target: "hyperlight::pvclock", + ?clock_type, + boot_time_ns, + "guest clock armed" + ); + Ok(()) + } + /// Get the current stack top virtual address pub(crate) fn get_stack_top(&mut self) -> u64 { self.rsp_gva diff --git a/src/hyperlight_host/src/hypervisor/hyperlight_vm/x86_64.rs b/src/hyperlight_host/src/hypervisor/hyperlight_vm/x86_64.rs index f06c94964..af8f08617 100644 --- a/src/hyperlight_host/src/hypervisor/hyperlight_vm/x86_64.rs +++ b/src/hyperlight_host/src/hypervisor/hyperlight_vm/x86_64.rs @@ -212,6 +212,13 @@ impl HyperlightVm { return Ok(()); }; + // Set up the pvclock MSR so monotonic time works during init. + // boot_time_ns (wall clock) is deferred until after the first + // vCPU run — see arm_clock below. + #[cfg(all(feature = "enable_guest_clock", target_arch = "x86_64"))] + self.setup_clock(&mem_mgr.scratch_mem) + .map_err(|e| InitializeError::ArmClock(Box::new(e)))?; + let regs = CommonRegisters { rip: initialise, // We usually keep the top of the stack 16-byte @@ -241,6 +248,18 @@ impl HyperlightVm { ) .map_err(InitializeError::Run)?; + // Arm the paravirtualized clock after the first vCPU run. + // On some backends the monotonic clock source is unreliable + // until after the first vCPU entry, so wall-clock calibration + // is deferred to here. Wall clock is not available to the + // guest during hyperlight_main (init), but monotonic time + // works fine since the pvclock page is populated before the + // first vCPU entry. Wall clock becomes available on + // subsequent dispatch calls. + #[cfg(all(feature = "enable_guest_clock", target_arch = "x86_64"))] + self.arm_clock(&mem_mgr.scratch_mem) + .map_err(|e| InitializeError::ArmClock(Box::new(e)))?; + let regs = self.vm.regs()?; // todo(portability): this is architecture-specific if !regs.rsp.is_multiple_of(16) { diff --git a/src/hyperlight_host/src/hypervisor/virtual_machine/kvm/x86_64.rs b/src/hyperlight_host/src/hypervisor/virtual_machine/kvm/x86_64.rs index 626ebbe7e..7e78291ea 100644 --- a/src/hyperlight_host/src/hypervisor/virtual_machine/kvm/x86_64.rs +++ b/src/hyperlight_host/src/hypervisor/virtual_machine/kvm/x86_64.rs @@ -470,7 +470,10 @@ impl VirtualMachine for KvmVm { } #[cfg(feature = "enable_guest_clock")] - fn setup_pvclock(&mut self, clock_page_gpa: u64) -> crate::Result<()> { + fn setup_pvclock( + &mut self, + clock_page_gpa: u64, + ) -> crate::Result { // KVM pvclock: write `MSR_KVM_SYSTEM_TIME_NEW` with `gpa | 1`. // Bit 0 is the "enable" flag; clearing it disables pvclock for this // vCPU. @@ -498,7 +501,7 @@ impl VirtualMachine for KvmVm { clock_page_gpa, "KVM pvclock armed" ); - Ok(()) + Ok(hyperlight_common::time::ClockType::KvmPvclock) } #[cfg(feature = "enable_guest_clock")] diff --git a/src/hyperlight_host/src/hypervisor/virtual_machine/mod.rs b/src/hyperlight_host/src/hypervisor/virtual_machine/mod.rs index 047f36d58..ce2518d07 100644 --- a/src/hyperlight_host/src/hypervisor/virtual_machine/mod.rs +++ b/src/hyperlight_host/src/hypervisor/virtual_machine/mod.rs @@ -359,11 +359,10 @@ pub(crate) trait VirtualMachine: Debug + Send { /// Must be called before the first `run_vcpu`, and again on snapshot /// restore since the register lives in vCPU state. #[cfg(all(feature = "enable_guest_clock", target_arch = "x86_64"))] - #[allow(dead_code)] // wired up in a follow-on commit fn setup_pvclock( &mut self, clock_page_gpa: u64, - ) -> std::result::Result<(), crate::HyperlightError>; + ) -> std::result::Result; /// Read the host's monotonic clock for the time base that backs the /// guest's paravirtualized clock page, in nanoseconds. diff --git a/src/hyperlight_host/src/hypervisor/virtual_machine/mshv/x86_64.rs b/src/hyperlight_host/src/hypervisor/virtual_machine/mshv/x86_64.rs index 82b912612..9fb2b3166 100644 --- a/src/hyperlight_host/src/hypervisor/virtual_machine/mshv/x86_64.rs +++ b/src/hyperlight_host/src/hypervisor/virtual_machine/mshv/x86_64.rs @@ -471,7 +471,10 @@ impl VirtualMachine for MshvVm { } #[cfg(feature = "enable_guest_clock")] - fn setup_pvclock(&mut self, clock_page_gpa: u64) -> crate::Result<()> { + fn setup_pvclock( + &mut self, + clock_page_gpa: u64, + ) -> crate::Result { // Hyper-V Reference TSC page: write `HV_REGISTER_REFERENCE_TSC` with // `gpa | 1`. Bit 0 is the "enable" flag. // @@ -495,7 +498,7 @@ impl VirtualMachine for MshvVm { clock_page_gpa, "MSHV Reference TSC armed" ); - Ok(()) + Ok(hyperlight_common::time::ClockType::HyperVReferenceTsc) } #[cfg(feature = "enable_guest_clock")] diff --git a/src/hyperlight_host/src/hypervisor/virtual_machine/whp.rs b/src/hyperlight_host/src/hypervisor/virtual_machine/whp.rs index d853cd846..d2bbe73d0 100644 --- a/src/hyperlight_host/src/hypervisor/virtual_machine/whp.rs +++ b/src/hyperlight_host/src/hypervisor/virtual_machine/whp.rs @@ -805,7 +805,10 @@ impl VirtualMachine for WhpVm { } #[cfg(feature = "enable_guest_clock")] - fn setup_pvclock(&mut self, clock_page_gpa: u64) -> crate::Result<()> { + fn setup_pvclock( + &mut self, + clock_page_gpa: u64, + ) -> crate::Result { // Hyper-V Reference TSC page via WHP: write `WHvRegisterReferenceTsc` // with `gpa | 1`. Bit 0 is the "enable" flag. // @@ -823,7 +826,7 @@ impl VirtualMachine for WhpVm { clock_page_gpa, "WHP Reference TSC armed" ); - Ok(()) + Ok(hyperlight_common::time::ClockType::HyperVReferenceTsc) } #[cfg(feature = "enable_guest_clock")] diff --git a/src/hyperlight_host/src/sandbox/initialized_multi_use.rs b/src/hyperlight_host/src/sandbox/initialized_multi_use.rs index 241622cab..51f8dd8ec 100644 --- a/src/hyperlight_host/src/sandbox/initialized_multi_use.rs +++ b/src/hyperlight_host/src/sandbox/initialized_multi_use.rs @@ -348,6 +348,14 @@ impl MultiUseSandbox { HyperlightVmError::Restore(e) })?; + // Re-arm the paravirtualized clock for the freshly-reset vCPU. The + // MSR / HV register that pvclock rides on lives in vCPU state and + // has been clobbered by reset_vcpu above, and `boot_time_ns` must + // be re-stamped so the restored guest sees wall-clock reflecting + // the restore moment, not the original boot. + #[cfg(all(feature = "enable_guest_clock", target_arch = "x86_64"))] + self.vm.arm_clock(&self.mem_mgr.scratch_mem)?; + self.vm.set_stack_top(snapshot.stack_top_gva()); self.vm.set_entrypoint(snapshot.entrypoint()); diff --git a/src/hyperlight_host/src/sandbox/snapshot.rs b/src/hyperlight_host/src/sandbox/snapshot.rs index e4c7b1133..3cdf65b4f 100644 --- a/src/hyperlight_host/src/sandbox/snapshot.rs +++ b/src/hyperlight_host/src/sandbox/snapshot.rs @@ -405,8 +405,13 @@ impl Snapshot { layout.set_pt_size(pt_bytes.len())?; memory.extend(&pt_bytes); + // The main/init stack top must live below the reserved clock page + // at the top of scratch; otherwise the guest's first stack writes + // clobber the paravirtualized clock page. The clock page is + // always reserved independent of the host's `enable_guest_clock` + // feature. let exn_stack_top_gva = hyperlight_common::layout::MAX_GVA as u64 - - hyperlight_common::layout::SCRATCH_TOP_EXN_STACK_OFFSET + - hyperlight_common::layout::SCRATCH_TOP_CLOCK_PAGE_OFFSET + 1; let extra_regions = Vec::new(); From 420dca432c46a2e5a04053194e20afaa0ff94b54 Mon Sep 17 00:00:00 2001 From: Simon Davies Date: Thu, 23 Apr 2026 11:29:24 +0100 Subject: [PATCH 05/10] guest: add low-level paravirtualized clock reader Add hyperlight_guest::time, a no_std module providing free functions that read the paravirtualized clock page configured by the host: * is_available() / validate_clock() for runtime discovery and optional defense-in-depth checks * monotonic_time_ns() / monotonic_time_us() for monotonic time since sandbox boot * wall_clock_time_ns() / wall_clock_time() for UTC wall-clock time The clock page lives at a fixed guest-virtual address inside scratch (hyperlight_common::layout::clock_page_gva()), so the guest does not need a PEB field or any host-populated pointer to find it -- the GVA is a compile-time constant derived from MAX_GVA and SCRATCH_TOP_CLOCK_PAGE_OFFSET. Clock source is selected per-host: * KVM pvclock uses the seqlock-style version/payload protocol from https://docs.kernel.org/virt/kvm/x86/msr.html#pvclock * Hyper-V Reference TSC uses the seqlock-style protocol from TLFS section 12.7; tsc_sequence == 0 is the persistent MSR-fallback sentinel and is surfaced as None (MSR reads require a VM exit which a Hyperlight guest cannot make). Both readers cap retries at CLOCK_SEQLOCK_MAX_RETRIES (100) so a pathologically churning host cannot make us spin forever. Acquire fences pair with the host's release barriers so aarch64 emits the right dmb ishld; on x86_64 they are free. The clock-page trailer carries the ClockType discriminant (offset 0xFE0) and boot_time_ns (offset 0xFE8). The guest validates the byte on read: unknown discriminants decode to ClockType::None, which is reported as "not available" -- matching what happens when the host is built without enable_guest_clock. just check-i686 passes (32-bit build clean). Pre-existing clippy-guests breakage in the custom x86_64-hyperlight-none target is unrelated (picolibc wint_t header issue on base commit). Signed-off-by: Simon Davies --- src/hyperlight_guest/src/lib.rs | 1 + src/hyperlight_guest/src/time.rs | 361 +++++++++++++++++++++++++++++++ 2 files changed, 362 insertions(+) create mode 100644 src/hyperlight_guest/src/time.rs diff --git a/src/hyperlight_guest/src/lib.rs b/src/hyperlight_guest/src/lib.rs index 19e5ac5f2..a3811fa05 100644 --- a/src/hyperlight_guest/src/lib.rs +++ b/src/hyperlight_guest/src/lib.rs @@ -25,6 +25,7 @@ pub mod error; pub mod exit; pub mod layout; pub mod prim_alloc; +pub mod time; pub mod types; pub mod guest_handle { diff --git a/src/hyperlight_guest/src/time.rs b/src/hyperlight_guest/src/time.rs new file mode 100644 index 000000000..d282265e3 --- /dev/null +++ b/src/hyperlight_guest/src/time.rs @@ -0,0 +1,361 @@ +/* +Copyright 2025 The Hyperlight Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +//! Low-level guest time functions using the paravirtualized clock. +//! +//! This module provides low-level functions to read time without VM exits by +//! consulting the shared clock page populated by the host. The page lives at +//! a fixed, compile-time-known guest-virtual address inside the scratch +//! region (see [`hyperlight_common::layout::clock_page_gva`]), so no +//! per-sandbox discovery data — such as a PEB field — is required. +//! +//! # For most users +//! +//! Use [`hyperlight_guest_bin::time`] instead, which provides a +//! `std::time`-compatible API (`SystemTime`, `Instant`) built on top of the +//! free functions here. +//! +//! # Supported clock sources +//! +//! - **KVM pvclock** — used when running under KVM. +//! - **Hyper-V Reference TSC** — used when running under MSHV or WHP. +//! +//! Which one is active is decided by the host and advertised by the +//! `clock_type` field in the scratch bookkeeping page. When the host is built +//! without the `enable_guest_clock` feature the field reads back as +//! [`ClockType::None`] and every function in this module returns `None`. +//! +//! # Concurrency invariant (current) +//! +//! In the current Hyperlight execution model the guest vCPU runs only +//! while the host thread is blocked inside the vCPU run call: the host +//! writes the clock page **before** entering the guest and cannot mutate +//! it while the guest reads. There is therefore no concurrent writer in +//! practice and the seqlock retry, the acquire fences, and the per-field +//! `read_volatile`s will never actually fire at runtime today. +//! +//! These primitives are kept anyway because: (1) they future-proof +//! against multi-vCPU sandboxes, async host-side clock updates, or +//! live migration; and (2) by never creating a `&T` over +//! hypervisor-mutable memory we satisfy Rust's aliasing rules +//! unconditionally. + +use core::sync::atomic::{Ordering, fence}; + +use hyperlight_common::layout::{ + SCRATCH_TOP_BOOT_TIME_NS_OFFSET, SCRATCH_TOP_CLOCK_TYPE_OFFSET, clock_page_gva, +}; +use hyperlight_common::time::{ + ClockType, HvReferenceTscPage, KvmPvclockVcpuTimeInfo, PVCLOCK_TSC_STABLE_BIT, +}; + +/// The guest-virtual address of the top of scratch memory. The +/// bookkeeping fields (`clock_type`, `boot_time_ns`, etc.) are stored +/// as negative offsets from this address. +const SCRATCH_TOP_GVA: u64 = hyperlight_common::layout::MAX_GVA as u64 + 1; + +/// Error type for clock validation failures. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum ClockValidationError { + /// Clock is not configured. Either the host was built without the + /// `enable_guest_clock` feature, or the bookkeeping page contains an unknown + /// discriminant that we treat as "unavailable" out of caution. + NotConfigured, + /// KVM pvclock does not have `PVCLOCK_TSC_STABLE_BIT` set. This + /// indicates the TSC is not stable across vCPUs on this host. + KvmTscNotStable, + /// Hyper-V Reference TSC page has `tsc_sequence == 0`, which in the + /// TLFS is the host's "fall back to MSR" sentinel. MSR reads require a + /// VM exit which is not available from a Hyperlight guest, so this is + /// reported as an error rather than retried. + HyperVTscSequenceZero, +} + +/// Read the `clock_type` field from the scratch bookkeeping page. +#[inline] +fn read_clock_type() -> ClockType { + // SAFETY: the bookkeeping page at the top of scratch is always mapped + // RW; reads of any 8-byte aligned u64 inside it are well-defined. + // Zero-initialised memory decodes to `ClockType::None`. + let ptr = (SCRATCH_TOP_GVA - SCRATCH_TOP_CLOCK_TYPE_OFFSET) as *const u64; + let raw = unsafe { core::ptr::read_volatile(ptr) }; + ClockType::from(raw) +} + +/// Read the `boot_time_ns` field from the scratch bookkeeping page. +#[inline] +fn read_boot_time_ns() -> u64 { + // SAFETY: see `read_clock_type`. + let ptr = (SCRATCH_TOP_GVA - SCRATCH_TOP_BOOT_TIME_NS_OFFSET) as *const u64; + unsafe { core::ptr::read_volatile(ptr) } +} + +/// Returns `true` when the host has armed a paravirtualized clock for this +/// sandbox. Cheap - just a single read of the bookkeeping field. +#[inline] +pub fn is_available() -> bool { + !matches!(read_clock_type(), ClockType::None) +} + +/// Validate that the paravirtualized clock is properly configured and stable. +/// +/// This is an optional defense-in-depth check a guest can make once during +/// initialisation. The host should have already verified invariant TSC +/// support when enabling the feature; this catches accidental +/// misconfiguration. +pub fn validate_clock() -> Result<(), ClockValidationError> { + match read_clock_type() { + ClockType::KvmPvclock => { + // SAFETY: the clock page is mapped read/write into the guest's + // scratch region for the lifetime of the sandbox, and a + // `KvmPvclockVcpuTimeInfo` (32 bytes) fits at offset 0. We use + // raw-pointer `read_volatile` instead of materialising a + // `&KvmPvclockVcpuTimeInfo` so the reader stays sound under + // Rust's aliasing rules even if a future Hyperlight execution + // model lets the host mutate this page concurrently with the + // guest. See module-level "Concurrency invariant" note. + let ptr = clock_page_gva() as *const KvmPvclockVcpuTimeInfo; + let flags = unsafe { core::ptr::read_volatile(&raw const (*ptr).flags) }; + if (flags & PVCLOCK_TSC_STABLE_BIT) == 0 { + return Err(ClockValidationError::KvmTscNotStable); + } + Ok(()) + } + ClockType::HyperVReferenceTsc => { + // SAFETY: as above. `HvReferenceTscPage` fills the full 4 KiB + // page; we only read the `tsc_sequence` header field here. + let ptr = clock_page_gva() as *const HvReferenceTscPage; + let seq = unsafe { core::ptr::read_volatile(&raw const (*ptr).tsc_sequence) }; + if seq == 0 { + return Err(ClockValidationError::HyperVTscSequenceZero); + } + Ok(()) + } + ClockType::None => Err(ClockValidationError::NotConfigured), + } +} + +/// Read the CPU's Time Stamp Counter. +#[inline] +fn rdtsc() -> u64 { + #[cfg(target_arch = "x86_64")] + { + // SAFETY: RDTSC is unprivileged on x86_64 and always present on + // CPUs that support the paravirtualized clock (host-verified + // invariant TSC). + unsafe { core::arch::x86_64::_rdtsc() } + } + #[cfg(not(target_arch = "x86_64"))] + { + 0 // TSC not available on non-x86_64 architectures. + } +} + +/// Maximum number of retries when the hypervisor is concurrently updating +/// the paravirtualized clock page. +/// +/// Both the KVM pvclock and Hyper-V Reference TSC protocols use a +/// seqlock-style mechanism: the hypervisor bumps a sequence/version counter +/// before and after mutating the page, and readers must retry if they +/// observe an in-progress or changed counter. Mutations are extremely +/// short, so a small retry cap is plenty; the hypervisor's design assumes +/// the client spin-retries rather than falling back to an MSR (which would +/// force a VM exit and defeat the whole point of the paravirtualized +/// clock). +const CLOCK_SEQLOCK_MAX_RETRIES: u32 = 100; + +/// Read time from the KVM pvclock structure. +/// +/// Uses the seqlock-style protocol described in +/// : the host sets +/// `version` to an odd value before mutating and to a new even value +/// afterwards; readers retry while `version` is odd or changes across the +/// read. We cap retries with [`CLOCK_SEQLOCK_MAX_RETRIES`] so that a +/// pathologically churning host can't make us spin forever. +fn read_kvm_pvclock() -> Option { + // SAFETY: see `validate_clock` for the mapping invariant. Today the + // host cannot mutate this page while the guest is running (single + // vCPU, host-then-guest scheduling), so the seqlock loop and the + // volatile loads are not strictly required for correctness right now. + // We keep the upstream pvclock contract verbatim so that: + // (a) the reader is sound under Rust's aliasing rules regardless of + // what the host is doing — no `&T` is ever taken over this + // memory; and + // (b) no behavioural change is needed when Hyperlight gains + // multi-vCPU sandboxes or async host-side clock updates. + let ptr = clock_page_gva() as *const KvmPvclockVcpuTimeInfo; + + for _ in 0..CLOCK_SEQLOCK_MAX_RETRIES { + let version1 = unsafe { core::ptr::read_volatile(&raw const (*ptr).version) }; + if version1 & 1 != 0 { + core::hint::spin_loop(); + continue; // Update in progress. + } + + // Pair with the hypervisor's write barrier between the version bump + // and the payload write. On x86_64 an Acquire fence is free (no + // instruction emitted), but we keep it for correctness under the + // memory model. + fence(Ordering::Acquire); + + let tsc_timestamp = unsafe { core::ptr::read_volatile(&raw const (*ptr).tsc_timestamp) }; + let system_time = unsafe { core::ptr::read_volatile(&raw const (*ptr).system_time) }; + let tsc_to_system_mul = + unsafe { core::ptr::read_volatile(&raw const (*ptr).tsc_to_system_mul) }; + let tsc_shift = unsafe { core::ptr::read_volatile(&raw const (*ptr).tsc_shift) }; + + fence(Ordering::Acquire); + + let version2 = unsafe { core::ptr::read_volatile(&raw const (*ptr).version) }; + if version1 != version2 { + core::hint::spin_loop(); + continue; // Data changed mid-read. + } + + let tsc_now = rdtsc(); + let tsc_delta = tsc_now.wrapping_sub(tsc_timestamp); + + // KVM pvclock scaler, per + // : + // `ns = (tsc_delta * tsc_to_system_mul) >> (32 - tsc_shift)`. + // We clamp the right-shift count to `[0, 63]` so + // buggy host cannot induce UB / panic via an out-of-range shift; + // values outside the documented `tsc_shift ∈ [-31, 31]` band + // produce non-meaningful timings, but the reader stays sound. + let raw_shift = 32i32 - tsc_shift as i32; + let shift = raw_shift.clamp(0, 63) as u32; + let ns_delta = ((tsc_delta as u128 * tsc_to_system_mul as u128) >> shift) as u64; + + return Some(system_time.wrapping_add(ns_delta)); + } + + None +} + +/// Read time from the Hyper-V Reference TSC page. +/// +/// Uses the seqlock-style protocol described in TLFS §12.7. A sequence of +/// 0 is a persistent "fall back to MSR" signal from the host; we return +/// `None` without retrying because MSR reads require a VM exit that is +/// unavailable inside a Hyperlight guest. +fn read_hv_reference_tsc() -> Option { + // SAFETY: see `read_kvm_pvclock` for the aliasing / volatile rationale. + let ptr = clock_page_gva() as *const HvReferenceTscPage; + + for _ in 0..CLOCK_SEQLOCK_MAX_RETRIES { + let seq1 = unsafe { core::ptr::read_volatile(&raw const (*ptr).tsc_sequence) }; + if seq1 == 0 { + return None; // Persistent MSR-fallback sentinel. + } + + fence(Ordering::Acquire); + + let tsc_scale = unsafe { core::ptr::read_volatile(&raw const (*ptr).tsc_scale) }; + let tsc_offset = unsafe { core::ptr::read_volatile(&raw const (*ptr).tsc_offset) }; + + fence(Ordering::Acquire); + + let seq2 = unsafe { core::ptr::read_volatile(&raw const (*ptr).tsc_sequence) }; + if seq1 != seq2 { + core::hint::spin_loop(); + continue; // Host updated the page mid-read. + } + + let tsc_now = rdtsc(); + + // Hyper-V Reference TSC formula (TLFS §12.7): + // `time_100ns = ((tsc * scale) >> 64) + offset` + // The high 64 bits of a 128-bit multiply give the scaled value. + // We use `checked_add_signed` on the offset addition: an overflow + // here would mean the host's `tsc_offset` is so far out of band + // that `time_100ns` cannot be represented, which we treat as + // "clock unavailable" rather than retrying — the offset is + // host-written and stable, so retrying cannot rescue it. + let scaled = ((tsc_now as u128 * tsc_scale as u128) >> 64) as u64; + let time_100ns = scaled.checked_add_signed(tsc_offset)?; + + return time_100ns.checked_mul(100); + } + + None +} + +/// Monotonic time in nanoseconds. +/// +/// The value is an absolute counter from the hypervisor's time base +/// (kvmclock on KVM, partition reference time on Hyper-V). It is +/// monotonically increasing and suitable for measuring elapsed time +/// between two reads, but its epoch is unspecified — do not assume +/// it starts at zero when the sandbox is created. +/// +/// Returns `None` if the clock is not configured, or if the retry cap was +/// exhausted (the caller may retry). +pub fn monotonic_time_ns() -> Option { + match read_clock_type() { + ClockType::KvmPvclock => read_kvm_pvclock(), + ClockType::HyperVReferenceTsc => read_hv_reference_tsc(), + ClockType::None => None, + } +} + +/// Wall-clock time in nanoseconds since the Unix epoch. +/// +/// Returns `None` if: +/// - The clock is not configured (`clock_type == None`). +/// - `boot_time_ns` has not been stamped yet (it is zero before +/// `arm_clock` runs). On some backends the host's monotonic clock +/// source is unreliable until after the first vCPU run, so +/// wall clock is unavailable during `hyperlight_main` (init). +/// Monotonic time works fine during init. Wall clock becomes +/// available on the first dispatch call. +/// - The underlying monotonic read fails. +/// +/// The host computes `boot_time_ns` as the Unix-epoch origin of the +/// monotonic clock (`wall_now - monotonic_now`, sampled back-to-back +/// in `arm_clock`) and stamps it into the scratch bookkeeping page. The +/// guest simply adds its live monotonic reading to recover wall time. +/// +/// This host-side computation is necessary because Hyper-V has no +/// guest-accessible wall-clock register (unlike KVM's +/// `MSR_KVM_WALL_CLOCK_NEW`). We use the same host-computed approach +/// on all backends for uniformity. +pub fn wall_clock_time_ns() -> Option { + let monotonic = monotonic_time_ns()?; + let boot_time = read_boot_time_ns(); + // boot_time_ns == 0 means the host hasn't stamped it yet + // (scratch memory is zero-initialised). Return None rather + // than returning a nonsense value. + if boot_time == 0 { + return None; + } + Some(boot_time.wrapping_add(monotonic)) +} + +/// Monotonic time in microseconds. +/// +/// See [`monotonic_time_ns`] for details on the time base. +pub fn monotonic_time_us() -> Option { + monotonic_time_ns().map(|ns| ns / 1_000) +} + +/// Wall-clock time as `(seconds, sub-second nanoseconds)` since the Unix +/// epoch. Shape matches a POSIX `timespec`. +pub fn wall_clock_time() -> Option<(u64, u32)> { + let ns = wall_clock_time_ns()?; + let secs = ns / 1_000_000_000; + let nsecs = (ns % 1_000_000_000) as u32; + Some((secs, nsecs)) +} From c3ec7db39d56918790af515e8dfc7b81663a7386 Mon Sep 17 00:00:00 2001 From: Simon Davies Date: Thu, 23 Apr 2026 12:53:59 +0100 Subject: [PATCH 06/10] guest_bin: add std::time-compatible Instant and SystemTime Adds hyperlight_guest_bin::time providing Instant, SystemTime, UNIX_EPOCH and a TimeError enum on top of the low-level hyperlight_guest::time primitives. The API mirrors std::time so guest code that would ordinarily use std::time::Instant / std::time::SystemTime can be ported with minimal churn, while remaining no_std compatible. Constructors are fallible: they surface Unavailable when the host was built without the enable_guest_clock feature and Retry when the seqlock retry cap is exhausted. SystemTime::duration_since matches the std shape by returning the magnitude of a negative difference in its error variant. Signed-off-by: Simon Davies --- src/hyperlight_guest_bin/src/lib.rs | 1 + src/hyperlight_guest_bin/src/time.rs | 222 +++++++++++++++++++++++++++ 2 files changed, 223 insertions(+) create mode 100644 src/hyperlight_guest_bin/src/time.rs diff --git a/src/hyperlight_guest_bin/src/lib.rs b/src/hyperlight_guest_bin/src/lib.rs index 84e328892..f5fd827c2 100644 --- a/src/hyperlight_guest_bin/src/lib.rs +++ b/src/hyperlight_guest_bin/src/lib.rs @@ -53,6 +53,7 @@ pub mod host_comm; pub mod memory; #[cfg(target_arch = "x86_64")] pub mod paging; +pub mod time; /// Bridge between picolibc's POSIX expectations and the Hyperlight host. #[cfg(feature = "libc")] diff --git a/src/hyperlight_guest_bin/src/time.rs b/src/hyperlight_guest_bin/src/time.rs new file mode 100644 index 000000000..ed9f5031d --- /dev/null +++ b/src/hyperlight_guest_bin/src/time.rs @@ -0,0 +1,222 @@ +/* +Copyright 2025 The Hyperlight Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +//! A `std::time`-compatible API built on the paravirtualized guest clock. +//! +//! This module provides [`Instant`] and [`SystemTime`] types that mirror the +//! shape of `std::time::Instant` and `std::time::SystemTime`, so guest code +//! that ordinarily uses the standard library's time APIs can be ported with +//! minimal changes. +//! +//! # Clock source +//! +//! Both types read from the shared paravirtualized clock page armed by the +//! host. See [`hyperlight_guest::time`] for the low-level details. +//! +//! # Availability +//! +//! If the host was built without the `enable_guest_clock` feature, every +//! constructor in this module returns [`TimeError::Unavailable`]. A guest +//! that wants to gracefully degrade should probe [`is_available`] once at +//! start-up rather than relying on `Instant::now()` to fail later. +//! +//! # Example +//! +//! ```no_run +//! use hyperlight_guest_bin::time::{Instant, SystemTime, UNIX_EPOCH}; +//! +//! if let Ok(start) = Instant::now() { +//! do_some_work(); +//! if let Ok(elapsed) = start.elapsed() { +//! log::info!("work took {} us", elapsed.as_micros()); +//! } +//! } +//! +//! if let Ok(now) = SystemTime::now() +//! && let Ok(since_epoch) = now.duration_since(UNIX_EPOCH) +//! { +//! log::info!("wall-clock seconds since epoch: {}", since_epoch.as_secs()); +//! } +//! # fn do_some_work() {} +//! ``` + +use core::time::Duration; + +use hyperlight_guest::time; + +/// Errors returned by the time API. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum TimeError { + /// The host did not arm a paravirtualized clock for this sandbox (the + /// host was built without the `enable_guest_clock` feature, or clock + /// setup failed). + Unavailable, + /// The seqlock retry cap was exhausted. The caller may simply retry. + Retry, + /// `SystemTime::duration_since` was called with an argument that lies + /// in the future relative to `self`. + NegativeDuration(Duration), +} + +impl core::fmt::Display for TimeError { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + match self { + Self::Unavailable => f.write_str("guest clock is not available"), + Self::Retry => f.write_str("guest clock read retry cap exhausted"), + Self::NegativeDuration(_) => f.write_str("second time is later than self"), + } + } +} + +/// Returns `true` if the host has armed a paravirtualized clock. +#[inline] +pub fn is_available() -> bool { + time::is_available() +} + +/// Read raw monotonic nanoseconds, or convert a [`time`] read failure into a +/// [`TimeError`]. Factored out so `Instant::now` and `SystemTime::now` share +/// the same failure classification. +#[inline] +fn read_monotonic_ns() -> Result { + if !time::is_available() { + return Err(TimeError::Unavailable); + } + time::monotonic_time_ns().ok_or(TimeError::Retry) +} + +/// A measurement of a monotonically non-decreasing clock, analogous to +/// [`std::time::Instant`]. +/// +/// Unlike `std::time::Instant`, construction is fallible: it returns +/// `TimeError::Unavailable` when the host has no guest-clock feature +/// enabled, and `TimeError::Retry` on a (vanishingly rare) seqlock retry +/// storm. +#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub struct Instant { + /// Nanoseconds since sandbox creation, as reported by the paravirt + /// clock. + ns: u64, +} + +impl Instant { + /// Returns an instant corresponding to "now". + pub fn now() -> Result { + Ok(Self { + ns: read_monotonic_ns()?, + }) + } + + /// Returns the amount of time elapsed from another instant to this one, + /// or `None` if that instant is later than this one. + pub fn checked_duration_since(&self, earlier: Instant) -> Option { + self.ns.checked_sub(earlier.ns).map(Duration::from_nanos) + } + + /// Returns the amount of time elapsed from another instant to this one, + /// saturating at zero when the other instant is later. + pub fn saturating_duration_since(&self, earlier: Instant) -> Duration { + self.checked_duration_since(earlier) + .unwrap_or(Duration::ZERO) + } + + /// Returns the amount of time elapsed since this instant. + pub fn elapsed(&self) -> Result { + let now = Self::now()?; + Ok(now.saturating_duration_since(*self)) + } +} + +impl core::ops::Sub for Instant { + type Output = Duration; + + /// Panics if `rhs` is later than `self`. Mirrors the behaviour of + /// `std::time::Instant::sub`. + fn sub(self, rhs: Instant) -> Duration { + self.checked_duration_since(rhs) + .expect("supplied instant is later than self") + } +} + +/// A measurement of the system clock, analogous to +/// [`std::time::SystemTime`]. +/// +/// Represents wall-clock time, using the host's boot-time stamp combined +/// with the paravirtualized monotonic clock. Snapshot-restore preserves +/// the freshly re-stamped boot time, so `SystemTime::now()` will jump +/// forward by real elapsed wall-clock time across a restore — exactly the +/// behaviour a guest using `std::time::SystemTime` would expect. +#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub struct SystemTime { + /// Nanoseconds since the Unix epoch. + ns: u64, +} + +/// An anchor point corresponding to 1970-01-01 00:00:00 UTC. Subtract from +/// a `SystemTime` to get the wall-clock duration since the epoch. +pub const UNIX_EPOCH: SystemTime = SystemTime { ns: 0 }; + +impl SystemTime { + /// Returns the current wall-clock time. + pub fn now() -> Result { + if !time::is_available() { + return Err(TimeError::Unavailable); + } + let ns = time::wall_clock_time_ns().ok_or(TimeError::Retry)?; + Ok(Self { ns }) + } + + /// Returns the duration from `earlier` to `self`, or + /// `TimeError::NegativeDuration(d)` — where `d` is the magnitude of the + /// difference — if `earlier` is later than `self`. Mirrors + /// `std::time::SystemTime::duration_since`. + pub fn duration_since(&self, earlier: SystemTime) -> Result { + if self.ns >= earlier.ns { + Ok(Duration::from_nanos(self.ns - earlier.ns)) + } else { + Err(TimeError::NegativeDuration(Duration::from_nanos( + earlier.ns - self.ns, + ))) + } + } + + /// Returns the amount of time elapsed since `self`. + pub fn elapsed(&self) -> Result { + let now = Self::now()?; + now.duration_since(*self) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + // The error type is both `Debug` and `Display` so it plays nicely with + // `?` in guest code and with `log::error!("{err}")` style logging. + #[test] + fn time_error_display() { + extern crate std; + use std::format; + assert_eq!( + format!("{}", TimeError::Unavailable), + "guest clock is not available" + ); + assert_eq!( + format!("{}", TimeError::NegativeDuration(Duration::from_secs(1))), + "second time is later than self" + ); + } +} From 13abd7dbb41a67c83277f900af10db3b5c3126c7 Mon Sep 17 00:00:00 2001 From: Simon Davies Date: Thu, 23 Apr 2026 13:36:17 +0100 Subject: [PATCH 07/10] guest_bin: wire libc clock_gettime/gettimeofday to paravirt clock The picolibc POSIX bridge previously returned a synthetic 1s-per-call time for CLOCK_REALTIME, CLOCK_MONOTONIC and gettimeofday. When the host arms a paravirtualized clock those callers now receive real wall-clock and monotonic time sourced from hyperlight_guest::time, with the counter-based fallback preserved for hosts built without the enable_guest_clock feature. Signed-off-by: Simon Davies --- src/hyperlight_guest_bin/src/libc.rs | 40 +++++++++++++++++++++++----- 1 file changed, 34 insertions(+), 6 deletions(-) diff --git a/src/hyperlight_guest_bin/src/libc.rs b/src/hyperlight_guest_bin/src/libc.rs index dbb0d4cab..06089d8ae 100644 --- a/src/hyperlight_guest_bin/src/libc.rs +++ b/src/hyperlight_guest_bin/src/libc.rs @@ -20,6 +20,7 @@ use core::ffi::*; use core::sync::atomic::{AtomicU64, Ordering}; use hyperlight_common::flatbuffer_wrappers::function_types::{ParameterValue, ReturnType}; +use hyperlight_guest::time; use crate::host_comm::call_host_function; @@ -58,13 +59,32 @@ pub(crate) struct Timeval { tv_usec: c_long, } -/// Returns a synthetic monotonically-increasing time starting at Unix epoch -/// increasing 1s each call. -fn current_time() -> (u64, u64) { +/// Fallback clock used when the host has not armed a paravirtualized +/// clock. Returns a synthetic `(secs, nsecs)` pair that advances by one +/// second per call, preserving long-standing guest behaviour for hosts +/// built without the `enable_guest_clock` feature. +fn fallback_time() -> (u64, u64) { let call_count = CURRENT_TIME.fetch_add(1, Ordering::Relaxed) + 1; (call_count, 0) } +/// Returns `(secs, nsecs)` for `CLOCK_REALTIME` (wall-clock). +fn realtime() -> (u64, u64) { + match time::wall_clock_time() { + Some((secs, nsecs)) => (secs, nsecs as u64), + None => fallback_time(), + } +} + +/// Returns `(secs, nsecs)` for `CLOCK_MONOTONIC` (time since sandbox +/// creation). +fn monotonic() -> (u64, u64) { + match time::monotonic_time_ns() { + Some(ns) => (ns / 1_000_000_000, ns % 1_000_000_000), + None => fallback_time(), + } +} + #[unsafe(no_mangle)] pub extern "C" fn read(fd: c_int, buf: *mut c_void, count: usize) -> isize { if buf.is_null() && count > 0 { @@ -115,8 +135,16 @@ pub extern "C" fn clock_gettime(clk_id: c_ulong, tp: *mut Timespec) -> c_int { } match clk_id { - CLOCK_REALTIME | CLOCK_MONOTONIC => { - let (secs, nanos) = current_time(); + CLOCK_REALTIME => { + let (secs, nanos) = realtime(); + unsafe { + (*tp).tv_sec = secs as c_long; + (*tp).tv_nsec = nanos as c_long; + } + 0 + } + CLOCK_MONOTONIC => { + let (secs, nanos) = monotonic(); unsafe { (*tp).tv_sec = secs as c_long; (*tp).tv_nsec = nanos as c_long; @@ -137,7 +165,7 @@ pub extern "C" fn gettimeofday(tv: *mut Timeval, _tz: *mut c_void) -> c_int { return -1; } - let (secs, nanos) = current_time(); + let (secs, nanos) = realtime(); unsafe { (*tv).tv_sec = secs as c_long; (*tv).tv_usec = (nanos / 1000) as c_long; From 3c1404242d5018e936f7cc302924ad03161bf26f Mon Sep 17 00:00:00 2001 From: Simon Davies Date: Thu, 23 Apr 2026 22:10:06 +0100 Subject: [PATCH 08/10] Reserve clock page from guest stack & add guest-clock integration tests The paravirtualized clock page lives at the top of scratch, below the small bookkeeping trailer. Both the main/init stack top and the guest's IST1 (exception) stack were set to 'MAX_GVA - SCRATCH_TOP_EXN_STACK_OFFSET + 1', so any write to either stack \u2014 and in particular page-fault handlers running on IST1 \u2014 would grow down through the clock page and clobber its 'clock_type' / 'boot_time_ns' trailer before the guest could read it. Move both stack tops unconditionally to 'MAX_GVA - SCRATCH_TOP_CLOCK_PAGE_OFFSET + 1' so they sit strictly below the reserved clock page. Also bump the guest's physical page allocator top-of-scratch reservation to three pages (bookkeeping, guest-counter, clock) to match. The clock page is always reserved regardless of the host's 'enable_guest_clock' feature to keep the memory layout stable. Add four integration tests gated on 'enable_guest_clock' that verify: * the guest observes the clock as available, * the monotonic clock advances across calls, * the wall clock tracks the host wall clock, * the wall clock is re-stamped across snapshot/restore. Signed-off-by: Simon Davies --- .github/workflows/dep_build_test.yml | 5 + Justfile | 3 + .../src/arch/amd64/prim_alloc.rs | 11 +- .../src/arch/amd64/init.rs | 6 +- .../src/hypervisor/hyperlight_vm/x86_64.rs | 4 +- src/hyperlight_host/src/hypervisor/mod.rs | 2 +- src/hyperlight_host/tests/guest_clock_test.rs | 186 ++++++++++++++++++ src/tests/rust_guests/simpleguest/src/main.rs | 24 +++ 8 files changed, 234 insertions(+), 7 deletions(-) create mode 100644 src/hyperlight_host/tests/guest_clock_test.rs diff --git a/.github/workflows/dep_build_test.yml b/.github/workflows/dep_build_test.yml index 32ac2a306..8b5c4651e 100644 --- a/.github/workflows/dep_build_test.yml +++ b/.github/workflows/dep_build_test.yml @@ -114,6 +114,11 @@ jobs: # with hw-interrupts feature enabled (+ explicit driver on Linux) just test ${{ inputs.config }} ${{ runner.os == 'Linux' && (inputs.hypervisor == 'mshv3' && 'mshv3,hw-interrupts' || 'kvm,hw-interrupts') || 'hw-interrupts' }} + - name: Run Rust tests with enable_guest_clock + run: | + # with enable_guest_clock + hw-interrupts (+ explicit driver on Linux) + just test ${{ inputs.config }} ${{ runner.os == 'Linux' && (inputs.hypervisor == 'mshv3' && 'mshv3,hw-interrupts,enable_guest_clock' || 'kvm,hw-interrupts,enable_guest_clock') || 'hw-interrupts,enable_guest_clock' }} + - name: Run Rust Gdb tests env: RUST_LOG: debug diff --git a/Justfile b/Justfile index 90b9ba61e..173e18f55 100644 --- a/Justfile +++ b/Justfile @@ -91,6 +91,9 @@ test-like-ci config=default-target hypervisor="kvm": @# with hw-interrupts enabled (+ explicit driver on Linux) {{ if os() == "linux" { if hypervisor == "mshv3" { "just test " + config + " mshv3,hw-interrupts" } else { "just test " + config + " kvm,hw-interrupts" } } else { "just test " + config + " hw-interrupts" } }} + @# with enable_guest_clock (+ explicit driver + hw-interrupts on Linux) + {{ if os() == "linux" { if hypervisor == "mshv3" { "just test " + config + " mshv3,hw-interrupts,enable_guest_clock" } else { "just test " + config + " kvm,hw-interrupts,enable_guest_clock" } } else { "just test " + config + " hw-interrupts,enable_guest_clock" } }} + @# make sure certain cargo features compile just check diff --git a/src/hyperlight_guest/src/arch/amd64/prim_alloc.rs b/src/hyperlight_guest/src/arch/amd64/prim_alloc.rs index cfaad9a0b..9cb26293e 100644 --- a/src/hyperlight_guest/src/arch/amd64/prim_alloc.rs +++ b/src/hyperlight_guest/src/arch/amd64/prim_alloc.rs @@ -31,9 +31,14 @@ pub unsafe fn alloc_phys_pages(n: u64) -> u64 { x = inout(reg) x ); } - // Set aside two pages at the top of the scratch region for the - // exception stack, shared state, etc - let max_avail = hyperlight_common::layout::MAX_GPA - hyperlight_common::vmem::PAGE_SIZE * 2; + // Set aside three pages at the top of the scratch region: + // - top page: size/allocator/snapshot-PT/exn-stack bookkeeping + // - next page down: the reserved guest-counter / shared-state page + // - third page down: the paravirtualized guest clock page + // The clock page is always reserved even when the host is built + // without the `enable_guest_clock` feature, so that the physical + // allocator's footprint is independent of host-side features. + let max_avail = hyperlight_common::layout::MAX_GPA - hyperlight_common::vmem::PAGE_SIZE * 3; if x.checked_add(nbytes) .is_none_or(|xx| xx >= max_avail as u64) { diff --git a/src/hyperlight_guest_bin/src/arch/amd64/init.rs b/src/hyperlight_guest_bin/src/arch/amd64/init.rs index 073bd3a2f..8fc5d9911 100644 --- a/src/hyperlight_guest_bin/src/arch/amd64/init.rs +++ b/src/hyperlight_guest_bin/src/arch/amd64/init.rs @@ -92,8 +92,12 @@ unsafe fn init_tss(pc: *mut ProcCtrl) { let tss_ptr = &raw mut (*pc).tss; // copy byte by byte to avoid alignment issues let ist1_ptr = &raw mut (*tss_ptr).ist1 as *mut [u8; 8]; + // The exception stack (IST1) grows downward. Place it below + // the reserved clock page so page-fault / COW handlers never + // clobber the hypervisor-owned clock page or the bookkeeping + // data at the top of scratch. let exn_stack = hyperlight_common::layout::MAX_GVA as u64 - - hyperlight_common::layout::SCRATCH_TOP_EXN_STACK_OFFSET + - hyperlight_common::layout::SCRATCH_TOP_CLOCK_PAGE_OFFSET + 1; ist1_ptr.write_volatile(exn_stack.to_ne_bytes()); asm!( diff --git a/src/hyperlight_host/src/hypervisor/hyperlight_vm/x86_64.rs b/src/hyperlight_host/src/hypervisor/hyperlight_vm/x86_64.rs index af8f08617..27f23c721 100644 --- a/src/hyperlight_host/src/hypervisor/hyperlight_vm/x86_64.rs +++ b/src/hyperlight_host/src/hypervisor/hyperlight_vm/x86_64.rs @@ -1510,7 +1510,7 @@ mod tests { let peb_address = gshm.layout.peb_address; let stack_top_gva = hyperlight_common::layout::MAX_GVA as u64 - - hyperlight_common::layout::SCRATCH_TOP_EXN_STACK_OFFSET + - hyperlight_common::layout::SCRATCH_TOP_CLOCK_PAGE_OFFSET + 1; let mut vm = set_up_hypervisor_partition( gshm, @@ -2123,7 +2123,7 @@ mod tests { /// Get the stack top GVA, same as the regular codepath. fn stack_top_gva(&self) -> u64 { hyperlight_common::layout::MAX_GVA as u64 - - hyperlight_common::layout::SCRATCH_TOP_EXN_STACK_OFFSET + - hyperlight_common::layout::SCRATCH_TOP_CLOCK_PAGE_OFFSET + 1 } } diff --git a/src/hyperlight_host/src/hypervisor/mod.rs b/src/hyperlight_host/src/hypervisor/mod.rs index be1a15c22..d87752e1b 100644 --- a/src/hyperlight_host/src/hypervisor/mod.rs +++ b/src/hyperlight_host/src/hypervisor/mod.rs @@ -488,7 +488,7 @@ pub(crate) mod tests { UninitializedSandbox::new(GuestBinary::FilePath(filename.clone()), Some(config))?; let (mut mem_mgr, gshm) = sandbox.mgr.build().unwrap(); let exn_stack_top_gva = hyperlight_common::layout::MAX_GVA as u64 - - hyperlight_common::layout::SCRATCH_TOP_EXN_STACK_OFFSET + - hyperlight_common::layout::SCRATCH_TOP_CLOCK_PAGE_OFFSET + 1; let mut vm = set_up_hypervisor_partition( gshm, diff --git a/src/hyperlight_host/tests/guest_clock_test.rs b/src/hyperlight_host/tests/guest_clock_test.rs new file mode 100644 index 000000000..24d466e39 --- /dev/null +++ b/src/hyperlight_host/tests/guest_clock_test.rs @@ -0,0 +1,186 @@ +/* +Copyright 2025 The Hyperlight Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +//! Integration tests for the paravirtualized guest clock, only compiled +//! when the `enable_guest_clock` feature is enabled on `hyperlight-host`. +#![cfg(all(feature = "enable_guest_clock", target_arch = "x86_64"))] + +use std::thread; +use std::time::{Duration, SystemTime, UNIX_EPOCH}; + +pub mod common; +use crate::common::with_rust_sandbox; +/// Minimum real wait used by the monotonic advance test. Chosen large +/// enough to dwarf any plausible CI scheduling jitter but small enough +/// not to slow the test suite down noticeably. +const MONOTONIC_ADVANCE_SLEEP: Duration = Duration::from_millis(50); + +/// Minimum real wait used by the restore-doesn't-freeze-wall-clock test. +/// Must be comfortably larger than `WALL_CLOCK_ADVANCE_TOLERANCE` below. +const RESTORE_SLEEP: Duration = Duration::from_millis(200); + +/// Allowed "play" when comparing post-restore wall-clock time against the +/// host's notion of now. Accounts for the guest call round-trip plus any +/// CI jitter. Kept generous because the test only needs to prove that +/// the clock was re-stamped, not that it is sub-millisecond accurate. +const WALL_CLOCK_ADVANCE_TOLERANCE: Duration = Duration::from_millis(500); + +/// How long to sit idle after sandbox creation in the no-drift test below. +/// Long enough that any constant offset between guest and host wall +/// clocks (e.g. from a stale `boot_time_ns` calibration) dominates over +/// scheduling jitter. +const IDLE_BEFORE_FIRST_CALL: Duration = Duration::from_secs(2); + +/// Tight tolerance used by the no-drift test. +/// +/// The host computes `boot_time_ns = wall_now - monotonic_now` +/// back-to-back in `arm_clock` (where `monotonic_now` comes from +/// `KVM_GET_CLOCK` on KVM, or `HV_REGISTER_TIME_REF_COUNT` on +/// Hyper-V). On KVM, `KVM_GET_CLOCK` can disagree with the live +/// pvclock page by up to ~13ms (observed on WSL2; root cause +/// uncertain — may be smaller on bare metal). The 20ms tolerance +/// accommodates this while still catching formula bugs (e.g. +/// omitting the monotonic subtraction produces ~100ms+ drift). +const WALL_CLOCK_TIGHT_TOLERANCE: Duration = Duration::from_millis(20); + +#[test] +fn clock_is_available_under_enable_guest_clock() { + with_rust_sandbox(|mut sbox| { + let available: i32 = sbox.call("ClockIsAvailable", ()).unwrap(); + assert_eq!(available, 1, "guest clock should be armed by the host"); + }); +} + +#[test] +fn monotonic_time_advances_across_calls() { + with_rust_sandbox(|mut sbox| { + let first: i64 = sbox.call("GetMonotonicTimeNs", ()).unwrap(); + assert!(first >= 0, "guest reported clock unavailable: {first}"); + + thread::sleep(MONOTONIC_ADVANCE_SLEEP); + + let second: i64 = sbox.call("GetMonotonicTimeNs", ()).unwrap(); + assert!(second >= 0, "guest reported clock unavailable: {second}"); + + let delta_ns = second - first; + assert!( + delta_ns >= MONOTONIC_ADVANCE_SLEEP.as_nanos() as i64 / 2, + "monotonic clock did not advance enough: first={first} second={second} \ + delta_ns={delta_ns}" + ); + }); +} + +#[test] +fn wall_clock_tracks_host_wall_clock() { + with_rust_sandbox(|mut sbox| { + let guest_ns: i64 = sbox.call("GetWallClockTimeNs", ()).unwrap(); + assert!( + guest_ns >= 0, + "guest reported wall-clock unavailable: {guest_ns}" + ); + + let host_ns = SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap() + .as_nanos() as i64; + + let skew_ns = (host_ns - guest_ns).abs(); + assert!( + skew_ns < WALL_CLOCK_ADVANCE_TOLERANCE.as_nanos() as i64, + "guest wall-clock differs from host by {skew_ns} ns \ + (guest={guest_ns}, host={host_ns})" + ); + }); +} + +/// Snapshot / restore must re-stamp the host's `boot_time_ns` so the guest +/// sees real elapsed wall-clock time across the restore rather than a +/// frozen instant from when the snapshot was taken. +#[test] +fn wall_clock_advances_across_snapshot_restore() { + with_rust_sandbox(|mut sbox| { + let snapshot = sbox.snapshot().unwrap(); + + let before: i64 = sbox.call("GetWallClockTimeNs", ()).unwrap(); + assert!( + before >= 0, + "guest reported wall-clock unavailable: {before}" + ); + + thread::sleep(RESTORE_SLEEP); + sbox.restore(snapshot).unwrap(); + + let after: i64 = sbox.call("GetWallClockTimeNs", ()).unwrap(); + assert!(after >= 0, "guest reported wall-clock unavailable: {after}"); + + let advance_ns = after - before; + // Allow half the sleep to cover scheduling jitter on the low end; + // on the high end, real elapsed time plus the guest-call overhead + // is fine. + assert!( + advance_ns >= RESTORE_SLEEP.as_nanos() as i64 / 2, + "wall-clock did not advance across snapshot/restore: \ + before={before} after={after} advance_ns={advance_ns}" + ); + }); +} + +/// Diagnostic for the `boot_time_ns` calibration formula. +/// +/// `arm_clock` stamps `boot_time_ns` and the guest computes +/// `wall = boot_time_ns + monotonic_time_ns()`. For that to match the +/// host's wall clock, `boot_time_ns` must be `wall_at_arm - monotonic_at_arm` +/// — i.e. the Unix-epoch origin of the monotonic clock — not just +/// `wall_at_arm`. If the host stamps the latter, the guest's wall clock +/// is offset ahead of the host by exactly the value of the underlying +/// paravirt counter at arm time, which on a host with non-trivial +/// uptime (or any KVM partition where `system_time` is host-wide) can +/// be arbitrarily large. +/// +/// This test waits for a real interval after sandbox creation before +/// the first guest call, then requires the guest's reported wall clock +/// to match the host's within a tight tolerance. The existing +/// [`wall_clock_tracks_host_wall_clock`] test uses a 500 ms tolerance +/// and reads immediately, both of which can mask a small constant +/// offset. This one will not. +#[test] +fn wall_clock_does_not_drift_after_idle() { + with_rust_sandbox(|mut sbox| { + thread::sleep(IDLE_BEFORE_FIRST_CALL); + + let guest_ns: i64 = sbox.call("GetWallClockTimeNs", ()).unwrap(); + assert!( + guest_ns >= 0, + "guest reported wall-clock unavailable: {guest_ns}" + ); + + let host_ns = SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap() + .as_nanos() as i64; + + let skew_ns = (host_ns - guest_ns).abs(); + assert!( + skew_ns < WALL_CLOCK_TIGHT_TOLERANCE.as_nanos() as i64, + "guest wall-clock skew of {skew_ns} ns exceeds tolerance of {tol} ns \ + after {idle:?} idle — likely a `boot_time_ns` calibration bug \ + (guest={guest_ns}, host={host_ns})", + tol = WALL_CLOCK_TIGHT_TOLERANCE.as_nanos(), + idle = IDLE_BEFORE_FIRST_CALL, + ); + }); +} diff --git a/src/tests/rust_guests/simpleguest/src/main.rs b/src/tests/rust_guests/simpleguest/src/main.rs index b6844a716..fb08b80fc 100644 --- a/src/tests/rust_guests/simpleguest/src/main.rs +++ b/src/tests/rust_guests/simpleguest/src/main.rs @@ -725,6 +725,30 @@ fn add(a: i32, b: i32) -> Result { host_add(a, b) } +// ===== Paravirtualized guest clock test surface ===== + +#[guest_function("ClockIsAvailable")] +fn clock_is_available() -> i32 { + hyperlight_guest::time::is_available() as i32 +} + +/// Returns monotonic nanoseconds, or `-1` if the clock is unavailable. +#[guest_function("GetMonotonicTimeNs")] +fn get_monotonic_time_ns() -> i64 { + hyperlight_guest::time::monotonic_time_ns() + .and_then(|ns| i64::try_from(ns).ok()) + .unwrap_or(-1) +} + +/// Returns wall-clock nanoseconds since the Unix epoch, or `-1` if the +/// clock is unavailable. +#[guest_function("GetWallClockTimeNs")] +fn get_wall_clock_time_ns() -> i64 { + hyperlight_guest::time::wall_clock_time_ns() + .and_then(|ns| i64::try_from(ns).ok()) + .unwrap_or(-1) +} + // Does nothing, but used for testing large parameters #[guest_function("LargeParameters")] fn large_parameters(v: Vec, s: String) { From 1b01ea1a5201899232504c399ab4d80b3483bdf1 Mon Sep 17 00:00:00 2001 From: Simon Davies Date: Thu, 23 Apr 2026 22:13:00 +0100 Subject: [PATCH 09/10] docs: describe the paravirtualized guest clock Add a walkthrough of the 'enable_guest_clock' feature: what the guest gets (monotonic + wall-clock reads with no VM exit), how to use the Rust and C APIs, snapshot/restore semantics, the top-of-scratch memory layout, and explicit non-goals. Link it from docs/README.md. Signed-off-by: Simon Davies --- docs/README.md | 1 + docs/guest-time.md | 135 ++++++++++++++++++++++++++++ src/hyperlight_common/src/layout.rs | 7 +- 3 files changed, 141 insertions(+), 2 deletions(-) create mode 100644 docs/guest-time.md diff --git a/docs/README.md b/docs/README.md index 3b36c5db1..a4f86d836 100644 --- a/docs/README.md +++ b/docs/README.md @@ -29,6 +29,7 @@ This project is composed internally of several components, depicted in the below * [How to build a Hyperlight guest binary](./how-to-build-a-hyperlight-guest-binary.md) * [Security considerations](./security.md) * [Technical requirements document](./technical-requirements-document.md) +* [Paravirtualized guest clock](./guest-time.md) ## For developers diff --git a/docs/guest-time.md b/docs/guest-time.md new file mode 100644 index 000000000..3828da584 --- /dev/null +++ b/docs/guest-time.md @@ -0,0 +1,135 @@ +# Paravirtualized Guest Clock + +Hyperlight's `enable_guest_clock` Cargo feature gives guests a cheap way to ask +"what time is it?" without taking a VM exit. When the host is built with the +feature, every sandbox exposes a paravirtualized clock that the guest can read +using ordinary memory loads. + +## What the guest gets + +When the feature is enabled the host populates a single 4 KiB "clock page" +inside the sandbox's scratch region. The page carries two pieces of +information: + +- **A hypervisor-specific calibration block at offset `0x00`.** Written by + KVM (`kvm_clock`) or Hyper-V / MSHV (Reference TSC). Contains the TSC + frequency, scaling constants, and a sequence lock the guest uses to read it + atomically. The entire clock page is hypervisor-owned; Hyperlight does not + write to it. +- **Hyperlight metadata in the scratch bookkeeping page** (separate from the + clock page): a `u64` [`ClockType`](../src/hyperlight_common/src/time.rs) tag + and `boot_time_ns`, the Unix-epoch origin of the monotonic clock computed + by the host as `wall_now - monotonic_now` (see below). These live at fixed + offsets from the top of scratch (`-0x28` and `-0x30`), NOT in the clock + page, so a future TLFS extension cannot clobber them. + +With those two pieces the guest can compute: + +- **Monotonic nanoseconds since boot** — read the TSC, apply the scaling + factors from the calibration block, giving you a `CLOCK_MONOTONIC` + equivalent. +- **Wall-clock nanoseconds since the Unix epoch** — add `boot_time_ns` to the + monotonic value above, giving you a `CLOCK_REALTIME` / `gettimeofday`. `boot_time_ns` is computed by the host as + `SystemTime::now() - KVM_GET_CLOCK` (on KVM) or + `SystemTime::now() - TIME_REF_COUNT` (on Hyper-V) after sandbox + initialisation. Hyper-V has no equivalent to KVM's + `MSR_KVM_WALL_CLOCK_NEW`, so we use this uniform host-computed approach + on all backends. + +> **Note (KVM only):** Wall-clock time returns `None` during +> `hyperlight_main` (guest init). On KVM, `KVM_GET_CLOCK` is unreliable +> until the "master clock" is established at first vCPU entry, so +> `boot_time_ns` is stamped after init completes. Monotonic time works +> fine during init. Wall-clock time becomes available on the first +> dispatch call. + +Both reads are lock-free (well, seqlock-protected for the calibration block) +and never leave the guest. + +## Using it in a Rust guest + +The guest-side API lives in `hyperlight_guest::time` for the low-level +readers and `hyperlight_guest_bin::time` for a `std::time`-flavoured +wrapper: + +```rust +// Low-level, no_std readers. +use hyperlight_guest::time; + +if time::is_available() { + let mono_ns: u64 = time::monotonic_time_ns().unwrap(); + let wall_ns: u64 = time::wall_clock_time_ns().unwrap(); +} + +// std::time-flavoured wrapper (hyperlight_guest_bin only). +use hyperlight_guest_bin::time::{Instant, SystemTime, UNIX_EPOCH}; + +let t0 = Instant::now()?; +// ... do work ... +let elapsed = t0.elapsed()?; + +let now = SystemTime::now()?; +let unix_ns = now.duration_since(UNIX_EPOCH)?.as_nanos(); +``` + +C guests that use picolibc get paravirt time for free: `hyperlight_guest_bin` +wires `clock_gettime(CLOCK_MONOTONIC|CLOCK_REALTIME)` and `gettimeofday` into +the same reader, so existing C code continues to work unchanged. + +## Snapshot / restore semantics + +Both `boot_time_ns` and the hypervisor calibration block live inside scratch +memory, which is not included in snapshots. On every +`MultiUseSandbox::restore`, the host re-arms the clock page: it re-installs +the pvclock MSR / Hyper-V register against the fresh vCPU state and stamps a +new `boot_time_ns` captured at the moment of restore. As a result a restored +guest observes wall-clock time reflecting the restore moment, not the +original boot — which is what wall clocks are supposed to do. + +## Enabling the feature + +Turn it on in the host's `Cargo.toml`: + +```toml +[dependencies] +hyperlight-host = { version = "...", features = ["enable_guest_clock"] } +``` + +The feature is x86_64 only; on aarch64 it has no effect. It is off by default +so existing sandboxes don't pay for a facility they don't use. When off, the +clock page is still reserved in the layout (so memory maps are stable) but +left un-mapped against any hypervisor clock source; `hyperlight_guest::time` +readers then report "unavailable" and fall back to whatever the guest wants +to do about it (the picolibc wiring returns a synthetic 1-second-per-call +counter, which is enough to stop `strftime` crashing and not much else). + +## Layout details + +The clock page sits 3 pages below the very top of the scratch region: + +| Offset from top | Size | Contents | +|-----------------|-------|------------------------------------------------| +| `-0x1000` | 4 KiB | Bookkeeping (size, allocator counter, ...) | +| `-0x2000` | 4 KiB | Reserved for shared-state counter | +| `-0x3000` | 4 KiB | Paravirtualized clock page | + +Because the clock page is at the top of scratch, both the guest's main stack +and its IST1 (exception) stack are configured to start one page below the +clock page (at `MAX_GVA + 1 - SCRATCH_TOP_CLOCK_PAGE_OFFSET`) so stack writes +— including page-fault handlers running on IST1 — cannot clobber the trailer. +The allocator reserves the top three pages unconditionally so the memory map +stays identical whether or not the feature is enabled. + +## Non-goals + +- **Sub-microsecond accuracy.** `boot_time_ns` is computed from two + back-to-back host reads (`SystemTime::now()` and `KVM_GET_CLOCK` / + `TIME_REF_COUNT`). On KVM, residual disagreement between `KVM_GET_CLOCK` + and the pvclock page can add up to ~13ms of constant offset (observed on + WSL2; root cause uncertain). On Hyper-V the offset should be negligible. +- **`CLOCK_PROCESS_CPUTIME_ID` and friends.** The clock page exposes only + monotonic and wall-clock time; per-thread / per-process CPU time is out of + scope. +- **Timers or sleeps.** The guest can read the clock but has no way to ask + the hypervisor to wake it up later — that is still done through the + existing guest-function call model. diff --git a/src/hyperlight_common/src/layout.rs b/src/hyperlight_common/src/layout.rs index 00f560a7b..bb7b57648 100644 --- a/src/hyperlight_common/src/layout.rs +++ b/src/hyperlight_common/src/layout.rs @@ -85,8 +85,11 @@ pub const SCRATCH_TOP_GUEST_COUNTER_OFFSET: u64 = 0x1008; /// below, at `top - SCRATCH_TOP_CLOCK_PAGE_OFFSET` + 1 byte — in other words, /// subtract this value from `MAX_GPA`/`MAX_GVA` + 1 to get the page base. /// -/// The page is only present when the host is built with the -/// `enable_guest_clock` feature. +/// The page is always reserved regardless of the `enable_guest_clock` +/// feature so that the memory layout (and therefore stack positions) +/// is stable across feature-flag builds. The host only populates it +/// when the feature is enabled; otherwise it stays zero-filled and +/// the guest sees `ClockType::None`. pub const SCRATCH_TOP_CLOCK_PAGE_OFFSET: u64 = 0x3000; /// Size of the paravirtualized clock page in bytes (one 4 KiB page). From fca0261f33cbd9d33e06bffdacf31432a35192c8 Mon Sep 17 00:00:00 2001 From: Simon Davies Date: Tue, 28 Apr 2026 22:56:05 +0100 Subject: [PATCH 10/10] guest: maintain monotonic clock continuity across cross-partition restores MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a guest-side monotonic offset mechanism so that monotonic_time_ns() never goes backward, even when a snapshot is restored into a new partition whose raw clock starts from a lower value. Two statics in BSS (survive snapshot): - RAW_HIGH_WATER: highest raw pvclock value ever returned - MONO_OFFSET: cumulative offset applied to maintain monotonicity On each read, if the raw value is less than the high-water mark, the offset is bumped by the old high-water mark. This is zero-cost on the normal (same-partition) path — just one atomic load + compare. wall_clock_time_ns() uses the raw monotonic value (no offset) because boot_time_ns is calibrated by the host against the raw clock. Applying the offset there would shift wall time into the future. Signed-off-by: Simon Davies --- src/hyperlight_guest/src/time.rs | 67 ++++++++++++++++++++++++++------ 1 file changed, 55 insertions(+), 12 deletions(-) diff --git a/src/hyperlight_guest/src/time.rs b/src/hyperlight_guest/src/time.rs index d282265e3..6a261eb96 100644 --- a/src/hyperlight_guest/src/time.rs +++ b/src/hyperlight_guest/src/time.rs @@ -53,7 +53,7 @@ limitations under the License. //! hypervisor-mutable memory we satisfy Rust's aliasing rules //! unconditionally. -use core::sync::atomic::{Ordering, fence}; +use core::sync::atomic::{AtomicU64, Ordering, fence}; use hyperlight_common::layout::{ SCRATCH_TOP_BOOT_TIME_NS_OFFSET, SCRATCH_TOP_CLOCK_TYPE_OFFSET, clock_page_gva, @@ -293,22 +293,61 @@ fn read_hv_reference_tsc() -> Option { None } +/// Highest raw pvclock value ever returned. Lives in BSS so it +/// survives snapshot/restore. Used to detect backward jumps when a +/// snapshot is restored into a new partition whose monotonic clock +/// starts from a lower value. +static RAW_HIGH_WATER: AtomicU64 = AtomicU64::new(0); + +/// Cumulative offset added to raw pvclock reads to maintain the +/// monotonic guarantee across cross-partition restores. On each +/// backward jump, the previous high-water mark is added so that all +/// future returns are >= any previously returned value. +static MONO_OFFSET: AtomicU64 = AtomicU64::new(0); + +/// Read the raw monotonic value from the hypervisor without any +/// offset adjustment. +fn raw_monotonic_ns() -> Option { + match read_clock_type() { + ClockType::KvmPvclock => read_kvm_pvclock(), + ClockType::HyperVReferenceTsc => read_hv_reference_tsc(), + ClockType::None => None, + } +} + /// Monotonic time in nanoseconds. /// -/// The value is an absolute counter from the hypervisor's time base -/// (kvmclock on KVM, partition reference time on Hyper-V). It is +/// The value is an absolute counter derived from the hypervisor's time +/// base (kvmclock on KVM, partition reference time on Hyper-V). It is /// monotonically increasing and suitable for measuring elapsed time -/// between two reads, but its epoch is unspecified — do not assume -/// it starts at zero when the sandbox is created. +/// between two reads. /// -/// Returns `None` if the clock is not configured, or if the retry cap was -/// exhausted (the caller may retry). +/// If a snapshot is restored into a **new** partition whose raw clock +/// starts from a lower value, an offset is applied so the returned +/// value never goes backward. Within a single partition epoch, diffs +/// between consecutive reads reflect real elapsed time. Across a +/// cross-partition restore the diff includes a synthetic gap (the +/// high-water mark from the old partition) — safe for timeouts and +/// deadlines, but not an accurate measure of freeze duration (use +/// wall-clock time for that). +/// +/// Returns `None` if the clock is not configured, or if the retry cap +/// was exhausted (the caller may retry). pub fn monotonic_time_ns() -> Option { - match read_clock_type() { - ClockType::KvmPvclock => read_kvm_pvclock(), - ClockType::HyperVReferenceTsc => read_hv_reference_tsc(), - ClockType::None => None, + let raw = raw_monotonic_ns()?; + + let high = RAW_HIGH_WATER.load(Ordering::Relaxed); + if raw < high { + // Raw clock went backward — snapshot was restored into a new + // partition. Bump the offset by the old high-water mark so all + // future reads are >= any previously returned value. + MONO_OFFSET.fetch_add(high, Ordering::Relaxed); + RAW_HIGH_WATER.store(raw, Ordering::Relaxed); + } else { + RAW_HIGH_WATER.store(raw, Ordering::Relaxed); } + + Some(raw.wrapping_add(MONO_OFFSET.load(Ordering::Relaxed))) } /// Wall-clock time in nanoseconds since the Unix epoch. @@ -333,7 +372,11 @@ pub fn monotonic_time_ns() -> Option { /// `MSR_KVM_WALL_CLOCK_NEW`). We use the same host-computed approach /// on all backends for uniformity. pub fn wall_clock_time_ns() -> Option { - let monotonic = monotonic_time_ns()?; + // Use the raw monotonic value (no cross-partition offset) because + // boot_time_ns is calibrated by the host against the raw clock. + // Applying the monotonic offset here would shift wall time into + // the future after a cross-partition restore. + let monotonic = raw_monotonic_ns()?; let boot_time = read_boot_time_ns(); // boot_time_ns == 0 means the host hasn't stamped it yet // (scratch memory is zero-initialised). Return None rather