diff --git a/src/hyperlight_guest/src/exit.rs b/src/hyperlight_guest/src/exit.rs index 449d1ddc2..40d72e6d2 100644 --- a/src/hyperlight_guest/src/exit.rs +++ b/src/hyperlight_guest/src/exit.rs @@ -21,6 +21,9 @@ use hyperlight_common::outb::OutBAction; use tracing::instrument; /// Halt the execution of the guest and returns control to the host. +/// Halt is generally called for a successful completion of the guest's work, +/// this means we can instrument it as a trace point because the trace state +/// shall not be locked at this point (we are not in an exception context). #[inline(never)] #[instrument(skip_all, level = "Trace")] pub fn halt() { @@ -122,7 +125,12 @@ pub(crate) fn outb(port: u16, data: &[u8]) { } /// OUT function for sending a 32-bit value to the host. -#[instrument(skip_all, level = "Trace")] +/// `out32` can be called from an exception context, so we must be careful +/// with the tracing state that might be locked at that time. +/// The tracing state calls `try_lock` internally to avoid deadlocks. +/// Furthermore, the instrument macro is not used here to avoid creating spans +/// in exception contexts. Because if the trace state is already locked, trying to create a span +/// would cause a panic, which is undesirable in exception handling. pub(crate) unsafe fn out32(port: u16, val: u32) { #[cfg(feature = "trace_guest")] { diff --git a/src/hyperlight_guest_tracing/src/lib.rs b/src/hyperlight_guest_tracing/src/lib.rs index eda7aada5..230ce2d4d 100644 --- a/src/hyperlight_guest_tracing/src/lib.rs +++ b/src/hyperlight_guest_tracing/src/lib.rs @@ -55,6 +55,11 @@ mod trace { use crate::subscriber::GuestSubscriber; /// Weak reference to the guest state so we can manually trigger flush to host + /// The `GuestState` is ONLY accessed from two places: + /// - The tracing subscriber, when spans/events are created in the guest + /// - The guest tracing API, when we want manual control to flush the events to the host + /// + /// The mutex ensures safe access to the state from both places. static GUEST_STATE: spin::Once>> = spin::Once::new(); /// Initialize the guest tracing subscriber as global default. @@ -72,12 +77,27 @@ mod trace { let _ = tracing_core::dispatcher::set_global_default(tracing_core::Dispatch::new(sub)); } - /// Sets the guset starting timestamp reported to the host on a VMExit + /// Sets the guest starting timestamp reported to the host on a VMExit + /// NOTE: Panics if unable to lock the guest state. pub fn set_start_tsc(guest_start_tsc: u64) { if let Some(w) = GUEST_STATE.get() - && let Some(state) = w.upgrade() + && let Some(state_mutex) = w.upgrade() { - state.lock().set_start_tsc(guest_start_tsc); + // We want to protect against re-entrancy issues produced by tracing code that locks + // the state and then causes an exception that tries to lock the state again. + // + // For example: + // - 1. A span is created, locking the state + // - 2. An exception occurs while the span is being created (e.g. not enough memory, etc.) + // - 3. The exception handler uses the tracing API to send the trace data to the host + // or just create spans/events for logging purposes. + // - 4. The tracing API tries to lock the state again, causing a deadlock. + // To avoid this, we use try_lock and if we cannot acquire the lock, we panic to signal + // the issue. + let mut state = state_mutex + .try_lock() + .expect("guest_tracing: Unable to lock guest tracing state in `set_start_tsc`"); + state.set_start_tsc(guest_start_tsc); } } @@ -87,32 +107,78 @@ mod trace { /// This expects an outb call to send the spans to the host. /// After calling this function, the internal state is marked /// for cleaning on the next access. + /// + /// NOTE: Panics if unable to lock the guest state. pub fn end_trace() { if let Some(w) = GUEST_STATE.get() - && let Some(state) = w.upgrade() + && let Some(state_mutex) = w.upgrade() { - state.lock().end_trace(); + // We want to protect against re-entrancy issues produced by tracing code that locks + // the state and then causes an exception that tries to lock the state again. + // + // For example: + // - 1. A span is created, locking the state + // - 2. An exception occurs while the span is being created (e.g. not enough memory, etc.) + // - 3. The exception handler uses the tracing API to send the trace data to the host + // or just create spans/events for logging purposes. + // - 4. The tracing API tries to lock the state again, causing a deadlock. + // To avoid this, we use try_lock and if we cannot acquire the lock, we panic to signal + // the issue. + let mut state = state_mutex + .try_lock() + .expect("guest_tracing: Unable to lock guest tracing state in `end_trace`"); + state.end_trace(); } } /// Cleans the internal trace state by removing closed spans and events. /// This ensures that after a VM exit, we keep the spans that /// are still active (in the stack) and remove all other spans and events. + /// NOTE: Panics if unable to lock the guest state. pub fn clean_trace_state() { if let Some(w) = GUEST_STATE.get() - && let Some(state) = w.upgrade() + && let Some(state_mutex) = w.upgrade() { - state.lock().clean(); + // We want to protect against re-entrancy issues produced by tracing code that locks + // the state and then causes an exception that tries to lock the state again. + // + // For example: + // - 1. A span is created, locking the state + // - 2. An exception occurs while the span is being created (e.g. not enough memory, etc.) + // - 3. The exception handler uses the tracing API to send the trace data to the host + // or just create spans/events for logging purposes. + // - 4. The tracing API tries to lock the state again, causing a deadlock. + // To avoid this, we use try_lock and if we cannot acquire the lock, we panic to signal + // the issue. + let mut state = state_mutex + .try_lock() + .expect("guest_tracing: Unable to lock guest tracing state in `clean_trace_state`"); + state.clean(); } } /// Returns information about the current trace state needed by the host to read the spans. + /// NOTE: Panics if unable to lock the guest state. pub fn guest_trace_info() -> Option { let mut res = None; if let Some(w) = GUEST_STATE.get() - && let Some(state) = w.upgrade() + && let Some(state_mutex) = w.upgrade() { - res = Some(state.lock().guest_trace_info()); + // We want to protect against re-entrancy issues produced by tracing code that locks + // the state and then causes an exception that tries to lock the state again. + // + // For example: + // - 1. A span is created, locking the state + // - 2. An exception occurs while the span is being created (e.g. not enough memory, etc.) + // - 3. The exception handler uses the tracing API to send the trace data to the host + // or just create spans/events for logging purposes. + // - 4. The tracing API tries to lock the state again, causing a deadlock. + // To avoid this, we use try_lock and if we cannot acquire the lock, we panic to signal + // the issue. + let mut state = state_mutex + .try_lock() + .expect("guest_tracing: Unable to lock guest tracing state in `guest_trace_info`"); + res = Some(state.guest_trace_info()); } res } diff --git a/src/hyperlight_guest_tracing/src/subscriber.rs b/src/hyperlight_guest_tracing/src/subscriber.rs index fca202bb0..e36f71c59 100644 --- a/src/hyperlight_guest_tracing/src/subscriber.rs +++ b/src/hyperlight_guest_tracing/src/subscriber.rs @@ -29,6 +29,7 @@ pub(crate) struct GuestSubscriber { /// Internal state that holds the spans and events /// Protected by a Mutex for inner mutability /// A reference to this state is stored in a static variable + /// so it can be accessed from the guest tracing API state: Arc>, } @@ -49,27 +50,123 @@ impl Subscriber for GuestSubscriber { } fn new_span(&self, attrs: &Attributes<'_>) -> Id { - self.state.lock().new_span(attrs) + // We want to protect against re-entrancy issues produced by tracing code that locks + // the state and then causes an exception that tries to lock the state again. + // + // For example: + // - 1. A span is created, locking the state + // - 2. An exception occurs while the span is being created (e.g. not enough memory, etc.) + // - 3. The exception handler uses the tracing API to send the trace data to the host + // or just create spans/events for logging purposes. + // - 4. The tracing API tries to lock the state again, causing a deadlock. + // To avoid this, we use try_lock and if we cannot acquire the lock, we panic to signal + // the issue. + let mut state = self + .state + .try_lock() + .expect("guest_tracing: Unable to lock guest tracing state in `new_span`"); + + state.new_span(attrs) } fn record(&self, id: &Id, values: &Record<'_>) { - self.state.lock().record(id, values) + // We want to protect against re-entrancy issues produced by tracing code that locks + // the state and then causes an exception that tries to lock the state again. + // + // For example: + // - 1. A span is created, locking the state + // - 2. An exception occurs while the span is being created (e.g. not enough memory, etc.) + // - 3. The exception handler uses the tracing API to send the trace data to the host + // or just create spans/events for logging purposes. + // - 4. The tracing API tries to lock the state again, causing a deadlock. + // To avoid this, we use try_lock and if we cannot acquire the lock, we panic to signal + // the issue. + let mut state = self + .state + .try_lock() + .expect("guest_tracing: Unable to lock guest tracing state in `record`"); + + state.record(id, values) } fn event(&self, event: &Event<'_>) { - self.state.lock().event(event) + // We want to protect against re-entrancy issues produced by tracing code that locks + // the state and then causes an exception that tries to lock the state again. + // + // For example: + // - 1. A span is created, locking the state + // - 2. An exception occurs while the span is being created (e.g. not enough memory, etc.) + // - 3. The exception handler uses the tracing API to send the trace data to the host + // or just create spans/events for logging purposes. + // - 4. The tracing API tries to lock the state again, causing a deadlock. + // To avoid this, we use try_lock and if we cannot acquire the lock, we panic to signal + // the issue. + let mut state = self + .state + .try_lock() + .expect("guest_tracing: Unable to lock guest tracing state in `event`"); + + state.event(event) } fn enter(&self, id: &Id) { - self.state.lock().enter(id) + // We want to protect against re-entrancy issues produced by tracing code that locks + // the state and then causes an exception that tries to lock the state again. + // + // For example: + // - 1. A span is created, locking the state + // - 2. An exception occurs while the span is being created (e.g. not enough memory, etc.) + // - 3. The exception handler uses the tracing API to send the trace data to the host + // or just create spans/events for logging purposes. + // - 4. The tracing API tries to lock the state again, causing a deadlock. + // To avoid this, we use try_lock and if we cannot acquire the lock, we panic to signal + // the issue. + let mut state = self + .state + .try_lock() + .expect("guest_tracing: Unable to lock guest tracing state in `enter`"); + + state.enter(id) } fn exit(&self, id: &Id) { - self.state.lock().exit(id) + // We want to protect against re-entrancy issues produced by tracing code that locks + // the state and then causes an exception that tries to lock the state again. + // + // For example: + // - 1. A span is created, locking the state + // - 2. An exception occurs while the span is being created (e.g. not enough memory, etc.) + // - 3. The exception handler uses the tracing API to send the trace data to the host + // or just create spans/events for logging purposes. + // - 4. The tracing API tries to lock the state again, causing a deadlock. + // To avoid this, we use try_lock and if we cannot acquire the lock, we panic to signal + // the issue. + let mut state = self + .state + .try_lock() + .expect("guest_tracing: Unable to lock guest tracing state in `exit`"); + + state.exit(id) } fn try_close(&self, id: Id) -> bool { - self.state.lock().try_close(id) + // We want to protect against re-entrancy issues produced by tracing code that locks + // the state and then causes an exception that tries to lock the state again. + // + // For example: + // - 1. A span is created, locking the state + // - 2. An exception occurs while the span is being created (e.g. not enough memory, etc.) + // - 3. The exception handler uses the tracing API to send the trace data to the host + // or just create spans/events for logging purposes. + // - 4. The tracing API tries to lock the state again, causing a deadlock. + // To avoid this, we use try_lock and if we cannot acquire the lock, we panic to signal + // the issue. + let mut state = self + .state + .try_lock() + .expect("guest_tracing: Unable to lock guest tracing state in `try_close`"); + + state.try_close(id) } fn record_follows_from(&self, _span: &Id, _follows: &Id) { diff --git a/src/hyperlight_guest_tracing/src/visitor.rs b/src/hyperlight_guest_tracing/src/visitor.rs index df0baea3b..05ac25fbf 100644 --- a/src/hyperlight_guest_tracing/src/visitor.rs +++ b/src/hyperlight_guest_tracing/src/visitor.rs @@ -34,8 +34,7 @@ impl<'a> Visit for FieldsVisitor<'a> { /// * `value` - The byte slice value fn record_bytes(&mut self, f: &Field, v: &[u8]) { let k = String::from(f.name()); - let mut val = String::new(); - val.push_str(&alloc::format!("{v:?}")); + let val = alloc::format!("{v:?}"); self.out.push(KeyValue { key: k, value: val }); } @@ -45,8 +44,7 @@ impl<'a> Visit for FieldsVisitor<'a> { /// * `v` - The string value fn record_str(&mut self, f: &Field, v: &str) { let k = String::from(f.name()); - let mut val = String::new(); - val.push_str(v); + let val = String::from(v); self.out.push(KeyValue { key: k, value: val }); } @@ -56,8 +54,7 @@ impl<'a> Visit for FieldsVisitor<'a> { /// * `v` - The debug value fn record_debug(&mut self, f: &Field, v: &dyn Debug) { let k = String::from(f.name()); - let mut val = String::new(); - val.push_str(&alloc::format!("{v:?}")); + let val = alloc::format!("{v:?}"); self.out.push(KeyValue { key: k, value: val }); } } diff --git a/src/tests/rust_guests/dummyguest/Cargo.lock b/src/tests/rust_guests/dummyguest/Cargo.lock index 2109679a7..725c332ce 100644 --- a/src/tests/rust_guests/dummyguest/Cargo.lock +++ b/src/tests/rust_guests/dummyguest/Cargo.lock @@ -135,9 +135,9 @@ dependencies = [ [[package]] name = "log" -version = "0.4.28" +version = "0.4.29" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "34080505efa8e45a4b816c349525ebe327ceaa8559756f0356cba97ef3bf7432" +checksum = "5e5032e24019045c762d3c0f28f5b6b8bbf38563a65908389bf7978758920897" [[package]] name = "memchr" @@ -265,9 +265,9 @@ dependencies = [ [[package]] name = "tracing" -version = "0.1.41" +version = "0.1.43" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "784e0ac535deb450455cbfa28a6f0df145ea1bb7ae51b821cf5e7927fdcfbdd0" +checksum = "2d15d90a0b5c19378952d479dc858407149d7bb45a14de0142f6c534b16fc647" dependencies = [ "pin-project-lite", "tracing-attributes", @@ -276,9 +276,9 @@ dependencies = [ [[package]] name = "tracing-attributes" -version = "0.1.30" +version = "0.1.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "81383ab64e72a7a8b8e13130c49e3dab29def6d0c7d76a03087b3cf71c5c6903" +checksum = "7490cfa5ec963746568740651ac6781f701c9c5ea257c58e057f3ba8cf69e8da" dependencies = [ "proc-macro2", "quote", diff --git a/src/tests/rust_guests/simpleguest/Cargo.lock b/src/tests/rust_guests/simpleguest/Cargo.lock index 8d6d915b5..1b9611f7b 100644 --- a/src/tests/rust_guests/simpleguest/Cargo.lock +++ b/src/tests/rust_guests/simpleguest/Cargo.lock @@ -127,9 +127,9 @@ dependencies = [ [[package]] name = "log" -version = "0.4.28" +version = "0.4.29" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "34080505efa8e45a4b816c349525ebe327ceaa8559756f0356cba97ef3bf7432" +checksum = "5e5032e24019045c762d3c0f28f5b6b8bbf38563a65908389bf7978758920897" [[package]] name = "memchr" @@ -269,9 +269,9 @@ dependencies = [ [[package]] name = "tracing" -version = "0.1.41" +version = "0.1.43" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "784e0ac535deb450455cbfa28a6f0df145ea1bb7ae51b821cf5e7927fdcfbdd0" +checksum = "2d15d90a0b5c19378952d479dc858407149d7bb45a14de0142f6c534b16fc647" dependencies = [ "pin-project-lite", "tracing-attributes", @@ -280,9 +280,9 @@ dependencies = [ [[package]] name = "tracing-attributes" -version = "0.1.30" +version = "0.1.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "81383ab64e72a7a8b8e13130c49e3dab29def6d0c7d76a03087b3cf71c5c6903" +checksum = "7490cfa5ec963746568740651ac6781f701c9c5ea257c58e057f3ba8cf69e8da" dependencies = [ "proc-macro2", "quote", diff --git a/src/tests/rust_guests/witguest/Cargo.lock b/src/tests/rust_guests/witguest/Cargo.lock index dba84d7a3..bad77a91b 100644 --- a/src/tests/rust_guests/witguest/Cargo.lock +++ b/src/tests/rust_guests/witguest/Cargo.lock @@ -320,9 +320,9 @@ dependencies = [ [[package]] name = "log" -version = "0.4.28" +version = "0.4.29" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "34080505efa8e45a4b816c349525ebe327ceaa8559756f0356cba97ef3bf7432" +checksum = "5e5032e24019045c762d3c0f28f5b6b8bbf38563a65908389bf7978758920897" [[package]] name = "memchr" @@ -510,9 +510,9 @@ dependencies = [ [[package]] name = "tracing" -version = "0.1.41" +version = "0.1.43" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "784e0ac535deb450455cbfa28a6f0df145ea1bb7ae51b821cf5e7927fdcfbdd0" +checksum = "2d15d90a0b5c19378952d479dc858407149d7bb45a14de0142f6c534b16fc647" dependencies = [ "pin-project-lite", "tracing-attributes", @@ -521,9 +521,9 @@ dependencies = [ [[package]] name = "tracing-attributes" -version = "0.1.30" +version = "0.1.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "81383ab64e72a7a8b8e13130c49e3dab29def6d0c7d76a03087b3cf71c5c6903" +checksum = "7490cfa5ec963746568740651ac6781f701c9c5ea257c58e057f3ba8cf69e8da" dependencies = [ "proc-macro2", "quote",