From ae3b2ffd7f08eeab49365c9471e62a849fdc6078 Mon Sep 17 00:00:00 2001 From: Denys Fedoryshchenko Date: Sat, 23 May 2026 06:30:28 +0300 Subject: [PATCH 1/2] Fix misleading s3 fix command name Signed-off-by: Denys Fedoryshchenko --- src/kernel_ci_cloud_labs/core/pipeline.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/kernel_ci_cloud_labs/core/pipeline.py b/src/kernel_ci_cloud_labs/core/pipeline.py index 23a9ecb..efcba23 100644 --- a/src/kernel_ci_cloud_labs/core/pipeline.py +++ b/src/kernel_ci_cloud_labs/core/pipeline.py @@ -313,7 +313,8 @@ def _warn_if_logs_not_public(provider, storage): logger.warning( "Bucket %s is missing the PublicReadKernelBootLogs policy; " "boot-log URLs published to KCIDB will return AccessDenied. " - "Run `aws setup validate --bucket %s --fix` to repair.", + "Run `kernel-ci-cloud-runner aws setup validate --bucket %s --fix` " + "to repair.", bucket, bucket, ) @@ -339,8 +340,9 @@ def run_pipeline( # Lightweight check: are kernel boot logs going to be reachable by KCIDB # dashboard users via the public S3 URL we'll publish? We only warn — a - # broken policy doesn't justify aborting a run, and `aws setup validate - # --fix` is the supported way to repair it. + # broken policy doesn't justify aborting a run, and + # `kernel-ci-cloud-runner aws setup validate --fix` is the supported way + # to repair it. _warn_if_logs_not_public(provider, storage) try: From aad2f23b089ac99448afd5853a5593f878ebc335 Mon Sep 17 00:00:00 2001 From: Denys Fedoryshchenko Date: Sat, 23 May 2026 06:39:41 +0300 Subject: [PATCH 2/2] Wait for logs after vm termination Signed-off-by: Denys Fedoryshchenko --- src/kernel_ci_cloud_labs/launch_vm.py | 137 +++++++++++++++++++++++--- 1 file changed, 122 insertions(+), 15 deletions(-) diff --git a/src/kernel_ci_cloud_labs/launch_vm.py b/src/kernel_ci_cloud_labs/launch_vm.py index 569ad68..eda556a 100644 --- a/src/kernel_ci_cloud_labs/launch_vm.py +++ b/src/kernel_ci_cloud_labs/launch_vm.py @@ -35,6 +35,35 @@ def log_not(msg): # pylint: disable=unused-argument # Info messages only appear in VM CloudWatch logs, not container log +# Kernel-side fatal/near-fatal markers we scan captured console buffers for. +# Hit on any of these gets logged loudly and stamped into the S3 object +# metadata so downstream consumers (KCIDB submitter, triage tooling) can +# flag the run without re-parsing the log. Patterns are case-sensitive +# substrings — kernel prints these verbatim. +PANIC_PATTERNS = ( + "Kernel panic - not syncing", + "Oops: ", + "BUG: ", + "Call Trace:", + "general protection fault", + "Unable to handle kernel", + "double fault", + "watchdog: BUG: soft lockup", + "rcu_sched detected stalls", + "page fault in interrupt", +) + + +def _scan_for_panic(text): + """Return the first PANIC_PATTERNS substring found in text, or None.""" + if not text: + return None + for pattern in PANIC_PATTERNS: + if pattern in text: + return pattern + return None + + class VMLauncher: """Launch and manage EC2 instances with multi-run test support.""" @@ -365,16 +394,23 @@ def check_test_result(self): def capture_console_output(self, reason="cleanup"): """Fetch EC2 serial console output (kernel boot log) and upload to S3. - Safe to call multiple times: subsequent calls will overwrite the S3 - object with a (typically larger) snapshot. Once we have successfully - captured a non-empty buffer, later calls are skipped — except when - forced via a different `reason` from the SSM-failure path, where the - most recent state of the buffer is more interesting than an earlier - capture taken before the failure was visible. + Re-entrancy rules: + * "cleanup" is the best-effort pre-terminate pass. It's skipped if + a previous call already uploaded — there's nothing new to fetch + while the VM is still running and the buffer only grows. + * "ssm-failure" and "post-terminate" always run. The first catches + a panic visible mid-run; the second catches the flushed final + buffer that EC2 only finalizes after shutdown — typically the + only place an early-boot panic actually shows up. Both can + overwrite an earlier (smaller) capture. + + Also scans the scrubbed output for PANIC_PATTERNS and stamps the + result into the S3 object metadata so triage tooling can flag a run + without re-reading the log. Args: reason: Free-text label for the call site (logged for diagnostics). - Currently used values: "cleanup", "ssm-failure". + Currently used values: "cleanup", "ssm-failure", "post-terminate". """ if not self.instance_id: return @@ -411,6 +447,15 @@ def capture_console_output(self, reason="cleanup"): log_not(f" Console scrub redacted: {summary}") output = scrubbed + # Panic scan runs on the scrubbed buffer so the pattern we log can't + # re-leak a token the scrubber just redacted. + panic_match = _scan_for_panic(output) + if panic_match: + log_error( + f"⚠ Kernel panic indicator in console buffer " + f"(instance={self.instance_id}, marker={panic_match!r}, reason={reason})" + ) + s3_key = f"{self.run_prefix}/test_{self.test}/output/{self.instance_id}/console-output.log" try: self.s3.put_object( @@ -423,6 +468,7 @@ def capture_console_output(self, reason="cleanup"): # Records that the buffer passed through the scrubber, so an # operator inspecting the object knows it's not raw. "scrubbed": "v1", + "panic-detected": "true" if panic_match else "false", }, ) log_not(f"✓ Console output uploaded ({len(output)} bytes) to s3://{self.s3_bucket}/{s3_key}") @@ -430,17 +476,78 @@ def capture_console_output(self, reason="cleanup"): except Exception as e: log_not(f" Failed to upload console output: {e}") - def cleanup(self): - """Capture console output, then terminate instance.""" - self.capture_console_output(reason="cleanup") + def _wait_for_terminated(self, timeout=90): + """Poll describe_instances until the VM reaches a terminal state. - if self.instance_id: - log_not(f"\n=== Terminating instance {self.instance_id} ===") + EC2 only finalizes the serial-console buffer at shutdown; a + get_console_output call against a still-running short-lived VM + often returns empty because the async mirror hasn't caught up. By + waiting for ``terminated``/``stopped`` we can re-fetch and get the + buffer that includes early-boot output and any panic on shutdown. + + Bounded by ``timeout`` so a stuck instance can't pin the pipeline. + Returns True if a terminal state was observed, False on timeout or + API error (caller proceeds either way). + """ + if not self.instance_id: + return False + log_not(f"Waiting up to {timeout}s for instance {self.instance_id} to reach terminated/stopped state...") + start = time.time() + while time.time() - start < timeout: try: - self.ec2.terminate_instances(InstanceIds=[self.instance_id]) - log_not("✓ Instance terminated") + resp = self.ec2.describe_instances(InstanceIds=[self.instance_id]) + reservations = resp.get("Reservations", []) + if not reservations or not reservations[0].get("Instances"): + # Instance metadata aged out — treat as terminal. + return True + state = reservations[0]["Instances"][0]["State"]["Name"] + if state in ("terminated", "stopped"): + log_not(f"✓ Instance state: {state} (after {int(time.time() - start)}s)") + return True except Exception as e: - log_not(f"Error terminating instance: {e}") + log_not(f" describe_instances error: {e} (continuing without wait)") + return False + time.sleep(5) + log_not(f"⚠ Timed out waiting for instance to terminate after {timeout}s") + return False + + def cleanup(self): + """Hybrid console capture around instance termination. + + Two captures bracket the terminate call: + + 1. Pre-terminate (``reason="cleanup"``) — best-effort while the VM + is still alive. Often empty for short-lived VMs (EC2's mirror + lags by minutes), but on a longer run this is the only way to + grab the buffer if termination later hangs. + 2. Post-terminate (``reason="post-terminate"``) — after shutdown + EC2 finalizes and preserves the buffer for ~1h. This is the + pass that reliably catches the boot log, kernel panics, and + any shutdown-time oops. + + The capture path scans for panic markers and stamps metadata on + whichever upload wins (post-terminate overwrites cleanup). + """ + # Best-effort live capture. Skipped silently inside the helper if a + # prior ssm-failure capture already uploaded. + self.capture_console_output(reason="cleanup") + + if not self.instance_id: + return + + log_not(f"\n=== Terminating instance {self.instance_id} ===") + try: + self.ec2.terminate_instances(InstanceIds=[self.instance_id]) + log_not("✓ Termination requested") + except Exception as e: + log_not(f"Error terminating instance: {e}") + return + + # Wait for the VM to actually wind down, then grab the flushed + # buffer. This is where an early-boot panic that never made it into + # the live mirror finally becomes visible. + self._wait_for_terminated(timeout=90) + self.capture_console_output(reason="post-terminate") def launch_vms_from_config():