Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 5 additions & 3 deletions src/kernel_ci_cloud_labs/core/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -313,7 +313,8 @@ def _warn_if_logs_not_public(provider, storage):
logger.warning(
"Bucket %s is missing the PublicReadKernelBootLogs policy; "
"boot-log URLs published to KCIDB will return AccessDenied. "
"Run `aws setup validate --bucket %s --fix` to repair.",
"Run `kernel-ci-cloud-runner aws setup validate --bucket %s --fix` "
"to repair.",
bucket,
bucket,
)
Expand All @@ -339,8 +340,9 @@ def run_pipeline(

# Lightweight check: are kernel boot logs going to be reachable by KCIDB
# dashboard users via the public S3 URL we'll publish? We only warn — a
# broken policy doesn't justify aborting a run, and `aws setup validate
# --fix` is the supported way to repair it.
# broken policy doesn't justify aborting a run, and
# `kernel-ci-cloud-runner aws setup validate --fix` is the supported way
# to repair it.
_warn_if_logs_not_public(provider, storage)

try:
Expand Down
137 changes: 122 additions & 15 deletions src/kernel_ci_cloud_labs/launch_vm.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,35 @@ def log_not(msg): # pylint: disable=unused-argument
# Info messages only appear in VM CloudWatch logs, not container log


# Kernel-side fatal/near-fatal markers we scan captured console buffers for.
# Hit on any of these gets logged loudly and stamped into the S3 object
# metadata so downstream consumers (KCIDB submitter, triage tooling) can
# flag the run without re-parsing the log. Patterns are case-sensitive
# substrings — kernel prints these verbatim.
PANIC_PATTERNS = (
"Kernel panic - not syncing",
"Oops: ",
"BUG: ",
"Call Trace:",
"general protection fault",
"Unable to handle kernel",
"double fault",
"watchdog: BUG: soft lockup",
"rcu_sched detected stalls",
"page fault in interrupt",
)


def _scan_for_panic(text):
"""Return the first PANIC_PATTERNS substring found in text, or None."""
if not text:
return None
for pattern in PANIC_PATTERNS:
if pattern in text:
return pattern
return None


class VMLauncher:
"""Launch and manage EC2 instances with multi-run test support."""

Expand Down Expand Up @@ -365,16 +394,23 @@ def check_test_result(self):
def capture_console_output(self, reason="cleanup"):
"""Fetch EC2 serial console output (kernel boot log) and upload to S3.

Safe to call multiple times: subsequent calls will overwrite the S3
object with a (typically larger) snapshot. Once we have successfully
captured a non-empty buffer, later calls are skipped — except when
forced via a different `reason` from the SSM-failure path, where the
most recent state of the buffer is more interesting than an earlier
capture taken before the failure was visible.
Re-entrancy rules:
* "cleanup" is the best-effort pre-terminate pass. It's skipped if
a previous call already uploaded — there's nothing new to fetch
while the VM is still running and the buffer only grows.
* "ssm-failure" and "post-terminate" always run. The first catches
a panic visible mid-run; the second catches the flushed final
buffer that EC2 only finalizes after shutdown — typically the
only place an early-boot panic actually shows up. Both can
overwrite an earlier (smaller) capture.

Also scans the scrubbed output for PANIC_PATTERNS and stamps the
result into the S3 object metadata so triage tooling can flag a run
without re-reading the log.

Args:
reason: Free-text label for the call site (logged for diagnostics).
Currently used values: "cleanup", "ssm-failure".
Currently used values: "cleanup", "ssm-failure", "post-terminate".
"""
if not self.instance_id:
return
Expand Down Expand Up @@ -411,6 +447,15 @@ def capture_console_output(self, reason="cleanup"):
log_not(f" Console scrub redacted: {summary}")
output = scrubbed

# Panic scan runs on the scrubbed buffer so the pattern we log can't
# re-leak a token the scrubber just redacted.
panic_match = _scan_for_panic(output)
if panic_match:
log_error(
f"⚠ Kernel panic indicator in console buffer "
f"(instance={self.instance_id}, marker={panic_match!r}, reason={reason})"
)

s3_key = f"{self.run_prefix}/test_{self.test}/output/{self.instance_id}/console-output.log"
try:
self.s3.put_object(
Expand All @@ -423,24 +468,86 @@ def capture_console_output(self, reason="cleanup"):
# Records that the buffer passed through the scrubber, so an
# operator inspecting the object knows it's not raw.
"scrubbed": "v1",
"panic-detected": "true" if panic_match else "false",
},
)
log_not(f"✓ Console output uploaded ({len(output)} bytes) to s3://{self.s3_bucket}/{s3_key}")
self._console_captured = True
except Exception as e:
log_not(f" Failed to upload console output: {e}")

def cleanup(self):
"""Capture console output, then terminate instance."""
self.capture_console_output(reason="cleanup")
def _wait_for_terminated(self, timeout=90):
"""Poll describe_instances until the VM reaches a terminal state.

if self.instance_id:
log_not(f"\n=== Terminating instance {self.instance_id} ===")
EC2 only finalizes the serial-console buffer at shutdown; a
get_console_output call against a still-running short-lived VM
often returns empty because the async mirror hasn't caught up. By
waiting for ``terminated``/``stopped`` we can re-fetch and get the
buffer that includes early-boot output and any panic on shutdown.

Bounded by ``timeout`` so a stuck instance can't pin the pipeline.
Returns True if a terminal state was observed, False on timeout or
API error (caller proceeds either way).
"""
if not self.instance_id:
return False
log_not(f"Waiting up to {timeout}s for instance {self.instance_id} to reach terminated/stopped state...")
start = time.time()
while time.time() - start < timeout:
try:
self.ec2.terminate_instances(InstanceIds=[self.instance_id])
log_not("✓ Instance terminated")
resp = self.ec2.describe_instances(InstanceIds=[self.instance_id])
reservations = resp.get("Reservations", [])
if not reservations or not reservations[0].get("Instances"):
# Instance metadata aged out — treat as terminal.
return True
state = reservations[0]["Instances"][0]["State"]["Name"]
if state in ("terminated", "stopped"):
log_not(f"✓ Instance state: {state} (after {int(time.time() - start)}s)")
return True
except Exception as e:
log_not(f"Error terminating instance: {e}")
log_not(f" describe_instances error: {e} (continuing without wait)")
return False
time.sleep(5)
log_not(f"⚠ Timed out waiting for instance to terminate after {timeout}s")
return False

def cleanup(self):
"""Hybrid console capture around instance termination.

Two captures bracket the terminate call:

1. Pre-terminate (``reason="cleanup"``) — best-effort while the VM
is still alive. Often empty for short-lived VMs (EC2's mirror
lags by minutes), but on a longer run this is the only way to
grab the buffer if termination later hangs.
2. Post-terminate (``reason="post-terminate"``) — after shutdown
EC2 finalizes and preserves the buffer for ~1h. This is the
pass that reliably catches the boot log, kernel panics, and
any shutdown-time oops.

The capture path scans for panic markers and stamps metadata on
whichever upload wins (post-terminate overwrites cleanup).
"""
# Best-effort live capture. Skipped silently inside the helper if a
# prior ssm-failure capture already uploaded.
self.capture_console_output(reason="cleanup")

if not self.instance_id:
return

log_not(f"\n=== Terminating instance {self.instance_id} ===")
try:
self.ec2.terminate_instances(InstanceIds=[self.instance_id])
log_not("✓ Termination requested")
except Exception as e:
log_not(f"Error terminating instance: {e}")
return

# Wait for the VM to actually wind down, then grab the flushed
# buffer. This is where an early-boot panic that never made it into
# the live mirror finally becomes visible.
self._wait_for_terminated(timeout=90)
self.capture_console_output(reason="post-terminate")


def launch_vms_from_config():
Expand Down
Loading