Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
119 changes: 99 additions & 20 deletions src/kernel_ci_cloud_labs/launch_vm.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,12 @@

import base64
import json
import os
import shlex
import sys
import threading
import time
import traceback
import uuid

import boto3
Expand All @@ -18,6 +21,12 @@
from kernel_ci_cloud_labs.core.log_scrub import scrub_text


# Enable verbose diagnostics by setting KCI_DEBUG=1 (or true/yes/on) on the
# ECS task. log_debug() lines land in the container log group alongside
# INFO/ERROR. Defaults to off so production logs stay quiet.
_DEBUG_ENABLED = os.environ.get("KCI_DEBUG", "").lower() in ("1", "true", "yes", "on")


def log_error(msg):
"""Print error to stderr (shows in container log)."""
sys.stderr.write(f"ERROR: {msg}\n")
Expand All @@ -43,6 +52,30 @@ def log_not(msg):
sys.stdout.flush()


def log_debug(msg):
"""Print debug diagnostic only when KCI_DEBUG is set in the environment.

Used to dump intermediate state (response shapes, retry timing, buffer
lengths) without polluting normal production runs.
"""
if _DEBUG_ENABLED:
sys.stdout.write(f"DEBUG: {msg}\n")
sys.stdout.flush()


def log_exception(prefix, exc):
"""Log an exception with full traceback to stderr.

Use on any catch site where a bare ``log_error(str(e))`` would hide
the call stack — boto3 errors in particular need the traceback to
distinguish a NoCredentialsError from a ClientError from a transient
network blip.
"""
sys.stderr.write(f"ERROR: {prefix}: {exc!r}\n")
sys.stderr.write(traceback.format_exc())
sys.stderr.flush()


# Kernel-side fatal/near-fatal markers we scan captured console buffers for.
# Hit on any of these gets logged loudly and stamped into the S3 object
# metadata so downstream consumers (KCIDB submitter, triage tooling) can
Expand Down Expand Up @@ -437,24 +470,38 @@ def capture_console_output(self, reason="cleanup"):
# because post-terminate covers the lag.
output_b64 = ""
if reason == "post-terminate":
poll_budget = 240 # 4 min; EC2 mirror typically settles in 1–3 min
# AWS documents the serial-console mirror update cadence as
# "approximately every 5 minutes" — arm64/Graviton tends to be
# at the slow end. 9 min gives the mirror two full cycles to
# show up; 15 s polling keeps API cost bounded.
poll_budget = 540
poll_interval = 15
start = time.time()
attempt = 0
last_exc_repr = None
while time.time() - start < poll_budget:
attempt += 1
try:
resp = self.ec2.get_console_output(
InstanceId=self.instance_id, Latest=True
)
output_b64 = resp.get("Output", "")
log_debug(
f"get_console_output attempt={attempt} "
f"output_len_b64={len(output_b64)} "
f"keys={sorted(resp.keys())}"
)
except Exception as e:
log_not(f" get_console_output error (attempt {attempt}): {e}")
last_exc_repr = repr(e)
log_exception(
f"get_console_output failed (attempt {attempt})", e
)
output_b64 = ""
if output_b64:
log_not(
f" Console buffer available on attempt {attempt} "
f"(after {int(time.time() - start)}s)"
f"(after {int(time.time() - start)}s, "
f"b64_len={len(output_b64)})"
)
break
log_not(
Expand All @@ -463,20 +510,27 @@ def capture_console_output(self, reason="cleanup"):
)
time.sleep(poll_interval)
if not output_b64:
log_not(
f" No console output after {attempt} attempts "
f"({int(time.time() - start)}s) — EC2 mirror never flushed"
log_error(
f"No console output after {attempt} attempts "
f"({int(time.time() - start)}s, budget {poll_budget}s) — "
f"EC2 mirror never flushed for instance {self.instance_id}"
+ (f"; last exception: {last_exc_repr}" if last_exc_repr else "")
)
return
else:
try:
resp = self.ec2.get_console_output(
InstanceId=self.instance_id, Latest=True
)
output_b64 = resp.get("Output", "")
log_debug(
f"get_console_output (reason={reason}) "
f"output_len_b64={len(output_b64)} "
f"keys={sorted(resp.keys())}"
)
except Exception as e:
log_not(f" Failed to fetch console output: {e}")
log_exception(f"get_console_output failed (reason={reason})", e)
return
output_b64 = resp.get("Output", "")
if not output_b64:
log_not(" No console output available yet")
return
Expand Down Expand Up @@ -523,7 +577,10 @@ def capture_console_output(self, reason="cleanup"):
log_not(f"✓ Console output uploaded ({len(output)} bytes) to s3://{self.s3_bucket}/{s3_key}")
self._console_captured = True
except Exception as e:
log_not(f" Failed to upload console output: {e}")
log_exception(
f"Failed to upload console output to s3://{self.s3_bucket}/{s3_key}",
e,
)

def _wait_for_terminated(self, timeout=90):
"""Poll describe_instances until the VM reaches a terminal state.
Expand Down Expand Up @@ -577,9 +634,14 @@ def cleanup(self):
The capture path scans for panic markers and stamps metadata on
whichever upload wins (post-terminate overwrites cleanup).
"""
# Best-effort live capture. Skipped silently inside the helper if a
# prior ssm-failure capture already uploaded.
self.capture_console_output(reason="cleanup")
# Each stage is wrapped so one failure doesn't silently abort the
# remaining cleanup work. Previously a boto3 error in terminate or
# an unhandled exception in capture would short-circuit cleanup and
# leave no traceback in the container log.
try:
self.capture_console_output(reason="cleanup")
except Exception as e:
log_exception("capture_console_output(cleanup) raised", e)

if not self.instance_id:
return
Expand All @@ -589,20 +651,30 @@ def cleanup(self):
self.ec2.terminate_instances(InstanceIds=[self.instance_id])
log_not("✓ Termination requested")
except Exception as e:
log_not(f"Error terminating instance: {e}")
return
log_exception(
f"terminate_instances failed for {self.instance_id}", e
)
# Still attempt post-terminate capture: instance may already be
# in a terminal state, in which case GetConsoleOutput still
# works against the preserved buffer.

# Wait for the VM to actually wind down, then grab the flushed
# buffer. This is where an early-boot panic that never made it into
# the live mirror finally becomes visible.
self._wait_for_terminated(timeout=90)
self.capture_console_output(reason="post-terminate")
try:
self._wait_for_terminated(timeout=90)
except Exception as e:
log_exception("_wait_for_terminated raised", e)

try:
self.capture_console_output(reason="post-terminate")
except Exception as e:
log_exception("capture_console_output(post-terminate) raised", e)


def launch_vms_from_config():
"""Launch and test multiple VMs from environment variables (no config file needed)."""
import os
import threading
log_info(f"launch_vm startup: KCI_DEBUG={'on' if _DEBUG_ENABLED else 'off'}")

# Read config from environment variables
run_prefix = os.getenv("RUN_PREFIX")
Expand Down Expand Up @@ -730,10 +802,17 @@ def launch_and_test_vm(vm_config, instance_num, results_list):
results_list.append({"vm_name": vm_name, "success": False})

except Exception as e:
log_error(f"FAILED: {vm_name} - {e}")
log_exception(f"FAILED: {vm_name}", e)
results_list.append({"vm_name": vm_name, "success": False})
finally:
launcher.cleanup()
# cleanup() is already internally guarded per-stage, but a thread
# that escapes its target with an unhandled exception is silent
# by default — wrap once more so any surviving error reaches
# the container log with a traceback.
try:
launcher.cleanup()
except Exception as e:
log_exception(f"cleanup raised for {vm_name}", e)

# Launch all VMs in parallel using threads
threads = []
Expand Down
12 changes: 9 additions & 3 deletions src/kernel_ci_cloud_labs/providers/aws_provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@


# providers/aws_provider.py
import json
import os
import re
import time
Expand Down Expand Up @@ -97,9 +98,6 @@ def spawn_container(self):
logger.debug("Network config: %s", network_config)

# Build environment overrides with runtime config
import json
import time

env_vars = []

# Pass run prefix
Expand All @@ -124,6 +122,14 @@ def spawn_container(self):
env_vars.append({"name": "EC2_LOG_GROUP", "value": ec2_log_group})
logger.debug("Passing EC2_LOG_GROUP to container: %s", ec2_log_group)

# Forward KCI_DEBUG to the container so launch_vm.log_debug() lines
# show up in the ECS task log group. Set on the host running the
# poller: KCI_DEBUG=1 ./prod-amd64.sh
kci_debug = os.environ.get("KCI_DEBUG")
if kci_debug:
env_vars.append({"name": "KCI_DEBUG", "value": kci_debug})
logger.debug("Passing KCI_DEBUG to container: %s", kci_debug)

# Always pass test config via S3 in results bucket
if self.config.get("test_config"):
test_config_json = json.dumps(self.config["test_config"])
Expand Down
Loading