Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 62 additions & 0 deletions integration/systemd_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,10 @@ package integration
import (
"bytes"
"context"
"io"
"os"
"strings"
"syscall"
"testing"
"time"

Expand Down Expand Up @@ -162,6 +164,66 @@ func TestSystemdMode(t *testing.T) {
t.Logf("Agent logs (last 5 lines):\n%s", output)
})

// Regression: serial log must survive copytruncate without leaving a
// sparse NUL hole. Pre-fix, Cloud Hypervisor held a non-O_APPEND fd on
// app.log, so post-truncate writes landed at the stale offset and the
// file became multi-GB sparse with NUL bytes from byte 0. After the
// fix, hypeman owns the writer fd with O_APPEND.
t.Run("SerialLogSurvivesCopytruncate", func(t *testing.T) {
if inst.HypervisorType != hypervisor.TypeCloudHypervisor {
t.Skipf("regression test is CH-specific; instance uses %s", inst.HypervisorType)
}
appLog := p.InstanceAppLog(inst.Id)

require.Eventually(t, func() bool {
st, err := os.Stat(appLog)
return err == nil && st.Size() > 1024
}, 30*time.Second, 200*time.Millisecond, "expected serial output to accumulate before rotation")

src, err := os.Open(appLog)
require.NoError(t, err)
dst, err := os.Create(appLog + ".1")
require.NoError(t, err)
_, err = io.Copy(dst, src)
_ = src.Close()
_ = dst.Close()
require.NoError(t, err)
require.NoError(t, os.Truncate(appLog, 0))

// Drive more serial output post-truncate.
_, _, err = execInInstance(ctx, inst, "sh", "-c", "for i in 1 2 3; do echo post-rotate-marker-$i > /dev/kmsg; done")
require.NoError(t, err)

require.Eventually(t, func() bool {
st, err := os.Stat(appLog)
return err == nil && st.Size() > 0
}, 10*time.Second, 200*time.Millisecond)

st, err := os.Stat(appLog)
require.NoError(t, err)
sys := st.Sys().(*syscall.Stat_t)
allocated := int64(sys.Blocks) * 512
apparent := st.Size()
// Allow one block of slack since stat blocks granularity is 512B.
assert.LessOrEqualf(t, apparent, allocated+4096,
"post-rotation app.log is sparse: apparent=%d allocated=%d (sparse_bytes=%d)",
apparent, allocated, apparent-allocated)

head := make([]byte, 64)
f, err := os.Open(appLog)
require.NoError(t, err)
n, _ := f.Read(head)
_ = f.Close()
nulCount := 0
for _, b := range head[:n] {
if b == 0 {
nulCount++
}
}
assert.Lessf(t, nulCount, n/2,
"post-rotation app.log starts with too many NULs (%d/%d) — likely a sparse hole", nulCount, n)
})

t.Log("All systemd mode tests passed!")
}

Expand Down
10 changes: 9 additions & 1 deletion lib/hypervisor/cloudhypervisor/cloudhypervisor.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,11 @@ import (
type CloudHypervisor struct {
client *vmm.VMM
socketPath string
// serial is set on the Starter path (StartVM/RestoreVM) so Shutdown
// can stop the reader explicitly. When the client is constructed via
// the reconnect factory (New), there is no reader to own — the
// goroutine from the original process exited with that process.
serial *serialReader
}

var balloonTargetCache hypervisor.BalloonTargetCache
Expand Down Expand Up @@ -73,10 +78,13 @@ func (c *CloudHypervisor) DeleteVM(ctx context.Context) error {
// Shutdown stops the VMM process gracefully.
func (c *CloudHypervisor) Shutdown(ctx context.Context) error {
resp, err := c.client.ShutdownVMMWithResponse(ctx)
// Stop the serial reader regardless of API outcome — once Shutdown
// has been requested the VM is going away and the reader has no
// further work.
c.serial.Close()
if err != nil {
return fmt.Errorf("shutdown vmm: %w", err)
}
// ShutdownVMM may return various codes, 204 is success
if resp.StatusCode() != 204 {
return fmt.Errorf("shutdown vmm failed with status %d", resp.StatusCode())
}
Expand Down
28 changes: 25 additions & 3 deletions lib/hypervisor/cloudhypervisor/config.go
Original file line number Diff line number Diff line change
@@ -1,10 +1,30 @@
package cloudhypervisor

import (
"path/filepath"

"github.com/kernel/hypeman/lib/hypervisor"
"github.com/kernel/hypeman/lib/vmm"
)

// serialSocketPath returns the unix socket path that Cloud Hypervisor
// binds for serial output. The socket lives at the instance directory
// level, next to ch.sock and vsock.sock — not under logs/ — so the
// total path stays under the 108-byte sun_path limit on Linux (104 on
// macOS) when long test temp prefixes are involved.
//
// We route serial through a hypeman-owned socket reader (see serial.go)
// rather than letting CH open the file directly, because CH's File-mode
// serial opens without O_APPEND. Combined with copytruncate-style log
// rotation that leaves CH's fd offset stale, the next write lands past
// EOF and creates a sparse hole of NUL bytes from byte 0 onward.
func serialSocketPath(logPath string) string {
if logPath == "" {
return ""
}
return filepath.Join(filepath.Dir(filepath.Dir(logPath)), "serial.sock")
}

// ToVMConfig converts hypervisor.VMConfig to Cloud Hypervisor's vmm.VmConfig.
func ToVMConfig(cfg hypervisor.VMConfig) vmm.VmConfig {
// Payload configuration (kernel + initramfs)
Expand Down Expand Up @@ -66,10 +86,12 @@ func ToVMConfig(cfg hypervisor.VMConfig) vmm.VmConfig {
disks = append(disks, disk)
}

// Serial console configuration
// Serial console configuration. We route serial through a unix socket
// that hypeman listens on (see startSerialReader) instead of letting
// CH write to the file directly — see serialSocketPath for rationale.
serial := vmm.ConsoleConfig{
Mode: vmm.ConsoleConfigMode("File"),
File: ptr(cfg.SerialLogPath),
Mode: vmm.ConsoleConfigModeSocket,
Socket: ptr(serialSocketPath(cfg.SerialLogPath)),
}

// Console off (we use serial)
Expand Down
32 changes: 31 additions & 1 deletion lib/hypervisor/cloudhypervisor/fork_snapshot.go
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,37 @@ func updateSerialConfig(config map[string]any, logPath string) {
if !ok || serial == nil {
return
}
serial["file"] = logPath
// Forks always use the socket-based serial reader (see config.go), so
// rewrite to the new shape regardless of the source snapshot's mode.
// This also migrates legacy File-mode snapshots to Socket on fork.
delete(serial, "file")
serial["mode"] = "Socket"
serial["socket"] = serialSocketPath(logPath)
}

// rewriteSerialConfigForRestore migrates the on-disk snapshot config so
// CH binds the socket-mode reader on restore. Pre-fix snapshots embed
// serial.mode=File, which keeps the original copytruncate sparse-hole
// bug alive after restore. New snapshots are already mode=Socket, so
// this is a cheap no-op for them.
func rewriteSerialConfigForRestore(configPath, logPath string) error {
data, err := os.ReadFile(configPath)
if err != nil {
return fmt.Errorf("read snapshot config: %w", err)
}
var config map[string]any
if err := json.Unmarshal(data, &config); err != nil {
return fmt.Errorf("unmarshal snapshot config: %w", err)
}
updateSerialConfig(config, logPath)
updated, err := json.MarshalIndent(config, "", " ")
if err != nil {
return fmt.Errorf("marshal snapshot config: %w", err)
}
if err := os.WriteFile(configPath, updated, 0644); err != nil {
return fmt.Errorf("write snapshot config: %w", err)
}
return nil
}

func updateNetworkConfig(config map[string]any, netCfg *hypervisor.ForkNetworkConfig) {
Expand Down
66 changes: 64 additions & 2 deletions lib/hypervisor/cloudhypervisor/fork_snapshot_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ func TestRewriteSnapshotConfigForFork(t *testing.T) {

orig := map[string]any{
"disks": []any{map[string]any{"path": "/src/guests/a/overlay.raw"}},
"serial": map[string]any{"file": "/src/guests/a/logs/app.log"},
"serial": map[string]any{"mode": "Socket", "socket": "/src/guests/a/serial.sock"},
"vsock": map[string]any{"cid": float64(100), "socket": "/src/guests/a/vsock.sock"},
"metadata": map[string]any{
"note": "keep-/src/guests/a-as-substring",
Expand Down Expand Up @@ -59,7 +59,10 @@ func TestRewriteSnapshotConfigForFork(t *testing.T) {
assert.Equal(t, "/dst/guests/b/overlay.raw", disk0["path"])

serial := updated["serial"].(map[string]any)
assert.Equal(t, "/dst/guests/b/logs/app.log", serial["file"])
assert.Equal(t, "Socket", serial["mode"])
assert.Equal(t, "/dst/guests/b/serial.sock", serial["socket"])
_, hasFile := serial["file"]
assert.False(t, hasFile, "fork rewrite should drop legacy serial.file")

vsock := updated["vsock"].(map[string]any)
assert.Equal(t, float64(100), vsock["cid"])
Expand All @@ -74,3 +77,62 @@ func TestRewriteSnapshotConfigForFork(t *testing.T) {
metadata := updated["metadata"].(map[string]any)
assert.Equal(t, "keep-/src/guests/a-as-substring", metadata["note"])
}

func TestRewriteSerialConfigForRestore(t *testing.T) {
t.Run("FileToSocket", func(t *testing.T) {
path := writeSnapshotConfig(t, map[string]any{
"serial": map[string]any{"mode": "File", "file": "/old/logs/app.log"},
})
require.NoError(t, rewriteSerialConfigForRestore(path, "/inst/logs/app.log"))

serial := readSerialConfig(t, path)
assert.Equal(t, "Socket", serial["mode"])
assert.Equal(t, "/inst/serial.sock", serial["socket"])
_, hasFile := serial["file"]
assert.False(t, hasFile, "legacy file field must be removed")
})

t.Run("AlreadySocketIsIdempotent", func(t *testing.T) {
path := writeSnapshotConfig(t, map[string]any{
"serial": map[string]any{"mode": "Socket", "socket": "/inst/serial.sock"},
})
require.NoError(t, rewriteSerialConfigForRestore(path, "/inst/logs/app.log"))

serial := readSerialConfig(t, path)
assert.Equal(t, "Socket", serial["mode"])
assert.Equal(t, "/inst/serial.sock", serial["socket"])
})

t.Run("NoSerialBlockIsNoOp", func(t *testing.T) {
path := writeSnapshotConfig(t, map[string]any{
"disks": []any{map[string]any{"path": "/x"}},
})
require.NoError(t, rewriteSerialConfigForRestore(path, "/inst/logs/app.log"))

var cfg map[string]any
data, err := os.ReadFile(path)
require.NoError(t, err)
require.NoError(t, json.Unmarshal(data, &cfg))
_, hasSerial := cfg["serial"]
assert.False(t, hasSerial, "should not synthesize a serial block")
})
}

func writeSnapshotConfig(t *testing.T, cfg map[string]any) string {
t.Helper()
dir := t.TempDir()
path := filepath.Join(dir, "config.json")
data, err := json.Marshal(cfg)
require.NoError(t, err)
require.NoError(t, os.WriteFile(path, data, 0644))
return path
}

func readSerialConfig(t *testing.T, path string) map[string]any {
t.Helper()
data, err := os.ReadFile(path)
require.NoError(t, err)
var cfg map[string]any
require.NoError(t, json.Unmarshal(data, &cfg))
return cfg["serial"].(map[string]any)
}
37 changes: 35 additions & 2 deletions lib/hypervisor/cloudhypervisor/process.go
Original file line number Diff line number Diff line change
Expand Up @@ -69,17 +69,26 @@ func (s *Starter) StartVM(ctx context.Context, p *paths.Paths, version string, s
return 0, nil, fmt.Errorf("unsupported cloud-hypervisor version: %s", version)
}

// 0. Start the serial reader before CH so the unix socket is bound by
// the time CH boots and tries to connect.
sr, err := startSerialReader(ctx, serialSocketPath(config.SerialLogPath), config.SerialLogPath)
if err != nil {
return 0, nil, fmt.Errorf("start serial reader: %w", err)
}

// 1. Start the Cloud Hypervisor process
processCtx, processSpan := hypervisor.StartProcessSpan(ctx, hypervisor.TypeCloudHypervisor)
pid, err := vmm.StartProcess(processCtx, p, chVersion, socketPath)
hypervisor.FinishTraceSpan(processSpan, err)
if err != nil {
sr.Close()
return 0, nil, fmt.Errorf("start process: %w", err)
}

// Setup cleanup to kill the process if subsequent steps fail
cu := cleanup.Make(func() {
syscall.Kill(pid, syscall.SIGKILL)
sr.Close()
})
defer cu.Clean()

Expand Down Expand Up @@ -112,7 +121,10 @@ func (s *Starter) StartVM(ctx context.Context, p *paths.Paths, version string, s
return 0, nil, fmt.Errorf("boot vm failed with status %d: %s", bootResp.StatusCode(), string(bootResp.Body))
}

// Success - release cleanup to prevent killing the process
// Success - release cleanup to prevent killing the process. Hand
// ownership of the serial reader to the client so Shutdown can
// stop it.
hv.serial = sr
cu.Release()
return pid, hv, nil
}
Expand All @@ -129,19 +141,37 @@ func (s *Starter) RestoreVM(ctx context.Context, p *paths.Paths, version string,
return 0, nil, fmt.Errorf("unsupported cloud-hypervisor version: %s", version)
}

// 0. Start the serial reader before CH. The serial log path lives at
Comment thread
cursor[bot] marked this conversation as resolved.
// a fixed offset from the CH API socket within the instance directory.
logPath := filepath.Join(filepath.Dir(socketPath), "logs", "app.log")
sr, err := startSerialReader(ctx, serialSocketPath(logPath), logPath)
if err != nil {
return 0, nil, fmt.Errorf("start serial reader: %w", err)
}

// Migrate legacy serial.mode=File snapshots to Socket so CH binds the
// reader's socket on restore. New snapshots are already Socket; the
// rewrite is idempotent.
if err := rewriteSerialConfigForRestore(filepath.Join(snapshotPath, "config.json"), logPath); err != nil {
sr.Close()
return 0, nil, fmt.Errorf("rewrite snapshot serial config: %w", err)
}

// 1. Start the Cloud Hypervisor process
processStartTime := time.Now()
processCtx, processSpan := hypervisor.StartProcessSpan(ctx, hypervisor.TypeCloudHypervisor)
pid, err := vmm.StartProcess(processCtx, p, chVersion, socketPath)
hypervisor.FinishTraceSpan(processSpan, err)
if err != nil {
sr.Close()
return 0, nil, fmt.Errorf("start process: %w", err)
}
log.DebugContext(ctx, "CH process started", "pid", pid, "duration_ms", time.Since(processStartTime).Milliseconds())

// Setup cleanup to kill the process if subsequent steps fail
cu := cleanup.Make(func() {
syscall.Kill(pid, syscall.SIGKILL)
sr.Close()
})
defer cu.Clean()

Expand All @@ -167,7 +197,10 @@ func (s *Starter) RestoreVM(ctx context.Context, p *paths.Paths, version string,
}
log.DebugContext(ctx, "CH restore API complete", "duration_ms", time.Since(restoreAPIStart).Milliseconds())

// Success - release cleanup to prevent killing the process
// Success - release cleanup to prevent killing the process. Hand
// ownership of the serial reader to the client so Shutdown can
// stop it.
hv.serial = sr
cu.Release()
log.DebugContext(ctx, "CH restore complete", "pid", pid, "total_duration_ms", time.Since(startTime).Milliseconds())
return pid, hv, nil
Expand Down
Loading
Loading