From d82b4e762ac8f8f758afe10b2f939038cd37703c Mon Sep 17 00:00:00 2001 From: Steven Miller Date: Thu, 26 Mar 2026 15:08:02 -0400 Subject: [PATCH 1/5] Add automatic image retention cleanup --- cmd/api/config/config.go | 22 ++ cmd/api/config/config_test.go | 36 ++ cmd/api/image_retention_test.go | 54 +++ cmd/api/main.go | 42 +++ config.example.darwin.yaml | 9 + config.example.yaml | 9 + lib/imageretention/README.md | 24 ++ lib/imageretention/controller.go | 482 ++++++++++++++++++++++++++ lib/imageretention/controller_test.go | 290 ++++++++++++++++ lib/instances/create.go | 1 + lib/instances/image_usage_test.go | 96 +++++ lib/instances/manager.go | 27 ++ lib/paths/paths.go | 15 + 13 files changed, 1107 insertions(+) create mode 100644 cmd/api/image_retention_test.go create mode 100644 lib/imageretention/README.md create mode 100644 lib/imageretention/controller.go create mode 100644 lib/imageretention/controller_test.go create mode 100644 lib/instances/image_usage_test.go diff --git a/cmd/api/config/config.go b/cmd/api/config/config.go index 4e0bdadf..64a19a41 100644 --- a/cmd/api/config/config.go +++ b/cmd/api/config/config.go @@ -123,6 +123,17 @@ type LoggingConfig struct { RotateInterval string `koanf:"rotate_interval"` } +// ImagesAutoDeleteConfig holds server-wide image retention settings. +type ImagesAutoDeleteConfig struct { + Enabled bool `koanf:"enabled"` + UnusedFor string `koanf:"unused_for"` +} + +// ImagesConfig holds image-management settings. +type ImagesConfig struct { + AutoDelete ImagesAutoDeleteConfig `koanf:"auto_delete"` +} + // BuildConfig holds source-to-image build system settings. type BuildConfig struct { MaxConcurrentSourceBuilds int `koanf:"max_concurrent_source_builds"` @@ -226,6 +237,7 @@ type Config struct { Metrics MetricsConfig `koanf:"metrics"` Otel OtelConfig `koanf:"otel"` Logging LoggingConfig `koanf:"logging"` + Images ImagesConfig `koanf:"images"` Build BuildConfig `koanf:"build"` Registry RegistryConfig `koanf:"registry"` Limits LimitsConfig `koanf:"limits"` @@ -319,6 +331,13 @@ func defaultConfig() *Config { RotateInterval: "5m", }, + Images: ImagesConfig{ + AutoDelete: ImagesAutoDeleteConfig{ + Enabled: false, + UnusedFor: "720h", + }, + }, + Build: BuildConfig{ MaxConcurrentSourceBuilds: 2, BuilderImage: "", // empty = build from embedded Dockerfile on first run @@ -511,6 +530,9 @@ func (c *Config) Validate() error { if c.Build.Timeout <= 0 { return fmt.Errorf("build.timeout must be positive, got %d", c.Build.Timeout) } + if err := validateDuration("images.auto_delete.unused_for", c.Images.AutoDelete.UnusedFor); err != nil { + return err + } algorithm := strings.ToLower(c.Snapshot.CompressionDefault.Algorithm) c.Snapshot.CompressionDefault.Algorithm = algorithm if c.Snapshot.CompressionDefault.Enabled { diff --git a/cmd/api/config/config_test.go b/cmd/api/config/config_test.go index 6ae361a1..3314e0bd 100644 --- a/cmd/api/config/config_test.go +++ b/cmd/api/config/config_test.go @@ -31,6 +31,12 @@ func TestDefaultConfigIncludesMetricsSettings(t *testing.T) { if cfg.Otel.SuccessfulGetSampleRatio != 0.1 { t.Fatalf("expected default otel.successful_get_sample_ratio to be 0.1, got %v", cfg.Otel.SuccessfulGetSampleRatio) } + if cfg.Images.AutoDelete.Enabled { + t.Fatalf("expected default images.auto_delete.enabled to be false") + } + if cfg.Images.AutoDelete.UnusedFor != "720h" { + t.Fatalf("expected default images.auto_delete.unused_for to be 720h, got %q", cfg.Images.AutoDelete.UnusedFor) + } } func TestLoadEnvOverridesMetricsAndOtelInterval(t *testing.T) { @@ -138,6 +144,36 @@ func TestValidateRejectsInvalidResourceRefreshInterval(t *testing.T) { } } +func TestLoadUsesDefaultImageAutoDeleteRetentionWindow(t *testing.T) { + tmp := t.TempDir() + cfgPath := filepath.Join(tmp, "config.yaml") + if err := os.WriteFile(cfgPath, []byte("images:\n auto_delete:\n enabled: true\n"), 0600); err != nil { + t.Fatalf("write temp config: %v", err) + } + + cfg, err := Load(cfgPath) + if err != nil { + t.Fatalf("load config: %v", err) + } + + if !cfg.Images.AutoDelete.Enabled { + t.Fatalf("expected images.auto_delete.enabled override to be true") + } + if cfg.Images.AutoDelete.UnusedFor != "720h" { + t.Fatalf("expected default images.auto_delete.unused_for to remain 720h, got %q", cfg.Images.AutoDelete.UnusedFor) + } +} + +func TestValidateRejectsInvalidImageAutoDeleteUnusedFor(t *testing.T) { + cfg := defaultConfig() + cfg.Images.AutoDelete.UnusedFor = "not-a-duration" + + err := cfg.Validate() + if err == nil { + t.Fatalf("expected validation error for invalid images.auto_delete.unused_for") + } +} + func TestValidateRejectsEmptyActiveBallooningDurations(t *testing.T) { cfg := defaultConfig() cfg.Hypervisor.Memory.ActiveBallooning.PollInterval = " " diff --git a/cmd/api/image_retention_test.go b/cmd/api/image_retention_test.go new file mode 100644 index 00000000..60299cea --- /dev/null +++ b/cmd/api/image_retention_test.go @@ -0,0 +1,54 @@ +package main + +import ( + "context" + "sync/atomic" + "testing" + "time" + + "golang.org/x/sync/errgroup" +) + +type stubImageRetentionRunner struct { + runCount atomic.Int32 +} + +func (s *stubImageRetentionRunner) Run(ctx context.Context) error { + s.runCount.Add(1) + <-ctx.Done() + return nil +} + +func TestStartImageRetentionControllerSkipsNilController(t *testing.T) { + grp, ctx := errgroup.WithContext(context.Background()) + + started := startImageRetentionController(grp, ctx, nil) + if started { + t.Fatalf("expected nil controller not to start") + } +} + +func TestStartImageRetentionControllerStartsRunner(t *testing.T) { + grp, ctx := errgroup.WithContext(context.Background()) + ctx, cancel := context.WithCancel(ctx) + defer cancel() + + runner := &stubImageRetentionRunner{} + started := startImageRetentionController(grp, ctx, runner) + if !started { + t.Fatalf("expected controller to start") + } + + deadline := time.Now().Add(time.Second) + for runner.runCount.Load() == 0 && time.Now().Before(deadline) { + time.Sleep(10 * time.Millisecond) + } + if runner.runCount.Load() != 1 { + t.Fatalf("expected runner to be started once, got %d", runner.runCount.Load()) + } + + cancel() + if err := grp.Wait(); err != nil { + t.Fatalf("wait for retention runner: %v", err) + } +} diff --git a/cmd/api/main.go b/cmd/api/main.go index 01b17f57..fffaf774 100644 --- a/cmd/api/main.go +++ b/cmd/api/main.go @@ -26,10 +26,13 @@ import ( "github.com/kernel/hypeman/lib/devices" "github.com/kernel/hypeman/lib/guest" "github.com/kernel/hypeman/lib/hypervisor/qemu" + "github.com/kernel/hypeman/lib/imageretention" + "github.com/kernel/hypeman/lib/images" "github.com/kernel/hypeman/lib/instances" mw "github.com/kernel/hypeman/lib/middleware" "github.com/kernel/hypeman/lib/oapi" "github.com/kernel/hypeman/lib/otel" + "github.com/kernel/hypeman/lib/paths" "github.com/kernel/hypeman/lib/registry" "github.com/kernel/hypeman/lib/scopes" "github.com/kernel/hypeman/lib/vmm" @@ -59,6 +62,37 @@ func newMetricsServer(addr string, handler http.Handler) *http.Server { } } +type imageRetentionRunner interface { + Run(ctx context.Context) error +} + +func configureImageRetentionController(cfg *config.Config, imageManager images.Manager, instanceManager instances.Manager, logger *slog.Logger) (imageRetentionRunner, error) { + if cfg == nil || !cfg.Images.AutoDelete.Enabled { + return nil, nil + } + + unusedFor, err := time.ParseDuration(cfg.Images.AutoDelete.UnusedFor) + if err != nil { + return nil, fmt.Errorf("invalid images.auto_delete.unused_for %q: %w", cfg.Images.AutoDelete.UnusedFor, err) + } + + controller := imageretention.NewController(paths.New(cfg.DataDir), imageManager, unusedFor, logger) + if setter, ok := instanceManager.(instances.ImageUsageRecorderSetter); ok { + setter.SetImageUsageRecorder(controller) + } + return controller, nil +} + +func startImageRetentionController(grp *errgroup.Group, ctx context.Context, controller imageRetentionRunner) bool { + if grp == nil || controller == nil { + return false + } + grp.Go(func() error { + return controller.Run(ctx) + }) + return true +} + func run() error { // Load config early for OTel initialization // Config path can be specified via CONFIG_PATH env var or defaults to platform-specific locations @@ -429,6 +463,14 @@ func run() error { // Error group for coordinated shutdown grp, gctx := errgroup.WithContext(ctx) + retentionController, err := configureImageRetentionController(app.Config, app.ImageManager, app.InstanceManager, logger) + if err != nil { + return err + } + if startImageRetentionController(grp, gctx, retentionController) { + logger.Info("image auto-delete enabled", "unused_for", app.Config.Images.AutoDelete.UnusedFor) + } + // Start build manager background services (vsock handler for builder VMs) if err := app.BuildManager.Start(gctx); err != nil { logger.Error("failed to start build manager", "error", err) diff --git a/config.example.darwin.yaml b/config.example.darwin.yaml index 0ace51dc..267bf8a1 100644 --- a/config.example.darwin.yaml +++ b/config.example.darwin.yaml @@ -67,6 +67,15 @@ network: logging: level: debug +# ============================================================================= +# Images +# ============================================================================= +# images: +# auto_delete: +# enabled: false # server-wide automatic deletion of cached converted images +# unused_for: 720h # delete only after no instances or snapshots reference the image for this long +# # only affects data_dir/images, not the shared OCI cache + # ============================================================================= # Caddy / Ingress Configuration # ============================================================================= diff --git a/config.example.yaml b/config.example.yaml index 345c2da9..0652efe6 100644 --- a/config.example.yaml +++ b/config.example.yaml @@ -60,6 +60,15 @@ data_dir: /var/lib/hypeman # logging: # level: info # debug, info, warn, error +# ============================================================================= +# Images +# ============================================================================= +# images: +# auto_delete: +# enabled: false # server-wide automatic deletion of cached converted images +# unused_for: 720h # delete only after no instances or snapshots reference the image for this long +# # only affects data_dir/images, not the shared OCI cache + # ============================================================================= # Caddy / Ingress Configuration # ============================================================================= diff --git a/lib/imageretention/README.md b/lib/imageretention/README.md new file mode 100644 index 00000000..25ee7aa7 --- /dev/null +++ b/lib/imageretention/README.md @@ -0,0 +1,24 @@ +# Image Retention + +This feature automatically deletes cached converted images after they have been unused for a configurable amount of time. + +The retention window is server-wide and controlled by: + +```yaml +images: + auto_delete: + enabled: false + unused_for: 720h +``` + +When auto-delete is enabled: + +- The server runs a retention sweep on startup and then every minute. +- Only converted cached images under `data_dir/images` are eligible for deletion. +- Shared OCI cache data under `data_dir/system/oci-cache` is not modified. + +An image is considered in use if any persisted instance metadata or snapshot record still references it. As long as at least one such reference exists, the image is protected from deletion. + +The retention timer starts only after the last persisted reference disappears. At that point the server records `unused_since` for the image digest. Once `unused_since + unused_for` has elapsed, the cached image digest is deleted. + +New instance creation clears any stale unused state for the resolved image before the new instance metadata is written. This helps prevent races where an image is being reused right as retention cleanup runs. diff --git a/lib/imageretention/controller.go b/lib/imageretention/controller.go new file mode 100644 index 00000000..54411f22 --- /dev/null +++ b/lib/imageretention/controller.go @@ -0,0 +1,482 @@ +package imageretention + +import ( + "context" + "encoding/json" + "errors" + "fmt" + "io/fs" + "log/slog" + "os" + "path/filepath" + "strings" + "sync" + "time" + + "github.com/kernel/hypeman/lib/images" + "github.com/kernel/hypeman/lib/instances" + "github.com/kernel/hypeman/lib/paths" + snapshotstore "github.com/kernel/hypeman/lib/snapshot" +) + +const sweepInterval = time.Minute + +type imageState struct { + Repository string `json:"repository"` + Digest string `json:"digest"` + UnusedSince time.Time `json:"unused_since"` +} + +type digestRef struct { + Repository string + Digest string + DigestHex string +} + +func (r digestRef) key() string { + return r.Repository + "@" + r.Digest +} + +func (r digestRef) digestRef() string { + return r.Repository + "@" + r.Digest +} + +// Controller tracks unused cached images and deletes them after a retention window. +type Controller struct { + paths *paths.Paths + imageManager images.Manager + unusedFor time.Duration + logger *slog.Logger + now func() time.Time + mu sync.Mutex +} + +// NewController creates a new image retention controller. +func NewController(p *paths.Paths, imageManager images.Manager, unusedFor time.Duration, logger *slog.Logger) *Controller { + if logger == nil { + logger = slog.Default() + } + return &Controller{ + paths: p, + imageManager: imageManager, + unusedFor: unusedFor, + logger: logger.With("component", "image_retention"), + now: time.Now, + } +} + +// Run executes one sweep immediately, then continues sweeping every minute until ctx is cancelled. +func (c *Controller) Run(ctx context.Context) error { + c.logger.InfoContext(ctx, "image auto-delete scheduler started", "unused_for", c.unusedFor, "interval", sweepInterval) + if err := c.Sweep(ctx); err != nil { + c.logger.ErrorContext(ctx, "image auto-delete sweep failed", "error", err) + } + + ticker := time.NewTicker(sweepInterval) + defer ticker.Stop() + + for { + select { + case <-ctx.Done(): + return nil + case <-ticker.C: + if err := c.Sweep(ctx); err != nil { + c.logger.ErrorContext(ctx, "image auto-delete sweep failed", "error", err) + } + } + } +} + +// Sweep performs one reconciliation pass of image retention state. +func (c *Controller) Sweep(ctx context.Context) error { + c.mu.Lock() + defer c.mu.Unlock() + + stats, err := c.sweep(ctx) + if err != nil { + return err + } + + c.logger.InfoContext(ctx, "image auto-delete sweep completed", + "ready_images", stats.readyImages, + "protected_images", stats.protectedImages, + "marked_unused", stats.markedUnused, + "cleared_states", stats.clearedStates, + "deleted_images", stats.deletedImages, + "pruned_states", stats.prunedStates, + "stale_references", stats.staleReferences, + ) + return nil +} + +// MarkUsed clears retention state for a digest that is about to be referenced by a new instance. +func (c *Controller) MarkUsed(ctx context.Context, imageName, digest string) error { + c.mu.Lock() + defer c.mu.Unlock() + + ref, err := c.resolveDigestRef(ctx, imageName, digest) + if err != nil { + return err + } + removed, err := c.deleteStateIfExists(ref) + if err != nil { + return err + } + if removed { + c.logger.DebugContext(ctx, "cleared image auto-delete state", "image", imageName, "digest", ref.Digest) + } + return nil +} + +type sweepStats struct { + readyImages int + protectedImages int + markedUnused int + clearedStates int + deletedImages int + prunedStates int + staleReferences int +} + +func (c *Controller) sweep(ctx context.Context) (sweepStats, error) { + var stats sweepStats + + allImages, err := c.imageManager.ListImages(ctx) + if err != nil { + return stats, fmt.Errorf("list images: %w", err) + } + + ready := make(map[string]digestRef) + allLocal := make(map[string]struct{}) + nonReady := make(map[string]digestRef) + for _, img := range allImages { + ref, err := digestRefFromImage(img) + if err != nil { + c.logger.WarnContext(ctx, "skipping image with invalid retention metadata", "image", img.Name, "digest", img.Digest, "error", err) + continue + } + allLocal[ref.key()] = struct{}{} + if img.Status == images.StatusReady { + ready[ref.key()] = ref + stats.readyImages++ + continue + } + nonReady[ref.key()] = ref + } + + states, err := c.listStates() + if err != nil { + return stats, err + } + + protected, staleRefs, err := c.collectProtectedDigests(ctx) + if err != nil { + return stats, err + } + stats.protectedImages = len(protected) + stats.staleReferences = staleRefs + + for _, ref := range protected { + removed, err := c.deleteStateIfExists(ref) + if err != nil { + return stats, err + } + if removed { + stats.clearedStates++ + } + } + + for _, ref := range nonReady { + removed, err := c.deleteStateIfExists(ref) + if err != nil { + return stats, err + } + if removed { + stats.clearedStates++ + } + } + + for key, state := range states { + if _, ok := allLocal[key]; ok { + continue + } + removed, err := c.deleteStateIfExists(digestRef{ + Repository: state.Repository, + Digest: state.Digest, + DigestHex: strings.TrimPrefix(state.Digest, "sha256:"), + }) + if err != nil { + return stats, err + } + if removed { + stats.prunedStates++ + } + } + + now := c.now().UTC() + for key, ref := range ready { + if _, ok := protected[key]; ok { + continue + } + + state, ok := states[key] + if !ok { + if err := c.writeState(imageState{ + Repository: ref.Repository, + Digest: ref.Digest, + UnusedSince: now, + }); err != nil { + return stats, err + } + stats.markedUnused++ + continue + } + + if state.UnusedSince.IsZero() { + state.UnusedSince = now + if err := c.writeState(state); err != nil { + return stats, err + } + stats.markedUnused++ + continue + } + + if state.UnusedSince.UTC().Add(c.unusedFor).After(now) { + continue + } + + if err := c.imageManager.DeleteImage(ctx, ref.digestRef()); err != nil { + if errors.Is(err, images.ErrNotFound) { + removed, removeErr := c.deleteStateIfExists(ref) + if removeErr != nil { + return stats, removeErr + } + if removed { + stats.prunedStates++ + } + continue + } + return stats, fmt.Errorf("delete image %s: %w", ref.digestRef(), err) + } + + c.logger.InfoContext(ctx, "deleted unused cached image", "image", ref.digestRef(), "unused_since", state.UnusedSince, "unused_for", c.unusedFor) + removed, err := c.deleteStateIfExists(ref) + if err != nil { + return stats, err + } + if removed { + stats.deletedImages++ + } + } + + return stats, nil +} + +func (c *Controller) collectProtectedDigests(ctx context.Context) (map[string]digestRef, int, error) { + imageRefs, err := c.collectReferencedImageNames(ctx) + if err != nil { + return nil, 0, err + } + + protected := make(map[string]digestRef, len(imageRefs)) + staleRefs := 0 + for imageName := range imageRefs { + img, err := c.imageManager.GetImage(ctx, imageName) + if err != nil { + staleRefs++ + c.logger.WarnContext(ctx, "skipping stale image reference during retention sweep", "image", imageName, "error", err) + continue + } + ref, err := digestRefFromImage(*img) + if err != nil { + staleRefs++ + c.logger.WarnContext(ctx, "skipping unresolvable retained image", "image", imageName, "digest", img.Digest, "error", err) + continue + } + protected[ref.key()] = ref + } + return protected, staleRefs, nil +} + +func (c *Controller) collectReferencedImageNames(ctx context.Context) (map[string]struct{}, error) { + _ = ctx + + refs := make(map[string]struct{}) + guestDir := c.paths.GuestsDir() + if entries, err := os.ReadDir(guestDir); err == nil { + for _, entry := range entries { + if !entry.IsDir() { + continue + } + metaPath := c.paths.InstanceMetadata(entry.Name()) + content, err := os.ReadFile(metaPath) + if err != nil { + if errors.Is(err, os.ErrNotExist) { + continue + } + return nil, fmt.Errorf("read instance metadata %s: %w", metaPath, err) + } + + var stored instances.StoredMetadata + if err := json.Unmarshal(content, &stored); err != nil { + c.logger.Warn("skipping invalid instance metadata during retention sweep", "path", metaPath, "error", err) + continue + } + if stored.Image != "" { + refs[stored.Image] = struct{}{} + } + } + } else if !errors.Is(err, os.ErrNotExist) { + return nil, fmt.Errorf("read guests directory: %w", err) + } + + store := snapshotstore.NewStore(c.paths) + records, err := store.ListRecords() + if err != nil { + return nil, fmt.Errorf("list snapshot records: %w", err) + } + for _, record := range records { + if len(record.StoredMetadata) == 0 { + continue + } + var stored instances.StoredMetadata + if err := json.Unmarshal(record.StoredMetadata, &stored); err != nil { + c.logger.Warn("skipping invalid snapshot stored metadata during retention sweep", "snapshot_id", record.Snapshot.Id, "error", err) + continue + } + if stored.Image != "" { + refs[stored.Image] = struct{}{} + } + } + + return refs, nil +} + +func (c *Controller) resolveDigestRef(ctx context.Context, imageName, digest string) (digestRef, error) { + if strings.TrimSpace(digest) == "" { + img, err := c.imageManager.GetImage(ctx, imageName) + if err != nil { + return digestRef{}, err + } + return digestRefFromImage(*img) + } + + ref, err := images.ParseNormalizedRef(imageName) + if err != nil { + return digestRef{}, fmt.Errorf("parse image name: %w", err) + } + return digestRef{ + Repository: ref.Repository(), + Digest: normalizeDigest(digest), + DigestHex: strings.TrimPrefix(normalizeDigest(digest), "sha256:"), + }, nil +} + +func digestRefFromImage(img images.Image) (digestRef, error) { + if strings.TrimSpace(img.Digest) == "" { + return digestRef{}, fmt.Errorf("missing digest") + } + ref, err := images.ParseNormalizedRef(img.Name) + if err != nil { + return digestRef{}, err + } + digest := normalizeDigest(img.Digest) + return digestRef{ + Repository: ref.Repository(), + Digest: digest, + DigestHex: strings.TrimPrefix(digest, "sha256:"), + }, nil +} + +func normalizeDigest(digest string) string { + digest = strings.TrimSpace(digest) + if strings.HasPrefix(digest, "sha256:") { + return digest + } + return "sha256:" + digest +} + +func (c *Controller) listStates() (map[string]imageState, error) { + root := c.paths.SystemImageRetentionDir() + states := make(map[string]imageState) + + err := filepath.WalkDir(root, func(path string, d fs.DirEntry, walkErr error) error { + if walkErr != nil { + if errors.Is(walkErr, os.ErrNotExist) { + return nil + } + return walkErr + } + if d.IsDir() || filepath.Ext(path) != ".json" { + return nil + } + + content, err := os.ReadFile(path) + if err != nil { + return fmt.Errorf("read image retention state %s: %w", path, err) + } + + var state imageState + if err := json.Unmarshal(content, &state); err != nil { + return fmt.Errorf("unmarshal image retention state %s: %w", path, err) + } + + key := state.Repository + "@" + state.Digest + states[key] = state + return nil + }) + if err != nil && !errors.Is(err, os.ErrNotExist) { + return nil, fmt.Errorf("list image retention state: %w", err) + } + + return states, nil +} + +func (c *Controller) writeState(state imageState) error { + path := c.paths.ImageRetentionState(state.Repository, strings.TrimPrefix(state.Digest, "sha256:")) + if err := os.MkdirAll(filepath.Dir(path), 0o755); err != nil { + return fmt.Errorf("create image retention directory: %w", err) + } + + content, err := json.MarshalIndent(state, "", " ") + if err != nil { + return fmt.Errorf("marshal image retention state: %w", err) + } + + tmpPath := path + ".tmp" + if err := os.WriteFile(tmpPath, content, 0o644); err != nil { + return fmt.Errorf("write image retention temp state: %w", err) + } + if err := os.Rename(tmpPath, path); err != nil { + _ = os.Remove(tmpPath) + return fmt.Errorf("rename image retention state: %w", err) + } + return nil +} + +func (c *Controller) deleteStateIfExists(ref digestRef) (bool, error) { + path := c.paths.ImageRetentionState(ref.Repository, ref.DigestHex) + if err := os.Remove(path); err != nil { + if errors.Is(err, os.ErrNotExist) { + return false, nil + } + return false, fmt.Errorf("remove image retention state %s: %w", path, err) + } + c.removeEmptyParents(filepath.Dir(path), c.paths.SystemImageRetentionDir()) + return true, nil +} + +func (c *Controller) removeEmptyParents(path string, root string) { + root = filepath.Clean(root) + for { + cleaned := filepath.Clean(path) + if cleaned == root || cleaned == "." || cleaned == string(filepath.Separator) { + return + } + err := os.Remove(cleaned) + if err != nil { + return + } + path = filepath.Dir(cleaned) + } +} diff --git a/lib/imageretention/controller_test.go b/lib/imageretention/controller_test.go new file mode 100644 index 00000000..129dd3d8 --- /dev/null +++ b/lib/imageretention/controller_test.go @@ -0,0 +1,290 @@ +package imageretention + +import ( + "context" + "encoding/json" + "os" + "path/filepath" + "testing" + "time" + + "github.com/kernel/hypeman/lib/images" + "github.com/kernel/hypeman/lib/instances" + "github.com/kernel/hypeman/lib/paths" + snapshotstore "github.com/kernel/hypeman/lib/snapshot" + "github.com/stretchr/testify/require" +) + +func TestSweepMarksUnreferencedImageUnused(t *testing.T) { + controller, p, _ := newTestController(t, 30*24*time.Hour) + const digest = "sha256:1111111111111111111111111111111111111111111111111111111111111111" + seedReadyImage(t, p, "docker.io/library/alpine:latest", digest) + + now := time.Date(2026, 3, 26, 12, 0, 0, 0, time.UTC) + controller.now = func() time.Time { return now } + + require.NoError(t, controller.Sweep(context.Background())) + + state := readState(t, p, "docker.io/library/alpine", digest) + require.Equal(t, now, state.UnusedSince) +} + +func TestSweepReferencedInstancePreventsUnusedState(t *testing.T) { + controller, p, _ := newTestController(t, 30*24*time.Hour) + const digest = "sha256:2222222222222222222222222222222222222222222222222222222222222222" + imageName := "docker.io/library/alpine:latest" + seedReadyImage(t, p, imageName, digest) + writeInstanceMetadata(t, p, "inst-1", instances.StoredMetadata{ + Id: "inst-1", + Name: "instance-1", + Image: imageName, + }) + + require.NoError(t, controller.Sweep(context.Background())) + + _, err := os.Stat(p.ImageRetentionState("docker.io/library/alpine", stringsTrimDigest(digest))) + require.Error(t, err) + require.True(t, os.IsNotExist(err)) +} + +func TestSweepSnapshotReferencePreventsDeletion(t *testing.T) { + controller, p, _ := newTestController(t, 24*time.Hour) + const digest = "sha256:3333333333333333333333333333333333333333333333333333333333333333" + imageName := "docker.io/library/alpine:latest" + seedReadyImage(t, p, imageName, digest) + writeSnapshotRecord(t, p, "snap-1", instances.StoredMetadata{Image: imageName}) + writeStateAt(t, p, "docker.io/library/alpine", digest, time.Date(2026, 3, 20, 0, 0, 0, 0, time.UTC)) + + controller.now = func() time.Time { + return time.Date(2026, 3, 26, 0, 0, 0, 0, time.UTC) + } + + require.NoError(t, controller.Sweep(context.Background())) + + _, err := os.Stat(p.ImageDigestDir("docker.io/library/alpine", stringsTrimDigest(digest))) + require.NoError(t, err) + _, err = os.Stat(p.ImageRetentionState("docker.io/library/alpine", stringsTrimDigest(digest))) + require.True(t, os.IsNotExist(err)) +} + +func TestSweepDeletesExpiredImageDigestAndAllTags(t *testing.T) { + controller, p, _ := newTestController(t, 24*time.Hour) + const digest = "sha256:4444444444444444444444444444444444444444444444444444444444444444" + seedReadyImage(t, p, "docker.io/library/alpine:latest", digest, "stable") + writeStateAt(t, p, "docker.io/library/alpine", digest, time.Date(2026, 3, 20, 0, 0, 0, 0, time.UTC)) + + controller.now = func() time.Time { + return time.Date(2026, 3, 26, 0, 0, 0, 0, time.UTC) + } + + require.NoError(t, controller.Sweep(context.Background())) + + _, err := os.Stat(p.ImageDigestDir("docker.io/library/alpine", stringsTrimDigest(digest))) + require.True(t, os.IsNotExist(err)) + _, err = os.Stat(p.ImageTagSymlink("docker.io/library/alpine", "latest")) + require.True(t, os.IsNotExist(err)) + _, err = os.Stat(p.ImageTagSymlink("docker.io/library/alpine", "stable")) + require.True(t, os.IsNotExist(err)) +} + +func TestSweepProtectedImageClearsPriorState(t *testing.T) { + controller, p, _ := newTestController(t, 24*time.Hour) + const digest = "sha256:5555555555555555555555555555555555555555555555555555555555555555" + imageName := "docker.io/library/alpine:latest" + seedReadyImage(t, p, imageName, digest) + writeStateAt(t, p, "docker.io/library/alpine", digest, time.Date(2026, 3, 20, 0, 0, 0, 0, time.UTC)) + writeInstanceMetadata(t, p, "inst-1", instances.StoredMetadata{ + Id: "inst-1", + Name: "instance-1", + Image: imageName, + }) + + require.NoError(t, controller.Sweep(context.Background())) + + _, err := os.Stat(p.ImageRetentionState("docker.io/library/alpine", stringsTrimDigest(digest))) + require.True(t, os.IsNotExist(err)) +} + +func TestSweepPrunesStateForManuallyDeletedImage(t *testing.T) { + controller, p, _ := newTestController(t, 24*time.Hour) + const digest = "sha256:6666666666666666666666666666666666666666666666666666666666666666" + seedReadyImage(t, p, "docker.io/library/alpine:latest", digest) + writeStateAt(t, p, "docker.io/library/alpine", digest, time.Date(2026, 3, 20, 0, 0, 0, 0, time.UTC)) + require.NoError(t, os.RemoveAll(p.ImageDigestDir("docker.io/library/alpine", stringsTrimDigest(digest)))) + + require.NoError(t, controller.Sweep(context.Background())) + + _, err := os.Stat(p.ImageRetentionState("docker.io/library/alpine", stringsTrimDigest(digest))) + require.True(t, os.IsNotExist(err)) +} + +func TestSweepIgnoresNonReadyImages(t *testing.T) { + controller, p, _ := newTestController(t, 24*time.Hour) + const digest = "sha256:7777777777777777777777777777777777777777777777777777777777777777" + seedImage(t, p, "docker.io/library/alpine:latest", digest, images.StatusPending) + + require.NoError(t, controller.Sweep(context.Background())) + + _, err := os.Stat(p.ImageRetentionState("docker.io/library/alpine", stringsTrimDigest(digest))) + require.True(t, os.IsNotExist(err)) +} + +func TestSweepStaleReferencesDoNotBlockOtherCleanup(t *testing.T) { + controller, p, _ := newTestController(t, 24*time.Hour) + const staleDigest = "sha256:8888888888888888888888888888888888888888888888888888888888888888" + const liveDigest = "sha256:9999999999999999999999999999999999999999999999999999999999999999" + seedReadyImage(t, p, "docker.io/library/alpine:latest", liveDigest) + writeStateAt(t, p, "docker.io/library/alpine", liveDigest, time.Date(2026, 3, 20, 0, 0, 0, 0, time.UTC)) + writeInstanceMetadata(t, p, "inst-1", instances.StoredMetadata{ + Id: "inst-1", + Name: "instance-1", + Image: "docker.io/library/busybox@" + staleDigest, + }) + + controller.now = func() time.Time { + return time.Date(2026, 3, 26, 0, 0, 0, 0, time.UTC) + } + + require.NoError(t, controller.Sweep(context.Background())) + + _, err := os.Stat(p.ImageDigestDir("docker.io/library/alpine", stringsTrimDigest(liveDigest))) + require.True(t, os.IsNotExist(err)) +} + +func TestMarkUsedClearsState(t *testing.T) { + controller, p, _ := newTestController(t, 24*time.Hour) + const digest = "sha256:aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa" + seedReadyImage(t, p, "docker.io/library/alpine:latest", digest) + writeStateAt(t, p, "docker.io/library/alpine", digest, time.Date(2026, 3, 20, 0, 0, 0, 0, time.UTC)) + + require.NoError(t, controller.MarkUsed(context.Background(), "docker.io/library/alpine:latest", digest)) + + _, err := os.Stat(p.ImageRetentionState("docker.io/library/alpine", stringsTrimDigest(digest))) + require.True(t, os.IsNotExist(err)) +} + +func TestRunPerformsImmediateSweep(t *testing.T) { + controller, p, _ := newTestController(t, 24*time.Hour) + const digest = "sha256:bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb" + seedReadyImage(t, p, "docker.io/library/alpine:latest", digest) + now := time.Date(2026, 3, 26, 12, 0, 0, 0, time.UTC) + controller.now = func() time.Time { return now } + + ctx, cancel := context.WithCancel(context.Background()) + done := make(chan struct{}) + go func() { + defer close(done) + _ = controller.Run(ctx) + }() + + statePath := p.ImageRetentionState("docker.io/library/alpine", stringsTrimDigest(digest)) + require.Eventually(t, func() bool { + _, err := os.Stat(statePath) + return err == nil + }, time.Second, 10*time.Millisecond) + + cancel() + <-done + + state := readState(t, p, "docker.io/library/alpine", digest) + require.Equal(t, now, state.UnusedSince) +} + +func newTestController(t *testing.T, unusedFor time.Duration) (*Controller, *paths.Paths, images.Manager) { + t.Helper() + + dataDir := t.TempDir() + p := paths.New(dataDir) + manager, err := images.NewManager(p, 1, nil) + require.NoError(t, err) + + return NewController(p, manager, unusedFor, nil), p, manager +} + +func seedReadyImage(t *testing.T, p *paths.Paths, imageName, digest string, extraTags ...string) { + t.Helper() + seedImage(t, p, imageName, digest, images.StatusReady, extraTags...) +} + +func seedImage(t *testing.T, p *paths.Paths, imageName, digest, status string, extraTags ...string) { + t.Helper() + + ref, err := images.ParseNormalizedRef(imageName) + require.NoError(t, err) + + meta := map[string]any{ + "name": ref.String(), + "digest": digest, + "status": status, + "created_at": time.Date(2026, 3, 20, 0, 0, 0, 0, time.UTC), + } + + digestHex := stringsTrimDigest(digest) + dir := p.ImageDigestDir(ref.Repository(), digestHex) + require.NoError(t, os.MkdirAll(dir, 0o755)) + if status == images.StatusReady { + require.NoError(t, os.WriteFile(p.ImageDigestPath(ref.Repository(), digestHex), []byte("rootfs"), 0o644)) + } + content, err := json.MarshalIndent(meta, "", " ") + require.NoError(t, err) + require.NoError(t, os.WriteFile(p.ImageMetadata(ref.Repository(), digestHex), content, 0o644)) + + if !ref.IsDigest() { + require.NoError(t, os.MkdirAll(p.ImageRepositoryDir(ref.Repository()), 0o755)) + require.NoError(t, os.Symlink(digestHex, p.ImageTagSymlink(ref.Repository(), ref.Tag()))) + } + for _, tag := range extraTags { + require.NoError(t, os.Symlink(digestHex, p.ImageTagSymlink(ref.Repository(), tag))) + } +} + +func writeInstanceMetadata(t *testing.T, p *paths.Paths, id string, stored instances.StoredMetadata) { + t.Helper() + require.NoError(t, os.MkdirAll(p.InstanceDir(id), 0o755)) + content, err := json.MarshalIndent(stored, "", " ") + require.NoError(t, err) + require.NoError(t, os.WriteFile(p.InstanceMetadata(id), content, 0o644)) +} + +func writeSnapshotRecord(t *testing.T, p *paths.Paths, snapshotID string, stored instances.StoredMetadata) { + t.Helper() + content, err := json.Marshal(stored) + require.NoError(t, err) + + store := snapshotstore.NewStore(p) + err = store.SaveRecord(&snapshotstore.Record{ + Snapshot: snapshotstore.Snapshot{ + Id: snapshotID, + SourceInstanceID: "inst-1", + CreatedAt: time.Date(2026, 3, 20, 0, 0, 0, 0, time.UTC), + }, + StoredMetadata: content, + }) + require.NoError(t, err) +} + +func writeStateAt(t *testing.T, p *paths.Paths, repository, digest string, unusedSince time.Time) { + t.Helper() + statePath := p.ImageRetentionState(repository, stringsTrimDigest(digest)) + require.NoError(t, os.MkdirAll(filepath.Dir(statePath), 0o755)) + content, err := json.MarshalIndent(imageState{ + Repository: repository, + Digest: digest, + UnusedSince: unusedSince, + }, "", " ") + require.NoError(t, err) + require.NoError(t, os.WriteFile(statePath, content, 0o644)) +} + +func readState(t *testing.T, p *paths.Paths, repository, digest string) imageState { + t.Helper() + content, err := os.ReadFile(p.ImageRetentionState(repository, stringsTrimDigest(digest))) + require.NoError(t, err) + var state imageState + require.NoError(t, json.Unmarshal(content, &state)) + return state +} + +func stringsTrimDigest(digest string) string { + return digest[len("sha256:"):] +} diff --git a/lib/instances/create.go b/lib/instances/create.go index a620a24f..ca4be904 100644 --- a/lib/instances/create.go +++ b/lib/instances/create.go @@ -133,6 +133,7 @@ func (m *manager) createInstance( log.ErrorContext(ctx, "image not ready", "image", req.Image, "status", imageInfo.Status) return nil, fmt.Errorf("%w: image status is %s", ErrImageNotReady, imageInfo.Status) } + m.recordImageUsage(ctx, imageInfo) // 3. Generate instance ID (CUID2 for secure, collision-resistant IDs) id := cuid2.Generate() diff --git a/lib/instances/image_usage_test.go b/lib/instances/image_usage_test.go new file mode 100644 index 00000000..7ec1d946 --- /dev/null +++ b/lib/instances/image_usage_test.go @@ -0,0 +1,96 @@ +package instances + +import ( + "context" + "encoding/json" + "os" + "path/filepath" + "testing" + "time" + + "github.com/kernel/hypeman/lib/hypervisor" + "github.com/kernel/hypeman/lib/images" + "github.com/kernel/hypeman/lib/paths" + "github.com/stretchr/testify/require" +) + +func TestCreateInstanceClearsRetentionStateBeforeMetadataSave(t *testing.T) { + mgr, tmpDir := setupTestManager(t) + p := paths.New(tmpDir) + + const digest = "sha256:cccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc" + const imageName = "docker.io/library/alpine:latest" + seedLocalReadyImage(t, p, imageName, digest) + writeImageRetentionState(t, p, "docker.io/library/alpine", digest, time.Date(2026, 3, 20, 0, 0, 0, 0, time.UTC)) + + recorder := &stubImageUsageRecorder{ + statePath: p.ImageRetentionState("docker.io/library/alpine", trimDigestPrefix(digest)), + } + mgr.SetImageUsageRecorder(recorder) + + _, err := mgr.CreateInstance(context.Background(), CreateInstanceRequest{ + Name: "retention-test", + Image: imageName, + Hypervisor: hypervisor.Type("missing-hypervisor"), + }) + require.Error(t, err) + + _, err = os.Stat(p.ImageRetentionState("docker.io/library/alpine", trimDigestPrefix(digest))) + require.True(t, os.IsNotExist(err)) + + metaFiles, err := mgr.listMetadataFiles() + require.NoError(t, err) + require.Len(t, metaFiles, 0) + require.Equal(t, 1, recorder.calls) +} + +func seedLocalReadyImage(t *testing.T, p *paths.Paths, imageName, digest string) { + t.Helper() + + ref, err := images.ParseNormalizedRef(imageName) + require.NoError(t, err) + + digestHex := trimDigestPrefix(digest) + require.NoError(t, os.MkdirAll(p.ImageDigestDir(ref.Repository(), digestHex), 0o755)) + require.NoError(t, os.WriteFile(p.ImageDigestPath(ref.Repository(), digestHex), []byte("rootfs"), 0o644)) + + meta := map[string]any{ + "name": ref.String(), + "digest": digest, + "status": images.StatusReady, + "created_at": time.Date(2026, 3, 20, 0, 0, 0, 0, time.UTC), + } + content, err := json.MarshalIndent(meta, "", " ") + require.NoError(t, err) + require.NoError(t, os.WriteFile(p.ImageMetadata(ref.Repository(), digestHex), content, 0o644)) + require.NoError(t, os.MkdirAll(p.ImageRepositoryDir(ref.Repository()), 0o755)) + require.NoError(t, os.Symlink(digestHex, p.ImageTagSymlink(ref.Repository(), ref.Tag()))) +} + +func writeImageRetentionState(t *testing.T, p *paths.Paths, repository, digest string, unusedSince time.Time) { + t.Helper() + statePath := p.ImageRetentionState(repository, trimDigestPrefix(digest)) + require.NoError(t, os.MkdirAll(filepath.Dir(statePath), 0o755)) + + content, err := json.MarshalIndent(map[string]any{ + "repository": repository, + "digest": digest, + "unused_since": unusedSince, + }, "", " ") + require.NoError(t, err) + require.NoError(t, os.WriteFile(statePath, content, 0o644)) +} + +func trimDigestPrefix(digest string) string { + return digest[len("sha256:"):] +} + +type stubImageUsageRecorder struct { + statePath string + calls int +} + +func (s *stubImageUsageRecorder) MarkUsed(ctx context.Context, imageName, digest string) error { + s.calls++ + return os.Remove(s.statePath) +} diff --git a/lib/instances/manager.go b/lib/instances/manager.go index 389486b3..f4e90cdf 100644 --- a/lib/instances/manager.go +++ b/lib/instances/manager.go @@ -12,6 +12,7 @@ import ( "github.com/kernel/hypeman/lib/guestmemory" "github.com/kernel/hypeman/lib/hypervisor" "github.com/kernel/hypeman/lib/images" + "github.com/kernel/hypeman/lib/logger" "github.com/kernel/hypeman/lib/network" "github.com/kernel/hypeman/lib/paths" "github.com/kernel/hypeman/lib/resources" @@ -58,6 +59,16 @@ type Manager interface { GetVsockDialer(ctx context.Context, instanceID string) (hypervisor.VsockDialer, error) } +// ImageUsageRecorder records newly used images before instance metadata is persisted. +type ImageUsageRecorder interface { + MarkUsed(ctx context.Context, imageName, digest string) error +} + +// ImageUsageRecorderSetter configures an optional image usage recorder on the manager. +type ImageUsageRecorderSetter interface { + SetImageUsageRecorder(recorder ImageUsageRecorder) +} + // ResourceLimits contains configurable resource limits for instances type ResourceLimits struct { MaxOverlaySize int64 // Maximum overlay disk size in bytes per instance @@ -99,6 +110,7 @@ type manager struct { compressionJobs map[string]*compressionJob nativeCodecMu sync.Mutex nativeCodecPaths map[string]string + imageUsageRecorder ImageUsageRecorder // Hypervisor support vmStarters map[hypervisor.Type]hypervisor.VMStarter @@ -171,6 +183,11 @@ func (m *manager) SetResourceValidator(v ResourceValidator) { m.resourceValidator = v } +// SetImageUsageRecorder configures an optional recorder for pre-persistence image usage. +func (m *manager) SetImageUsageRecorder(recorder ImageUsageRecorder) { + m.imageUsageRecorder = recorder +} + // getHypervisor creates a hypervisor client for the given socket and type. // Used for connecting to already-running VMs (e.g., for state queries). func (m *manager) getHypervisor(socketPath string, hvType hypervisor.Type) (hypervisor.Hypervisor, error) { @@ -217,6 +234,16 @@ func (m *manager) maybePersistBootMarkers(ctx context.Context, id string) { m.persistBootMarkers(ctx, id) } +func (m *manager) recordImageUsage(ctx context.Context, imageInfo *images.Image) { + if m.imageUsageRecorder == nil || imageInfo == nil { + return + } + if err := m.imageUsageRecorder.MarkUsed(ctx, imageInfo.Name, imageInfo.Digest); err != nil { + logger := logger.FromContext(ctx) + logger.WarnContext(ctx, "failed to record image usage", "image", imageInfo.Name, "digest", imageInfo.Digest, "error", err) + } +} + // CreateInstance creates and starts a new instance func (m *manager) CreateInstance(ctx context.Context, req CreateInstanceRequest) (*Instance, error) { // Note: ID is generated inside createInstance, so we can't lock before calling it. diff --git a/lib/paths/paths.go b/lib/paths/paths.go index 35ddf884..adc070c4 100644 --- a/lib/paths/paths.go +++ b/lib/paths/paths.go @@ -53,6 +53,21 @@ func (p *Paths) SystemOCICache() string { return filepath.Join(p.dataDir, "system", "oci-cache") } +// SystemImageRetentionDir returns the path to persisted image retention state. +func (p *Paths) SystemImageRetentionDir() string { + return filepath.Join(p.dataDir, "system", "image-retention") +} + +// ImageRetentionRepositoryDir returns the directory for image retention state for one repository. +func (p *Paths) ImageRetentionRepositoryDir(repository string) string { + return filepath.Join(p.SystemImageRetentionDir(), repository) +} + +// ImageRetentionState returns the state file path for one retained image digest. +func (p *Paths) ImageRetentionState(repository, digestHex string) string { + return filepath.Join(p.ImageRetentionRepositoryDir(repository), digestHex+".json") +} + // OCICacheBlobDir returns the path to the OCI cache blobs directory. func (p *Paths) OCICacheBlobDir() string { return filepath.Join(p.SystemOCICache(), "blobs", "sha256") From 436197947c01d13b014d6afa3818efa3f62f3bfa Mon Sep 17 00:00:00 2001 From: Steven Miller Date: Thu, 26 Mar 2026 15:35:15 -0400 Subject: [PATCH 2/5] Add image retention observability metrics --- cmd/api/main.go | 17 ++- lib/imageretention/controller.go | 33 +++++- lib/imageretention/controller_test.go | 155 +++++++++++++++++++++++++- lib/imageretention/metrics.go | 121 ++++++++++++++++++++ lib/otel/README.md | 5 + 5 files changed, 324 insertions(+), 7 deletions(-) create mode 100644 lib/imageretention/metrics.go diff --git a/cmd/api/main.go b/cmd/api/main.go index fffaf774..11657c58 100644 --- a/cmd/api/main.go +++ b/cmd/api/main.go @@ -29,6 +29,7 @@ import ( "github.com/kernel/hypeman/lib/imageretention" "github.com/kernel/hypeman/lib/images" "github.com/kernel/hypeman/lib/instances" + loglib "github.com/kernel/hypeman/lib/logger" mw "github.com/kernel/hypeman/lib/middleware" "github.com/kernel/hypeman/lib/oapi" "github.com/kernel/hypeman/lib/otel" @@ -38,6 +39,7 @@ import ( "github.com/kernel/hypeman/lib/vmm" nethttpmiddleware "github.com/oapi-codegen/nethttp-middleware" "github.com/riandyrn/otelchi" + "go.opentelemetry.io/otel/metric" "golang.org/x/sync/errgroup" ) @@ -66,7 +68,7 @@ type imageRetentionRunner interface { Run(ctx context.Context) error } -func configureImageRetentionController(cfg *config.Config, imageManager images.Manager, instanceManager instances.Manager, logger *slog.Logger) (imageRetentionRunner, error) { +func configureImageRetentionController(cfg *config.Config, imageManager images.Manager, instanceManager instances.Manager, logger *slog.Logger, meter metric.Meter) (imageRetentionRunner, error) { if cfg == nil || !cfg.Images.AutoDelete.Enabled { return nil, nil } @@ -76,7 +78,10 @@ func configureImageRetentionController(cfg *config.Config, imageManager images.M return nil, fmt.Errorf("invalid images.auto_delete.unused_for %q: %w", cfg.Images.AutoDelete.UnusedFor, err) } - controller := imageretention.NewController(paths.New(cfg.DataDir), imageManager, unusedFor, logger) + controller, err := imageretention.NewController(paths.New(cfg.DataDir), imageManager, unusedFor, logger, meter) + if err != nil { + return nil, err + } if setter, ok := instanceManager.(instances.ImageUsageRecorderSetter); ok { setter.SetImageUsageRecorder(controller) } @@ -463,7 +468,13 @@ func run() error { // Error group for coordinated shutdown grp, gctx := errgroup.WithContext(ctx) - retentionController, err := configureImageRetentionController(app.Config, app.ImageManager, app.InstanceManager, logger) + retentionController, err := configureImageRetentionController( + app.Config, + app.ImageManager, + app.InstanceManager, + logger, + otelProvider.MeterFor(loglib.SubsystemImages), + ) if err != nil { return err } diff --git a/lib/imageretention/controller.go b/lib/imageretention/controller.go index 54411f22..be62b591 100644 --- a/lib/imageretention/controller.go +++ b/lib/imageretention/controller.go @@ -17,6 +17,7 @@ import ( "github.com/kernel/hypeman/lib/instances" "github.com/kernel/hypeman/lib/paths" snapshotstore "github.com/kernel/hypeman/lib/snapshot" + "go.opentelemetry.io/otel/metric" ) const sweepInterval = time.Minute @@ -47,22 +48,31 @@ type Controller struct { imageManager images.Manager unusedFor time.Duration logger *slog.Logger + metrics *Metrics now func() time.Time mu sync.Mutex } // NewController creates a new image retention controller. -func NewController(p *paths.Paths, imageManager images.Manager, unusedFor time.Duration, logger *slog.Logger) *Controller { +func NewController(p *paths.Paths, imageManager images.Manager, unusedFor time.Duration, logger *slog.Logger, meter metric.Meter) (*Controller, error) { if logger == nil { logger = slog.Default() } - return &Controller{ + controller := &Controller{ paths: p, imageManager: imageManager, unusedFor: unusedFor, logger: logger.With("component", "image_retention"), now: time.Now, } + if meter != nil { + metrics, err := newMetrics(meter, controller) + if err != nil { + return nil, fmt.Errorf("create image retention metrics: %w", err) + } + controller.metrics = metrics + } + return controller, nil } // Run executes one sweep immediately, then continues sweeping every minute until ctx is cancelled. @@ -92,12 +102,20 @@ func (c *Controller) Sweep(ctx context.Context) error { c.mu.Lock() defer c.mu.Unlock() + start := time.Now() stats, err := c.sweep(ctx) + status := "success" + if err != nil { + status = "error" + } + if c.metrics != nil { + c.metrics.RecordSweep(ctx, status, time.Since(start), stats.staleReferences) + } if err != nil { return err } - c.logger.InfoContext(ctx, "image auto-delete sweep completed", + c.logger.DebugContext(ctx, "image auto-delete sweep completed", "ready_images", stats.readyImages, "protected_images", stats.protectedImages, "marked_unused", stats.markedUnused, @@ -247,6 +265,9 @@ func (c *Controller) sweep(ctx context.Context) (sweepStats, error) { if err := c.imageManager.DeleteImage(ctx, ref.digestRef()); err != nil { if errors.Is(err, images.ErrNotFound) { + if c.metrics != nil { + c.metrics.RecordDelete(ctx, "not_found") + } removed, removeErr := c.deleteStateIfExists(ref) if removeErr != nil { return stats, removeErr @@ -256,8 +277,14 @@ func (c *Controller) sweep(ctx context.Context) (sweepStats, error) { } continue } + if c.metrics != nil { + c.metrics.RecordDelete(ctx, "error") + } return stats, fmt.Errorf("delete image %s: %w", ref.digestRef(), err) } + if c.metrics != nil { + c.metrics.RecordDelete(ctx, "success") + } c.logger.InfoContext(ctx, "deleted unused cached image", "image", ref.digestRef(), "unused_since", state.UnusedSince, "unused_for", c.unusedFor) removed, err := c.deleteStateIfExists(ref) diff --git a/lib/imageretention/controller_test.go b/lib/imageretention/controller_test.go index 129dd3d8..a8eb0dc8 100644 --- a/lib/imageretention/controller_test.go +++ b/lib/imageretention/controller_test.go @@ -1,8 +1,10 @@ package imageretention import ( + "bytes" "context" "encoding/json" + "log/slog" "os" "path/filepath" "testing" @@ -13,6 +15,9 @@ import ( "github.com/kernel/hypeman/lib/paths" snapshotstore "github.com/kernel/hypeman/lib/snapshot" "github.com/stretchr/testify/require" + "go.opentelemetry.io/otel/attribute" + otelmetric "go.opentelemetry.io/otel/sdk/metric" + "go.opentelemetry.io/otel/sdk/metric/metricdata" ) func TestSweepMarksUnreferencedImageUnused(t *testing.T) { @@ -29,6 +34,36 @@ func TestSweepMarksUnreferencedImageUnused(t *testing.T) { require.Equal(t, now, state.UnusedSince) } +func TestSweepRecordsMetrics(t *testing.T) { + controller, p, reader := newMetricTestController(t, 24*time.Hour) + const trackedDigest = "sha256:0101010101010101010101010101010101010101010101010101010101010101" + const deletedDigest = "sha256:0202020202020202020202020202020202020202020202020202020202020202" + + seedReadyImage(t, p, "docker.io/library/alpine:latest", trackedDigest) + seedReadyImage(t, p, "docker.io/library/busybox:latest", deletedDigest) + writeStateAt(t, p, "docker.io/library/busybox", deletedDigest, time.Date(2026, 3, 20, 0, 0, 0, 0, time.UTC)) + writeInstanceMetadata(t, p, "inst-1", instances.StoredMetadata{ + Id: "inst-1", + Name: "instance-1", + Image: "docker.io/library/debian@sha256:0303030303030303030303030303030303030303030303030303030303030303", + }) + + controller.now = func() time.Time { + return time.Date(2026, 3, 26, 0, 0, 0, 0, time.UTC) + } + + require.NoError(t, controller.Sweep(context.Background())) + + rm := collectRetentionMetrics(t, reader) + + require.Equal(t, int64(1), int64SumValue(t, rm, "hypeman_image_retention_sweeps_total", map[string]string{"status": "success"})) + require.Equal(t, uint64(1), float64HistogramCount(t, rm, "hypeman_image_retention_sweep_duration_seconds", map[string]string{"status": "success"})) + require.Equal(t, int64(1), int64SumValue(t, rm, "hypeman_image_retention_deletes_total", map[string]string{"status": "success"})) + require.Equal(t, int64(1), int64SumValue(t, rm, "hypeman_image_retention_stale_references_total", nil)) + require.Equal(t, int64(1), int64GaugeValue(t, rm, "hypeman_image_retention_pending_images", map[string]string{"state": "tracked"})) + require.Equal(t, int64(0), int64GaugeValue(t, rm, "hypeman_image_retention_pending_images", map[string]string{"state": "expired"})) +} + func TestSweepReferencedInstancePreventsUnusedState(t *testing.T) { controller, p, _ := newTestController(t, 30*24*time.Hour) const digest = "sha256:2222222222222222222222222222222222222222222222222222222222222222" @@ -190,6 +225,22 @@ func TestRunPerformsImmediateSweep(t *testing.T) { require.Equal(t, now, state.UnusedSince) } +func TestSweepSuccessLogIsDebugOnly(t *testing.T) { + dataDir := t.TempDir() + p := paths.New(dataDir) + manager, err := images.NewManager(p, 1, nil) + require.NoError(t, err) + + var out bytes.Buffer + logger := slog.New(slog.NewTextHandler(&out, &slog.HandlerOptions{Level: slog.LevelInfo})) + controller, err := NewController(p, manager, 24*time.Hour, logger, nil) + require.NoError(t, err) + seedReadyImage(t, p, "docker.io/library/alpine:latest", "sha256:cccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc") + + require.NoError(t, controller.Sweep(context.Background())) + require.NotContains(t, out.String(), "image auto-delete sweep completed") +} + func newTestController(t *testing.T, unusedFor time.Duration) (*Controller, *paths.Paths, images.Manager) { t.Helper() @@ -198,7 +249,24 @@ func newTestController(t *testing.T, unusedFor time.Duration) (*Controller, *pat manager, err := images.NewManager(p, 1, nil) require.NoError(t, err) - return NewController(p, manager, unusedFor, nil), p, manager + controller, err := NewController(p, manager, unusedFor, nil, nil) + require.NoError(t, err) + return controller, p, manager +} + +func newMetricTestController(t *testing.T, unusedFor time.Duration) (*Controller, *paths.Paths, *otelmetric.ManualReader) { + t.Helper() + + dataDir := t.TempDir() + p := paths.New(dataDir) + manager, err := images.NewManager(p, 1, nil) + require.NoError(t, err) + + reader := otelmetric.NewManualReader() + provider := otelmetric.NewMeterProvider(otelmetric.WithReader(reader)) + controller, err := NewController(p, manager, unusedFor, nil, provider.Meter("test")) + require.NoError(t, err) + return controller, p, reader } func seedReadyImage(t *testing.T, p *paths.Paths, imageName, digest string, extraTags ...string) { @@ -285,6 +353,91 @@ func readState(t *testing.T, p *paths.Paths, repository, digest string) imageSta return state } +func collectRetentionMetrics(t *testing.T, reader *otelmetric.ManualReader) metricdata.ResourceMetrics { + t.Helper() + + var rm metricdata.ResourceMetrics + require.NoError(t, reader.Collect(context.Background(), &rm)) + return rm +} + +func int64GaugeValue(t *testing.T, rm metricdata.ResourceMetrics, name string, wantAttrs map[string]string) int64 { + t.Helper() + + metric := findRetentionMetric(t, rm, name) + gauge, ok := metric.Data.(metricdata.Gauge[int64]) + require.True(t, ok, "expected int64 gauge metric data for %s", name) + for _, point := range gauge.DataPoints { + if retentionMetricAttrsMatch(point.Attributes, wantAttrs) { + return point.Value + } + } + t.Fatalf("metric %s with attrs %v not found", name, wantAttrs) + return 0 +} + +func int64SumValue(t *testing.T, rm metricdata.ResourceMetrics, name string, wantAttrs map[string]string) int64 { + t.Helper() + + metric := findRetentionMetric(t, rm, name) + sum, ok := metric.Data.(metricdata.Sum[int64]) + require.True(t, ok, "expected int64 sum metric data for %s", name) + for _, point := range sum.DataPoints { + if retentionMetricAttrsMatch(point.Attributes, wantAttrs) { + return point.Value + } + } + t.Fatalf("metric %s with attrs %v not found", name, wantAttrs) + return 0 +} + +func float64HistogramCount(t *testing.T, rm metricdata.ResourceMetrics, name string, wantAttrs map[string]string) uint64 { + t.Helper() + + metric := findRetentionMetric(t, rm, name) + histogram, ok := metric.Data.(metricdata.Histogram[float64]) + require.True(t, ok, "expected float64 histogram metric data for %s", name) + for _, point := range histogram.DataPoints { + if retentionMetricAttrsMatch(point.Attributes, wantAttrs) { + return point.Count + } + } + t.Fatalf("metric %s with attrs %v not found", name, wantAttrs) + return 0 +} + +func findRetentionMetric(t *testing.T, rm metricdata.ResourceMetrics, name string) metricdata.Metrics { + t.Helper() + + for _, scope := range rm.ScopeMetrics { + for _, metric := range scope.Metrics { + if metric.Name == name { + return metric + } + } + } + + t.Fatalf("metric %s not found", name) + return metricdata.Metrics{} +} + +func retentionMetricAttrsMatch(set attribute.Set, want map[string]string) bool { + if len(want) == 0 { + return true + } + + attrs := make(map[string]string, len(set.ToSlice())) + for _, kv := range set.ToSlice() { + attrs[string(kv.Key)] = kv.Value.AsString() + } + for key, value := range want { + if attrs[key] != value { + return false + } + } + return true +} + func stringsTrimDigest(digest string) string { return digest[len("sha256:"):] } diff --git a/lib/imageretention/metrics.go b/lib/imageretention/metrics.go new file mode 100644 index 00000000..7e59d822 --- /dev/null +++ b/lib/imageretention/metrics.go @@ -0,0 +1,121 @@ +package imageretention + +import ( + "context" + "time" + + hypotel "github.com/kernel/hypeman/lib/otel" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/metric" +) + +// Metrics holds the OTel instruments for image retention. +type Metrics struct { + sweepsTotal metric.Int64Counter + sweepDuration metric.Float64Histogram + deletesTotal metric.Int64Counter + staleReferencesTotal metric.Int64Counter + pendingImages metric.Int64ObservableGauge +} + +func newMetrics(meter metric.Meter, c *Controller) (*Metrics, error) { + sweepsTotal, err := meter.Int64Counter( + "hypeman_image_retention_sweeps_total", + metric.WithDescription("Total number of image retention sweeps"), + ) + if err != nil { + return nil, err + } + + sweepDuration, err := meter.Float64Histogram( + "hypeman_image_retention_sweep_duration_seconds", + metric.WithDescription("Duration of image retention sweeps"), + metric.WithUnit("s"), + metric.WithExplicitBucketBoundaries(hypotel.CommonDurationHistogramBuckets()...), + ) + if err != nil { + return nil, err + } + + deletesTotal, err := meter.Int64Counter( + "hypeman_image_retention_deletes_total", + metric.WithDescription("Total number of image retention delete attempts"), + ) + if err != nil { + return nil, err + } + + staleReferencesTotal, err := meter.Int64Counter( + "hypeman_image_retention_stale_references_total", + metric.WithDescription("Total number of stale image references skipped during retention sweeps"), + ) + if err != nil { + return nil, err + } + + pendingImages, err := meter.Int64ObservableGauge( + "hypeman_image_retention_pending_images", + metric.WithDescription("Number of images currently tracked by retention state"), + ) + if err != nil { + return nil, err + } + + _, err = meter.RegisterCallback( + func(ctx context.Context, o metric.Observer) error { + states, err := c.listStates() + if err != nil { + return nil + } + + now := c.now().UTC() + var tracked int64 + var expired int64 + for _, state := range states { + tracked++ + if !state.UnusedSince.IsZero() && !state.UnusedSince.UTC().Add(c.unusedFor).After(now) { + expired++ + } + } + + o.ObserveInt64(pendingImages, tracked, metric.WithAttributes(attribute.String("state", "tracked"))) + o.ObserveInt64(pendingImages, expired, metric.WithAttributes(attribute.String("state", "expired"))) + return nil + }, + pendingImages, + ) + if err != nil { + return nil, err + } + + return &Metrics{ + sweepsTotal: sweepsTotal, + sweepDuration: sweepDuration, + deletesTotal: deletesTotal, + staleReferencesTotal: staleReferencesTotal, + pendingImages: pendingImages, + }, nil +} + +// RecordSweep records the outcome and duration of a retention sweep. +func (m *Metrics) RecordSweep(ctx context.Context, status string, duration time.Duration, staleReferences int) { + if m == nil { + return + } + + attrs := metric.WithAttributes(attribute.String("status", status)) + m.sweepsTotal.Add(ctx, 1, attrs) + m.sweepDuration.Record(ctx, duration.Seconds(), attrs) + if staleReferences > 0 { + m.staleReferencesTotal.Add(ctx, int64(staleReferences)) + } +} + +// RecordDelete records the outcome of an image delete attempt. +func (m *Metrics) RecordDelete(ctx context.Context, status string) { + if m == nil { + return + } + + m.deletesTotal.Add(ctx, 1, metric.WithAttributes(attribute.String("status", status))) +} diff --git a/lib/otel/README.md b/lib/otel/README.md index 47cd9123..38d79a7c 100644 --- a/lib/otel/README.md +++ b/lib/otel/README.md @@ -55,6 +55,11 @@ This keeps pull and push views aligned because both are sourced from the same OT | `hypeman_images_build_duration_seconds` | histogram | status | Image build time | | `hypeman_images_total` | gauge | status | Cached images count | | `hypeman_images_pulls_total` | counter | status | Registry pulls | +| `hypeman_image_retention_pending_images` | gauge | state | Images currently tracked by retention state | +| `hypeman_image_retention_stale_references_total` | counter | | Stale image references skipped during retention sweeps | +| `hypeman_image_retention_sweeps_total` | counter | status | Retention sweep runs | +| `hypeman_image_retention_sweep_duration_seconds` | histogram | status | Retention sweep latency | +| `hypeman_image_retention_deletes_total` | counter | status | Retention delete attempts | ### Instances | Metric | Type | Labels | Description | From 197c29848f9b4db9c7240e3f5a592d5e845530ea Mon Sep 17 00:00:00 2001 From: Steven Miller Date: Fri, 27 Mar 2026 09:03:50 -0400 Subject: [PATCH 3/5] Address image retention review feedback --- lib/imageretention/controller.go | 37 ++++++++++++++++++++------------ lib/imageretention/metrics.go | 3 +-- 2 files changed, 24 insertions(+), 16 deletions(-) diff --git a/lib/imageretention/controller.go b/lib/imageretention/controller.go index be62b591..dfb3621d 100644 --- a/lib/imageretention/controller.go +++ b/lib/imageretention/controller.go @@ -34,11 +34,7 @@ type digestRef struct { DigestHex string } -func (r digestRef) key() string { - return r.Repository + "@" + r.Digest -} - -func (r digestRef) digestRef() string { +func (r digestRef) String() string { return r.Repository + "@" + r.Digest } @@ -173,13 +169,13 @@ func (c *Controller) sweep(ctx context.Context) (sweepStats, error) { c.logger.WarnContext(ctx, "skipping image with invalid retention metadata", "image", img.Name, "digest", img.Digest, "error", err) continue } - allLocal[ref.key()] = struct{}{} + allLocal[ref.String()] = struct{}{} if img.Status == images.StatusReady { - ready[ref.key()] = ref + ready[ref.String()] = ref stats.readyImages++ continue } - nonReady[ref.key()] = ref + nonReady[ref.String()] = ref } states, err := c.listStates() @@ -263,7 +259,7 @@ func (c *Controller) sweep(ctx context.Context) (sweepStats, error) { continue } - if err := c.imageManager.DeleteImage(ctx, ref.digestRef()); err != nil { + if err := c.imageManager.DeleteImage(ctx, ref.String()); err != nil { if errors.Is(err, images.ErrNotFound) { if c.metrics != nil { c.metrics.RecordDelete(ctx, "not_found") @@ -280,13 +276,13 @@ func (c *Controller) sweep(ctx context.Context) (sweepStats, error) { if c.metrics != nil { c.metrics.RecordDelete(ctx, "error") } - return stats, fmt.Errorf("delete image %s: %w", ref.digestRef(), err) + return stats, fmt.Errorf("delete image %s: %w", ref.String(), err) } if c.metrics != nil { c.metrics.RecordDelete(ctx, "success") } - c.logger.InfoContext(ctx, "deleted unused cached image", "image", ref.digestRef(), "unused_since", state.UnusedSince, "unused_for", c.unusedFor) + c.logger.InfoContext(ctx, "deleted unused cached image", "image", ref.String(), "unused_since", state.UnusedSince, "unused_for", c.unusedFor) removed, err := c.deleteStateIfExists(ref) if err != nil { return stats, err @@ -320,7 +316,7 @@ func (c *Controller) collectProtectedDigests(ctx context.Context) (map[string]di c.logger.WarnContext(ctx, "skipping unresolvable retained image", "image", imageName, "digest", img.Digest, "error", err) continue } - protected[ref.key()] = ref + protected[ref.String()] = ref } return protected, staleRefs, nil } @@ -440,6 +436,9 @@ func (c *Controller) listStates() (map[string]imageState, error) { content, err := os.ReadFile(path) if err != nil { + if errors.Is(err, os.ErrNotExist) { + return nil + } return fmt.Errorf("read image retention state %s: %w", path, err) } @@ -448,8 +447,7 @@ func (c *Controller) listStates() (map[string]imageState, error) { return fmt.Errorf("unmarshal image retention state %s: %w", path, err) } - key := state.Repository + "@" + state.Digest - states[key] = state + states[state.Repository+"@"+state.Digest] = state return nil }) if err != nil && !errors.Is(err, os.ErrNotExist) { @@ -459,6 +457,17 @@ func (c *Controller) listStates() (map[string]imageState, error) { return states, nil } +func (c *Controller) listStatesSnapshot() (map[string]imageState, time.Time, error) { + c.mu.Lock() + defer c.mu.Unlock() + + states, err := c.listStates() + if err != nil { + return nil, time.Time{}, err + } + return states, c.now().UTC(), nil +} + func (c *Controller) writeState(state imageState) error { path := c.paths.ImageRetentionState(state.Repository, strings.TrimPrefix(state.Digest, "sha256:")) if err := os.MkdirAll(filepath.Dir(path), 0o755); err != nil { diff --git a/lib/imageretention/metrics.go b/lib/imageretention/metrics.go index 7e59d822..d07016ab 100644 --- a/lib/imageretention/metrics.go +++ b/lib/imageretention/metrics.go @@ -63,12 +63,11 @@ func newMetrics(meter metric.Meter, c *Controller) (*Metrics, error) { _, err = meter.RegisterCallback( func(ctx context.Context, o metric.Observer) error { - states, err := c.listStates() + states, now, err := c.listStatesSnapshot() if err != nil { return nil } - now := c.now().UTC() var tracked int64 var expired int64 for _, state := range states { From 4830d1f9840e0903d2bf2ebf8e8c605de92530af Mon Sep 17 00:00:00 2001 From: Steven Miller Date: Fri, 27 Mar 2026 09:42:50 -0400 Subject: [PATCH 4/5] Clean up image usage logging --- lib/instances/manager.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/instances/manager.go b/lib/instances/manager.go index f4e90cdf..ccffdad7 100644 --- a/lib/instances/manager.go +++ b/lib/instances/manager.go @@ -239,8 +239,8 @@ func (m *manager) recordImageUsage(ctx context.Context, imageInfo *images.Image) return } if err := m.imageUsageRecorder.MarkUsed(ctx, imageInfo.Name, imageInfo.Digest); err != nil { - logger := logger.FromContext(ctx) - logger.WarnContext(ctx, "failed to record image usage", "image", imageInfo.Name, "digest", imageInfo.Digest, "error", err) + log := logger.FromContext(ctx) + log.WarnContext(ctx, "failed to record image usage", "image", imageInfo.Name, "digest", imageInfo.Digest, "error", err) } } From 8879e48100e52487d5d6bb0eaa93ce51a748a3bf Mon Sep 17 00:00:00 2001 From: Steven Miller Date: Fri, 27 Mar 2026 11:56:37 -0400 Subject: [PATCH 5/5] Add image retention allow list --- cmd/api/config/config.go | 9 ++- cmd/api/config/config_test.go | 24 ++++++++ cmd/api/main.go | 2 +- config.example.darwin.yaml | 3 + config.example.yaml | 3 + lib/imageretention/README.md | 6 +- lib/imageretention/controller.go | 51 +++++++++++++++- lib/imageretention/controller_test.go | 88 ++++++++++++++++++++++----- 8 files changed, 165 insertions(+), 21 deletions(-) diff --git a/cmd/api/config/config.go b/cmd/api/config/config.go index 64a19a41..ce7c90a0 100644 --- a/cmd/api/config/config.go +++ b/cmd/api/config/config.go @@ -125,8 +125,9 @@ type LoggingConfig struct { // ImagesAutoDeleteConfig holds server-wide image retention settings. type ImagesAutoDeleteConfig struct { - Enabled bool `koanf:"enabled"` - UnusedFor string `koanf:"unused_for"` + Enabled bool `koanf:"enabled"` + UnusedFor string `koanf:"unused_for"` + Allowed []string `koanf:"allowed"` } // ImagesConfig holds image-management settings. @@ -335,6 +336,7 @@ func defaultConfig() *Config { AutoDelete: ImagesAutoDeleteConfig{ Enabled: false, UnusedFor: "720h", + Allowed: []string{}, }, }, @@ -533,6 +535,9 @@ func (c *Config) Validate() error { if err := validateDuration("images.auto_delete.unused_for", c.Images.AutoDelete.UnusedFor); err != nil { return err } + for i, pattern := range c.Images.AutoDelete.Allowed { + c.Images.AutoDelete.Allowed[i] = strings.TrimSpace(pattern) + } algorithm := strings.ToLower(c.Snapshot.CompressionDefault.Algorithm) c.Snapshot.CompressionDefault.Algorithm = algorithm if c.Snapshot.CompressionDefault.Enabled { diff --git a/cmd/api/config/config_test.go b/cmd/api/config/config_test.go index 3314e0bd..bb451957 100644 --- a/cmd/api/config/config_test.go +++ b/cmd/api/config/config_test.go @@ -37,6 +37,9 @@ func TestDefaultConfigIncludesMetricsSettings(t *testing.T) { if cfg.Images.AutoDelete.UnusedFor != "720h" { t.Fatalf("expected default images.auto_delete.unused_for to be 720h, got %q", cfg.Images.AutoDelete.UnusedFor) } + if len(cfg.Images.AutoDelete.Allowed) != 0 { + t.Fatalf("expected default images.auto_delete.allowed to be empty, got %v", cfg.Images.AutoDelete.Allowed) + } } func TestLoadEnvOverridesMetricsAndOtelInterval(t *testing.T) { @@ -162,6 +165,9 @@ func TestLoadUsesDefaultImageAutoDeleteRetentionWindow(t *testing.T) { if cfg.Images.AutoDelete.UnusedFor != "720h" { t.Fatalf("expected default images.auto_delete.unused_for to remain 720h, got %q", cfg.Images.AutoDelete.UnusedFor) } + if len(cfg.Images.AutoDelete.Allowed) != 0 { + t.Fatalf("expected default images.auto_delete.allowed to remain empty, got %v", cfg.Images.AutoDelete.Allowed) + } } func TestValidateRejectsInvalidImageAutoDeleteUnusedFor(t *testing.T) { @@ -174,6 +180,24 @@ func TestValidateRejectsInvalidImageAutoDeleteUnusedFor(t *testing.T) { } } +func TestValidateTrimsImageAutoDeleteAllowedPatterns(t *testing.T) { + cfg := defaultConfig() + cfg.Images.AutoDelete.Allowed = []string{" docker.io/library/* ", " ", "ghcr.io/kernel/*"} + + if err := cfg.Validate(); err != nil { + t.Fatalf("expected image auto delete allow list to validate, got %v", err) + } + if cfg.Images.AutoDelete.Allowed[0] != "docker.io/library/*" { + t.Fatalf("expected first allow pattern to be trimmed, got %q", cfg.Images.AutoDelete.Allowed[0]) + } + if cfg.Images.AutoDelete.Allowed[1] != "" { + t.Fatalf("expected whitespace-only allow pattern to trim to empty string, got %q", cfg.Images.AutoDelete.Allowed[1]) + } + if cfg.Images.AutoDelete.Allowed[2] != "ghcr.io/kernel/*" { + t.Fatalf("expected third allow pattern to be trimmed, got %q", cfg.Images.AutoDelete.Allowed[2]) + } +} + func TestValidateRejectsEmptyActiveBallooningDurations(t *testing.T) { cfg := defaultConfig() cfg.Hypervisor.Memory.ActiveBallooning.PollInterval = " " diff --git a/cmd/api/main.go b/cmd/api/main.go index 11657c58..3430b9fe 100644 --- a/cmd/api/main.go +++ b/cmd/api/main.go @@ -78,7 +78,7 @@ func configureImageRetentionController(cfg *config.Config, imageManager images.M return nil, fmt.Errorf("invalid images.auto_delete.unused_for %q: %w", cfg.Images.AutoDelete.UnusedFor, err) } - controller, err := imageretention.NewController(paths.New(cfg.DataDir), imageManager, unusedFor, logger, meter) + controller, err := imageretention.NewController(paths.New(cfg.DataDir), imageManager, unusedFor, cfg.Images.AutoDelete.Allowed, logger, meter) if err != nil { return nil, err } diff --git a/config.example.darwin.yaml b/config.example.darwin.yaml index 267bf8a1..5b16e672 100644 --- a/config.example.darwin.yaml +++ b/config.example.darwin.yaml @@ -74,6 +74,9 @@ logging: # auto_delete: # enabled: false # server-wide automatic deletion of cached converted images # unused_for: 720h # delete only after no instances or snapshots reference the image for this long +# allowed: # safety gate: only delete repositories matching one of these patterns +# - docker.io/library/* # match normalized repository names +# - ghcr.io/kernel/* # use ["*"] to allow deletion for every repository # # only affects data_dir/images, not the shared OCI cache # ============================================================================= diff --git a/config.example.yaml b/config.example.yaml index 0652efe6..34e59d14 100644 --- a/config.example.yaml +++ b/config.example.yaml @@ -67,6 +67,9 @@ data_dir: /var/lib/hypeman # auto_delete: # enabled: false # server-wide automatic deletion of cached converted images # unused_for: 720h # delete only after no instances or snapshots reference the image for this long +# allowed: # safety gate: only delete repositories matching one of these patterns +# - docker.io/library/* # match normalized repository names +# - ghcr.io/kernel/* # use ["*"] to allow deletion for every repository # # only affects data_dir/images, not the shared OCI cache # ============================================================================= diff --git a/lib/imageretention/README.md b/lib/imageretention/README.md index 25ee7aa7..c70db06f 100644 --- a/lib/imageretention/README.md +++ b/lib/imageretention/README.md @@ -9,6 +9,7 @@ images: auto_delete: enabled: false unused_for: 720h + allowed: [] ``` When auto-delete is enabled: @@ -16,9 +17,12 @@ When auto-delete is enabled: - The server runs a retention sweep on startup and then every minute. - Only converted cached images under `data_dir/images` are eligible for deletion. - Shared OCI cache data under `data_dir/system/oci-cache` is not modified. +- An image repository must also match at least one `allowed` pattern before any retention state is recorded or deletion is attempted. An image is considered in use if any persisted instance metadata or snapshot record still references it. As long as at least one such reference exists, the image is protected from deletion. -The retention timer starts only after the last persisted reference disappears. At that point the server records `unused_since` for the image digest. Once `unused_since + unused_for` has elapsed, the cached image digest is deleted. +The retention timer starts only after the last persisted reference disappears and the image repository is allowed by policy. At that point the server records `unused_since` for the image digest. Once `unused_since + unused_for` has elapsed, the cached image digest is deleted. + +Allow-list patterns match normalized repository names such as `docker.io/library/alpine`. A lone `*` pattern allows deletion for every repository. New instance creation clears any stale unused state for the resolved image before the new instance metadata is written. This helps prevent races where an image is being reused right as retention cleanup runs. diff --git a/lib/imageretention/controller.go b/lib/imageretention/controller.go index dfb3621d..d7df5cb6 100644 --- a/lib/imageretention/controller.go +++ b/lib/imageretention/controller.go @@ -9,6 +9,7 @@ import ( "log/slog" "os" "path/filepath" + "regexp" "strings" "sync" "time" @@ -43,6 +44,7 @@ type Controller struct { paths *paths.Paths imageManager images.Manager unusedFor time.Duration + allowed []string logger *slog.Logger metrics *Metrics now func() time.Time @@ -50,7 +52,7 @@ type Controller struct { } // NewController creates a new image retention controller. -func NewController(p *paths.Paths, imageManager images.Manager, unusedFor time.Duration, logger *slog.Logger, meter metric.Meter) (*Controller, error) { +func NewController(p *paths.Paths, imageManager images.Manager, unusedFor time.Duration, allowed []string, logger *slog.Logger, meter metric.Meter) (*Controller, error) { if logger == nil { logger = slog.Default() } @@ -58,6 +60,7 @@ func NewController(p *paths.Paths, imageManager images.Manager, unusedFor time.D paths: p, imageManager: imageManager, unusedFor: unusedFor, + allowed: normalizeAllowedPatterns(allowed), logger: logger.With("component", "image_retention"), now: time.Now, } @@ -232,6 +235,16 @@ func (c *Controller) sweep(ctx context.Context) (sweepStats, error) { if _, ok := protected[key]; ok { continue } + if !c.isAllowed(ref.Repository) { + removed, err := c.deleteStateIfExists(ref) + if err != nil { + return stats, err + } + if removed { + stats.clearedStates++ + } + continue + } state, ok := states[key] if !ok { @@ -419,6 +432,42 @@ func normalizeDigest(digest string) string { return "sha256:" + digest } +func normalizeAllowedPatterns(patterns []string) []string { + normalized := make([]string, 0, len(patterns)) + for _, pattern := range patterns { + pattern = strings.TrimSpace(pattern) + if pattern == "" { + continue + } + normalized = append(normalized, pattern) + } + return normalized +} + +func (c *Controller) isAllowed(repository string) bool { + for _, pattern := range c.allowed { + if allowPatternMatches(pattern, repository) { + return true + } + } + return false +} + +func allowPatternMatches(pattern, repository string) bool { + pattern = strings.TrimSpace(pattern) + if pattern == "" { + return false + } + if pattern == "*" { + return true + } + + matcher := "^" + regexp.QuoteMeta(pattern) + "$" + matcher = strings.ReplaceAll(matcher, `\*`, ".*") + matcher = strings.ReplaceAll(matcher, `\?`, ".") + return regexp.MustCompile(matcher).MatchString(repository) +} + func (c *Controller) listStates() (map[string]imageState, error) { root := c.paths.SystemImageRetentionDir() states := make(map[string]imageState) diff --git a/lib/imageretention/controller_test.go b/lib/imageretention/controller_test.go index a8eb0dc8..d1f71fbb 100644 --- a/lib/imageretention/controller_test.go +++ b/lib/imageretention/controller_test.go @@ -21,7 +21,7 @@ import ( ) func TestSweepMarksUnreferencedImageUnused(t *testing.T) { - controller, p, _ := newTestController(t, 30*24*time.Hour) + controller, p, _ := newTestController(t, 30*24*time.Hour, []string{"*"}) const digest = "sha256:1111111111111111111111111111111111111111111111111111111111111111" seedReadyImage(t, p, "docker.io/library/alpine:latest", digest) @@ -35,7 +35,7 @@ func TestSweepMarksUnreferencedImageUnused(t *testing.T) { } func TestSweepRecordsMetrics(t *testing.T) { - controller, p, reader := newMetricTestController(t, 24*time.Hour) + controller, p, reader := newMetricTestController(t, 24*time.Hour, []string{"*"}) const trackedDigest = "sha256:0101010101010101010101010101010101010101010101010101010101010101" const deletedDigest = "sha256:0202020202020202020202020202020202020202020202020202020202020202" @@ -65,7 +65,7 @@ func TestSweepRecordsMetrics(t *testing.T) { } func TestSweepReferencedInstancePreventsUnusedState(t *testing.T) { - controller, p, _ := newTestController(t, 30*24*time.Hour) + controller, p, _ := newTestController(t, 30*24*time.Hour, []string{"*"}) const digest = "sha256:2222222222222222222222222222222222222222222222222222222222222222" imageName := "docker.io/library/alpine:latest" seedReadyImage(t, p, imageName, digest) @@ -83,7 +83,7 @@ func TestSweepReferencedInstancePreventsUnusedState(t *testing.T) { } func TestSweepSnapshotReferencePreventsDeletion(t *testing.T) { - controller, p, _ := newTestController(t, 24*time.Hour) + controller, p, _ := newTestController(t, 24*time.Hour, []string{"*"}) const digest = "sha256:3333333333333333333333333333333333333333333333333333333333333333" imageName := "docker.io/library/alpine:latest" seedReadyImage(t, p, imageName, digest) @@ -103,7 +103,7 @@ func TestSweepSnapshotReferencePreventsDeletion(t *testing.T) { } func TestSweepDeletesExpiredImageDigestAndAllTags(t *testing.T) { - controller, p, _ := newTestController(t, 24*time.Hour) + controller, p, _ := newTestController(t, 24*time.Hour, []string{"*"}) const digest = "sha256:4444444444444444444444444444444444444444444444444444444444444444" seedReadyImage(t, p, "docker.io/library/alpine:latest", digest, "stable") writeStateAt(t, p, "docker.io/library/alpine", digest, time.Date(2026, 3, 20, 0, 0, 0, 0, time.UTC)) @@ -123,7 +123,7 @@ func TestSweepDeletesExpiredImageDigestAndAllTags(t *testing.T) { } func TestSweepProtectedImageClearsPriorState(t *testing.T) { - controller, p, _ := newTestController(t, 24*time.Hour) + controller, p, _ := newTestController(t, 24*time.Hour, []string{"*"}) const digest = "sha256:5555555555555555555555555555555555555555555555555555555555555555" imageName := "docker.io/library/alpine:latest" seedReadyImage(t, p, imageName, digest) @@ -141,7 +141,7 @@ func TestSweepProtectedImageClearsPriorState(t *testing.T) { } func TestSweepPrunesStateForManuallyDeletedImage(t *testing.T) { - controller, p, _ := newTestController(t, 24*time.Hour) + controller, p, _ := newTestController(t, 24*time.Hour, []string{"*"}) const digest = "sha256:6666666666666666666666666666666666666666666666666666666666666666" seedReadyImage(t, p, "docker.io/library/alpine:latest", digest) writeStateAt(t, p, "docker.io/library/alpine", digest, time.Date(2026, 3, 20, 0, 0, 0, 0, time.UTC)) @@ -154,7 +154,7 @@ func TestSweepPrunesStateForManuallyDeletedImage(t *testing.T) { } func TestSweepIgnoresNonReadyImages(t *testing.T) { - controller, p, _ := newTestController(t, 24*time.Hour) + controller, p, _ := newTestController(t, 24*time.Hour, []string{"*"}) const digest = "sha256:7777777777777777777777777777777777777777777777777777777777777777" seedImage(t, p, "docker.io/library/alpine:latest", digest, images.StatusPending) @@ -165,7 +165,7 @@ func TestSweepIgnoresNonReadyImages(t *testing.T) { } func TestSweepStaleReferencesDoNotBlockOtherCleanup(t *testing.T) { - controller, p, _ := newTestController(t, 24*time.Hour) + controller, p, _ := newTestController(t, 24*time.Hour, []string{"*"}) const staleDigest = "sha256:8888888888888888888888888888888888888888888888888888888888888888" const liveDigest = "sha256:9999999999999999999999999999999999999999999999999999999999999999" seedReadyImage(t, p, "docker.io/library/alpine:latest", liveDigest) @@ -187,7 +187,7 @@ func TestSweepStaleReferencesDoNotBlockOtherCleanup(t *testing.T) { } func TestMarkUsedClearsState(t *testing.T) { - controller, p, _ := newTestController(t, 24*time.Hour) + controller, p, _ := newTestController(t, 24*time.Hour, []string{"*"}) const digest = "sha256:aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa" seedReadyImage(t, p, "docker.io/library/alpine:latest", digest) writeStateAt(t, p, "docker.io/library/alpine", digest, time.Date(2026, 3, 20, 0, 0, 0, 0, time.UTC)) @@ -199,7 +199,7 @@ func TestMarkUsedClearsState(t *testing.T) { } func TestRunPerformsImmediateSweep(t *testing.T) { - controller, p, _ := newTestController(t, 24*time.Hour) + controller, p, _ := newTestController(t, 24*time.Hour, []string{"*"}) const digest = "sha256:bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb" seedReadyImage(t, p, "docker.io/library/alpine:latest", digest) now := time.Date(2026, 3, 26, 12, 0, 0, 0, time.UTC) @@ -233,7 +233,7 @@ func TestSweepSuccessLogIsDebugOnly(t *testing.T) { var out bytes.Buffer logger := slog.New(slog.NewTextHandler(&out, &slog.HandlerOptions{Level: slog.LevelInfo})) - controller, err := NewController(p, manager, 24*time.Hour, logger, nil) + controller, err := NewController(p, manager, 24*time.Hour, []string{"*"}, logger, nil) require.NoError(t, err) seedReadyImage(t, p, "docker.io/library/alpine:latest", "sha256:cccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc") @@ -241,7 +241,63 @@ func TestSweepSuccessLogIsDebugOnly(t *testing.T) { require.NotContains(t, out.String(), "image auto-delete sweep completed") } -func newTestController(t *testing.T, unusedFor time.Duration) (*Controller, *paths.Paths, images.Manager) { +func TestSweepSkipsDisallowedRepositories(t *testing.T) { + controller, p, _ := newTestController(t, 24*time.Hour, []string{"ghcr.io/kernel/*"}) + const digest = "sha256:dddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddd" + seedReadyImage(t, p, "docker.io/library/alpine:latest", digest) + writeStateAt(t, p, "docker.io/library/alpine", digest, time.Date(2026, 3, 20, 0, 0, 0, 0, time.UTC)) + + controller.now = func() time.Time { + return time.Date(2026, 3, 26, 0, 0, 0, 0, time.UTC) + } + + require.NoError(t, controller.Sweep(context.Background())) + + _, err := os.Stat(p.ImageDigestDir("docker.io/library/alpine", stringsTrimDigest(digest))) + require.NoError(t, err) + _, err = os.Stat(p.ImageRetentionState("docker.io/library/alpine", stringsTrimDigest(digest))) + require.True(t, os.IsNotExist(err)) +} + +func TestSweepAllowsGlobalWildcard(t *testing.T) { + controller, p, _ := newTestController(t, 24*time.Hour, []string{"*"}) + const digest = "sha256:eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee" + seedReadyImage(t, p, "docker.io/library/alpine:latest", digest) + writeStateAt(t, p, "docker.io/library/alpine", digest, time.Date(2026, 3, 20, 0, 0, 0, 0, time.UTC)) + + controller.now = func() time.Time { + return time.Date(2026, 3, 26, 0, 0, 0, 0, time.UTC) + } + + require.NoError(t, controller.Sweep(context.Background())) + + _, err := os.Stat(p.ImageDigestDir("docker.io/library/alpine", stringsTrimDigest(digest))) + require.True(t, os.IsNotExist(err)) +} + +func TestAllowPatternMatches(t *testing.T) { + tests := []struct { + name string + pattern string + repository string + want bool + }{ + {name: "exact", pattern: "docker.io/library/alpine", repository: "docker.io/library/alpine", want: true}, + {name: "global wildcard", pattern: "*", repository: "docker.io/library/alpine", want: true}, + {name: "suffix wildcard", pattern: "docker.io/library/*", repository: "docker.io/library/alpine", want: true}, + {name: "embedded wildcard", pattern: "ghcr.io/kernel/hypeman-*", repository: "ghcr.io/kernel/hypeman-test", want: true}, + {name: "question mark", pattern: "ghcr.io/kernel/hypeman-?", repository: "ghcr.io/kernel/hypeman-a", want: true}, + {name: "no match", pattern: "ghcr.io/kernel/*", repository: "docker.io/library/alpine", want: false}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + require.Equal(t, tt.want, allowPatternMatches(tt.pattern, tt.repository)) + }) + } +} + +func newTestController(t *testing.T, unusedFor time.Duration, allowed []string) (*Controller, *paths.Paths, images.Manager) { t.Helper() dataDir := t.TempDir() @@ -249,12 +305,12 @@ func newTestController(t *testing.T, unusedFor time.Duration) (*Controller, *pat manager, err := images.NewManager(p, 1, nil) require.NoError(t, err) - controller, err := NewController(p, manager, unusedFor, nil, nil) + controller, err := NewController(p, manager, unusedFor, allowed, nil, nil) require.NoError(t, err) return controller, p, manager } -func newMetricTestController(t *testing.T, unusedFor time.Duration) (*Controller, *paths.Paths, *otelmetric.ManualReader) { +func newMetricTestController(t *testing.T, unusedFor time.Duration, allowed []string) (*Controller, *paths.Paths, *otelmetric.ManualReader) { t.Helper() dataDir := t.TempDir() @@ -264,7 +320,7 @@ func newMetricTestController(t *testing.T, unusedFor time.Duration) (*Controller reader := otelmetric.NewManualReader() provider := otelmetric.NewMeterProvider(otelmetric.WithReader(reader)) - controller, err := NewController(p, manager, unusedFor, nil, provider.Meter("test")) + controller, err := NewController(p, manager, unusedFor, allowed, nil, provider.Meter("test")) require.NoError(t, err) return controller, p, reader }