diff --git a/cmd/api/config/config.go b/cmd/api/config/config.go index 4e0bdadf..ce7c90a0 100644 --- a/cmd/api/config/config.go +++ b/cmd/api/config/config.go @@ -123,6 +123,18 @@ type LoggingConfig struct { RotateInterval string `koanf:"rotate_interval"` } +// ImagesAutoDeleteConfig holds server-wide image retention settings. +type ImagesAutoDeleteConfig struct { + Enabled bool `koanf:"enabled"` + UnusedFor string `koanf:"unused_for"` + Allowed []string `koanf:"allowed"` +} + +// ImagesConfig holds image-management settings. +type ImagesConfig struct { + AutoDelete ImagesAutoDeleteConfig `koanf:"auto_delete"` +} + // BuildConfig holds source-to-image build system settings. type BuildConfig struct { MaxConcurrentSourceBuilds int `koanf:"max_concurrent_source_builds"` @@ -226,6 +238,7 @@ type Config struct { Metrics MetricsConfig `koanf:"metrics"` Otel OtelConfig `koanf:"otel"` Logging LoggingConfig `koanf:"logging"` + Images ImagesConfig `koanf:"images"` Build BuildConfig `koanf:"build"` Registry RegistryConfig `koanf:"registry"` Limits LimitsConfig `koanf:"limits"` @@ -319,6 +332,14 @@ func defaultConfig() *Config { RotateInterval: "5m", }, + Images: ImagesConfig{ + AutoDelete: ImagesAutoDeleteConfig{ + Enabled: false, + UnusedFor: "720h", + Allowed: []string{}, + }, + }, + Build: BuildConfig{ MaxConcurrentSourceBuilds: 2, BuilderImage: "", // empty = build from embedded Dockerfile on first run @@ -511,6 +532,12 @@ func (c *Config) Validate() error { if c.Build.Timeout <= 0 { return fmt.Errorf("build.timeout must be positive, got %d", c.Build.Timeout) } + if err := validateDuration("images.auto_delete.unused_for", c.Images.AutoDelete.UnusedFor); err != nil { + return err + } + for i, pattern := range c.Images.AutoDelete.Allowed { + c.Images.AutoDelete.Allowed[i] = strings.TrimSpace(pattern) + } algorithm := strings.ToLower(c.Snapshot.CompressionDefault.Algorithm) c.Snapshot.CompressionDefault.Algorithm = algorithm if c.Snapshot.CompressionDefault.Enabled { diff --git a/cmd/api/config/config_test.go b/cmd/api/config/config_test.go index 6ae361a1..bb451957 100644 --- a/cmd/api/config/config_test.go +++ b/cmd/api/config/config_test.go @@ -31,6 +31,15 @@ func TestDefaultConfigIncludesMetricsSettings(t *testing.T) { if cfg.Otel.SuccessfulGetSampleRatio != 0.1 { t.Fatalf("expected default otel.successful_get_sample_ratio to be 0.1, got %v", cfg.Otel.SuccessfulGetSampleRatio) } + if cfg.Images.AutoDelete.Enabled { + t.Fatalf("expected default images.auto_delete.enabled to be false") + } + if cfg.Images.AutoDelete.UnusedFor != "720h" { + t.Fatalf("expected default images.auto_delete.unused_for to be 720h, got %q", cfg.Images.AutoDelete.UnusedFor) + } + if len(cfg.Images.AutoDelete.Allowed) != 0 { + t.Fatalf("expected default images.auto_delete.allowed to be empty, got %v", cfg.Images.AutoDelete.Allowed) + } } func TestLoadEnvOverridesMetricsAndOtelInterval(t *testing.T) { @@ -138,6 +147,57 @@ func TestValidateRejectsInvalidResourceRefreshInterval(t *testing.T) { } } +func TestLoadUsesDefaultImageAutoDeleteRetentionWindow(t *testing.T) { + tmp := t.TempDir() + cfgPath := filepath.Join(tmp, "config.yaml") + if err := os.WriteFile(cfgPath, []byte("images:\n auto_delete:\n enabled: true\n"), 0600); err != nil { + t.Fatalf("write temp config: %v", err) + } + + cfg, err := Load(cfgPath) + if err != nil { + t.Fatalf("load config: %v", err) + } + + if !cfg.Images.AutoDelete.Enabled { + t.Fatalf("expected images.auto_delete.enabled override to be true") + } + if cfg.Images.AutoDelete.UnusedFor != "720h" { + t.Fatalf("expected default images.auto_delete.unused_for to remain 720h, got %q", cfg.Images.AutoDelete.UnusedFor) + } + if len(cfg.Images.AutoDelete.Allowed) != 0 { + t.Fatalf("expected default images.auto_delete.allowed to remain empty, got %v", cfg.Images.AutoDelete.Allowed) + } +} + +func TestValidateRejectsInvalidImageAutoDeleteUnusedFor(t *testing.T) { + cfg := defaultConfig() + cfg.Images.AutoDelete.UnusedFor = "not-a-duration" + + err := cfg.Validate() + if err == nil { + t.Fatalf("expected validation error for invalid images.auto_delete.unused_for") + } +} + +func TestValidateTrimsImageAutoDeleteAllowedPatterns(t *testing.T) { + cfg := defaultConfig() + cfg.Images.AutoDelete.Allowed = []string{" docker.io/library/* ", " ", "ghcr.io/kernel/*"} + + if err := cfg.Validate(); err != nil { + t.Fatalf("expected image auto delete allow list to validate, got %v", err) + } + if cfg.Images.AutoDelete.Allowed[0] != "docker.io/library/*" { + t.Fatalf("expected first allow pattern to be trimmed, got %q", cfg.Images.AutoDelete.Allowed[0]) + } + if cfg.Images.AutoDelete.Allowed[1] != "" { + t.Fatalf("expected whitespace-only allow pattern to trim to empty string, got %q", cfg.Images.AutoDelete.Allowed[1]) + } + if cfg.Images.AutoDelete.Allowed[2] != "ghcr.io/kernel/*" { + t.Fatalf("expected third allow pattern to be trimmed, got %q", cfg.Images.AutoDelete.Allowed[2]) + } +} + func TestValidateRejectsEmptyActiveBallooningDurations(t *testing.T) { cfg := defaultConfig() cfg.Hypervisor.Memory.ActiveBallooning.PollInterval = " " diff --git a/cmd/api/image_retention_test.go b/cmd/api/image_retention_test.go new file mode 100644 index 00000000..60299cea --- /dev/null +++ b/cmd/api/image_retention_test.go @@ -0,0 +1,54 @@ +package main + +import ( + "context" + "sync/atomic" + "testing" + "time" + + "golang.org/x/sync/errgroup" +) + +type stubImageRetentionRunner struct { + runCount atomic.Int32 +} + +func (s *stubImageRetentionRunner) Run(ctx context.Context) error { + s.runCount.Add(1) + <-ctx.Done() + return nil +} + +func TestStartImageRetentionControllerSkipsNilController(t *testing.T) { + grp, ctx := errgroup.WithContext(context.Background()) + + started := startImageRetentionController(grp, ctx, nil) + if started { + t.Fatalf("expected nil controller not to start") + } +} + +func TestStartImageRetentionControllerStartsRunner(t *testing.T) { + grp, ctx := errgroup.WithContext(context.Background()) + ctx, cancel := context.WithCancel(ctx) + defer cancel() + + runner := &stubImageRetentionRunner{} + started := startImageRetentionController(grp, ctx, runner) + if !started { + t.Fatalf("expected controller to start") + } + + deadline := time.Now().Add(time.Second) + for runner.runCount.Load() == 0 && time.Now().Before(deadline) { + time.Sleep(10 * time.Millisecond) + } + if runner.runCount.Load() != 1 { + t.Fatalf("expected runner to be started once, got %d", runner.runCount.Load()) + } + + cancel() + if err := grp.Wait(); err != nil { + t.Fatalf("wait for retention runner: %v", err) + } +} diff --git a/cmd/api/main.go b/cmd/api/main.go index 01b17f57..3430b9fe 100644 --- a/cmd/api/main.go +++ b/cmd/api/main.go @@ -26,15 +26,20 @@ import ( "github.com/kernel/hypeman/lib/devices" "github.com/kernel/hypeman/lib/guest" "github.com/kernel/hypeman/lib/hypervisor/qemu" + "github.com/kernel/hypeman/lib/imageretention" + "github.com/kernel/hypeman/lib/images" "github.com/kernel/hypeman/lib/instances" + loglib "github.com/kernel/hypeman/lib/logger" mw "github.com/kernel/hypeman/lib/middleware" "github.com/kernel/hypeman/lib/oapi" "github.com/kernel/hypeman/lib/otel" + "github.com/kernel/hypeman/lib/paths" "github.com/kernel/hypeman/lib/registry" "github.com/kernel/hypeman/lib/scopes" "github.com/kernel/hypeman/lib/vmm" nethttpmiddleware "github.com/oapi-codegen/nethttp-middleware" "github.com/riandyrn/otelchi" + "go.opentelemetry.io/otel/metric" "golang.org/x/sync/errgroup" ) @@ -59,6 +64,40 @@ func newMetricsServer(addr string, handler http.Handler) *http.Server { } } +type imageRetentionRunner interface { + Run(ctx context.Context) error +} + +func configureImageRetentionController(cfg *config.Config, imageManager images.Manager, instanceManager instances.Manager, logger *slog.Logger, meter metric.Meter) (imageRetentionRunner, error) { + if cfg == nil || !cfg.Images.AutoDelete.Enabled { + return nil, nil + } + + unusedFor, err := time.ParseDuration(cfg.Images.AutoDelete.UnusedFor) + if err != nil { + return nil, fmt.Errorf("invalid images.auto_delete.unused_for %q: %w", cfg.Images.AutoDelete.UnusedFor, err) + } + + controller, err := imageretention.NewController(paths.New(cfg.DataDir), imageManager, unusedFor, cfg.Images.AutoDelete.Allowed, logger, meter) + if err != nil { + return nil, err + } + if setter, ok := instanceManager.(instances.ImageUsageRecorderSetter); ok { + setter.SetImageUsageRecorder(controller) + } + return controller, nil +} + +func startImageRetentionController(grp *errgroup.Group, ctx context.Context, controller imageRetentionRunner) bool { + if grp == nil || controller == nil { + return false + } + grp.Go(func() error { + return controller.Run(ctx) + }) + return true +} + func run() error { // Load config early for OTel initialization // Config path can be specified via CONFIG_PATH env var or defaults to platform-specific locations @@ -429,6 +468,20 @@ func run() error { // Error group for coordinated shutdown grp, gctx := errgroup.WithContext(ctx) + retentionController, err := configureImageRetentionController( + app.Config, + app.ImageManager, + app.InstanceManager, + logger, + otelProvider.MeterFor(loglib.SubsystemImages), + ) + if err != nil { + return err + } + if startImageRetentionController(grp, gctx, retentionController) { + logger.Info("image auto-delete enabled", "unused_for", app.Config.Images.AutoDelete.UnusedFor) + } + // Start build manager background services (vsock handler for builder VMs) if err := app.BuildManager.Start(gctx); err != nil { logger.Error("failed to start build manager", "error", err) diff --git a/config.example.darwin.yaml b/config.example.darwin.yaml index 0ace51dc..5b16e672 100644 --- a/config.example.darwin.yaml +++ b/config.example.darwin.yaml @@ -67,6 +67,18 @@ network: logging: level: debug +# ============================================================================= +# Images +# ============================================================================= +# images: +# auto_delete: +# enabled: false # server-wide automatic deletion of cached converted images +# unused_for: 720h # delete only after no instances or snapshots reference the image for this long +# allowed: # safety gate: only delete repositories matching one of these patterns +# - docker.io/library/* # match normalized repository names +# - ghcr.io/kernel/* # use ["*"] to allow deletion for every repository +# # only affects data_dir/images, not the shared OCI cache + # ============================================================================= # Caddy / Ingress Configuration # ============================================================================= diff --git a/config.example.yaml b/config.example.yaml index 345c2da9..34e59d14 100644 --- a/config.example.yaml +++ b/config.example.yaml @@ -60,6 +60,18 @@ data_dir: /var/lib/hypeman # logging: # level: info # debug, info, warn, error +# ============================================================================= +# Images +# ============================================================================= +# images: +# auto_delete: +# enabled: false # server-wide automatic deletion of cached converted images +# unused_for: 720h # delete only after no instances or snapshots reference the image for this long +# allowed: # safety gate: only delete repositories matching one of these patterns +# - docker.io/library/* # match normalized repository names +# - ghcr.io/kernel/* # use ["*"] to allow deletion for every repository +# # only affects data_dir/images, not the shared OCI cache + # ============================================================================= # Caddy / Ingress Configuration # ============================================================================= diff --git a/lib/imageretention/README.md b/lib/imageretention/README.md new file mode 100644 index 00000000..c70db06f --- /dev/null +++ b/lib/imageretention/README.md @@ -0,0 +1,28 @@ +# Image Retention + +This feature automatically deletes cached converted images after they have been unused for a configurable amount of time. + +The retention window is server-wide and controlled by: + +```yaml +images: + auto_delete: + enabled: false + unused_for: 720h + allowed: [] +``` + +When auto-delete is enabled: + +- The server runs a retention sweep on startup and then every minute. +- Only converted cached images under `data_dir/images` are eligible for deletion. +- Shared OCI cache data under `data_dir/system/oci-cache` is not modified. +- An image repository must also match at least one `allowed` pattern before any retention state is recorded or deletion is attempted. + +An image is considered in use if any persisted instance metadata or snapshot record still references it. As long as at least one such reference exists, the image is protected from deletion. + +The retention timer starts only after the last persisted reference disappears and the image repository is allowed by policy. At that point the server records `unused_since` for the image digest. Once `unused_since + unused_for` has elapsed, the cached image digest is deleted. + +Allow-list patterns match normalized repository names such as `docker.io/library/alpine`. A lone `*` pattern allows deletion for every repository. + +New instance creation clears any stale unused state for the resolved image before the new instance metadata is written. This helps prevent races where an image is being reused right as retention cleanup runs. diff --git a/lib/imageretention/controller.go b/lib/imageretention/controller.go new file mode 100644 index 00000000..d7df5cb6 --- /dev/null +++ b/lib/imageretention/controller.go @@ -0,0 +1,567 @@ +package imageretention + +import ( + "context" + "encoding/json" + "errors" + "fmt" + "io/fs" + "log/slog" + "os" + "path/filepath" + "regexp" + "strings" + "sync" + "time" + + "github.com/kernel/hypeman/lib/images" + "github.com/kernel/hypeman/lib/instances" + "github.com/kernel/hypeman/lib/paths" + snapshotstore "github.com/kernel/hypeman/lib/snapshot" + "go.opentelemetry.io/otel/metric" +) + +const sweepInterval = time.Minute + +type imageState struct { + Repository string `json:"repository"` + Digest string `json:"digest"` + UnusedSince time.Time `json:"unused_since"` +} + +type digestRef struct { + Repository string + Digest string + DigestHex string +} + +func (r digestRef) String() string { + return r.Repository + "@" + r.Digest +} + +// Controller tracks unused cached images and deletes them after a retention window. +type Controller struct { + paths *paths.Paths + imageManager images.Manager + unusedFor time.Duration + allowed []string + logger *slog.Logger + metrics *Metrics + now func() time.Time + mu sync.Mutex +} + +// NewController creates a new image retention controller. +func NewController(p *paths.Paths, imageManager images.Manager, unusedFor time.Duration, allowed []string, logger *slog.Logger, meter metric.Meter) (*Controller, error) { + if logger == nil { + logger = slog.Default() + } + controller := &Controller{ + paths: p, + imageManager: imageManager, + unusedFor: unusedFor, + allowed: normalizeAllowedPatterns(allowed), + logger: logger.With("component", "image_retention"), + now: time.Now, + } + if meter != nil { + metrics, err := newMetrics(meter, controller) + if err != nil { + return nil, fmt.Errorf("create image retention metrics: %w", err) + } + controller.metrics = metrics + } + return controller, nil +} + +// Run executes one sweep immediately, then continues sweeping every minute until ctx is cancelled. +func (c *Controller) Run(ctx context.Context) error { + c.logger.InfoContext(ctx, "image auto-delete scheduler started", "unused_for", c.unusedFor, "interval", sweepInterval) + if err := c.Sweep(ctx); err != nil { + c.logger.ErrorContext(ctx, "image auto-delete sweep failed", "error", err) + } + + ticker := time.NewTicker(sweepInterval) + defer ticker.Stop() + + for { + select { + case <-ctx.Done(): + return nil + case <-ticker.C: + if err := c.Sweep(ctx); err != nil { + c.logger.ErrorContext(ctx, "image auto-delete sweep failed", "error", err) + } + } + } +} + +// Sweep performs one reconciliation pass of image retention state. +func (c *Controller) Sweep(ctx context.Context) error { + c.mu.Lock() + defer c.mu.Unlock() + + start := time.Now() + stats, err := c.sweep(ctx) + status := "success" + if err != nil { + status = "error" + } + if c.metrics != nil { + c.metrics.RecordSweep(ctx, status, time.Since(start), stats.staleReferences) + } + if err != nil { + return err + } + + c.logger.DebugContext(ctx, "image auto-delete sweep completed", + "ready_images", stats.readyImages, + "protected_images", stats.protectedImages, + "marked_unused", stats.markedUnused, + "cleared_states", stats.clearedStates, + "deleted_images", stats.deletedImages, + "pruned_states", stats.prunedStates, + "stale_references", stats.staleReferences, + ) + return nil +} + +// MarkUsed clears retention state for a digest that is about to be referenced by a new instance. +func (c *Controller) MarkUsed(ctx context.Context, imageName, digest string) error { + c.mu.Lock() + defer c.mu.Unlock() + + ref, err := c.resolveDigestRef(ctx, imageName, digest) + if err != nil { + return err + } + removed, err := c.deleteStateIfExists(ref) + if err != nil { + return err + } + if removed { + c.logger.DebugContext(ctx, "cleared image auto-delete state", "image", imageName, "digest", ref.Digest) + } + return nil +} + +type sweepStats struct { + readyImages int + protectedImages int + markedUnused int + clearedStates int + deletedImages int + prunedStates int + staleReferences int +} + +func (c *Controller) sweep(ctx context.Context) (sweepStats, error) { + var stats sweepStats + + allImages, err := c.imageManager.ListImages(ctx) + if err != nil { + return stats, fmt.Errorf("list images: %w", err) + } + + ready := make(map[string]digestRef) + allLocal := make(map[string]struct{}) + nonReady := make(map[string]digestRef) + for _, img := range allImages { + ref, err := digestRefFromImage(img) + if err != nil { + c.logger.WarnContext(ctx, "skipping image with invalid retention metadata", "image", img.Name, "digest", img.Digest, "error", err) + continue + } + allLocal[ref.String()] = struct{}{} + if img.Status == images.StatusReady { + ready[ref.String()] = ref + stats.readyImages++ + continue + } + nonReady[ref.String()] = ref + } + + states, err := c.listStates() + if err != nil { + return stats, err + } + + protected, staleRefs, err := c.collectProtectedDigests(ctx) + if err != nil { + return stats, err + } + stats.protectedImages = len(protected) + stats.staleReferences = staleRefs + + for _, ref := range protected { + removed, err := c.deleteStateIfExists(ref) + if err != nil { + return stats, err + } + if removed { + stats.clearedStates++ + } + } + + for _, ref := range nonReady { + removed, err := c.deleteStateIfExists(ref) + if err != nil { + return stats, err + } + if removed { + stats.clearedStates++ + } + } + + for key, state := range states { + if _, ok := allLocal[key]; ok { + continue + } + removed, err := c.deleteStateIfExists(digestRef{ + Repository: state.Repository, + Digest: state.Digest, + DigestHex: strings.TrimPrefix(state.Digest, "sha256:"), + }) + if err != nil { + return stats, err + } + if removed { + stats.prunedStates++ + } + } + + now := c.now().UTC() + for key, ref := range ready { + if _, ok := protected[key]; ok { + continue + } + if !c.isAllowed(ref.Repository) { + removed, err := c.deleteStateIfExists(ref) + if err != nil { + return stats, err + } + if removed { + stats.clearedStates++ + } + continue + } + + state, ok := states[key] + if !ok { + if err := c.writeState(imageState{ + Repository: ref.Repository, + Digest: ref.Digest, + UnusedSince: now, + }); err != nil { + return stats, err + } + stats.markedUnused++ + continue + } + + if state.UnusedSince.IsZero() { + state.UnusedSince = now + if err := c.writeState(state); err != nil { + return stats, err + } + stats.markedUnused++ + continue + } + + if state.UnusedSince.UTC().Add(c.unusedFor).After(now) { + continue + } + + if err := c.imageManager.DeleteImage(ctx, ref.String()); err != nil { + if errors.Is(err, images.ErrNotFound) { + if c.metrics != nil { + c.metrics.RecordDelete(ctx, "not_found") + } + removed, removeErr := c.deleteStateIfExists(ref) + if removeErr != nil { + return stats, removeErr + } + if removed { + stats.prunedStates++ + } + continue + } + if c.metrics != nil { + c.metrics.RecordDelete(ctx, "error") + } + return stats, fmt.Errorf("delete image %s: %w", ref.String(), err) + } + if c.metrics != nil { + c.metrics.RecordDelete(ctx, "success") + } + + c.logger.InfoContext(ctx, "deleted unused cached image", "image", ref.String(), "unused_since", state.UnusedSince, "unused_for", c.unusedFor) + removed, err := c.deleteStateIfExists(ref) + if err != nil { + return stats, err + } + if removed { + stats.deletedImages++ + } + } + + return stats, nil +} + +func (c *Controller) collectProtectedDigests(ctx context.Context) (map[string]digestRef, int, error) { + imageRefs, err := c.collectReferencedImageNames(ctx) + if err != nil { + return nil, 0, err + } + + protected := make(map[string]digestRef, len(imageRefs)) + staleRefs := 0 + for imageName := range imageRefs { + img, err := c.imageManager.GetImage(ctx, imageName) + if err != nil { + staleRefs++ + c.logger.WarnContext(ctx, "skipping stale image reference during retention sweep", "image", imageName, "error", err) + continue + } + ref, err := digestRefFromImage(*img) + if err != nil { + staleRefs++ + c.logger.WarnContext(ctx, "skipping unresolvable retained image", "image", imageName, "digest", img.Digest, "error", err) + continue + } + protected[ref.String()] = ref + } + return protected, staleRefs, nil +} + +func (c *Controller) collectReferencedImageNames(ctx context.Context) (map[string]struct{}, error) { + _ = ctx + + refs := make(map[string]struct{}) + guestDir := c.paths.GuestsDir() + if entries, err := os.ReadDir(guestDir); err == nil { + for _, entry := range entries { + if !entry.IsDir() { + continue + } + metaPath := c.paths.InstanceMetadata(entry.Name()) + content, err := os.ReadFile(metaPath) + if err != nil { + if errors.Is(err, os.ErrNotExist) { + continue + } + return nil, fmt.Errorf("read instance metadata %s: %w", metaPath, err) + } + + var stored instances.StoredMetadata + if err := json.Unmarshal(content, &stored); err != nil { + c.logger.Warn("skipping invalid instance metadata during retention sweep", "path", metaPath, "error", err) + continue + } + if stored.Image != "" { + refs[stored.Image] = struct{}{} + } + } + } else if !errors.Is(err, os.ErrNotExist) { + return nil, fmt.Errorf("read guests directory: %w", err) + } + + store := snapshotstore.NewStore(c.paths) + records, err := store.ListRecords() + if err != nil { + return nil, fmt.Errorf("list snapshot records: %w", err) + } + for _, record := range records { + if len(record.StoredMetadata) == 0 { + continue + } + var stored instances.StoredMetadata + if err := json.Unmarshal(record.StoredMetadata, &stored); err != nil { + c.logger.Warn("skipping invalid snapshot stored metadata during retention sweep", "snapshot_id", record.Snapshot.Id, "error", err) + continue + } + if stored.Image != "" { + refs[stored.Image] = struct{}{} + } + } + + return refs, nil +} + +func (c *Controller) resolveDigestRef(ctx context.Context, imageName, digest string) (digestRef, error) { + if strings.TrimSpace(digest) == "" { + img, err := c.imageManager.GetImage(ctx, imageName) + if err != nil { + return digestRef{}, err + } + return digestRefFromImage(*img) + } + + ref, err := images.ParseNormalizedRef(imageName) + if err != nil { + return digestRef{}, fmt.Errorf("parse image name: %w", err) + } + return digestRef{ + Repository: ref.Repository(), + Digest: normalizeDigest(digest), + DigestHex: strings.TrimPrefix(normalizeDigest(digest), "sha256:"), + }, nil +} + +func digestRefFromImage(img images.Image) (digestRef, error) { + if strings.TrimSpace(img.Digest) == "" { + return digestRef{}, fmt.Errorf("missing digest") + } + ref, err := images.ParseNormalizedRef(img.Name) + if err != nil { + return digestRef{}, err + } + digest := normalizeDigest(img.Digest) + return digestRef{ + Repository: ref.Repository(), + Digest: digest, + DigestHex: strings.TrimPrefix(digest, "sha256:"), + }, nil +} + +func normalizeDigest(digest string) string { + digest = strings.TrimSpace(digest) + if strings.HasPrefix(digest, "sha256:") { + return digest + } + return "sha256:" + digest +} + +func normalizeAllowedPatterns(patterns []string) []string { + normalized := make([]string, 0, len(patterns)) + for _, pattern := range patterns { + pattern = strings.TrimSpace(pattern) + if pattern == "" { + continue + } + normalized = append(normalized, pattern) + } + return normalized +} + +func (c *Controller) isAllowed(repository string) bool { + for _, pattern := range c.allowed { + if allowPatternMatches(pattern, repository) { + return true + } + } + return false +} + +func allowPatternMatches(pattern, repository string) bool { + pattern = strings.TrimSpace(pattern) + if pattern == "" { + return false + } + if pattern == "*" { + return true + } + + matcher := "^" + regexp.QuoteMeta(pattern) + "$" + matcher = strings.ReplaceAll(matcher, `\*`, ".*") + matcher = strings.ReplaceAll(matcher, `\?`, ".") + return regexp.MustCompile(matcher).MatchString(repository) +} + +func (c *Controller) listStates() (map[string]imageState, error) { + root := c.paths.SystemImageRetentionDir() + states := make(map[string]imageState) + + err := filepath.WalkDir(root, func(path string, d fs.DirEntry, walkErr error) error { + if walkErr != nil { + if errors.Is(walkErr, os.ErrNotExist) { + return nil + } + return walkErr + } + if d.IsDir() || filepath.Ext(path) != ".json" { + return nil + } + + content, err := os.ReadFile(path) + if err != nil { + if errors.Is(err, os.ErrNotExist) { + return nil + } + return fmt.Errorf("read image retention state %s: %w", path, err) + } + + var state imageState + if err := json.Unmarshal(content, &state); err != nil { + return fmt.Errorf("unmarshal image retention state %s: %w", path, err) + } + + states[state.Repository+"@"+state.Digest] = state + return nil + }) + if err != nil && !errors.Is(err, os.ErrNotExist) { + return nil, fmt.Errorf("list image retention state: %w", err) + } + + return states, nil +} + +func (c *Controller) listStatesSnapshot() (map[string]imageState, time.Time, error) { + c.mu.Lock() + defer c.mu.Unlock() + + states, err := c.listStates() + if err != nil { + return nil, time.Time{}, err + } + return states, c.now().UTC(), nil +} + +func (c *Controller) writeState(state imageState) error { + path := c.paths.ImageRetentionState(state.Repository, strings.TrimPrefix(state.Digest, "sha256:")) + if err := os.MkdirAll(filepath.Dir(path), 0o755); err != nil { + return fmt.Errorf("create image retention directory: %w", err) + } + + content, err := json.MarshalIndent(state, "", " ") + if err != nil { + return fmt.Errorf("marshal image retention state: %w", err) + } + + tmpPath := path + ".tmp" + if err := os.WriteFile(tmpPath, content, 0o644); err != nil { + return fmt.Errorf("write image retention temp state: %w", err) + } + if err := os.Rename(tmpPath, path); err != nil { + _ = os.Remove(tmpPath) + return fmt.Errorf("rename image retention state: %w", err) + } + return nil +} + +func (c *Controller) deleteStateIfExists(ref digestRef) (bool, error) { + path := c.paths.ImageRetentionState(ref.Repository, ref.DigestHex) + if err := os.Remove(path); err != nil { + if errors.Is(err, os.ErrNotExist) { + return false, nil + } + return false, fmt.Errorf("remove image retention state %s: %w", path, err) + } + c.removeEmptyParents(filepath.Dir(path), c.paths.SystemImageRetentionDir()) + return true, nil +} + +func (c *Controller) removeEmptyParents(path string, root string) { + root = filepath.Clean(root) + for { + cleaned := filepath.Clean(path) + if cleaned == root || cleaned == "." || cleaned == string(filepath.Separator) { + return + } + err := os.Remove(cleaned) + if err != nil { + return + } + path = filepath.Dir(cleaned) + } +} diff --git a/lib/imageretention/controller_test.go b/lib/imageretention/controller_test.go new file mode 100644 index 00000000..d1f71fbb --- /dev/null +++ b/lib/imageretention/controller_test.go @@ -0,0 +1,499 @@ +package imageretention + +import ( + "bytes" + "context" + "encoding/json" + "log/slog" + "os" + "path/filepath" + "testing" + "time" + + "github.com/kernel/hypeman/lib/images" + "github.com/kernel/hypeman/lib/instances" + "github.com/kernel/hypeman/lib/paths" + snapshotstore "github.com/kernel/hypeman/lib/snapshot" + "github.com/stretchr/testify/require" + "go.opentelemetry.io/otel/attribute" + otelmetric "go.opentelemetry.io/otel/sdk/metric" + "go.opentelemetry.io/otel/sdk/metric/metricdata" +) + +func TestSweepMarksUnreferencedImageUnused(t *testing.T) { + controller, p, _ := newTestController(t, 30*24*time.Hour, []string{"*"}) + const digest = "sha256:1111111111111111111111111111111111111111111111111111111111111111" + seedReadyImage(t, p, "docker.io/library/alpine:latest", digest) + + now := time.Date(2026, 3, 26, 12, 0, 0, 0, time.UTC) + controller.now = func() time.Time { return now } + + require.NoError(t, controller.Sweep(context.Background())) + + state := readState(t, p, "docker.io/library/alpine", digest) + require.Equal(t, now, state.UnusedSince) +} + +func TestSweepRecordsMetrics(t *testing.T) { + controller, p, reader := newMetricTestController(t, 24*time.Hour, []string{"*"}) + const trackedDigest = "sha256:0101010101010101010101010101010101010101010101010101010101010101" + const deletedDigest = "sha256:0202020202020202020202020202020202020202020202020202020202020202" + + seedReadyImage(t, p, "docker.io/library/alpine:latest", trackedDigest) + seedReadyImage(t, p, "docker.io/library/busybox:latest", deletedDigest) + writeStateAt(t, p, "docker.io/library/busybox", deletedDigest, time.Date(2026, 3, 20, 0, 0, 0, 0, time.UTC)) + writeInstanceMetadata(t, p, "inst-1", instances.StoredMetadata{ + Id: "inst-1", + Name: "instance-1", + Image: "docker.io/library/debian@sha256:0303030303030303030303030303030303030303030303030303030303030303", + }) + + controller.now = func() time.Time { + return time.Date(2026, 3, 26, 0, 0, 0, 0, time.UTC) + } + + require.NoError(t, controller.Sweep(context.Background())) + + rm := collectRetentionMetrics(t, reader) + + require.Equal(t, int64(1), int64SumValue(t, rm, "hypeman_image_retention_sweeps_total", map[string]string{"status": "success"})) + require.Equal(t, uint64(1), float64HistogramCount(t, rm, "hypeman_image_retention_sweep_duration_seconds", map[string]string{"status": "success"})) + require.Equal(t, int64(1), int64SumValue(t, rm, "hypeman_image_retention_deletes_total", map[string]string{"status": "success"})) + require.Equal(t, int64(1), int64SumValue(t, rm, "hypeman_image_retention_stale_references_total", nil)) + require.Equal(t, int64(1), int64GaugeValue(t, rm, "hypeman_image_retention_pending_images", map[string]string{"state": "tracked"})) + require.Equal(t, int64(0), int64GaugeValue(t, rm, "hypeman_image_retention_pending_images", map[string]string{"state": "expired"})) +} + +func TestSweepReferencedInstancePreventsUnusedState(t *testing.T) { + controller, p, _ := newTestController(t, 30*24*time.Hour, []string{"*"}) + const digest = "sha256:2222222222222222222222222222222222222222222222222222222222222222" + imageName := "docker.io/library/alpine:latest" + seedReadyImage(t, p, imageName, digest) + writeInstanceMetadata(t, p, "inst-1", instances.StoredMetadata{ + Id: "inst-1", + Name: "instance-1", + Image: imageName, + }) + + require.NoError(t, controller.Sweep(context.Background())) + + _, err := os.Stat(p.ImageRetentionState("docker.io/library/alpine", stringsTrimDigest(digest))) + require.Error(t, err) + require.True(t, os.IsNotExist(err)) +} + +func TestSweepSnapshotReferencePreventsDeletion(t *testing.T) { + controller, p, _ := newTestController(t, 24*time.Hour, []string{"*"}) + const digest = "sha256:3333333333333333333333333333333333333333333333333333333333333333" + imageName := "docker.io/library/alpine:latest" + seedReadyImage(t, p, imageName, digest) + writeSnapshotRecord(t, p, "snap-1", instances.StoredMetadata{Image: imageName}) + writeStateAt(t, p, "docker.io/library/alpine", digest, time.Date(2026, 3, 20, 0, 0, 0, 0, time.UTC)) + + controller.now = func() time.Time { + return time.Date(2026, 3, 26, 0, 0, 0, 0, time.UTC) + } + + require.NoError(t, controller.Sweep(context.Background())) + + _, err := os.Stat(p.ImageDigestDir("docker.io/library/alpine", stringsTrimDigest(digest))) + require.NoError(t, err) + _, err = os.Stat(p.ImageRetentionState("docker.io/library/alpine", stringsTrimDigest(digest))) + require.True(t, os.IsNotExist(err)) +} + +func TestSweepDeletesExpiredImageDigestAndAllTags(t *testing.T) { + controller, p, _ := newTestController(t, 24*time.Hour, []string{"*"}) + const digest = "sha256:4444444444444444444444444444444444444444444444444444444444444444" + seedReadyImage(t, p, "docker.io/library/alpine:latest", digest, "stable") + writeStateAt(t, p, "docker.io/library/alpine", digest, time.Date(2026, 3, 20, 0, 0, 0, 0, time.UTC)) + + controller.now = func() time.Time { + return time.Date(2026, 3, 26, 0, 0, 0, 0, time.UTC) + } + + require.NoError(t, controller.Sweep(context.Background())) + + _, err := os.Stat(p.ImageDigestDir("docker.io/library/alpine", stringsTrimDigest(digest))) + require.True(t, os.IsNotExist(err)) + _, err = os.Stat(p.ImageTagSymlink("docker.io/library/alpine", "latest")) + require.True(t, os.IsNotExist(err)) + _, err = os.Stat(p.ImageTagSymlink("docker.io/library/alpine", "stable")) + require.True(t, os.IsNotExist(err)) +} + +func TestSweepProtectedImageClearsPriorState(t *testing.T) { + controller, p, _ := newTestController(t, 24*time.Hour, []string{"*"}) + const digest = "sha256:5555555555555555555555555555555555555555555555555555555555555555" + imageName := "docker.io/library/alpine:latest" + seedReadyImage(t, p, imageName, digest) + writeStateAt(t, p, "docker.io/library/alpine", digest, time.Date(2026, 3, 20, 0, 0, 0, 0, time.UTC)) + writeInstanceMetadata(t, p, "inst-1", instances.StoredMetadata{ + Id: "inst-1", + Name: "instance-1", + Image: imageName, + }) + + require.NoError(t, controller.Sweep(context.Background())) + + _, err := os.Stat(p.ImageRetentionState("docker.io/library/alpine", stringsTrimDigest(digest))) + require.True(t, os.IsNotExist(err)) +} + +func TestSweepPrunesStateForManuallyDeletedImage(t *testing.T) { + controller, p, _ := newTestController(t, 24*time.Hour, []string{"*"}) + const digest = "sha256:6666666666666666666666666666666666666666666666666666666666666666" + seedReadyImage(t, p, "docker.io/library/alpine:latest", digest) + writeStateAt(t, p, "docker.io/library/alpine", digest, time.Date(2026, 3, 20, 0, 0, 0, 0, time.UTC)) + require.NoError(t, os.RemoveAll(p.ImageDigestDir("docker.io/library/alpine", stringsTrimDigest(digest)))) + + require.NoError(t, controller.Sweep(context.Background())) + + _, err := os.Stat(p.ImageRetentionState("docker.io/library/alpine", stringsTrimDigest(digest))) + require.True(t, os.IsNotExist(err)) +} + +func TestSweepIgnoresNonReadyImages(t *testing.T) { + controller, p, _ := newTestController(t, 24*time.Hour, []string{"*"}) + const digest = "sha256:7777777777777777777777777777777777777777777777777777777777777777" + seedImage(t, p, "docker.io/library/alpine:latest", digest, images.StatusPending) + + require.NoError(t, controller.Sweep(context.Background())) + + _, err := os.Stat(p.ImageRetentionState("docker.io/library/alpine", stringsTrimDigest(digest))) + require.True(t, os.IsNotExist(err)) +} + +func TestSweepStaleReferencesDoNotBlockOtherCleanup(t *testing.T) { + controller, p, _ := newTestController(t, 24*time.Hour, []string{"*"}) + const staleDigest = "sha256:8888888888888888888888888888888888888888888888888888888888888888" + const liveDigest = "sha256:9999999999999999999999999999999999999999999999999999999999999999" + seedReadyImage(t, p, "docker.io/library/alpine:latest", liveDigest) + writeStateAt(t, p, "docker.io/library/alpine", liveDigest, time.Date(2026, 3, 20, 0, 0, 0, 0, time.UTC)) + writeInstanceMetadata(t, p, "inst-1", instances.StoredMetadata{ + Id: "inst-1", + Name: "instance-1", + Image: "docker.io/library/busybox@" + staleDigest, + }) + + controller.now = func() time.Time { + return time.Date(2026, 3, 26, 0, 0, 0, 0, time.UTC) + } + + require.NoError(t, controller.Sweep(context.Background())) + + _, err := os.Stat(p.ImageDigestDir("docker.io/library/alpine", stringsTrimDigest(liveDigest))) + require.True(t, os.IsNotExist(err)) +} + +func TestMarkUsedClearsState(t *testing.T) { + controller, p, _ := newTestController(t, 24*time.Hour, []string{"*"}) + const digest = "sha256:aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa" + seedReadyImage(t, p, "docker.io/library/alpine:latest", digest) + writeStateAt(t, p, "docker.io/library/alpine", digest, time.Date(2026, 3, 20, 0, 0, 0, 0, time.UTC)) + + require.NoError(t, controller.MarkUsed(context.Background(), "docker.io/library/alpine:latest", digest)) + + _, err := os.Stat(p.ImageRetentionState("docker.io/library/alpine", stringsTrimDigest(digest))) + require.True(t, os.IsNotExist(err)) +} + +func TestRunPerformsImmediateSweep(t *testing.T) { + controller, p, _ := newTestController(t, 24*time.Hour, []string{"*"}) + const digest = "sha256:bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb" + seedReadyImage(t, p, "docker.io/library/alpine:latest", digest) + now := time.Date(2026, 3, 26, 12, 0, 0, 0, time.UTC) + controller.now = func() time.Time { return now } + + ctx, cancel := context.WithCancel(context.Background()) + done := make(chan struct{}) + go func() { + defer close(done) + _ = controller.Run(ctx) + }() + + statePath := p.ImageRetentionState("docker.io/library/alpine", stringsTrimDigest(digest)) + require.Eventually(t, func() bool { + _, err := os.Stat(statePath) + return err == nil + }, time.Second, 10*time.Millisecond) + + cancel() + <-done + + state := readState(t, p, "docker.io/library/alpine", digest) + require.Equal(t, now, state.UnusedSince) +} + +func TestSweepSuccessLogIsDebugOnly(t *testing.T) { + dataDir := t.TempDir() + p := paths.New(dataDir) + manager, err := images.NewManager(p, 1, nil) + require.NoError(t, err) + + var out bytes.Buffer + logger := slog.New(slog.NewTextHandler(&out, &slog.HandlerOptions{Level: slog.LevelInfo})) + controller, err := NewController(p, manager, 24*time.Hour, []string{"*"}, logger, nil) + require.NoError(t, err) + seedReadyImage(t, p, "docker.io/library/alpine:latest", "sha256:cccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc") + + require.NoError(t, controller.Sweep(context.Background())) + require.NotContains(t, out.String(), "image auto-delete sweep completed") +} + +func TestSweepSkipsDisallowedRepositories(t *testing.T) { + controller, p, _ := newTestController(t, 24*time.Hour, []string{"ghcr.io/kernel/*"}) + const digest = "sha256:dddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddd" + seedReadyImage(t, p, "docker.io/library/alpine:latest", digest) + writeStateAt(t, p, "docker.io/library/alpine", digest, time.Date(2026, 3, 20, 0, 0, 0, 0, time.UTC)) + + controller.now = func() time.Time { + return time.Date(2026, 3, 26, 0, 0, 0, 0, time.UTC) + } + + require.NoError(t, controller.Sweep(context.Background())) + + _, err := os.Stat(p.ImageDigestDir("docker.io/library/alpine", stringsTrimDigest(digest))) + require.NoError(t, err) + _, err = os.Stat(p.ImageRetentionState("docker.io/library/alpine", stringsTrimDigest(digest))) + require.True(t, os.IsNotExist(err)) +} + +func TestSweepAllowsGlobalWildcard(t *testing.T) { + controller, p, _ := newTestController(t, 24*time.Hour, []string{"*"}) + const digest = "sha256:eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee" + seedReadyImage(t, p, "docker.io/library/alpine:latest", digest) + writeStateAt(t, p, "docker.io/library/alpine", digest, time.Date(2026, 3, 20, 0, 0, 0, 0, time.UTC)) + + controller.now = func() time.Time { + return time.Date(2026, 3, 26, 0, 0, 0, 0, time.UTC) + } + + require.NoError(t, controller.Sweep(context.Background())) + + _, err := os.Stat(p.ImageDigestDir("docker.io/library/alpine", stringsTrimDigest(digest))) + require.True(t, os.IsNotExist(err)) +} + +func TestAllowPatternMatches(t *testing.T) { + tests := []struct { + name string + pattern string + repository string + want bool + }{ + {name: "exact", pattern: "docker.io/library/alpine", repository: "docker.io/library/alpine", want: true}, + {name: "global wildcard", pattern: "*", repository: "docker.io/library/alpine", want: true}, + {name: "suffix wildcard", pattern: "docker.io/library/*", repository: "docker.io/library/alpine", want: true}, + {name: "embedded wildcard", pattern: "ghcr.io/kernel/hypeman-*", repository: "ghcr.io/kernel/hypeman-test", want: true}, + {name: "question mark", pattern: "ghcr.io/kernel/hypeman-?", repository: "ghcr.io/kernel/hypeman-a", want: true}, + {name: "no match", pattern: "ghcr.io/kernel/*", repository: "docker.io/library/alpine", want: false}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + require.Equal(t, tt.want, allowPatternMatches(tt.pattern, tt.repository)) + }) + } +} + +func newTestController(t *testing.T, unusedFor time.Duration, allowed []string) (*Controller, *paths.Paths, images.Manager) { + t.Helper() + + dataDir := t.TempDir() + p := paths.New(dataDir) + manager, err := images.NewManager(p, 1, nil) + require.NoError(t, err) + + controller, err := NewController(p, manager, unusedFor, allowed, nil, nil) + require.NoError(t, err) + return controller, p, manager +} + +func newMetricTestController(t *testing.T, unusedFor time.Duration, allowed []string) (*Controller, *paths.Paths, *otelmetric.ManualReader) { + t.Helper() + + dataDir := t.TempDir() + p := paths.New(dataDir) + manager, err := images.NewManager(p, 1, nil) + require.NoError(t, err) + + reader := otelmetric.NewManualReader() + provider := otelmetric.NewMeterProvider(otelmetric.WithReader(reader)) + controller, err := NewController(p, manager, unusedFor, allowed, nil, provider.Meter("test")) + require.NoError(t, err) + return controller, p, reader +} + +func seedReadyImage(t *testing.T, p *paths.Paths, imageName, digest string, extraTags ...string) { + t.Helper() + seedImage(t, p, imageName, digest, images.StatusReady, extraTags...) +} + +func seedImage(t *testing.T, p *paths.Paths, imageName, digest, status string, extraTags ...string) { + t.Helper() + + ref, err := images.ParseNormalizedRef(imageName) + require.NoError(t, err) + + meta := map[string]any{ + "name": ref.String(), + "digest": digest, + "status": status, + "created_at": time.Date(2026, 3, 20, 0, 0, 0, 0, time.UTC), + } + + digestHex := stringsTrimDigest(digest) + dir := p.ImageDigestDir(ref.Repository(), digestHex) + require.NoError(t, os.MkdirAll(dir, 0o755)) + if status == images.StatusReady { + require.NoError(t, os.WriteFile(p.ImageDigestPath(ref.Repository(), digestHex), []byte("rootfs"), 0o644)) + } + content, err := json.MarshalIndent(meta, "", " ") + require.NoError(t, err) + require.NoError(t, os.WriteFile(p.ImageMetadata(ref.Repository(), digestHex), content, 0o644)) + + if !ref.IsDigest() { + require.NoError(t, os.MkdirAll(p.ImageRepositoryDir(ref.Repository()), 0o755)) + require.NoError(t, os.Symlink(digestHex, p.ImageTagSymlink(ref.Repository(), ref.Tag()))) + } + for _, tag := range extraTags { + require.NoError(t, os.Symlink(digestHex, p.ImageTagSymlink(ref.Repository(), tag))) + } +} + +func writeInstanceMetadata(t *testing.T, p *paths.Paths, id string, stored instances.StoredMetadata) { + t.Helper() + require.NoError(t, os.MkdirAll(p.InstanceDir(id), 0o755)) + content, err := json.MarshalIndent(stored, "", " ") + require.NoError(t, err) + require.NoError(t, os.WriteFile(p.InstanceMetadata(id), content, 0o644)) +} + +func writeSnapshotRecord(t *testing.T, p *paths.Paths, snapshotID string, stored instances.StoredMetadata) { + t.Helper() + content, err := json.Marshal(stored) + require.NoError(t, err) + + store := snapshotstore.NewStore(p) + err = store.SaveRecord(&snapshotstore.Record{ + Snapshot: snapshotstore.Snapshot{ + Id: snapshotID, + SourceInstanceID: "inst-1", + CreatedAt: time.Date(2026, 3, 20, 0, 0, 0, 0, time.UTC), + }, + StoredMetadata: content, + }) + require.NoError(t, err) +} + +func writeStateAt(t *testing.T, p *paths.Paths, repository, digest string, unusedSince time.Time) { + t.Helper() + statePath := p.ImageRetentionState(repository, stringsTrimDigest(digest)) + require.NoError(t, os.MkdirAll(filepath.Dir(statePath), 0o755)) + content, err := json.MarshalIndent(imageState{ + Repository: repository, + Digest: digest, + UnusedSince: unusedSince, + }, "", " ") + require.NoError(t, err) + require.NoError(t, os.WriteFile(statePath, content, 0o644)) +} + +func readState(t *testing.T, p *paths.Paths, repository, digest string) imageState { + t.Helper() + content, err := os.ReadFile(p.ImageRetentionState(repository, stringsTrimDigest(digest))) + require.NoError(t, err) + var state imageState + require.NoError(t, json.Unmarshal(content, &state)) + return state +} + +func collectRetentionMetrics(t *testing.T, reader *otelmetric.ManualReader) metricdata.ResourceMetrics { + t.Helper() + + var rm metricdata.ResourceMetrics + require.NoError(t, reader.Collect(context.Background(), &rm)) + return rm +} + +func int64GaugeValue(t *testing.T, rm metricdata.ResourceMetrics, name string, wantAttrs map[string]string) int64 { + t.Helper() + + metric := findRetentionMetric(t, rm, name) + gauge, ok := metric.Data.(metricdata.Gauge[int64]) + require.True(t, ok, "expected int64 gauge metric data for %s", name) + for _, point := range gauge.DataPoints { + if retentionMetricAttrsMatch(point.Attributes, wantAttrs) { + return point.Value + } + } + t.Fatalf("metric %s with attrs %v not found", name, wantAttrs) + return 0 +} + +func int64SumValue(t *testing.T, rm metricdata.ResourceMetrics, name string, wantAttrs map[string]string) int64 { + t.Helper() + + metric := findRetentionMetric(t, rm, name) + sum, ok := metric.Data.(metricdata.Sum[int64]) + require.True(t, ok, "expected int64 sum metric data for %s", name) + for _, point := range sum.DataPoints { + if retentionMetricAttrsMatch(point.Attributes, wantAttrs) { + return point.Value + } + } + t.Fatalf("metric %s with attrs %v not found", name, wantAttrs) + return 0 +} + +func float64HistogramCount(t *testing.T, rm metricdata.ResourceMetrics, name string, wantAttrs map[string]string) uint64 { + t.Helper() + + metric := findRetentionMetric(t, rm, name) + histogram, ok := metric.Data.(metricdata.Histogram[float64]) + require.True(t, ok, "expected float64 histogram metric data for %s", name) + for _, point := range histogram.DataPoints { + if retentionMetricAttrsMatch(point.Attributes, wantAttrs) { + return point.Count + } + } + t.Fatalf("metric %s with attrs %v not found", name, wantAttrs) + return 0 +} + +func findRetentionMetric(t *testing.T, rm metricdata.ResourceMetrics, name string) metricdata.Metrics { + t.Helper() + + for _, scope := range rm.ScopeMetrics { + for _, metric := range scope.Metrics { + if metric.Name == name { + return metric + } + } + } + + t.Fatalf("metric %s not found", name) + return metricdata.Metrics{} +} + +func retentionMetricAttrsMatch(set attribute.Set, want map[string]string) bool { + if len(want) == 0 { + return true + } + + attrs := make(map[string]string, len(set.ToSlice())) + for _, kv := range set.ToSlice() { + attrs[string(kv.Key)] = kv.Value.AsString() + } + for key, value := range want { + if attrs[key] != value { + return false + } + } + return true +} + +func stringsTrimDigest(digest string) string { + return digest[len("sha256:"):] +} diff --git a/lib/imageretention/metrics.go b/lib/imageretention/metrics.go new file mode 100644 index 00000000..d07016ab --- /dev/null +++ b/lib/imageretention/metrics.go @@ -0,0 +1,120 @@ +package imageretention + +import ( + "context" + "time" + + hypotel "github.com/kernel/hypeman/lib/otel" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/metric" +) + +// Metrics holds the OTel instruments for image retention. +type Metrics struct { + sweepsTotal metric.Int64Counter + sweepDuration metric.Float64Histogram + deletesTotal metric.Int64Counter + staleReferencesTotal metric.Int64Counter + pendingImages metric.Int64ObservableGauge +} + +func newMetrics(meter metric.Meter, c *Controller) (*Metrics, error) { + sweepsTotal, err := meter.Int64Counter( + "hypeman_image_retention_sweeps_total", + metric.WithDescription("Total number of image retention sweeps"), + ) + if err != nil { + return nil, err + } + + sweepDuration, err := meter.Float64Histogram( + "hypeman_image_retention_sweep_duration_seconds", + metric.WithDescription("Duration of image retention sweeps"), + metric.WithUnit("s"), + metric.WithExplicitBucketBoundaries(hypotel.CommonDurationHistogramBuckets()...), + ) + if err != nil { + return nil, err + } + + deletesTotal, err := meter.Int64Counter( + "hypeman_image_retention_deletes_total", + metric.WithDescription("Total number of image retention delete attempts"), + ) + if err != nil { + return nil, err + } + + staleReferencesTotal, err := meter.Int64Counter( + "hypeman_image_retention_stale_references_total", + metric.WithDescription("Total number of stale image references skipped during retention sweeps"), + ) + if err != nil { + return nil, err + } + + pendingImages, err := meter.Int64ObservableGauge( + "hypeman_image_retention_pending_images", + metric.WithDescription("Number of images currently tracked by retention state"), + ) + if err != nil { + return nil, err + } + + _, err = meter.RegisterCallback( + func(ctx context.Context, o metric.Observer) error { + states, now, err := c.listStatesSnapshot() + if err != nil { + return nil + } + + var tracked int64 + var expired int64 + for _, state := range states { + tracked++ + if !state.UnusedSince.IsZero() && !state.UnusedSince.UTC().Add(c.unusedFor).After(now) { + expired++ + } + } + + o.ObserveInt64(pendingImages, tracked, metric.WithAttributes(attribute.String("state", "tracked"))) + o.ObserveInt64(pendingImages, expired, metric.WithAttributes(attribute.String("state", "expired"))) + return nil + }, + pendingImages, + ) + if err != nil { + return nil, err + } + + return &Metrics{ + sweepsTotal: sweepsTotal, + sweepDuration: sweepDuration, + deletesTotal: deletesTotal, + staleReferencesTotal: staleReferencesTotal, + pendingImages: pendingImages, + }, nil +} + +// RecordSweep records the outcome and duration of a retention sweep. +func (m *Metrics) RecordSweep(ctx context.Context, status string, duration time.Duration, staleReferences int) { + if m == nil { + return + } + + attrs := metric.WithAttributes(attribute.String("status", status)) + m.sweepsTotal.Add(ctx, 1, attrs) + m.sweepDuration.Record(ctx, duration.Seconds(), attrs) + if staleReferences > 0 { + m.staleReferencesTotal.Add(ctx, int64(staleReferences)) + } +} + +// RecordDelete records the outcome of an image delete attempt. +func (m *Metrics) RecordDelete(ctx context.Context, status string) { + if m == nil { + return + } + + m.deletesTotal.Add(ctx, 1, metric.WithAttributes(attribute.String("status", status))) +} diff --git a/lib/instances/create.go b/lib/instances/create.go index a620a24f..ca4be904 100644 --- a/lib/instances/create.go +++ b/lib/instances/create.go @@ -133,6 +133,7 @@ func (m *manager) createInstance( log.ErrorContext(ctx, "image not ready", "image", req.Image, "status", imageInfo.Status) return nil, fmt.Errorf("%w: image status is %s", ErrImageNotReady, imageInfo.Status) } + m.recordImageUsage(ctx, imageInfo) // 3. Generate instance ID (CUID2 for secure, collision-resistant IDs) id := cuid2.Generate() diff --git a/lib/instances/image_usage_test.go b/lib/instances/image_usage_test.go new file mode 100644 index 00000000..7ec1d946 --- /dev/null +++ b/lib/instances/image_usage_test.go @@ -0,0 +1,96 @@ +package instances + +import ( + "context" + "encoding/json" + "os" + "path/filepath" + "testing" + "time" + + "github.com/kernel/hypeman/lib/hypervisor" + "github.com/kernel/hypeman/lib/images" + "github.com/kernel/hypeman/lib/paths" + "github.com/stretchr/testify/require" +) + +func TestCreateInstanceClearsRetentionStateBeforeMetadataSave(t *testing.T) { + mgr, tmpDir := setupTestManager(t) + p := paths.New(tmpDir) + + const digest = "sha256:cccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc" + const imageName = "docker.io/library/alpine:latest" + seedLocalReadyImage(t, p, imageName, digest) + writeImageRetentionState(t, p, "docker.io/library/alpine", digest, time.Date(2026, 3, 20, 0, 0, 0, 0, time.UTC)) + + recorder := &stubImageUsageRecorder{ + statePath: p.ImageRetentionState("docker.io/library/alpine", trimDigestPrefix(digest)), + } + mgr.SetImageUsageRecorder(recorder) + + _, err := mgr.CreateInstance(context.Background(), CreateInstanceRequest{ + Name: "retention-test", + Image: imageName, + Hypervisor: hypervisor.Type("missing-hypervisor"), + }) + require.Error(t, err) + + _, err = os.Stat(p.ImageRetentionState("docker.io/library/alpine", trimDigestPrefix(digest))) + require.True(t, os.IsNotExist(err)) + + metaFiles, err := mgr.listMetadataFiles() + require.NoError(t, err) + require.Len(t, metaFiles, 0) + require.Equal(t, 1, recorder.calls) +} + +func seedLocalReadyImage(t *testing.T, p *paths.Paths, imageName, digest string) { + t.Helper() + + ref, err := images.ParseNormalizedRef(imageName) + require.NoError(t, err) + + digestHex := trimDigestPrefix(digest) + require.NoError(t, os.MkdirAll(p.ImageDigestDir(ref.Repository(), digestHex), 0o755)) + require.NoError(t, os.WriteFile(p.ImageDigestPath(ref.Repository(), digestHex), []byte("rootfs"), 0o644)) + + meta := map[string]any{ + "name": ref.String(), + "digest": digest, + "status": images.StatusReady, + "created_at": time.Date(2026, 3, 20, 0, 0, 0, 0, time.UTC), + } + content, err := json.MarshalIndent(meta, "", " ") + require.NoError(t, err) + require.NoError(t, os.WriteFile(p.ImageMetadata(ref.Repository(), digestHex), content, 0o644)) + require.NoError(t, os.MkdirAll(p.ImageRepositoryDir(ref.Repository()), 0o755)) + require.NoError(t, os.Symlink(digestHex, p.ImageTagSymlink(ref.Repository(), ref.Tag()))) +} + +func writeImageRetentionState(t *testing.T, p *paths.Paths, repository, digest string, unusedSince time.Time) { + t.Helper() + statePath := p.ImageRetentionState(repository, trimDigestPrefix(digest)) + require.NoError(t, os.MkdirAll(filepath.Dir(statePath), 0o755)) + + content, err := json.MarshalIndent(map[string]any{ + "repository": repository, + "digest": digest, + "unused_since": unusedSince, + }, "", " ") + require.NoError(t, err) + require.NoError(t, os.WriteFile(statePath, content, 0o644)) +} + +func trimDigestPrefix(digest string) string { + return digest[len("sha256:"):] +} + +type stubImageUsageRecorder struct { + statePath string + calls int +} + +func (s *stubImageUsageRecorder) MarkUsed(ctx context.Context, imageName, digest string) error { + s.calls++ + return os.Remove(s.statePath) +} diff --git a/lib/instances/manager.go b/lib/instances/manager.go index cab30b67..4ef8d9bf 100644 --- a/lib/instances/manager.go +++ b/lib/instances/manager.go @@ -12,6 +12,7 @@ import ( "github.com/kernel/hypeman/lib/guestmemory" "github.com/kernel/hypeman/lib/hypervisor" "github.com/kernel/hypeman/lib/images" + "github.com/kernel/hypeman/lib/logger" "github.com/kernel/hypeman/lib/network" "github.com/kernel/hypeman/lib/paths" "github.com/kernel/hypeman/lib/resources" @@ -61,6 +62,16 @@ type Manager interface { Subscribe(instanceID string) (<-chan StateChange, func()) } +// ImageUsageRecorder records newly used images before instance metadata is persisted. +type ImageUsageRecorder interface { + MarkUsed(ctx context.Context, imageName, digest string) error +} + +// ImageUsageRecorderSetter configures an optional image usage recorder on the manager. +type ImageUsageRecorderSetter interface { + SetImageUsageRecorder(recorder ImageUsageRecorder) +} + // ResourceLimits contains configurable resource limits for instances type ResourceLimits struct { MaxOverlaySize int64 // Maximum overlay disk size in bytes per instance @@ -102,6 +113,7 @@ type manager struct { compressionJobs map[string]*compressionJob nativeCodecMu sync.Mutex nativeCodecPaths map[string]string + imageUsageRecorder ImageUsageRecorder // State change subscriptions for waitForState stateSubscribers *subscribers @@ -178,6 +190,11 @@ func (m *manager) SetResourceValidator(v ResourceValidator) { m.resourceValidator = v } +// SetImageUsageRecorder configures an optional recorder for pre-persistence image usage. +func (m *manager) SetImageUsageRecorder(recorder ImageUsageRecorder) { + m.imageUsageRecorder = recorder +} + func (m *manager) Subscribe(instanceID string) (<-chan StateChange, func()) { return m.stateSubscribers.Subscribe(instanceID) } @@ -236,6 +253,16 @@ func (m *manager) maybePersistBootMarkers(ctx context.Context, id string) { m.persistBootMarkers(ctx, id) } +func (m *manager) recordImageUsage(ctx context.Context, imageInfo *images.Image) { + if m.imageUsageRecorder == nil || imageInfo == nil { + return + } + if err := m.imageUsageRecorder.MarkUsed(ctx, imageInfo.Name, imageInfo.Digest); err != nil { + log := logger.FromContext(ctx) + log.WarnContext(ctx, "failed to record image usage", "image", imageInfo.Name, "digest", imageInfo.Digest, "error", err) + } +} + // CreateInstance creates and starts a new instance func (m *manager) CreateInstance(ctx context.Context, req CreateInstanceRequest) (*Instance, error) { // Note: ID is generated inside createInstance, so we can't lock before calling it. diff --git a/lib/otel/README.md b/lib/otel/README.md index 6f1fa944..04f9113c 100644 --- a/lib/otel/README.md +++ b/lib/otel/README.md @@ -55,6 +55,11 @@ This keeps pull and push views aligned because both are sourced from the same OT | `hypeman_images_build_duration_seconds` | histogram | status | Image build time | | `hypeman_images_total` | gauge | status | Cached images count | | `hypeman_images_pulls_total` | counter | status | Registry pulls | +| `hypeman_image_retention_pending_images` | gauge | state | Images currently tracked by retention state | +| `hypeman_image_retention_stale_references_total` | counter | | Stale image references skipped during retention sweeps | +| `hypeman_image_retention_sweeps_total` | counter | status | Retention sweep runs | +| `hypeman_image_retention_sweep_duration_seconds` | histogram | status | Retention sweep latency | +| `hypeman_image_retention_deletes_total` | counter | status | Retention delete attempts | ### Instances | Metric | Type | Labels | Description | diff --git a/lib/paths/paths.go b/lib/paths/paths.go index 35ddf884..adc070c4 100644 --- a/lib/paths/paths.go +++ b/lib/paths/paths.go @@ -53,6 +53,21 @@ func (p *Paths) SystemOCICache() string { return filepath.Join(p.dataDir, "system", "oci-cache") } +// SystemImageRetentionDir returns the path to persisted image retention state. +func (p *Paths) SystemImageRetentionDir() string { + return filepath.Join(p.dataDir, "system", "image-retention") +} + +// ImageRetentionRepositoryDir returns the directory for image retention state for one repository. +func (p *Paths) ImageRetentionRepositoryDir(repository string) string { + return filepath.Join(p.SystemImageRetentionDir(), repository) +} + +// ImageRetentionState returns the state file path for one retained image digest. +func (p *Paths) ImageRetentionState(repository, digestHex string) string { + return filepath.Join(p.ImageRetentionRepositoryDir(repository), digestHex+".json") +} + // OCICacheBlobDir returns the path to the OCI cache blobs directory. func (p *Paths) OCICacheBlobDir() string { return filepath.Join(p.SystemOCICache(), "blobs", "sha256")