Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 27 additions & 0 deletions cmd/api/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,18 @@ type LoggingConfig struct {
RotateInterval string `koanf:"rotate_interval"`
}

// ImagesAutoDeleteConfig holds server-wide image retention settings.
type ImagesAutoDeleteConfig struct {
Enabled bool `koanf:"enabled"`
UnusedFor string `koanf:"unused_for"`
Allowed []string `koanf:"allowed"`
}

// ImagesConfig holds image-management settings.
type ImagesConfig struct {
AutoDelete ImagesAutoDeleteConfig `koanf:"auto_delete"`
}

// BuildConfig holds source-to-image build system settings.
type BuildConfig struct {
MaxConcurrentSourceBuilds int `koanf:"max_concurrent_source_builds"`
Expand Down Expand Up @@ -226,6 +238,7 @@ type Config struct {
Metrics MetricsConfig `koanf:"metrics"`
Otel OtelConfig `koanf:"otel"`
Logging LoggingConfig `koanf:"logging"`
Images ImagesConfig `koanf:"images"`
Build BuildConfig `koanf:"build"`
Registry RegistryConfig `koanf:"registry"`
Limits LimitsConfig `koanf:"limits"`
Expand Down Expand Up @@ -319,6 +332,14 @@ func defaultConfig() *Config {
RotateInterval: "5m",
},

Images: ImagesConfig{
AutoDelete: ImagesAutoDeleteConfig{
Enabled: false,
UnusedFor: "720h",
Allowed: []string{},
},
},

Build: BuildConfig{
MaxConcurrentSourceBuilds: 2,
BuilderImage: "", // empty = build from embedded Dockerfile on first run
Expand Down Expand Up @@ -511,6 +532,12 @@ func (c *Config) Validate() error {
if c.Build.Timeout <= 0 {
return fmt.Errorf("build.timeout must be positive, got %d", c.Build.Timeout)
}
if err := validateDuration("images.auto_delete.unused_for", c.Images.AutoDelete.UnusedFor); err != nil {
return err
}

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Missing positive duration validation for unused_for config

Low Severity

The images.auto_delete.unused_for field is validated only by validateDuration, which accepts any parseable Go duration including zero and negative values. A zero or negative value would cause images to be eligible for deletion on the very next sweep after being marked unused (~1 minute), rather than observing a meaningful retention window, contradicting the documented 720h default intent.

Fix in Cursor Fix in Web

for i, pattern := range c.Images.AutoDelete.Allowed {
c.Images.AutoDelete.Allowed[i] = strings.TrimSpace(pattern)
}
algorithm := strings.ToLower(c.Snapshot.CompressionDefault.Algorithm)
c.Snapshot.CompressionDefault.Algorithm = algorithm
if c.Snapshot.CompressionDefault.Enabled {
Expand Down
60 changes: 60 additions & 0 deletions cmd/api/config/config_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,15 @@ func TestDefaultConfigIncludesMetricsSettings(t *testing.T) {
if cfg.Otel.SuccessfulGetSampleRatio != 0.1 {
t.Fatalf("expected default otel.successful_get_sample_ratio to be 0.1, got %v", cfg.Otel.SuccessfulGetSampleRatio)
}
if cfg.Images.AutoDelete.Enabled {
t.Fatalf("expected default images.auto_delete.enabled to be false")
}
if cfg.Images.AutoDelete.UnusedFor != "720h" {
t.Fatalf("expected default images.auto_delete.unused_for to be 720h, got %q", cfg.Images.AutoDelete.UnusedFor)
}
if len(cfg.Images.AutoDelete.Allowed) != 0 {
t.Fatalf("expected default images.auto_delete.allowed to be empty, got %v", cfg.Images.AutoDelete.Allowed)
}
}

func TestLoadEnvOverridesMetricsAndOtelInterval(t *testing.T) {
Expand Down Expand Up @@ -138,6 +147,57 @@ func TestValidateRejectsInvalidResourceRefreshInterval(t *testing.T) {
}
}

func TestLoadUsesDefaultImageAutoDeleteRetentionWindow(t *testing.T) {
tmp := t.TempDir()
cfgPath := filepath.Join(tmp, "config.yaml")
if err := os.WriteFile(cfgPath, []byte("images:\n auto_delete:\n enabled: true\n"), 0600); err != nil {
t.Fatalf("write temp config: %v", err)
}

cfg, err := Load(cfgPath)
if err != nil {
t.Fatalf("load config: %v", err)
}

if !cfg.Images.AutoDelete.Enabled {
t.Fatalf("expected images.auto_delete.enabled override to be true")
}
if cfg.Images.AutoDelete.UnusedFor != "720h" {
t.Fatalf("expected default images.auto_delete.unused_for to remain 720h, got %q", cfg.Images.AutoDelete.UnusedFor)
}
if len(cfg.Images.AutoDelete.Allowed) != 0 {
t.Fatalf("expected default images.auto_delete.allowed to remain empty, got %v", cfg.Images.AutoDelete.Allowed)
}
}

func TestValidateRejectsInvalidImageAutoDeleteUnusedFor(t *testing.T) {
cfg := defaultConfig()
cfg.Images.AutoDelete.UnusedFor = "not-a-duration"

err := cfg.Validate()
if err == nil {
t.Fatalf("expected validation error for invalid images.auto_delete.unused_for")
}
}

func TestValidateTrimsImageAutoDeleteAllowedPatterns(t *testing.T) {
cfg := defaultConfig()
cfg.Images.AutoDelete.Allowed = []string{" docker.io/library/* ", " ", "ghcr.io/kernel/*"}

if err := cfg.Validate(); err != nil {
t.Fatalf("expected image auto delete allow list to validate, got %v", err)
}
if cfg.Images.AutoDelete.Allowed[0] != "docker.io/library/*" {
t.Fatalf("expected first allow pattern to be trimmed, got %q", cfg.Images.AutoDelete.Allowed[0])
}
if cfg.Images.AutoDelete.Allowed[1] != "" {
t.Fatalf("expected whitespace-only allow pattern to trim to empty string, got %q", cfg.Images.AutoDelete.Allowed[1])
}
if cfg.Images.AutoDelete.Allowed[2] != "ghcr.io/kernel/*" {
t.Fatalf("expected third allow pattern to be trimmed, got %q", cfg.Images.AutoDelete.Allowed[2])
}
}

func TestValidateRejectsEmptyActiveBallooningDurations(t *testing.T) {
cfg := defaultConfig()
cfg.Hypervisor.Memory.ActiveBallooning.PollInterval = " "
Expand Down
54 changes: 54 additions & 0 deletions cmd/api/image_retention_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
package main

import (
"context"
"sync/atomic"
"testing"
"time"

"golang.org/x/sync/errgroup"
)

type stubImageRetentionRunner struct {
runCount atomic.Int32
}

func (s *stubImageRetentionRunner) Run(ctx context.Context) error {
s.runCount.Add(1)
<-ctx.Done()
return nil
}

func TestStartImageRetentionControllerSkipsNilController(t *testing.T) {
grp, ctx := errgroup.WithContext(context.Background())

started := startImageRetentionController(grp, ctx, nil)
if started {
t.Fatalf("expected nil controller not to start")
}
}

func TestStartImageRetentionControllerStartsRunner(t *testing.T) {
grp, ctx := errgroup.WithContext(context.Background())
ctx, cancel := context.WithCancel(ctx)
defer cancel()

runner := &stubImageRetentionRunner{}
started := startImageRetentionController(grp, ctx, runner)
if !started {
t.Fatalf("expected controller to start")
}

deadline := time.Now().Add(time.Second)
for runner.runCount.Load() == 0 && time.Now().Before(deadline) {
time.Sleep(10 * time.Millisecond)
}
if runner.runCount.Load() != 1 {
t.Fatalf("expected runner to be started once, got %d", runner.runCount.Load())
}

cancel()
if err := grp.Wait(); err != nil {
t.Fatalf("wait for retention runner: %v", err)
}
}
53 changes: 53 additions & 0 deletions cmd/api/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,15 +26,20 @@ import (
"github.com/kernel/hypeman/lib/devices"
"github.com/kernel/hypeman/lib/guest"
"github.com/kernel/hypeman/lib/hypervisor/qemu"
"github.com/kernel/hypeman/lib/imageretention"
"github.com/kernel/hypeman/lib/images"
"github.com/kernel/hypeman/lib/instances"
loglib "github.com/kernel/hypeman/lib/logger"
mw "github.com/kernel/hypeman/lib/middleware"
"github.com/kernel/hypeman/lib/oapi"
"github.com/kernel/hypeman/lib/otel"
"github.com/kernel/hypeman/lib/paths"
"github.com/kernel/hypeman/lib/registry"
"github.com/kernel/hypeman/lib/scopes"
"github.com/kernel/hypeman/lib/vmm"
nethttpmiddleware "github.com/oapi-codegen/nethttp-middleware"
"github.com/riandyrn/otelchi"
"go.opentelemetry.io/otel/metric"
"golang.org/x/sync/errgroup"
)

Expand All @@ -59,6 +64,40 @@ func newMetricsServer(addr string, handler http.Handler) *http.Server {
}
}

type imageRetentionRunner interface {
Run(ctx context.Context) error
}

func configureImageRetentionController(cfg *config.Config, imageManager images.Manager, instanceManager instances.Manager, logger *slog.Logger, meter metric.Meter) (imageRetentionRunner, error) {
if cfg == nil || !cfg.Images.AutoDelete.Enabled {
return nil, nil
}

unusedFor, err := time.ParseDuration(cfg.Images.AutoDelete.UnusedFor)
if err != nil {
return nil, fmt.Errorf("invalid images.auto_delete.unused_for %q: %w", cfg.Images.AutoDelete.UnusedFor, err)
}

controller, err := imageretention.NewController(paths.New(cfg.DataDir), imageManager, unusedFor, cfg.Images.AutoDelete.Allowed, logger, meter)
if err != nil {
return nil, err
}
if setter, ok := instanceManager.(instances.ImageUsageRecorderSetter); ok {
setter.SetImageUsageRecorder(controller)
}
return controller, nil
}

func startImageRetentionController(grp *errgroup.Group, ctx context.Context, controller imageRetentionRunner) bool {
if grp == nil || controller == nil {
return false
}
grp.Go(func() error {
return controller.Run(ctx)
})
Comment thread
sjmiller609 marked this conversation as resolved.
return true
}

func run() error {
// Load config early for OTel initialization
// Config path can be specified via CONFIG_PATH env var or defaults to platform-specific locations
Expand Down Expand Up @@ -429,6 +468,20 @@ func run() error {
// Error group for coordinated shutdown
grp, gctx := errgroup.WithContext(ctx)

retentionController, err := configureImageRetentionController(
app.Config,
app.ImageManager,
app.InstanceManager,
logger,
otelProvider.MeterFor(loglib.SubsystemImages),
)
if err != nil {
return err
}
if startImageRetentionController(grp, gctx, retentionController) {
logger.Info("image auto-delete enabled", "unused_for", app.Config.Images.AutoDelete.UnusedFor)
}

// Start build manager background services (vsock handler for builder VMs)
if err := app.BuildManager.Start(gctx); err != nil {
logger.Error("failed to start build manager", "error", err)
Expand Down
12 changes: 12 additions & 0 deletions config.example.darwin.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,18 @@ network:
logging:
level: debug

# =============================================================================
# Images
# =============================================================================
# images:
# auto_delete:
# enabled: false # server-wide automatic deletion of cached converted images
# unused_for: 720h # delete only after no instances or snapshots reference the image for this long
# allowed: # safety gate: only delete repositories matching one of these patterns
# - docker.io/library/* # match normalized repository names
# - ghcr.io/kernel/* # use ["*"] to allow deletion for every repository
# # only affects data_dir/images, not the shared OCI cache

# =============================================================================
# Caddy / Ingress Configuration
# =============================================================================
Expand Down
12 changes: 12 additions & 0 deletions config.example.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,18 @@ data_dir: /var/lib/hypeman
# logging:
# level: info # debug, info, warn, error

# =============================================================================
# Images
# =============================================================================
# images:
# auto_delete:
# enabled: false # server-wide automatic deletion of cached converted images
# unused_for: 720h # delete only after no instances or snapshots reference the image for this long
# allowed: # safety gate: only delete repositories matching one of these patterns
# - docker.io/library/* # match normalized repository names
# - ghcr.io/kernel/* # use ["*"] to allow deletion for every repository
# # only affects data_dir/images, not the shared OCI cache

# =============================================================================
# Caddy / Ingress Configuration
# =============================================================================
Expand Down
28 changes: 28 additions & 0 deletions lib/imageretention/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
# Image Retention

This feature automatically deletes cached converted images after they have been unused for a configurable amount of time.

The retention window is server-wide and controlled by:

```yaml
images:
auto_delete:
enabled: false
unused_for: 720h
allowed: []
```

When auto-delete is enabled:

- The server runs a retention sweep on startup and then every minute.
- Only converted cached images under `data_dir/images` are eligible for deletion.
- Shared OCI cache data under `data_dir/system/oci-cache` is not modified.
- An image repository must also match at least one `allowed` pattern before any retention state is recorded or deletion is attempted.

An image is considered in use if any persisted instance metadata or snapshot record still references it. As long as at least one such reference exists, the image is protected from deletion.

The retention timer starts only after the last persisted reference disappears and the image repository is allowed by policy. At that point the server records `unused_since` for the image digest. Once `unused_since + unused_for` has elapsed, the cached image digest is deleted.

Allow-list patterns match normalized repository names such as `docker.io/library/alpine`. A lone `*` pattern allows deletion for every repository.

New instance creation clears any stale unused state for the resolved image before the new instance metadata is written. This helps prevent races where an image is being reused right as retention cleanup runs.
Loading
Loading