diff --git a/.env.darwin.example b/.env.darwin.example index f714f06e..6c7f5f87 100644 --- a/.env.darwin.example +++ b/.env.darwin.example @@ -88,6 +88,10 @@ MAX_MEMORY_PER_INSTANCE=8GB # OTEL_ENDPOINT=127.0.0.1:4317 # OTEL_SERVICE_NAME=hypeman # OTEL_INSECURE=true +# OTEL__METRIC_EXPORT_INTERVAL=60s +# METRICS__LISTEN_ADDRESS=127.0.0.1 +# METRICS__PORT=9464 +# METRICS__VM_LABEL_BUDGET=200 # ENV=dev # ============================================================================= diff --git a/.env.example b/.env.example index ea445648..eecbc081 100644 --- a/.env.example +++ b/.env.example @@ -60,6 +60,10 @@ DATA_DIR=/var/lib/hypeman # OTEL_SERVICE_NAME=hypeman # OTEL_SERVICE_INSTANCE_ID= # default: hostname # OTEL_INSECURE=true +# OTEL__METRIC_EXPORT_INTERVAL=60s # OTLP push cadence (when OTEL_ENABLED=true) +# METRICS__LISTEN_ADDRESS=127.0.0.1 +# METRICS__PORT=9464 +# METRICS__VM_LABEL_BUDGET=200 # warn when observed per-VM metric labels exceed budget # ENV=dev # deployment environment # ============================================================================= diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md index 1813f547..7af86128 100644 --- a/DEVELOPMENT.md +++ b/DEVELOPMENT.md @@ -115,6 +115,10 @@ Common settings: | `logging.level` | Log level (debug, info, warn, error) | `info` | | `otel.enabled` | Enable OpenTelemetry traces/metrics | `false` | | `otel.endpoint` | OTLP gRPC endpoint | `127.0.0.1:4317` | +| `otel.metric_export_interval` | OTLP metric push interval | `60s` | +| `metrics.listen_address` | Bind address for `/metrics` endpoint | `127.0.0.1` | +| `metrics.port` | Port for `/metrics` endpoint | `9464` | +| `metrics.vm_label_budget` | Warning threshold for observed per-VM metric labels | `200` | | `limits.max_concurrent_builds` | Max concurrent image builds | `1` | | `limits.max_overlay_size` | Max overlay filesystem size | `100GB` | | `acme.email` | Email for ACME certificate registration | _(empty)_ | diff --git a/cmd/api/api/cp.go b/cmd/api/api/cp.go index 6bae53ed..bfa64e45 100644 --- a/cmd/api/api/cp.go +++ b/cmd/api/api/cp.go @@ -84,6 +84,13 @@ type CpResult struct { BytesWritten int64 `json:"bytes_written,omitempty"` } +func cpSpanAttributes(instanceID, direction string) []attribute.KeyValue { + return []attribute.KeyValue{ + attribute.String("instance_id", instanceID), + attribute.String("direction", direction), + } +} + // CpHandler handles file copy requests via WebSocket func (s *ApiService) CpHandler(w http.ResponseWriter, r *http.Request) { ctx := r.Context() @@ -146,12 +153,7 @@ func (s *ApiService) CpHandler(w http.ResponseWriter, r *http.Request) { // Start OTEL span for tracing (WebSocket bypasses otelchi middleware) tracer := otel.Tracer("hypeman/cp") ctx, span := tracer.Start(ctx, "cp.session", - trace.WithAttributes( - attribute.String("instance_id", inst.Id), - attribute.String("direction", cpReq.Direction), - attribute.String("guest_path", cpReq.GuestPath), - attribute.String("subject", subject), - ), + trace.WithAttributes(cpSpanAttributes(inst.Id, cpReq.Direction)...), ) defer span.End() diff --git a/cmd/api/api/exec.go b/cmd/api/api/exec.go index b1e13c2c..b6f93102 100644 --- a/cmd/api/api/exec.go +++ b/cmd/api/api/exec.go @@ -15,6 +15,10 @@ import ( "github.com/kernel/hypeman/lib/instances" "github.com/kernel/hypeman/lib/logger" mw "github.com/kernel/hypeman/lib/middleware" + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/codes" + "go.opentelemetry.io/otel/trace" ) var upgrader = websocket.Upgrader{ @@ -46,6 +50,13 @@ type ResizeMessage struct { } `json:"resize"` } +func execSpanAttributes(instanceID string, tty bool) []attribute.KeyValue { + return []attribute.KeyValue{ + attribute.String("instance_id", instanceID), + attribute.Bool("tty", tty), + } +} + // ExecHandler handles exec requests via WebSocket for bidirectional streaming // Note: Resolution is handled by ResolveResource middleware func (s *ApiService) ExecHandler(w http.ResponseWriter, r *http.Request) { @@ -108,6 +119,11 @@ func (s *ApiService) ExecHandler(w http.ResponseWriter, r *http.Request) { } } + // Start OTEL span for tracing (WebSocket bypasses otelchi middleware). + tracer := otel.Tracer("hypeman/exec") + ctx, span := tracer.Start(ctx, "exec.session", trace.WithAttributes(execSpanAttributes(inst.Id, execReq.TTY)...)) + defer span.End() + // Audit log: exec session started log.InfoContext(ctx, "exec session started", "instance_id", inst.Id, @@ -133,6 +149,8 @@ func (s *ApiService) ExecHandler(w http.ResponseWriter, r *http.Request) { dialer, err := s.InstanceManager.GetVsockDialer(ctx, inst.Id) if err != nil { + span.RecordError(err) + span.SetStatus(codes.Error, err.Error()) log.ErrorContext(ctx, "failed to get vsock dialer", "error", err) ws.WriteMessage(websocket.BinaryMessage, []byte(fmt.Sprintf("Error: %v\r\n", err))) ws.WriteMessage(websocket.TextMessage, []byte(`{"exitCode":127}`)) @@ -158,6 +176,8 @@ func (s *ApiService) ExecHandler(w http.ResponseWriter, r *http.Request) { duration := time.Since(startTime) if err != nil { + span.RecordError(err) + span.SetStatus(codes.Error, err.Error()) log.ErrorContext(ctx, "exec failed", "error", err, "instance_id", inst.Id, @@ -180,6 +200,8 @@ func (s *ApiService) ExecHandler(w http.ResponseWriter, r *http.Request) { "exit_code", exit.Code, "duration_ms", duration.Milliseconds(), ) + span.SetAttributes(attribute.Int("exit_code", exit.Code)) + span.SetStatus(codes.Ok, "") // Send close frame with exit code in JSON closeMsg := fmt.Sprintf(`{"exitCode":%d}`, exit.Code) diff --git a/cmd/api/api/trace_attrs_test.go b/cmd/api/api/trace_attrs_test.go new file mode 100644 index 00000000..6b1191c2 --- /dev/null +++ b/cmd/api/api/trace_attrs_test.go @@ -0,0 +1,33 @@ +package api + +import ( + "testing" + + "github.com/stretchr/testify/require" +) + +func TestCpSpanAttributes(t *testing.T) { + attrs := cpSpanAttributes("inst-123", "to") + got := map[string]any{} + for _, attr := range attrs { + got[string(attr.Key)] = attr.Value.AsInterface() + } + + require.Equal(t, "inst-123", got["instance_id"]) + require.Equal(t, "to", got["direction"]) + require.NotContains(t, got, "guest_path") + require.NotContains(t, got, "subject") +} + +func TestExecSpanAttributes(t *testing.T) { + attrs := execSpanAttributes("inst-456", true) + got := map[string]any{} + for _, attr := range attrs { + got[string(attr.Key)] = attr.Value.AsInterface() + } + + require.Equal(t, "inst-456", got["instance_id"]) + require.Equal(t, true, got["tty"]) + require.NotContains(t, got, "guest_path") + require.NotContains(t, got, "subject") +} diff --git a/cmd/api/config/config.go b/cmd/api/config/config.go index 4ee3d442..52cd1ca3 100644 --- a/cmd/api/config/config.go +++ b/cmd/api/config/config.go @@ -7,6 +7,7 @@ import ( "runtime" "runtime/debug" "strings" + "time" "github.com/knadh/koanf/parsers/yaml" "github.com/knadh/koanf/providers/env" @@ -93,13 +94,21 @@ type APIConfig struct { RedirectHTTP bool `koanf:"redirect_http"` } +// MetricsConfig holds metrics endpoint settings. +type MetricsConfig struct { + ListenAddress string `koanf:"listen_address"` + Port int `koanf:"port"` + VMLabelBudget int `koanf:"vm_label_budget"` +} + // OtelConfig holds OpenTelemetry settings. type OtelConfig struct { - Enabled bool `koanf:"enabled"` - Endpoint string `koanf:"endpoint"` - ServiceName string `koanf:"service_name"` - ServiceInstanceID string `koanf:"service_instance_id"` - Insecure bool `koanf:"insecure"` + Enabled bool `koanf:"enabled"` + Endpoint string `koanf:"endpoint"` + ServiceName string `koanf:"service_name"` + ServiceInstanceID string `koanf:"service_instance_id"` + Insecure bool `koanf:"insecure"` + MetricExportInterval string `koanf:"metric_export_interval"` } // LoggingConfig holds log rotation and level settings. @@ -175,6 +184,7 @@ type Config struct { Caddy CaddyConfig `koanf:"caddy"` ACME ACMEConfig `koanf:"acme"` API APIConfig `koanf:"api"` + Metrics MetricsConfig `koanf:"metrics"` Otel OtelConfig `koanf:"otel"` Logging LoggingConfig `koanf:"logging"` Build BuildConfig `koanf:"build"` @@ -245,12 +255,19 @@ func defaultConfig() *Config { RedirectHTTP: true, }, + Metrics: MetricsConfig{ + ListenAddress: "127.0.0.1", + Port: 9464, + VMLabelBudget: 200, + }, + Otel: OtelConfig{ - Enabled: false, - Endpoint: "127.0.0.1:4317", - ServiceName: "hypeman", - ServiceInstanceID: getHostname(), - Insecure: true, + Enabled: false, + Endpoint: "127.0.0.1:4317", + ServiceName: "hypeman", + ServiceInstanceID: getHostname(), + Insecure: true, + MetricExportInterval: "60s", }, Logging: LoggingConfig{ @@ -373,6 +390,20 @@ func Load(configPath string) (*Config, error) { // Validate checks configuration values for correctness. // Returns an error if any configuration value is invalid. func (c *Config) Validate() error { + if strings.TrimSpace(c.Metrics.ListenAddress) == "" { + return fmt.Errorf("metrics.listen_address must not be empty") + } + if c.Metrics.Port < 1 || c.Metrics.Port > 65535 { + return fmt.Errorf("metrics.port must be between 1 and 65535, got %d", c.Metrics.Port) + } + if c.Metrics.VMLabelBudget <= 0 { + return fmt.Errorf("metrics.vm_label_budget must be positive, got %d", c.Metrics.VMLabelBudget) + } + if c.Otel.MetricExportInterval != "" { + if _, err := time.ParseDuration(c.Otel.MetricExportInterval); err != nil { + return fmt.Errorf("otel.metric_export_interval must be a valid duration, got %q: %w", c.Otel.MetricExportInterval, err) + } + } if c.Oversubscription.CPU <= 0 { return fmt.Errorf("oversubscription.cpu must be positive, got %v", c.Oversubscription.CPU) } diff --git a/cmd/api/config/config_test.go b/cmd/api/config/config_test.go new file mode 100644 index 00000000..97828db5 --- /dev/null +++ b/cmd/api/config/config_test.go @@ -0,0 +1,85 @@ +package config + +import ( + "os" + "path/filepath" + "testing" +) + +func TestDefaultConfigIncludesMetricsSettings(t *testing.T) { + cfg := defaultConfig() + + if cfg.Metrics.ListenAddress != "127.0.0.1" { + t.Fatalf("expected default metrics.listen_address to be 127.0.0.1, got %q", cfg.Metrics.ListenAddress) + } + if cfg.Metrics.Port != 9464 { + t.Fatalf("expected default metrics.port to be 9464, got %d", cfg.Metrics.Port) + } + if cfg.Metrics.VMLabelBudget != 200 { + t.Fatalf("expected default metrics.vm_label_budget to be 200, got %d", cfg.Metrics.VMLabelBudget) + } + if cfg.Otel.MetricExportInterval != "60s" { + t.Fatalf("expected default otel.metric_export_interval to be 60s, got %q", cfg.Otel.MetricExportInterval) + } +} + +func TestLoadEnvOverridesMetricsAndOtelInterval(t *testing.T) { + t.Setenv("METRICS__LISTEN_ADDRESS", "0.0.0.0") + t.Setenv("METRICS__PORT", "9999") + t.Setenv("METRICS__VM_LABEL_BUDGET", "350") + t.Setenv("OTEL__METRIC_EXPORT_INTERVAL", "15s") + + tmp := t.TempDir() + cfgPath := filepath.Join(tmp, "config.yaml") + if err := os.WriteFile(cfgPath, []byte("{}\n"), 0600); err != nil { + t.Fatalf("write temp config: %v", err) + } + + cfg, err := Load(cfgPath) + if err != nil { + t.Fatalf("load config: %v", err) + } + + if cfg.Metrics.ListenAddress != "0.0.0.0" { + t.Fatalf("expected metrics.listen_address override, got %q", cfg.Metrics.ListenAddress) + } + if cfg.Metrics.Port != 9999 { + t.Fatalf("expected metrics.port override, got %d", cfg.Metrics.Port) + } + if cfg.Metrics.VMLabelBudget != 350 { + t.Fatalf("expected metrics.vm_label_budget override, got %d", cfg.Metrics.VMLabelBudget) + } + if cfg.Otel.MetricExportInterval != "15s" { + t.Fatalf("expected otel.metric_export_interval override, got %q", cfg.Otel.MetricExportInterval) + } +} + +func TestValidateRejectsInvalidMetricsPort(t *testing.T) { + cfg := defaultConfig() + cfg.Metrics.Port = 0 + + err := cfg.Validate() + if err == nil { + t.Fatalf("expected validation error for invalid metrics port") + } +} + +func TestValidateRejectsInvalidMetricExportInterval(t *testing.T) { + cfg := defaultConfig() + cfg.Otel.MetricExportInterval = "not-a-duration" + + err := cfg.Validate() + if err == nil { + t.Fatalf("expected validation error for invalid metric export interval") + } +} + +func TestValidateRejectsInvalidVMLabelBudget(t *testing.T) { + cfg := defaultConfig() + cfg.Metrics.VMLabelBudget = 0 + + err := cfg.Validate() + if err == nil { + t.Fatalf("expected validation error for invalid vm label budget") + } +} diff --git a/cmd/api/main.go b/cmd/api/main.go index f937fb31..0f0f4e56 100644 --- a/cmd/api/main.go +++ b/cmd/api/main.go @@ -6,9 +6,11 @@ import ( "errors" "fmt" "log/slog" + "net" "net/http" "os" "os/signal" + "strconv" "strings" "syscall" "time" @@ -43,6 +45,19 @@ func main() { slog.Info("main() exiting normally") } +func metricsServerAddress(cfg *config.Config) string { + return net.JoinHostPort(cfg.Metrics.ListenAddress, strconv.Itoa(cfg.Metrics.Port)) +} + +func newMetricsServer(addr string, handler http.Handler) *http.Server { + mux := http.NewServeMux() + mux.Handle("/metrics", handler) + return &http.Server{ + Addr: addr, + Handler: mux, + } +} + func run() error { // Load config early for OTel initialization // Config path can be specified via CONFIG_PATH env var or defaults to platform-specific locations @@ -62,34 +77,32 @@ func run() error { // Initialize OpenTelemetry (before wire initialization) otelCfg := otel.Config{ - Enabled: cfg.Otel.Enabled, - Endpoint: cfg.Otel.Endpoint, - ServiceName: cfg.Otel.ServiceName, - ServiceInstanceID: cfg.Otel.ServiceInstanceID, - Insecure: cfg.Otel.Insecure, - Version: cfg.Version, - Env: cfg.Env, + Enabled: cfg.Otel.Enabled, + Endpoint: cfg.Otel.Endpoint, + ServiceName: cfg.Otel.ServiceName, + ServiceInstanceID: cfg.Otel.ServiceInstanceID, + Insecure: cfg.Otel.Insecure, + MetricExportInterval: cfg.Otel.MetricExportInterval, + Version: cfg.Version, + Env: cfg.Env, } otelProvider, otelShutdown, err := otel.Init(context.Background(), otelCfg) if err != nil { - // Log warning but don't fail - graceful degradation - slog.Warn("failed to initialize OpenTelemetry, continuing without telemetry", "error", err) - } - if otelShutdown != nil { - defer func() { - slog.Info("shutting down OpenTelemetry") - shutdownCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second) - defer cancel() - if err := otelShutdown(shutdownCtx); err != nil { - slog.Warn("error shutting down OpenTelemetry", "error", err) - } - slog.Info("OpenTelemetry shutdown complete") - }() + return fmt.Errorf("initialize telemetry: %w", err) } + defer func() { + slog.Info("shutting down OpenTelemetry") + shutdownCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + if err := otelShutdown(shutdownCtx); err != nil { + slog.Warn("error shutting down OpenTelemetry", "error", err) + } + slog.Info("OpenTelemetry shutdown complete") + }() - // Initialize guest and vmm metrics if OTel is enabled - if otelProvider != nil && otelProvider.Meter != nil { + // Initialize guest and vmm metrics. + if otelProvider.Meter != nil { guestMetrics, err := guest.NewMetrics(otelProvider.Meter) if err == nil { guest.SetMetrics(guestMetrics) @@ -101,7 +114,7 @@ func run() error { } // Set global OTel log handler for logger package - if otelProvider != nil && otelProvider.LogHandler != nil { + if otelProvider.LogHandler != nil { otel.SetGlobalLogHandler(otelProvider.LogHandler) } @@ -127,7 +140,9 @@ func run() error { // Log OTel status if cfg.Otel.Enabled { - logger.Info("OpenTelemetry enabled", "endpoint", cfg.Otel.Endpoint, "service", cfg.Otel.ServiceName) + logger.Info("OpenTelemetry push enabled", "endpoint", cfg.Otel.Endpoint, "service", cfg.Otel.ServiceName, "metric_export_interval", cfg.Otel.MetricExportInterval) + } else { + logger.Info("OpenTelemetry push disabled; Prometheus pull metrics remain available") } // Validate JWT secret is configured @@ -237,7 +252,7 @@ func run() error { // Prepare HTTP metrics middleware (applied inside API group, not globally) // Global application breaks WebSocket (Hijacker) and SSE (Flusher) var httpMetricsMw func(http.Handler) http.Handler - if otelProvider != nil && otelProvider.Meter != nil { + if otelProvider.Meter != nil { httpMetrics, err := mw.NewHTTPMetrics(otelProvider.Meter) if err == nil { httpMetricsMw = httpMetrics.Middleware @@ -294,8 +309,12 @@ func run() error { r.Route("/v2", func(r chi.Router) { r.Use(middleware.RequestID) r.Use(middleware.RealIP) - r.Use(middleware.Logger) r.Use(middleware.Recoverer) + if cfg.Otel.Enabled { + r.Use(otelchi.Middleware(cfg.Otel.ServiceName, otelchi.WithChiRoutes(r))) + } + r.Use(mw.InjectLogger(logger)) + r.Use(mw.AccessLogger(accessLogger)) r.Use(mw.JwtAuth(app.Config.JwtSecret)) // Token endpoint for Docker Registry Token Authentication @@ -386,6 +405,12 @@ func run() error { Handler: r, } + metricsAddr := metricsServerAddress(cfg) + if otelProvider.MetricsHandler == nil { + return fmt.Errorf("metrics handler is not initialized") + } + metricsSrv := newMetricsServer(metricsAddr, otelProvider.MetricsHandler) + // Error group for coordinated shutdown grp, gctx := errgroup.WithContext(ctx) @@ -405,6 +430,15 @@ func run() error { return nil }) + grp.Go(func() error { + logger.Info("starting metrics endpoint", "addr", metricsAddr, "path", "/metrics") + if err := metricsSrv.ListenAndServe(); err != nil && !errors.Is(err, http.ErrServerClosed) { + logger.Error("metrics server error", "error", err) + return err + } + return nil + }) + // Shutdown handler grp.Go(func() error { <-gctx.Done() @@ -415,11 +449,21 @@ func run() error { shutdownCtx, cancel := context.WithTimeout(shutdownCtx, 30*time.Second) defer cancel() - if err := srv.Shutdown(shutdownCtx); err != nil { + var shutdownErrs []error + + if err := srv.Shutdown(shutdownCtx); err != nil && !errors.Is(err, http.ErrServerClosed) { logger.Error("failed to shutdown http server", "error", err) - return err + shutdownErrs = append(shutdownErrs, fmt.Errorf("shutdown http server: %w", err)) + } else { + logger.Info("http server shutdown complete") + } + + if err := metricsSrv.Shutdown(shutdownCtx); err != nil && !errors.Is(err, http.ErrServerClosed) { + logger.Error("failed to shutdown metrics server", "error", err) + shutdownErrs = append(shutdownErrs, fmt.Errorf("shutdown metrics server: %w", err)) + } else { + logger.Info("metrics server shutdown complete") } - logger.Info("http server shutdown complete") // Shutdown ingress manager (stops Caddy if CADDY_STOP_ON_SHUTDOWN=true) if err := app.IngressManager.Shutdown(shutdownCtx); err != nil { @@ -429,7 +473,7 @@ func run() error { logger.Info("ingress manager shutdown complete") } - return nil + return errors.Join(shutdownErrs...) }) // Log rotation scheduler @@ -446,7 +490,7 @@ func run() error { if err := app.InstanceManager.RotateLogs(gctx, int64(logMaxSize), app.Config.Logging.MaxFiles); err != nil { logger.Error("log rotation failed", "error", err) } else { - logger.Info("log rotation completed", "max_size", logMaxSize, "max_files", app.Config.Logging.MaxFiles) + logger.Debug("log rotation completed", "max_size", logMaxSize, "max_files", app.Config.Logging.MaxFiles) } } } diff --git a/cmd/api/metrics_server_test.go b/cmd/api/metrics_server_test.go new file mode 100644 index 00000000..dd6931b6 --- /dev/null +++ b/cmd/api/metrics_server_test.go @@ -0,0 +1,93 @@ +package main + +import ( + "context" + "errors" + "io" + "net" + "net/http" + "strings" + "testing" + "time" + + "github.com/kernel/hypeman/cmd/api/config" +) + +func TestMetricsServerAddress(t *testing.T) { + cfg := &config.Config{} + cfg.Metrics.ListenAddress = "127.0.0.1" + cfg.Metrics.Port = 9464 + + got := metricsServerAddress(cfg) + want := "127.0.0.1:9464" + if got != want { + t.Fatalf("expected metrics address %q, got %q", want, got) + } +} + +func TestMetricsServerServesAndShutsDown(t *testing.T) { + h := http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "text/plain") + _, _ = w.Write([]byte("hypeman_test_metric 1\n")) + }) + + ln, err := net.Listen("tcp", "127.0.0.1:0") + if err != nil { + t.Fatalf("listen: %v", err) + } + + srv := newMetricsServer(ln.Addr().String(), h) + errCh := make(chan error, 1) + go func() { + errCh <- srv.Serve(ln) + }() + + resp, err := http.Get("http://" + ln.Addr().String() + "/metrics") + if err != nil { + t.Fatalf("get metrics: %v", err) + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + t.Fatalf("expected status 200, got %d", resp.StatusCode) + } + body, err := io.ReadAll(resp.Body) + if err != nil { + t.Fatalf("read body: %v", err) + } + if !strings.Contains(string(body), "hypeman_test_metric") { + t.Fatalf("expected test metric in body") + } + + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + if err := srv.Shutdown(ctx); err != nil { + t.Fatalf("shutdown server: %v", err) + } + + serveErr := <-errCh + if !errors.Is(serveErr, http.ErrServerClosed) { + t.Fatalf("expected ErrServerClosed after shutdown, got %v", serveErr) + } +} + +func TestMetricsServerBindFailure(t *testing.T) { + h := http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusOK) + }) + + occupied, err := net.Listen("tcp", "127.0.0.1:0") + if err != nil { + t.Fatalf("listen: %v", err) + } + defer occupied.Close() + + srv := newMetricsServer(occupied.Addr().String(), h) + err = srv.ListenAndServe() + if err == nil { + t.Fatalf("expected bind failure") + } + if errors.Is(err, http.ErrServerClosed) { + t.Fatalf("expected bind error, got ErrServerClosed") + } +} diff --git a/cmd/api/wire_gen.go b/cmd/api/wire_gen.go index c0a28717..6eb15ce6 100644 --- a/cmd/api/wire_gen.go +++ b/cmd/api/wire_gen.go @@ -64,7 +64,7 @@ func initializeApp() (*application, func(), error) { if err != nil { return nil, nil, err } - vm_metricsManager, err := providers.ProvideVMMetricsManager(instancesManager) + vm_metricsManager, err := providers.ProvideVMMetricsManager(instancesManager, config, logger) if err != nil { return nil, nil, err } diff --git a/config.example.darwin.yaml b/config.example.darwin.yaml index d491e493..69792eac 100644 --- a/config.example.darwin.yaml +++ b/config.example.darwin.yaml @@ -92,11 +92,17 @@ limits: # ============================================================================= # OpenTelemetry (optional, same as Linux) # ============================================================================= +# metrics: +# listen_address: 127.0.0.1 +# port: 9464 +# vm_label_budget: 200 +# # otel: # enabled: false # endpoint: 127.0.0.1:4317 # service_name: hypeman # insecure: true +# metric_export_interval: 60s # env: dev # ============================================================================= diff --git a/config.example.yaml b/config.example.yaml index 5b4f9db1..b34a86d2 100644 --- a/config.example.yaml +++ b/config.example.yaml @@ -84,12 +84,18 @@ data_dir: /var/lib/hypeman # ============================================================================= # OpenTelemetry Configuration # ============================================================================= +# metrics: +# listen_address: 127.0.0.1 +# port: 9464 +# vm_label_budget: 200 +# # otel: # enabled: false # endpoint: 127.0.0.1:4317 # service_name: hypeman # service_instance_id: "" # default: hostname # insecure: true +# metric_export_interval: 60s # env: dev # deployment environment # ============================================================================= diff --git a/dashboards/hypeman.json b/dashboards/hypeman.json index 6e09676e..5055fa69 100644 --- a/dashboards/hypeman.json +++ b/dashboards/hypeman.json @@ -512,7 +512,7 @@ "targets": [ { "datasource": { "type": "prometheus", "uid": "prometheus" }, - "expr": "hypeman_vm_memory_utilization_ratio{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}", + "expr": "hypeman_vm_memory_rss_bytes{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"} / clamp_min(hypeman_vm_allocated_memory_bytes{deployment_environment_name=~\"$env\", service_instance_id=~\"$instance\"}, 1)", "legendFormat": "{{instance_name}}", "refId": "A" } diff --git a/go.mod b/go.mod index aea5119c..5b62ae91 100644 --- a/go.mod +++ b/go.mod @@ -35,6 +35,7 @@ require ( github.com/opencontainers/image-spec v1.1.1 github.com/opencontainers/runtime-spec v1.2.1 github.com/opencontainers/umoci v0.6.0 + github.com/prometheus/client_golang v1.23.0 github.com/riandyrn/otelchi v0.12.2 github.com/samber/lo v1.52.0 github.com/stretchr/testify v1.11.1 @@ -46,6 +47,7 @@ require ( go.opentelemetry.io/otel/exporters/otlp/otlplog/otlploggrpc v0.14.0 go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.38.0 go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.38.0 + go.opentelemetry.io/otel/exporters/prometheus v0.60.0 go.opentelemetry.io/otel/metric v1.38.0 go.opentelemetry.io/otel/sdk v1.38.0 go.opentelemetry.io/otel/sdk/log v0.14.0 @@ -66,9 +68,11 @@ require ( github.com/Microsoft/go-winio v0.6.2 // indirect github.com/apapsch/go-jsonmerge/v2 v2.0.0 // indirect github.com/apex/log v1.9.0 // indirect + github.com/beorn7/perks v1.0.1 // indirect github.com/blang/semver/v4 v4.0.0 // indirect github.com/cenkalti/backoff/v4 v4.3.0 // indirect github.com/cenkalti/backoff/v5 v5.0.3 // indirect + github.com/cespare/xxhash/v2 v2.3.0 // indirect github.com/containerd/errdefs v1.0.0 // indirect github.com/containerd/errdefs/pkg v0.3.0 // indirect github.com/containerd/stargz-snapshotter/estargz v0.16.3 // indirect @@ -89,6 +93,7 @@ require ( github.com/go-openapi/swag v0.23.0 // indirect github.com/go-test/deep v1.1.1 // indirect github.com/go-viper/mapstructure/v2 v2.4.0 // indirect + github.com/grafana/regexp v0.0.0-20240518133315-a468a5bfb3bc // indirect github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.2 // indirect github.com/josharian/intern v1.0.0 // indirect github.com/klauspost/compress v1.18.0 // indirect @@ -105,12 +110,17 @@ require ( github.com/moby/sys/userns v0.1.0 // indirect github.com/moby/term v0.5.0 // indirect github.com/mohae/deepcopy v0.0.0-20170929034955-c48cc78d4826 // indirect + github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect github.com/oasdiff/yaml v0.0.0-20250309154309-f31be36b4037 // indirect github.com/oasdiff/yaml3 v0.0.0-20250309153720-d2182401db90 // indirect github.com/perimeterx/marshmallow v1.1.5 // indirect github.com/pierrec/lz4/v4 v4.1.22 // indirect github.com/pkg/errors v0.9.1 // indirect github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect + github.com/prometheus/client_model v0.6.2 // indirect + github.com/prometheus/common v0.65.0 // indirect + github.com/prometheus/otlptranslator v0.0.2 // indirect + github.com/prometheus/procfs v0.17.0 // indirect github.com/rootless-containers/proto/go-proto v0.0.0-20230421021042-4cd87ebadd67 // indirect github.com/sirupsen/logrus v1.9.4-0.20230606125235-dd1b4c2e81af // indirect github.com/u-root/uio v0.0.0-20240224005618-d2acac8f3701 // indirect diff --git a/go.sum b/go.sum index d2e1ff06..b78d31f0 100644 --- a/go.sum +++ b/go.sum @@ -20,6 +20,8 @@ github.com/aphistic/golf v0.0.0-20180712155816-02c07f170c5a/go.mod h1:3NqKYiepwy github.com/aphistic/sweet v0.2.0/go.mod h1:fWDlIh/isSE9n6EPsRmC0det+whmX6dJid3stzu0Xys= github.com/aws/aws-sdk-go v1.20.6/go.mod h1:KmX6BPdI08NWTb3/sm4ZGu5ShLoqVDhKgpiN924inxo= github.com/aybabtme/rgbterm v0.0.0-20170906152045-cc83f3b3ce59/go.mod h1:q/89r3U2H7sSsE2t6Kca0lfwTK8JdoNGS/yzM/4iH5I= +github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= +github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= github.com/blang/semver/v4 v4.0.0 h1:1PFHFE6yCCTv8C1TeyNNarDzntLi7wMI5i/pzqYIsAM= github.com/blang/semver/v4 v4.0.0/go.mod h1:IbckMUScFkM3pff0VJDNKRiT6TG/YpiHIM2yvyW5YoQ= github.com/bmatcuk/doublestar v1.1.1/go.mod h1:UD6OnuiIn0yFxxA2le/rnRU1G4RaI4UvFv1sNto9p6w= @@ -29,6 +31,8 @@ github.com/cenkalti/backoff/v4 v4.3.0 h1:MyRJ/UdXutAwSAT+s3wNd7MfTIcy71VQueUuFK3 github.com/cenkalti/backoff/v4 v4.3.0/go.mod h1:Y3VNntkOUPxTVeUxJ/G5vcM//AlwfmyYozVcomhLiZE= github.com/cenkalti/backoff/v5 v5.0.3 h1:ZN+IMa753KfX5hd8vVaMixjnqRZ3y8CuJKRKj1xcsSM= github.com/cenkalti/backoff/v5 v5.0.3/go.mod h1:rkhZdG3JZukswDf7f0cwqPNk4K0sa+F97BxZthm/crw= +github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= +github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= github.com/containerd/errdefs v1.0.0 h1:tg5yIfIlQIrxYtu9ajqY42W3lpS19XqdxRQeEwYG8PI= github.com/containerd/errdefs v1.0.0/go.mod h1:+YBYIdtsnF4Iw6nWZhJcqGSg/dwvV7tyJ/kCkyJ2k+M= github.com/containerd/errdefs/pkg v0.3.0 h1:9IKJ06FvyNlexW690DXuQNx2KA2cUJXx151Xdx3ZPPE= @@ -116,6 +120,8 @@ github.com/gorilla/mux v1.8.1 h1:TuBL49tXwgrFYWhqrNgrUNEY92u81SPhu7sTdzQEiWY= github.com/gorilla/mux v1.8.1/go.mod h1:AKf9I4AEqPTmMytcMc0KkNouC66V3BtZ4qD5fmWSiMQ= github.com/gorilla/websocket v1.5.3 h1:saDtZ6Pbx/0u+bgYQ3q96pZgCzfhKXGPqt7kZ72aNNg= github.com/gorilla/websocket v1.5.3/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/adAjf1fMHhE= +github.com/grafana/regexp v0.0.0-20240518133315-a468a5bfb3bc h1:GN2Lv3MGO7AS6PrRoT6yV5+wkrOpcszoIsO4+4ds248= +github.com/grafana/regexp v0.0.0-20240518133315-a468a5bfb3bc/go.mod h1:+JKpmjMGhpgPL+rXZ5nsZieVzvarn86asRlBg4uNGnk= github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.2 h1:8Tjv8EJ+pM1xP8mK6egEbD1OgnVTyacbefKhmbLhIhU= github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.2/go.mod h1:pkJQ2tZHJ0aFOVEEot6oZmaVEZcRme73eIFmhiVuRWs= github.com/hpcloud/tail v1.0.0/go.mod h1:ab1qPbhIpdTxEkNHXyeSf5vhxWSCs/tWer42PpOxQnU= @@ -150,6 +156,8 @@ github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= +github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0SNc= +github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw= github.com/mailru/easyjson v0.7.7 h1:UGYAvKxe3sBsEDzO8ZeWOSlIQfWFlxbzLZe7hwFURr0= github.com/mailru/easyjson v0.7.7/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc= github.com/mattn/go-colorable v0.1.1/go.mod h1:FuOcm+DKB9mbwrcAfNl7/TZVBZ6rcnceauSikq3lYCQ= @@ -185,12 +193,12 @@ github.com/mohae/deepcopy v0.0.0-20170929034955-c48cc78d4826 h1:RWengNIwukTxcDr9 github.com/mohae/deepcopy v0.0.0-20170929034955-c48cc78d4826/go.mod h1:TaXosZuwdSHYgviHp1DAtfrULt5eUgsSMsZf+YrPgl8= github.com/morikuni/aec v1.0.0 h1:nP9CBfwrvYnBRgY6qfDQkygYDmYwOilePFkwzv4dU8A= github.com/morikuni/aec v1.0.0/go.mod h1:BbKIizmSmc5MMPqRYbxO4ZU0S0+P200+tUnFx7PXmsc= +github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA= +github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= github.com/nrednav/cuid2 v1.1.0 h1:Y2P9Fo1Iz7lKuwcn+fS0mbxkNvEqoNLUtm0+moHCnYc= github.com/nrednav/cuid2 v1.1.0/go.mod h1:jBjkJAI+QLM4EUGvtwGDHC1cP1QQrRNfLo/A7qJFDhA= github.com/oapi-codegen/nethttp-middleware v1.1.2 h1:TQwEU3WM6ifc7ObBEtiJgbRPaCe513tvJpiMJjypVPA= github.com/oapi-codegen/nethttp-middleware v1.1.2/go.mod h1:5qzjxMSiI8HjLljiOEjvs4RdrWyMPKnExeFS2kr8om4= -github.com/oapi-codegen/runtime v1.1.2 h1:P2+CubHq8fO4Q6fV1tqDBZHCwpVpvPg7oKiYzQgXIyI= -github.com/oapi-codegen/runtime v1.1.2/go.mod h1:SK9X900oXmPWilYR5/WKPzt3Kqxn/uS/+lbpREv+eCg= github.com/oapi-codegen/runtime v1.2.0 h1:RvKc1CVS1QeKSNzO97FBQbSMZyQ8s6rZd+LpmzwHMP4= github.com/oapi-codegen/runtime v1.2.0/go.mod h1:Y7ZhmmlE8ikZOmuHRRndiIm7nf3xcVv+YMweKgG1DT0= github.com/oasdiff/yaml v0.0.0-20250309154309-f31be36b4037 h1:G7ERwszslrBzRxj//JalHPu/3yz+De2J+4aLtSRlHiY= @@ -217,6 +225,16 @@ github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINE github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U= github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/prometheus/client_golang v1.23.0 h1:ust4zpdl9r4trLY/gSjlm07PuiBq2ynaXXlptpfy8Uc= +github.com/prometheus/client_golang v1.23.0/go.mod h1:i/o0R9ByOnHX0McrTMTyhYvKE4haaf2mW08I+jGAjEE= +github.com/prometheus/client_model v0.6.2 h1:oBsgwpGs7iVziMvrGhE53c/GrLUsZdHnqNwqPLxwZyk= +github.com/prometheus/client_model v0.6.2/go.mod h1:y3m2F6Gdpfy6Ut/GBsUqTWZqCUvMVzSfMLjcu6wAwpE= +github.com/prometheus/common v0.65.0 h1:QDwzd+G1twt//Kwj/Ww6E9FQq1iVMmODnILtW1t2VzE= +github.com/prometheus/common v0.65.0/go.mod h1:0gZns+BLRQ3V6NdaerOhMbwwRbNh9hkGINtQAsP5GS8= +github.com/prometheus/otlptranslator v0.0.2 h1:+1CdeLVrRQ6Psmhnobldo0kTp96Rj80DRXRd5OSnMEQ= +github.com/prometheus/otlptranslator v0.0.2/go.mod h1:P8AwMgdD7XEr6QRUJ2QWLpiAZTgTE2UYgjlu3svompI= +github.com/prometheus/procfs v0.17.0 h1:FuLQ+05u4ZI+SS/w9+BWEM2TXiHKsUQ9TADiRH7DuK0= +github.com/prometheus/procfs v0.17.0/go.mod h1:oPQLaDAMRbA+u8H5Pbfq+dl3VDAvHxMUOVhe0wYB2zw= github.com/riandyrn/otelchi v0.12.2 h1:6QhGv0LVw/dwjtPd12mnNrl0oEQF4ZAlmHcnlTYbeAg= github.com/riandyrn/otelchi v0.12.2/go.mod h1:weZZeUJURvtCcbWsdb7Y6F8KFZGedJlSrgUjq9VirV8= github.com/rogpeppe/fastuuid v1.1.0/go.mod h1:jVj6XXZzXRy/MSR5jhDC/2q6DgLz+nrA6LYCDYWNEvQ= @@ -250,9 +268,8 @@ github.com/u-root/u-root v0.15.0 h1:8JXfjAA/Vs8EXfZUA2ftvoHbiYYLdaU8umJ461aq+Jw= github.com/u-root/u-root v0.15.0/go.mod h1:/0Qr7qJeDwWxoKku2xKQ4Szc+SwBE3g9VE8jNiamsmc= github.com/u-root/uio v0.0.0-20240224005618-d2acac8f3701 h1:pyC9PaHYZFgEKFdlp3G8RaCKgVpHZnecvArXvPXcFkM= github.com/u-root/uio v0.0.0-20240224005618-d2acac8f3701/go.mod h1:P3a5rG4X7tI17Nn3aOIAYr5HbIMukwXG0urG0WuL8OA= -github.com/ugorji/go/codec v1.2.11 h1:BMaWp1Bb6fHwEtbplGBGJ498wD+LKlNSl25MjdZY4dU= -github.com/ugorji/go/codec v1.2.11/go.mod h1:UNopzCgEMSXjBc6AOMqYvWC1ktqTAfzJZUZgYf6w6lg= github.com/ugorji/go/codec v1.2.12 h1:9LC83zGrHhuUA9l16C9AHXAqEV/2wBQ4nkvumAE65EE= +github.com/ugorji/go/codec v1.2.12/go.mod h1:UNopzCgEMSXjBc6AOMqYvWC1ktqTAfzJZUZgYf6w6lg= github.com/vbatts/go-mtree v0.6.1-0.20250911112631-8307d76bc1b9 h1:R6l9BtUe83abUGu1YKGkfa17wMMFLt6mhHVQ8MxpfRE= github.com/vbatts/go-mtree v0.6.1-0.20250911112631-8307d76bc1b9/go.mod h1:W7bcG9PCn6lFY+ljGlZxx9DONkxL3v8a7HyN+PrSrjA= github.com/vbatts/tar-split v0.12.1 h1:CqKoORW7BUWBe7UL/iqTVvkTBOF8UvOMKOIZykxnnbo= @@ -284,6 +301,8 @@ go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.38.0 h1:lwI4D go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.38.0/go.mod h1:Kz/oCE7z5wuyhPxsXDuaPteSWqjSBD5YaSdbxZYGbGk= go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.33.0 h1:wpMfgF8E1rkrT1Z6meFh1NDtownE9Ii3n3X2GJYjsaU= go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.33.0/go.mod h1:wAy0T/dUbs468uOlkT31xjvqQgEVXv58BRFWEgn5v/0= +go.opentelemetry.io/otel/exporters/prometheus v0.60.0 h1:cGtQxGvZbnrWdC2GyjZi0PDKVSLWP/Jocix3QWfXtbo= +go.opentelemetry.io/otel/exporters/prometheus v0.60.0/go.mod h1:hkd1EekxNo69PTV4OWFGZcKQiIqg0RfuWExcPKFvepk= go.opentelemetry.io/otel/log v0.14.0 h1:2rzJ+pOAZ8qmZ3DDHg73NEKzSZkhkGIua9gXtxNGgrM= go.opentelemetry.io/otel/log v0.14.0/go.mod h1:5jRG92fEAgx0SU/vFPxmJvhIuDU9E1SUnEQrMlJpOno= go.opentelemetry.io/otel/metric v1.38.0 h1:Kl6lzIYGAh5M159u9NgiRkmoMKjvbsKtYRwgfrA6WpA= diff --git a/lib/builds/manager.go b/lib/builds/manager.go index 41632e82..f9afaa19 100644 --- a/lib/builds/manager.go +++ b/lib/builds/manager.go @@ -814,17 +814,17 @@ func (m *manager) waitForResult(ctx context.Context, buildID string, inst *insta } defer conn.Close() - m.logger.Info("connected to builder agent", "instance", inst.Id) + m.logger.Debug("connected to builder agent", "instance", inst.Id) encoder := json.NewEncoder(conn) decoder := json.NewDecoder(conn) // Tell the agent we're ready - it may request secrets - m.logger.Info("sending host_ready to agent", "instance", inst.Id) + m.logger.Debug("sending host_ready to agent", "instance", inst.Id) if err := encoder.Encode(VsockMessage{Type: "host_ready"}); err != nil { return nil, fmt.Errorf("send host_ready: %w", err) } - m.logger.Info("host_ready sent, waiting for agent messages", "instance", inst.Id) + m.logger.Debug("host_ready sent, waiting for agent messages", "instance", inst.Id) // Handle messages from agent until we get the build result for { @@ -855,11 +855,11 @@ func (m *manager) waitForResult(ctx context.Context, buildID string, inst *insta } // Handle message based on type - m.logger.Info("received message from agent", "type", dr.response.Type, "instance", inst.Id) + m.logger.Debug("received message from agent", "type", dr.response.Type, "instance", inst.Id) switch dr.response.Type { case "get_secrets": // Agent is requesting secrets - m.logger.Info("agent requesting secrets", "instance", inst.Id, "secret_ids", dr.response.SecretIDs) + m.logger.Debug("agent requesting secrets", "instance", inst.Id, "secret_ids", dr.response.SecretIDs) // Fetch secrets from provider secrets, err := m.secretProvider.GetSecrets(ctx, dr.response.SecretIDs) @@ -872,7 +872,7 @@ func (m *manager) waitForResult(ctx context.Context, buildID string, inst *insta if err := encoder.Encode(VsockMessage{Type: "secrets_response", Secrets: secrets}); err != nil { return nil, fmt.Errorf("send secrets response: %w", err) } - m.logger.Info("sent secrets to agent", "count", len(secrets), "instance", inst.Id) + m.logger.Debug("sent secrets to agent", "count", len(secrets), "instance", inst.Id) case "log": // Stream log line to build log file immediately diff --git a/lib/hypervisor/qemu/process_test.go b/lib/hypervisor/qemu/process_test.go index 900e7300..bc8756ab 100644 --- a/lib/hypervisor/qemu/process_test.go +++ b/lib/hypervisor/qemu/process_test.go @@ -30,7 +30,9 @@ func TestGetVersion_Integration(t *testing.T) { p := paths.New(tmpDir) version, err := starter.GetVersion(p) - require.NoError(t, err, "GetVersion should not return an error") + if err != nil { + t.Skipf("Skipping test: QEMU binary is not usable: %v", err) + } // Verify version is not empty assert.NotEmpty(t, version, "Version should not be empty") diff --git a/lib/ingress/binaries_linux.go b/lib/ingress/binaries_linux.go index 2b2a6a87..8d1dbc92 100644 --- a/lib/ingress/binaries_linux.go +++ b/lib/ingress/binaries_linux.go @@ -66,7 +66,7 @@ func ExtractCaddyBinary(p *paths.Paths) (string, error) { if err := os.WriteFile(hashPath, []byte(embeddedHash), 0644); err != nil { // Non-fatal - binary is extracted, just won't have hash for next time // This could cause unnecessary re-extractions but won't break functionality - slog.Info("failed to write caddy binary hash file", "path", hashPath, "error", err) + slog.Warn("failed to write caddy binary hash file", "path", hashPath, "error", err) } return extractPath, nil diff --git a/lib/ingress/logs.go b/lib/ingress/logs.go index ea2f6c2d..f085b039 100644 --- a/lib/ingress/logs.go +++ b/lib/ingress/logs.go @@ -99,8 +99,8 @@ func (f *CaddyLogForwarder) forwardLogLine(ctx context.Context, line string) { var entry caddyLogEntry if err := json.Unmarshal([]byte(line), &entry); err != nil { - // If we can't parse, log raw line at info level - f.logger.InfoContext(ctx, "caddy: "+line) + // If we can't parse, keep raw line at debug to avoid info noise. + f.logger.DebugContext(ctx, "caddy: "+line) return } diff --git a/lib/instances/qemu_test.go b/lib/instances/qemu_test.go index ab5aa853..45ffbc1b 100644 --- a/lib/instances/qemu_test.go +++ b/lib/instances/qemu_test.go @@ -163,22 +163,29 @@ func (r *qemuInstanceResolver) ResolveInstance(ctx context.Context, nameOrID str return nameOrID, nameOrID, nil } -// TestQEMUBasicEndToEnd tests the complete instance lifecycle with QEMU. -// This is the primary integration test for QEMU support. -// It tests: create, get, list, logs, network, ingress, volumes, exec, and delete. -// It does NOT test: snapshot/standby, hot memory resize (not supported by QEMU in first pass). -func TestQEMUBasicEndToEnd(t *testing.T) { - t.Parallel() - // Require KVM access +func requireQEMUUsable(t *testing.T) { + t.Helper() + if _, err := os.Stat("/dev/kvm"); os.IsNotExist(err) { t.Skip("/dev/kvm not available, skipping on this platform") } - // Require QEMU to be installed starter := qemu.NewStarter() if _, err := starter.GetBinaryPath(nil, ""); err != nil { - t.Fatalf("QEMU not available: %v", err) + t.Skipf("QEMU not available: %v", err) + } + if _, err := starter.GetVersion(nil); err != nil { + t.Skipf("QEMU is installed but not usable: %v", err) } +} + +// TestQEMUBasicEndToEnd tests the complete instance lifecycle with QEMU. +// This is the primary integration test for QEMU support. +// It tests: create, get, list, logs, network, ingress, volumes, exec, and delete. +// It does NOT test: snapshot/standby, hot memory resize (not supported by QEMU in first pass). +func TestQEMUBasicEndToEnd(t *testing.T) { + t.Parallel() + requireQEMUUsable(t) manager, tmpDir := setupTestManagerForQEMU(t) ctx := context.Background() @@ -575,12 +582,7 @@ func TestQEMUEntrypointEnvVars(t *testing.T) { if os.Getuid() != 0 { t.Skip("Skipping test that requires root") } - - // Require QEMU to be installed - starter := qemu.NewStarter() - if _, err := starter.GetBinaryPath(nil, ""); err != nil { - t.Fatalf("QEMU not available: %v", err) - } + requireQEMUUsable(t) mgr, tmpDir := setupTestManagerForQEMU(t) ctx := context.Background() @@ -749,16 +751,7 @@ func TestQEMUEntrypointEnvVars(t *testing.T) { // This tests QEMU's migrate-to-file snapshot mechanism. func TestQEMUStandbyAndRestore(t *testing.T) { t.Parallel() - // Require KVM access - if _, err := os.Stat("/dev/kvm"); os.IsNotExist(err) { - t.Skip("/dev/kvm not available, skipping on this platform") - } - - // Require QEMU to be installed - starter := qemu.NewStarter() - if _, err := starter.GetBinaryPath(nil, ""); err != nil { - t.Fatalf("QEMU not available: %v", err) - } + requireQEMUUsable(t) manager, tmpDir := setupTestManagerForQEMU(t) ctx := context.Background() @@ -870,14 +863,7 @@ func TestQEMUStandbyAndRestore(t *testing.T) { func TestQEMUForkFromRunningNetwork(t *testing.T) { t.Parallel() - if _, err := os.Stat("/dev/kvm"); os.IsNotExist(err) { - t.Skip("/dev/kvm not available, skipping on this platform") - } - - starter := qemu.NewStarter() - if _, err := starter.GetBinaryPath(nil, ""); err != nil { - t.Fatalf("QEMU not available: %v", err) - } + requireQEMUUsable(t) manager, tmpDir := setupTestManagerForQEMU(t) ctx := context.Background() @@ -960,14 +946,7 @@ func TestQEMUForkFromRunningNetwork(t *testing.T) { func TestQEMUSnapshotFeature(t *testing.T) { t.Parallel() - if _, err := os.Stat("/dev/kvm"); os.IsNotExist(err) { - t.Skip("/dev/kvm not available, skipping on this platform") - } - - starter := qemu.NewStarter() - if _, err := starter.GetBinaryPath(nil, ""); err != nil { - t.Skipf("QEMU not available: %v", err) - } + requireQEMUUsable(t) mgr, tmpDir := setupTestManagerForQEMU(t) runStandbySnapshotScenario(t, mgr, tmpDir, snapshotScenarioConfig{ diff --git a/lib/middleware/otel.go b/lib/middleware/otel.go index 2cd42d86..663d11c2 100644 --- a/lib/middleware/otel.go +++ b/lib/middleware/otel.go @@ -14,6 +14,8 @@ import ( "go.opentelemetry.io/otel/metric" ) +const unmatchedRouteLabel = "__unmatched__" + // HTTPMetrics holds the OTel metrics for HTTP requests. type HTTPMetrics struct { requestsTotal metric.Int64Counter @@ -62,7 +64,7 @@ func (m *HTTPMetrics) Middleware(next http.Handler) http.Handler { // Get route pattern if available (chi specific) routePattern := chi.RouteContext(r.Context()).RoutePattern() if routePattern == "" { - routePattern = r.URL.Path + routePattern = unmatchedRouteLabel } // Record metrics diff --git a/lib/middleware/otel_metrics_test.go b/lib/middleware/otel_metrics_test.go new file mode 100644 index 00000000..274f162b --- /dev/null +++ b/lib/middleware/otel_metrics_test.go @@ -0,0 +1,54 @@ +package middleware + +import ( + "net/http" + "net/http/httptest" + "testing" + + "github.com/stretchr/testify/require" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/sdk/metric" + "go.opentelemetry.io/otel/sdk/metric/metricdata" +) + +func TestHTTPMetrics_UnmatchedRouteUsesSentinelPathLabel(t *testing.T) { + reader := metric.NewManualReader() + provider := metric.NewMeterProvider(metric.WithReader(reader)) + meter := provider.Meter("test") + + httpMetrics, err := NewHTTPMetrics(meter) + require.NoError(t, err) + + handler := httpMetrics.Middleware(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusNoContent) + })) + + req := httptest.NewRequest(http.MethodGet, "/dynamic/path/123", nil) + rec := httptest.NewRecorder() + handler.ServeHTTP(rec, req) + require.Equal(t, http.StatusNoContent, rec.Code) + + var rm metricdata.ResourceMetrics + err = reader.Collect(t.Context(), &rm) + require.NoError(t, err) + + var pathLabel string + foundMetric := false + for _, sm := range rm.ScopeMetrics { + for _, m := range sm.Metrics { + if m.Name != "hypeman_http_requests_total" { + continue + } + foundMetric = true + sum, ok := m.Data.(metricdata.Sum[int64]) + require.True(t, ok, "expected sum metric data") + require.NotEmpty(t, sum.DataPoints) + v, ok := sum.DataPoints[0].Attributes.Value(attribute.Key("path")) + require.True(t, ok, "expected path label in metric attributes") + pathLabel = v.AsString() + } + } + + require.True(t, foundMetric, "expected hypeman_http_requests_total metric") + require.Equal(t, unmatchedRouteLabel, pathLabel) +} diff --git a/lib/otel/README.md b/lib/otel/README.md index 6db7a686..525da933 100644 --- a/lib/otel/README.md +++ b/lib/otel/README.md @@ -4,11 +4,19 @@ Provides OpenTelemetry initialization and metric definitions for Hypeman. ## Features -- OTLP export for traces, metrics, and logs (gRPC) +- Always-on Prometheus pull metrics via `/metrics` +- Optional OTLP push export for traces, metrics, and logs (gRPC) - Runtime metrics (Go GC, goroutines, memory) - Application-specific metrics per subsystem - Log bridging from slog to OTel (viewable in Grafana/Loki) -- Graceful degradation (failures don't crash the app) +- Single metric instrumentation pipeline shared by pull and push paths + +## Dual Export Model + +Hypeman always exposes metrics on `/metrics` (pull), by default on `127.0.0.1:9464`. +If `otel.enabled=true`, it also pushes the same metric stream on a schedule to OTLP. + +This keeps pull and push views aligned because both are sourced from the same OTel meter provider/instruments. ## Configuration @@ -20,6 +28,10 @@ Provides OpenTelemetry initialization and metric definitions for Hypeman. | `OTEL_SERVICE_NAME` | Service name | `hypeman` | | `OTEL_SERVICE_INSTANCE_ID` | Instance ID (`service.instance.id` attribute) | hostname | | `OTEL_INSECURE` | Disable TLS for OTLP | `true` | +| `OTEL__METRIC_EXPORT_INTERVAL` | OTLP metric push interval (when enabled) | `60s` | +| `METRICS__LISTEN_ADDRESS` | Bind address for `/metrics` listener | `127.0.0.1` | +| `METRICS__PORT` | Port for `/metrics` listener | `9464` | +| `METRICS__VM_LABEL_BUDGET` | Warning threshold for observed per-VM labeled VM metrics | `200` | ## Metrics @@ -80,19 +92,27 @@ Provides OpenTelemetry initialization and metric definitions for Hypeman. | `hypeman_exec_bytes_sent_total` | counter | | Bytes to guest (stdin) | | `hypeman_exec_bytes_received_total` | counter | | Bytes from guest (stdout+stderr) | +### VM Metrics Guardrails +| Metric | Type | Description | +|--------|------|-------------| +| `hypeman_vm_metrics_instances_observed` | gauge | Current number of VM instances represented by per-VM labeled metrics | +| `hypeman_vm_metrics_label_budget_exceeded_total` | counter | Count of transitions into over-budget VM metric label cardinality | + ## Usage ```go provider, shutdown, err := otel.Init(ctx, otel.Config{ - Enabled: true, - Endpoint: "localhost:4317", - ServiceName: "hypeman", + Enabled: true, + Endpoint: "localhost:4317", + ServiceName: "hypeman", + MetricExportInterval: "60s", }) defer shutdown(ctx) meter := provider.Meter // Use for creating metrics tracer := provider.Tracer // Use for creating traces logHandler := provider.LogHandler // Use with slog for logs to OTel +metricsHandler := provider.MetricsHandler // Attach to GET /metrics ``` ## Logs @@ -101,4 +121,3 @@ Logs are exported via the OTel log bridge (`otelslog`). When OTel is enabled, al - `subsystem` attribute (API, IMAGES, INSTANCES, etc.) - `trace_id` and `span_id` when available - Service attributes (name, instance, environment) - diff --git a/lib/otel/otel.go b/lib/otel/otel.go index 2b1c2e41..74911d5a 100644 --- a/lib/otel/otel.go +++ b/lib/otel/otel.go @@ -5,15 +5,20 @@ import ( "context" "fmt" "log/slog" + "net/http" goruntime "runtime" + "sync" "time" + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promhttp" "go.opentelemetry.io/contrib/bridges/otelslog" otelruntime "go.opentelemetry.io/contrib/instrumentation/runtime" "go.opentelemetry.io/otel" "go.opentelemetry.io/otel/exporters/otlp/otlplog/otlploggrpc" "go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc" "go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc" + otelprometheus "go.opentelemetry.io/otel/exporters/prometheus" "go.opentelemetry.io/otel/metric" "go.opentelemetry.io/otel/propagation" sdklog "go.opentelemetry.io/otel/sdk/log" @@ -26,13 +31,14 @@ import ( // Config holds OpenTelemetry configuration. type Config struct { - Enabled bool - Endpoint string - ServiceName string - ServiceInstanceID string - Insecure bool - Version string - Env string + Enabled bool + Endpoint string + ServiceName string + ServiceInstanceID string + Insecure bool + MetricExportInterval string + Version string + Env string } // Provider holds initialized OTel providers. @@ -43,23 +49,33 @@ type Provider struct { Tracer trace.Tracer Meter metric.Meter LogHandler slog.Handler + MetricsHandler http.Handler startTime time.Time } +var runtimeMetricsState struct { + mu sync.Mutex + started bool +} + +func startRuntimeMetricsOnce(meterProvider metric.MeterProvider) (bool, error) { + runtimeMetricsState.mu.Lock() + defer runtimeMetricsState.mu.Unlock() + + if runtimeMetricsState.started { + return false, nil + } + if err := otelruntime.Start(otelruntime.WithMeterProvider(meterProvider)); err != nil { + return false, err + } + runtimeMetricsState.started = true + return true, nil +} + // Init initializes OpenTelemetry with the given configuration. // Returns a shutdown function that should be called on application exit. -// If OTel is disabled, returns a no-op shutdown function. func Init(ctx context.Context, cfg Config) (*Provider, func(context.Context) error, error) { - if !cfg.Enabled { - // Return no-op provider when disabled - return &Provider{ - Tracer: otel.Tracer(cfg.ServiceName), - Meter: otel.Meter(cfg.ServiceName), - startTime: time.Now(), - }, func(context.Context) error { return nil }, nil - } - - // Create resource with service information + // Create resource with service information. res, err := resource.Merge( resource.Default(), resource.NewWithAttributes( @@ -74,110 +90,159 @@ func Init(ctx context.Context, cfg Config) (*Provider, func(context.Context) err return nil, nil, fmt.Errorf("create resource: %w", err) } - // Create trace exporter - traceOpts := []otlptracegrpc.Option{ - otlptracegrpc.WithEndpoint(cfg.Endpoint), - } - if cfg.Insecure { - traceOpts = append(traceOpts, otlptracegrpc.WithInsecure()) - } - traceExporter, err := otlptracegrpc.New(ctx, traceOpts...) + // Create Prometheus pull exporter and registry (required for always-on /metrics). + promRegistry := prometheus.NewRegistry() + promExporter, err := otelprometheus.New(otelprometheus.WithRegisterer(promRegistry)) if err != nil { - return nil, nil, fmt.Errorf("create trace exporter: %w", err) + return nil, nil, fmt.Errorf("create prometheus exporter: %w", err) } - // Create tracer provider - tracerProvider := sdktrace.NewTracerProvider( - sdktrace.WithBatcher(traceExporter), - sdktrace.WithResource(res), - ) - - // Create metric exporter - metricOpts := []otlpmetricgrpc.Option{ - otlpmetricgrpc.WithEndpoint(cfg.Endpoint), - } - if cfg.Insecure { - metricOpts = append(metricOpts, otlpmetricgrpc.WithInsecure()) + meterProviderOpts := []sdkmetric.Option{ + sdkmetric.WithReader(promExporter), + sdkmetric.WithResource(res), } - metricExporter, err := otlpmetricgrpc.New(ctx, metricOpts...) - if err != nil { - tracerProvider.Shutdown(ctx) - return nil, nil, fmt.Errorf("create metric exporter: %w", err) + + // Add OTLP metric push reader when enabled. Failures are non-fatal. + if cfg.Enabled { + metricOpts := []otlpmetricgrpc.Option{ + otlpmetricgrpc.WithEndpoint(cfg.Endpoint), + } + if cfg.Insecure { + metricOpts = append(metricOpts, otlpmetricgrpc.WithInsecure()) + } + metricExporter, metricErr := otlpmetricgrpc.New(ctx, metricOpts...) + if metricErr != nil { + slog.Warn("failed to initialize OTLP metric push exporter; continuing with pull metrics only", "error", metricErr) + } else { + periodicOpts := []sdkmetric.PeriodicReaderOption{} + if cfg.MetricExportInterval != "" { + if interval, parseErr := time.ParseDuration(cfg.MetricExportInterval); parseErr != nil { + slog.Warn("invalid OTLP metric export interval; using default", "value", cfg.MetricExportInterval, "error", parseErr) + } else { + periodicOpts = append(periodicOpts, sdkmetric.WithInterval(interval)) + } + } + meterProviderOpts = append(meterProviderOpts, sdkmetric.WithReader(sdkmetric.NewPeriodicReader(metricExporter, periodicOpts...))) + } } - // Create meter provider - meterProvider := sdkmetric.NewMeterProvider( - sdkmetric.WithReader(sdkmetric.NewPeriodicReader(metricExporter)), - sdkmetric.WithResource(res), - ) + // Create meter provider with at least the Prometheus pull reader. + meterProvider := sdkmetric.NewMeterProvider(meterProviderOpts...) - // Create log exporter - logOpts := []otlploggrpc.Option{ - otlploggrpc.WithEndpoint(cfg.Endpoint), - } - if cfg.Insecure { - logOpts = append(logOpts, otlploggrpc.WithInsecure()) - } - logExporter, err := otlploggrpc.New(ctx, logOpts...) - if err != nil { - tracerProvider.Shutdown(ctx) - meterProvider.Shutdown(ctx) - return nil, nil, fmt.Errorf("create log exporter: %w", err) - } + var tracerProvider *sdktrace.TracerProvider + var loggerProvider *sdklog.LoggerProvider + var logHandler slog.Handler - // Create logger provider - loggerProvider := sdklog.NewLoggerProvider( - sdklog.WithProcessor(sdklog.NewBatchProcessor(logExporter)), - sdklog.WithResource(res), - ) + if cfg.Enabled { + // Create trace provider (exporting when OTLP trace exporter initializes successfully). + traceOpts := []otlptracegrpc.Option{ + otlptracegrpc.WithEndpoint(cfg.Endpoint), + } + if cfg.Insecure { + traceOpts = append(traceOpts, otlptracegrpc.WithInsecure()) + } + traceExporter, traceErr := otlptracegrpc.New(ctx, traceOpts...) + if traceErr != nil { + slog.Warn("failed to initialize OTLP trace exporter; continuing without trace export", "error", traceErr) + tracerProvider = sdktrace.NewTracerProvider( + sdktrace.WithResource(res), + ) + } else { + tracerProvider = sdktrace.NewTracerProvider( + sdktrace.WithBatcher(traceExporter), + sdktrace.WithResource(res), + ) + } + + // Create log exporter/provider. Failures are non-fatal. + logOpts := []otlploggrpc.Option{ + otlploggrpc.WithEndpoint(cfg.Endpoint), + } + if cfg.Insecure { + logOpts = append(logOpts, otlploggrpc.WithInsecure()) + } + logExporter, logErr := otlploggrpc.New(ctx, logOpts...) + if logErr != nil { + slog.Warn("failed to initialize OTLP log exporter; continuing without OTLP log export", "error", logErr) + } else { + loggerProvider = sdklog.NewLoggerProvider( + sdklog.WithProcessor(sdklog.NewBatchProcessor(logExporter)), + sdklog.WithResource(res), + ) + logHandler = otelslog.NewHandler(cfg.ServiceName, otelslog.WithLoggerProvider(loggerProvider)) + } + } - // Set global providers - otel.SetTracerProvider(tracerProvider) + // Set global providers. + if tracerProvider != nil { + otel.SetTracerProvider(tracerProvider) + } otel.SetMeterProvider(meterProvider) otel.SetTextMapPropagator(propagation.NewCompositeTextMapPropagator( propagation.TraceContext{}, propagation.Baggage{}, )) - // Start runtime metrics collection - if err := otelruntime.Start(otelruntime.WithMeterProvider(meterProvider)); err != nil { - tracerProvider.Shutdown(ctx) + // Start runtime metrics collection. + startedRuntimeMetrics, err := startRuntimeMetricsOnce(meterProvider) + if err != nil { + if tracerProvider != nil { + tracerProvider.Shutdown(ctx) + } meterProvider.Shutdown(ctx) - loggerProvider.Shutdown(ctx) + if loggerProvider != nil { + loggerProvider.Shutdown(ctx) + } return nil, nil, fmt.Errorf("start runtime metrics: %w", err) } + if !startedRuntimeMetrics { + // Tests may initialize telemetry more than once in the same process. + // Runtime instrumentation is process-scoped, so skip duplicate starts. + slog.Warn("runtime metrics instrumentation already initialized; skipping duplicate start") + } - // Create slog handler that bridges to OTel - logHandler := otelslog.NewHandler(cfg.ServiceName, otelslog.WithLoggerProvider(loggerProvider)) + tracer := otel.Tracer(cfg.ServiceName) + if tracerProvider != nil { + tracer = tracerProvider.Tracer(cfg.ServiceName) + } provider := &Provider{ TracerProvider: tracerProvider, MeterProvider: meterProvider, LoggerProvider: loggerProvider, - Tracer: tracerProvider.Tracer(cfg.ServiceName), + Tracer: tracer, Meter: meterProvider.Meter(cfg.ServiceName), LogHandler: logHandler, + MetricsHandler: promhttp.HandlerFor(promRegistry, promhttp.HandlerOpts{}), startTime: time.Now(), } - // Register system metrics (uptime, info) + // Register system metrics (uptime, info). if err := provider.registerSystemMetrics(cfg); err != nil { - tracerProvider.Shutdown(ctx) + if tracerProvider != nil { + tracerProvider.Shutdown(ctx) + } meterProvider.Shutdown(ctx) - loggerProvider.Shutdown(ctx) + if loggerProvider != nil { + loggerProvider.Shutdown(ctx) + } return nil, nil, fmt.Errorf("register system metrics: %w", err) } shutdown := func(ctx context.Context) error { var errs []error - if err := tracerProvider.Shutdown(ctx); err != nil { - errs = append(errs, fmt.Errorf("shutdown tracer: %w", err)) + if tracerProvider != nil { + if err := tracerProvider.Shutdown(ctx); err != nil { + errs = append(errs, fmt.Errorf("shutdown tracer: %w", err)) + } } if err := meterProvider.Shutdown(ctx); err != nil { errs = append(errs, fmt.Errorf("shutdown meter: %w", err)) } - if err := loggerProvider.Shutdown(ctx); err != nil { - errs = append(errs, fmt.Errorf("shutdown logger: %w", err)) + if loggerProvider != nil { + if err := loggerProvider.Shutdown(ctx); err != nil { + errs = append(errs, fmt.Errorf("shutdown logger: %w", err)) + } } if len(errs) > 0 { return fmt.Errorf("shutdown errors: %v", errs) diff --git a/lib/otel/otel_test.go b/lib/otel/otel_test.go new file mode 100644 index 00000000..04ce5826 --- /dev/null +++ b/lib/otel/otel_test.go @@ -0,0 +1,100 @@ +package otel + +import ( + "context" + "net/http" + "net/http/httptest" + "strings" + "testing" +) + +func TestInitMetricsHandlerAlwaysAvailable(t *testing.T) { + tests := []struct { + name string + cfg Config + }{ + { + name: "otel push disabled", + cfg: Config{ + Enabled: false, + ServiceName: "hypeman-test", + ServiceInstanceID: "test-instance", + MetricExportInterval: "60s", + Version: "test", + Env: "test", + }, + }, + { + name: "otel push enabled but misconfigured endpoint", + cfg: Config{ + Enabled: true, + Endpoint: "://bad-endpoint", + ServiceName: "hypeman-test", + ServiceInstanceID: "test-instance", + Insecure: true, + MetricExportInterval: "5s", + Version: "test", + Env: "test", + }, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + provider, shutdown, err := Init(context.Background(), tt.cfg) + if err != nil { + t.Fatalf("init telemetry: %v", err) + } + t.Cleanup(func() { + _ = shutdown(context.Background()) + }) + + if provider.MetricsHandler == nil { + t.Fatalf("expected metrics handler") + } + + rr := httptest.NewRecorder() + req := httptest.NewRequest(http.MethodGet, "/metrics", nil) + provider.MetricsHandler.ServeHTTP(rr, req) + + if rr.Code != http.StatusOK { + t.Fatalf("expected status 200, got %d", rr.Code) + } + if !strings.Contains(rr.Header().Get("Content-Type"), "text/plain") { + t.Fatalf("expected Prometheus text content type, got %q", rr.Header().Get("Content-Type")) + } + if !strings.Contains(rr.Body.String(), "hypeman_uptime_seconds") { + t.Fatalf("expected hypeman_uptime_seconds metric in output") + } + }) + } +} + +func TestInitRepeatedDoesNotFail(t *testing.T) { + cfg := Config{ + Enabled: false, + ServiceName: "hypeman-test", + ServiceInstanceID: "test-instance-repeat", + MetricExportInterval: "60s", + Version: "test", + Env: "test", + } + + provider1, shutdown1, err := Init(context.Background(), cfg) + if err != nil { + t.Fatalf("first init telemetry: %v", err) + } + t.Cleanup(func() { _ = shutdown1(context.Background()) }) + if provider1.MetricsHandler == nil { + t.Fatalf("expected first metrics handler") + } + + provider2, shutdown2, err := Init(context.Background(), cfg) + if err != nil { + t.Fatalf("second init telemetry: %v", err) + } + t.Cleanup(func() { _ = shutdown2(context.Background()) }) + if provider2.MetricsHandler == nil { + t.Fatalf("expected second metrics handler") + } +} diff --git a/lib/providers/providers.go b/lib/providers/providers.go index 25cc3536..94fa8aef 100644 --- a/lib/providers/providers.go +++ b/lib/providers/providers.go @@ -167,8 +167,10 @@ func ProvideResourceManager(ctx context.Context, cfg *config.Config, p *paths.Pa } // ProvideVMMetricsManager provides the VM metrics manager for utilization tracking -func ProvideVMMetricsManager(instanceManager instances.Manager) (*vm_metrics.Manager, error) { +func ProvideVMMetricsManager(instanceManager instances.Manager, cfg *config.Config, log *slog.Logger) (*vm_metrics.Manager, error) { mgr := vm_metrics.NewManager() + mgr.SetVMLabelBudget(cfg.Metrics.VMLabelBudget) + mgr.SetLogger(log) // Adapt instance manager to vm_metrics.InstanceSource interface adapter := vm_metrics.NewInstanceManagerAdapter(instanceManager) diff --git a/lib/vm_metrics/README.md b/lib/vm_metrics/README.md index 3257b84a..66f1fc7e 100644 --- a/lib/vm_metrics/README.md +++ b/lib/vm_metrics/README.md @@ -22,9 +22,20 @@ This approach works for both Cloud Hypervisor and QEMU without requiring any in- | `hypeman_vm_allocated_memory_bytes` | Gauge | Total allocated memory | | `hypeman_vm_network_rx_bytes_total` | Counter | Network bytes received | | `hypeman_vm_network_tx_bytes_total` | Counter | Network bytes transmitted | -| `hypeman_vm_memory_utilization_ratio` | Gauge | RSS / allocated memory | +| `hypeman_vm_metrics_instances_observed` | Gauge | Number of instances currently represented by per-VM metrics | +| `hypeman_vm_metrics_label_budget_exceeded_total` | Counter | Transitions into over-budget per-VM metric cardinality | -All metrics include `instance_id` and `instance_name` labels. +Per-VM utilization series include `instance_id` and `instance_name` labels. + +## Cardinality Guardrail + +Per-VM labels are intentionally retained for operational visibility. Use the label budget guardrail to detect growth: + +- Config key: `metrics.vm_label_budget` (env: `METRICS__VM_LABEL_BUDGET`, default `200`) +- `hypeman_vm_metrics_instances_observed` reports current per-VM series driver +- `hypeman_vm_metrics_label_budget_exceeded_total` increments when moving from under-budget to over-budget + +When observed instances exceed budget, Hypeman logs a one-time WARN transition and emits a one-time INFO recovery when back under budget. ## API Endpoint @@ -49,6 +60,8 @@ Returns current utilization for a specific instance: } ``` +Note: `memory_utilization_ratio` is part of the API response for convenience, but not exported as a standalone Prometheus metric. + ## Architecture ``` diff --git a/lib/vm_metrics/manager.go b/lib/vm_metrics/manager.go index 112047d8..8f545c54 100644 --- a/lib/vm_metrics/manager.go +++ b/lib/vm_metrics/manager.go @@ -2,6 +2,7 @@ package vm_metrics import ( "context" + "log/slog" "sync" "github.com/kernel/hypeman/lib/logger" @@ -12,14 +13,21 @@ import ( // Manager collects and exposes VM resource utilization metrics. // It reads from /proc and TAP interfaces to gather real-time statistics. type Manager struct { - mu sync.RWMutex - source InstanceSource - otel *otelMetrics + mu sync.RWMutex + source InstanceSource + otel *otelMetrics + log *slog.Logger + vmLabelBudget int + vmLabelBudgetExceeded bool + vmLabelBudgetEvents int64 } // NewManager creates a new VM metrics manager. func NewManager() *Manager { - return &Manager{} + return &Manager{ + log: slog.Default(), + vmLabelBudget: 200, + } } // SetInstanceSource sets the source for instance information. @@ -30,6 +38,26 @@ func (m *Manager) SetInstanceSource(source InstanceSource) { m.source = source } +// SetLogger sets the logger used for guardrail transition logs. +func (m *Manager) SetLogger(log *slog.Logger) { + if log == nil { + log = slog.Default() + } + m.mu.Lock() + defer m.mu.Unlock() + m.log = log +} + +// SetVMLabelBudget sets the max expected number of per-VM labeled series. +func (m *Manager) SetVMLabelBudget(budget int) { + if budget <= 0 { + return + } + m.mu.Lock() + defer m.mu.Unlock() + m.vmLabelBudget = budget +} + // InitializeOTel sets up OpenTelemetry metrics. // If meter is nil, OTel metrics are disabled. func (m *Manager) InitializeOTel(meter metric.Meter) error { @@ -49,6 +77,42 @@ func (m *Manager) InitializeOTel(meter metric.Meter) error { return nil } +// observeVMLabelBudget checks current per-VM metric cardinality against budget. +// Returns true when transitioning into over-budget state. +func (m *Manager) observeVMLabelBudget(ctx context.Context, observedInstances int) bool { + m.mu.Lock() + budget := m.vmLabelBudget + if budget <= 0 { + budget = 1 + } + log := m.log + wasOverBudget := m.vmLabelBudgetExceeded + isOverBudget := observedInstances > budget + if isOverBudget && !wasOverBudget { + m.vmLabelBudgetEvents++ + } + m.vmLabelBudgetExceeded = isOverBudget + m.mu.Unlock() + + if log == nil { + log = slog.Default() + } + if isOverBudget && !wasOverBudget { + log.WarnContext(ctx, "vm metrics label budget exceeded", "observed_instances", observedInstances, "budget", budget) + return true + } + if !isOverBudget && wasOverBudget { + log.InfoContext(ctx, "vm metrics label budget recovered", "observed_instances", observedInstances, "budget", budget) + } + return false +} + +func (m *Manager) vmLabelBudgetEventCount() int64 { + m.mu.RLock() + defer m.mu.RUnlock() + return m.vmLabelBudgetEvents +} + // GetInstanceStats collects metrics for a single instance. // Returns nil if the instance is not running or stats cannot be collected. func (m *Manager) GetInstanceStats(ctx context.Context, info InstanceInfo) *VMStats { diff --git a/lib/vm_metrics/manager_test.go b/lib/vm_metrics/manager_test.go index d3e009f3..81603741 100644 --- a/lib/vm_metrics/manager_test.go +++ b/lib/vm_metrics/manager_test.go @@ -4,7 +4,9 @@ package vm_metrics import ( "context" + "log/slog" "os" + "sync" "testing" "github.com/stretchr/testify/assert" @@ -202,3 +204,52 @@ func TestBuildInstanceInfo(t *testing.T) { func ptrFloat64(v float64) *float64 { return &v } + +type recordedLog struct { + level slog.Level + msg string +} + +type recordingHandler struct { + mu sync.Mutex + records []recordedLog +} + +func (h *recordingHandler) Enabled(_ context.Context, _ slog.Level) bool { return true } + +func (h *recordingHandler) Handle(_ context.Context, r slog.Record) error { + h.mu.Lock() + defer h.mu.Unlock() + h.records = append(h.records, recordedLog{level: r.Level, msg: r.Message}) + return nil +} + +func (h *recordingHandler) WithAttrs(_ []slog.Attr) slog.Handler { return h } +func (h *recordingHandler) WithGroup(_ string) slog.Handler { return h } + +func (h *recordingHandler) count(level slog.Level, msg string) int { + h.mu.Lock() + defer h.mu.Unlock() + count := 0 + for _, r := range h.records { + if r.level == level && r.msg == msg { + count++ + } + } + return count +} + +func TestManager_ObserveVMLabelBudget_LogsTransitions(t *testing.T) { + mgr := NewManager() + mgr.SetVMLabelBudget(1) + + recorder := &recordingHandler{} + mgr.SetLogger(slog.New(recorder)) + + assert.True(t, mgr.observeVMLabelBudget(context.Background(), 2), "first over-budget observation should return true") + assert.False(t, mgr.observeVMLabelBudget(context.Background(), 3), "remaining over-budget should not trigger transition") + assert.False(t, mgr.observeVMLabelBudget(context.Background(), 1), "recovery should not count as breach transition") + + assert.Equal(t, 1, recorder.count(slog.LevelWarn, "vm metrics label budget exceeded")) + assert.Equal(t, 1, recorder.count(slog.LevelInfo, "vm metrics label budget recovered")) +} diff --git a/lib/vm_metrics/metrics.go b/lib/vm_metrics/metrics.go index 6bb64e90..c319ddcc 100644 --- a/lib/vm_metrics/metrics.go +++ b/lib/vm_metrics/metrics.go @@ -9,14 +9,15 @@ import ( // otelMetrics holds the OpenTelemetry instruments for VM metrics. type otelMetrics struct { - cpuSecondsTotal metric.Float64ObservableCounter - allocatedVcpus metric.Int64ObservableGauge - memoryRSSBytes metric.Int64ObservableGauge - memoryVMSBytes metric.Int64ObservableGauge - allocatedMemoryBytes metric.Int64ObservableGauge - networkRxBytesTotal metric.Int64ObservableCounter - networkTxBytesTotal metric.Int64ObservableCounter - memoryUtilizationRatio metric.Float64ObservableGauge + cpuSecondsTotal metric.Float64ObservableCounter + allocatedVcpus metric.Int64ObservableGauge + memoryRSSBytes metric.Int64ObservableGauge + memoryVMSBytes metric.Int64ObservableGauge + allocatedMemoryBytes metric.Int64ObservableGauge + networkRxBytesTotal metric.Int64ObservableCounter + networkTxBytesTotal metric.Int64ObservableCounter + instancesObserved metric.Int64ObservableGauge + labelBudgetExceededTotal metric.Int64ObservableCounter } // newOTelMetrics creates and registers all VM utilization metrics. @@ -91,11 +92,19 @@ func newOTelMetrics(meter metric.Meter, m *Manager) (*otelMetrics, error) { return nil, err } - // Memory utilization ratio (RSS / allocated) - memoryUtilizationRatio, err := meter.Float64ObservableGauge( - "hypeman_vm_memory_utilization_ratio", - metric.WithDescription("Memory utilization ratio (RSS / allocated memory)"), - metric.WithUnit("1"), + // Number of instances currently represented by per-VM metrics. + instancesObserved, err := meter.Int64ObservableGauge( + "hypeman_vm_metrics_instances_observed", + metric.WithDescription("Current number of VM instances represented by per-VM labeled metrics"), + metric.WithUnit("{instance}"), + ) + if err != nil { + return nil, err + } + + labelBudgetExceededTotal, err := meter.Int64ObservableCounter( + "hypeman_vm_metrics_label_budget_exceeded_total", + metric.WithDescription("Total number of transitions into over-budget VM metric label cardinality"), ) if err != nil { return nil, err @@ -109,6 +118,10 @@ func newOTelMetrics(meter metric.Meter, m *Manager) (*otelMetrics, error) { // Log error but don't fail the callback return nil } + observed := len(stats) + o.ObserveInt64(instancesObserved, int64(observed)) + m.observeVMLabelBudget(ctx, observed) + o.ObserveInt64(labelBudgetExceededTotal, m.vmLabelBudgetEventCount()) for _, s := range stats { attrs := metric.WithAttributes( @@ -128,11 +141,6 @@ func newOTelMetrics(meter metric.Meter, m *Manager) (*otelMetrics, error) { o.ObserveInt64(memoryVMSBytes, int64(s.MemoryVMSBytes), attrs) o.ObserveInt64(networkRxBytesTotal, int64(s.NetRxBytes), attrs) o.ObserveInt64(networkTxBytesTotal, int64(s.NetTxBytes), attrs) - - // Compute utilization ratio - if ratio := s.MemoryUtilizationRatio(); ratio != nil { - o.ObserveFloat64(memoryUtilizationRatio, *ratio, attrs) - } } return nil @@ -144,20 +152,22 @@ func newOTelMetrics(meter metric.Meter, m *Manager) (*otelMetrics, error) { allocatedMemoryBytes, networkRxBytesTotal, networkTxBytesTotal, - memoryUtilizationRatio, + instancesObserved, + labelBudgetExceededTotal, ) if err != nil { return nil, err } return &otelMetrics{ - cpuSecondsTotal: cpuSecondsTotal, - allocatedVcpus: allocatedVcpus, - memoryRSSBytes: memoryRSSBytes, - memoryVMSBytes: memoryVMSBytes, - allocatedMemoryBytes: allocatedMemoryBytes, - networkRxBytesTotal: networkRxBytesTotal, - networkTxBytesTotal: networkTxBytesTotal, - memoryUtilizationRatio: memoryUtilizationRatio, + cpuSecondsTotal: cpuSecondsTotal, + allocatedVcpus: allocatedVcpus, + memoryRSSBytes: memoryRSSBytes, + memoryVMSBytes: memoryVMSBytes, + allocatedMemoryBytes: allocatedMemoryBytes, + networkRxBytesTotal: networkRxBytesTotal, + networkTxBytesTotal: networkTxBytesTotal, + instancesObserved: instancesObserved, + labelBudgetExceededTotal: labelBudgetExceededTotal, }, nil } diff --git a/lib/vm_metrics/metrics_test.go b/lib/vm_metrics/metrics_test.go index c14b4a0b..a372f78c 100644 --- a/lib/vm_metrics/metrics_test.go +++ b/lib/vm_metrics/metrics_test.go @@ -8,6 +8,7 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + "go.opentelemetry.io/otel/attribute" "go.opentelemetry.io/otel/sdk/metric" "go.opentelemetry.io/otel/sdk/metric/metricdata" ) @@ -62,12 +63,33 @@ func TestOTelMetrics_Registration(t *testing.T) { "hypeman_vm_allocated_memory_bytes", "hypeman_vm_network_rx_bytes_total", "hypeman_vm_network_tx_bytes_total", - "hypeman_vm_memory_utilization_ratio", + "hypeman_vm_metrics_instances_observed", + "hypeman_vm_metrics_label_budget_exceeded_total", } for _, expected := range expectedMetrics { assert.True(t, metricNames[expected], "should have metric %s", expected) } + assert.False(t, metricNames["hypeman_vm_memory_utilization_ratio"], "should not have denormalized ratio metric") + + // Validate per-VM labels are still present on VM series. + var foundCPUMetric bool + for _, sm := range rm.ScopeMetrics { + for _, m := range sm.Metrics { + if m.Name != "hypeman_vm_cpu_seconds_total" { + continue + } + foundCPUMetric = true + sum, ok := m.Data.(metricdata.Sum[float64]) + require.True(t, ok, "expected cpu metric to be sum") + require.NotEmpty(t, sum.DataPoints) + _, hasInstanceID := sum.DataPoints[0].Attributes.Value(attribute.Key("instance_id")) + _, hasInstanceName := sum.DataPoints[0].Attributes.Value(attribute.Key("instance_name")) + assert.True(t, hasInstanceID, "cpu series should include instance_id label") + assert.True(t, hasInstanceName, "cpu series should include instance_name label") + } + } + require.True(t, foundCPUMetric, "expected cpu metric datapoint") } func TestOTelMetrics_NilMeter(t *testing.T) { @@ -75,3 +97,49 @@ func TestOTelMetrics_NilMeter(t *testing.T) { err := mgr.InitializeOTel(nil) require.NoError(t, err, "nil meter should not error") } + +func TestOTelMetrics_LabelBudgetGuardrails(t *testing.T) { + reader := metric.NewManualReader() + provider := metric.NewMeterProvider(metric.WithReader(reader)) + meter := provider.Meter("test") + + source := &mockInstanceSource{ + instances: []InstanceInfo{ + {ID: "vm-1", Name: "vm-one", AllocatedVcpus: 2, AllocatedMemoryBytes: 1024}, + {ID: "vm-2", Name: "vm-two", AllocatedVcpus: 2, AllocatedMemoryBytes: 1024}, + }, + } + mgr := NewManager() + mgr.SetInstanceSource(source) + mgr.SetVMLabelBudget(1) + require.NoError(t, mgr.InitializeOTel(meter)) + + var rm metricdata.ResourceMetrics + require.NoError(t, reader.Collect(t.Context(), &rm)) + + // Reduce cardinality to trigger recovery and ensure counter does not increment again. + source.instances = source.instances[:1] + require.NoError(t, reader.Collect(t.Context(), &rm)) + + var observed int64 + var budgetExceededTransitions int64 + for _, sm := range rm.ScopeMetrics { + for _, m := range sm.Metrics { + switch m.Name { + case "hypeman_vm_metrics_instances_observed": + g, ok := m.Data.(metricdata.Gauge[int64]) + require.True(t, ok) + require.NotEmpty(t, g.DataPoints) + observed = g.DataPoints[0].Value + case "hypeman_vm_metrics_label_budget_exceeded_total": + sum, ok := m.Data.(metricdata.Sum[int64]) + require.True(t, ok) + require.NotEmpty(t, sum.DataPoints) + budgetExceededTransitions = sum.DataPoints[0].Value + } + } + } + + assert.Equal(t, int64(1), observed, "observed instances gauge should reflect latest scrape") + assert.Equal(t, int64(1), budgetExceededTransitions, "counter should increment only on over-budget transition") +} diff --git a/skills/monitoring-feature-observability/SKILL.md b/skills/monitoring-feature-observability/SKILL.md new file mode 100644 index 00000000..d01d6620 --- /dev/null +++ b/skills/monitoring-feature-observability/SKILL.md @@ -0,0 +1,43 @@ +--- +name: monitoring-feature-observability +description: Add or adjust monitoring for a Hypeman feature using repository standards for logs, traces, and metrics. Use when a user asks for instrumentation, observability reviews, telemetry consistency changes, metric design, or production-signal improvements. +--- + +# Monitoring Feature Observability + +Your task is to add monitoring for a specific feature or to perform a specific monitoring-related ask from the user. + +## Logging + +- Logging uses structured slog JSON with per-subsystem levels (`LOG_LEVEL`, `LOG_LEVEL_`). Logs are enriched with `subsystem`, and when trace context exists, `trace_id`/`span_id`: `lib/logger/logger.go`. +- During normal running of the system without requests or events being sent to the system, there should be minimal to no logging at the INFO level or greater. So ongoing maintenance items should not be logging at INFO or greater. +- During a request (for example, API call) or interrupt/event (for example, guest program stops), the normal case should have about one informative log at INFO level, usually just one log. +- Other useful but normal information within a single request/event should be at DEBUG level accordingly. Do not use TRACE level. +- Logs resulting from a single request or event should not provide much duplicated information. +- Logs with `instance_id` are also duplicated into per-instance `logs/hypeman.log`, and instance log APIs stream them, so be sure to set `instance_id` or other `resourcetype_id` accordingly. +- Use WARN and ERROR logs appropriately. +- Logs should be associated with traces. + +## Tracing + +- All API requests should support tracing. +- Tracing should span down as far as reasonable, ideally all the way down unless there is a good reason not to. +- For example, trace down into clients calling each hypervisor. +- Per-instance identifiers (for example, `instance_id`) are allowed in trace attributes when they materially improve debugging or correlation. +- Still avoid sensitive or unbounded attributes by default (for example, full guest paths, user identifiers, tokens, arbitrary payload fields). + +## Metrics + +- Metrics should be created in Prometheus/OpenMetrics format using normal best practices. +- Metrics are emitted via OTel instruments (counters/histograms/gauges) across subsystems (instances, images, network, and so on). +- Low-cardinality labels only (for example, no VM name, IP address, or ID labels). +- Per-VM metric labels are an explicit exception when operationally required (for example `instance_id`, `instance_name`) and should be guarded with budget/alerting. +- Confirm with the user before adding any new high-cardinality metric label. +- Use counters where advisable to avoid sampling errors in data. +- Usually include timing histogram metrics. +- Work with the user to agree on good application-level signals to monitor for a given feature, providing examples in terms of what it would look like on the `/metrics` endpoint. +- All features should have at least one good application-level metric. +- Confirm with the user before removing metrics. +- Do not create denormalized metrics (containing information that can be derived from other metrics). + +Look in `DEVELOPMENT.md` (section: `Local OpenTelemetry (optional)`) for how to collect telemetry from a local server. diff --git a/skills/monitoring-feature-observability/agents/openai.yaml b/skills/monitoring-feature-observability/agents/openai.yaml new file mode 100644 index 00000000..2f8252d4 --- /dev/null +++ b/skills/monitoring-feature-observability/agents/openai.yaml @@ -0,0 +1,3 @@ +interface: + display_name: "Feature Monitoring" + short_description: "Instrument Hypeman logs, traces, and metrics"