Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .env.darwin.example
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,10 @@ MAX_MEMORY_PER_INSTANCE=8GB
# OTEL_ENDPOINT=127.0.0.1:4317
# OTEL_SERVICE_NAME=hypeman
# OTEL_INSECURE=true
# OTEL__METRIC_EXPORT_INTERVAL=60s
# METRICS__LISTEN_ADDRESS=127.0.0.1
# METRICS__PORT=9464
# METRICS__VM_LABEL_BUDGET=200
# ENV=dev

# =============================================================================
Expand Down
4 changes: 4 additions & 0 deletions .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,10 @@ DATA_DIR=/var/lib/hypeman
# OTEL_SERVICE_NAME=hypeman
# OTEL_SERVICE_INSTANCE_ID= # default: hostname
# OTEL_INSECURE=true
# OTEL__METRIC_EXPORT_INTERVAL=60s # OTLP push cadence (when OTEL_ENABLED=true)
# METRICS__LISTEN_ADDRESS=127.0.0.1
# METRICS__PORT=9464
# METRICS__VM_LABEL_BUDGET=200 # warn when observed per-VM metric labels exceed budget
# ENV=dev # deployment environment

# =============================================================================
Expand Down
4 changes: 4 additions & 0 deletions DEVELOPMENT.md
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,10 @@ Common settings:
| `logging.level` | Log level (debug, info, warn, error) | `info` |
| `otel.enabled` | Enable OpenTelemetry traces/metrics | `false` |
| `otel.endpoint` | OTLP gRPC endpoint | `127.0.0.1:4317` |
| `otel.metric_export_interval` | OTLP metric push interval | `60s` |
| `metrics.listen_address` | Bind address for `/metrics` endpoint | `127.0.0.1` |
| `metrics.port` | Port for `/metrics` endpoint | `9464` |
| `metrics.vm_label_budget` | Warning threshold for observed per-VM metric labels | `200` |
| `limits.max_concurrent_builds` | Max concurrent image builds | `1` |
| `limits.max_overlay_size` | Max overlay filesystem size | `100GB` |
| `acme.email` | Email for ACME certificate registration | _(empty)_ |
Expand Down
14 changes: 8 additions & 6 deletions cmd/api/api/cp.go
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,13 @@ type CpResult struct {
BytesWritten int64 `json:"bytes_written,omitempty"`
}

func cpSpanAttributes(instanceID, direction string) []attribute.KeyValue {
return []attribute.KeyValue{
attribute.String("instance_id", instanceID),
attribute.String("direction", direction),
}
}

// CpHandler handles file copy requests via WebSocket
func (s *ApiService) CpHandler(w http.ResponseWriter, r *http.Request) {
ctx := r.Context()
Expand Down Expand Up @@ -146,12 +153,7 @@ func (s *ApiService) CpHandler(w http.ResponseWriter, r *http.Request) {
// Start OTEL span for tracing (WebSocket bypasses otelchi middleware)
tracer := otel.Tracer("hypeman/cp")
ctx, span := tracer.Start(ctx, "cp.session",
trace.WithAttributes(
attribute.String("instance_id", inst.Id),
attribute.String("direction", cpReq.Direction),
attribute.String("guest_path", cpReq.GuestPath),
attribute.String("subject", subject),
),
trace.WithAttributes(cpSpanAttributes(inst.Id, cpReq.Direction)...),
)
defer span.End()

Expand Down
22 changes: 22 additions & 0 deletions cmd/api/api/exec.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,10 @@ import (
"github.com/kernel/hypeman/lib/instances"
"github.com/kernel/hypeman/lib/logger"
mw "github.com/kernel/hypeman/lib/middleware"
"go.opentelemetry.io/otel"
"go.opentelemetry.io/otel/attribute"
"go.opentelemetry.io/otel/codes"
"go.opentelemetry.io/otel/trace"
)

var upgrader = websocket.Upgrader{
Expand Down Expand Up @@ -46,6 +50,13 @@ type ResizeMessage struct {
} `json:"resize"`
}

func execSpanAttributes(instanceID string, tty bool) []attribute.KeyValue {
return []attribute.KeyValue{
attribute.String("instance_id", instanceID),
attribute.Bool("tty", tty),
}
}

// ExecHandler handles exec requests via WebSocket for bidirectional streaming
// Note: Resolution is handled by ResolveResource middleware
func (s *ApiService) ExecHandler(w http.ResponseWriter, r *http.Request) {
Expand Down Expand Up @@ -108,6 +119,11 @@ func (s *ApiService) ExecHandler(w http.ResponseWriter, r *http.Request) {
}
}

// Start OTEL span for tracing (WebSocket bypasses otelchi middleware).
tracer := otel.Tracer("hypeman/exec")
ctx, span := tracer.Start(ctx, "exec.session", trace.WithAttributes(execSpanAttributes(inst.Id, execReq.TTY)...))
defer span.End()

// Audit log: exec session started
log.InfoContext(ctx, "exec session started",
"instance_id", inst.Id,
Expand All @@ -133,6 +149,8 @@ func (s *ApiService) ExecHandler(w http.ResponseWriter, r *http.Request) {

dialer, err := s.InstanceManager.GetVsockDialer(ctx, inst.Id)
if err != nil {
span.RecordError(err)
span.SetStatus(codes.Error, err.Error())
log.ErrorContext(ctx, "failed to get vsock dialer", "error", err)
ws.WriteMessage(websocket.BinaryMessage, []byte(fmt.Sprintf("Error: %v\r\n", err)))
ws.WriteMessage(websocket.TextMessage, []byte(`{"exitCode":127}`))
Expand All @@ -158,6 +176,8 @@ func (s *ApiService) ExecHandler(w http.ResponseWriter, r *http.Request) {
duration := time.Since(startTime)

if err != nil {
span.RecordError(err)
span.SetStatus(codes.Error, err.Error())
log.ErrorContext(ctx, "exec failed",
"error", err,
"instance_id", inst.Id,
Expand All @@ -180,6 +200,8 @@ func (s *ApiService) ExecHandler(w http.ResponseWriter, r *http.Request) {
"exit_code", exit.Code,
"duration_ms", duration.Milliseconds(),
)
span.SetAttributes(attribute.Int("exit_code", exit.Code))
span.SetStatus(codes.Ok, "")

// Send close frame with exit code in JSON
closeMsg := fmt.Sprintf(`{"exitCode":%d}`, exit.Code)
Expand Down
33 changes: 33 additions & 0 deletions cmd/api/api/trace_attrs_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
package api

import (
"testing"

"github.com/stretchr/testify/require"
)

func TestCpSpanAttributes(t *testing.T) {
attrs := cpSpanAttributes("inst-123", "to")
got := map[string]any{}
for _, attr := range attrs {
got[string(attr.Key)] = attr.Value.AsInterface()
}

require.Equal(t, "inst-123", got["instance_id"])
require.Equal(t, "to", got["direction"])
require.NotContains(t, got, "guest_path")
require.NotContains(t, got, "subject")
}

func TestExecSpanAttributes(t *testing.T) {
attrs := execSpanAttributes("inst-456", true)
got := map[string]any{}
for _, attr := range attrs {
got[string(attr.Key)] = attr.Value.AsInterface()
}

require.Equal(t, "inst-456", got["instance_id"])
require.Equal(t, true, got["tty"])
require.NotContains(t, got, "guest_path")
require.NotContains(t, got, "subject")
}
51 changes: 41 additions & 10 deletions cmd/api/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import (
"runtime"
"runtime/debug"
"strings"
"time"

"github.com/knadh/koanf/parsers/yaml"
"github.com/knadh/koanf/providers/env"
Expand Down Expand Up @@ -93,13 +94,21 @@ type APIConfig struct {
RedirectHTTP bool `koanf:"redirect_http"`
}

// MetricsConfig holds metrics endpoint settings.
type MetricsConfig struct {
ListenAddress string `koanf:"listen_address"`
Port int `koanf:"port"`
VMLabelBudget int `koanf:"vm_label_budget"`
}

// OtelConfig holds OpenTelemetry settings.
type OtelConfig struct {
Enabled bool `koanf:"enabled"`
Endpoint string `koanf:"endpoint"`
ServiceName string `koanf:"service_name"`
ServiceInstanceID string `koanf:"service_instance_id"`
Insecure bool `koanf:"insecure"`
Enabled bool `koanf:"enabled"`
Endpoint string `koanf:"endpoint"`
ServiceName string `koanf:"service_name"`
ServiceInstanceID string `koanf:"service_instance_id"`
Insecure bool `koanf:"insecure"`
MetricExportInterval string `koanf:"metric_export_interval"`
}

// LoggingConfig holds log rotation and level settings.
Expand Down Expand Up @@ -175,6 +184,7 @@ type Config struct {
Caddy CaddyConfig `koanf:"caddy"`
ACME ACMEConfig `koanf:"acme"`
API APIConfig `koanf:"api"`
Metrics MetricsConfig `koanf:"metrics"`
Otel OtelConfig `koanf:"otel"`
Logging LoggingConfig `koanf:"logging"`
Build BuildConfig `koanf:"build"`
Expand Down Expand Up @@ -245,12 +255,19 @@ func defaultConfig() *Config {
RedirectHTTP: true,
},

Metrics: MetricsConfig{
ListenAddress: "127.0.0.1",
Port: 9464,
VMLabelBudget: 200,
},

Otel: OtelConfig{
Enabled: false,
Endpoint: "127.0.0.1:4317",
ServiceName: "hypeman",
ServiceInstanceID: getHostname(),
Insecure: true,
Enabled: false,
Endpoint: "127.0.0.1:4317",
ServiceName: "hypeman",
ServiceInstanceID: getHostname(),
Insecure: true,
MetricExportInterval: "60s",
},

Logging: LoggingConfig{
Expand Down Expand Up @@ -373,6 +390,20 @@ func Load(configPath string) (*Config, error) {
// Validate checks configuration values for correctness.
// Returns an error if any configuration value is invalid.
func (c *Config) Validate() error {
if strings.TrimSpace(c.Metrics.ListenAddress) == "" {
return fmt.Errorf("metrics.listen_address must not be empty")
}
if c.Metrics.Port < 1 || c.Metrics.Port > 65535 {
return fmt.Errorf("metrics.port must be between 1 and 65535, got %d", c.Metrics.Port)
}
if c.Metrics.VMLabelBudget <= 0 {
return fmt.Errorf("metrics.vm_label_budget must be positive, got %d", c.Metrics.VMLabelBudget)
}
if c.Otel.MetricExportInterval != "" {
if _, err := time.ParseDuration(c.Otel.MetricExportInterval); err != nil {
return fmt.Errorf("otel.metric_export_interval must be a valid duration, got %q: %w", c.Otel.MetricExportInterval, err)
}
}
if c.Oversubscription.CPU <= 0 {
return fmt.Errorf("oversubscription.cpu must be positive, got %v", c.Oversubscription.CPU)
}
Expand Down
85 changes: 85 additions & 0 deletions cmd/api/config/config_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
package config

import (
"os"
"path/filepath"
"testing"
)

func TestDefaultConfigIncludesMetricsSettings(t *testing.T) {
cfg := defaultConfig()

if cfg.Metrics.ListenAddress != "127.0.0.1" {
t.Fatalf("expected default metrics.listen_address to be 127.0.0.1, got %q", cfg.Metrics.ListenAddress)
}
if cfg.Metrics.Port != 9464 {
t.Fatalf("expected default metrics.port to be 9464, got %d", cfg.Metrics.Port)
}
if cfg.Metrics.VMLabelBudget != 200 {
t.Fatalf("expected default metrics.vm_label_budget to be 200, got %d", cfg.Metrics.VMLabelBudget)
}
if cfg.Otel.MetricExportInterval != "60s" {
t.Fatalf("expected default otel.metric_export_interval to be 60s, got %q", cfg.Otel.MetricExportInterval)
}
}

func TestLoadEnvOverridesMetricsAndOtelInterval(t *testing.T) {
t.Setenv("METRICS__LISTEN_ADDRESS", "0.0.0.0")
t.Setenv("METRICS__PORT", "9999")
t.Setenv("METRICS__VM_LABEL_BUDGET", "350")
t.Setenv("OTEL__METRIC_EXPORT_INTERVAL", "15s")

tmp := t.TempDir()
cfgPath := filepath.Join(tmp, "config.yaml")
if err := os.WriteFile(cfgPath, []byte("{}\n"), 0600); err != nil {
t.Fatalf("write temp config: %v", err)
}

cfg, err := Load(cfgPath)
if err != nil {
t.Fatalf("load config: %v", err)
}

if cfg.Metrics.ListenAddress != "0.0.0.0" {
t.Fatalf("expected metrics.listen_address override, got %q", cfg.Metrics.ListenAddress)
}
if cfg.Metrics.Port != 9999 {
t.Fatalf("expected metrics.port override, got %d", cfg.Metrics.Port)
}
if cfg.Metrics.VMLabelBudget != 350 {
t.Fatalf("expected metrics.vm_label_budget override, got %d", cfg.Metrics.VMLabelBudget)
}
if cfg.Otel.MetricExportInterval != "15s" {
t.Fatalf("expected otel.metric_export_interval override, got %q", cfg.Otel.MetricExportInterval)
}
}

func TestValidateRejectsInvalidMetricsPort(t *testing.T) {
cfg := defaultConfig()
cfg.Metrics.Port = 0

err := cfg.Validate()
if err == nil {
t.Fatalf("expected validation error for invalid metrics port")
}
}

func TestValidateRejectsInvalidMetricExportInterval(t *testing.T) {
cfg := defaultConfig()
cfg.Otel.MetricExportInterval = "not-a-duration"

err := cfg.Validate()
if err == nil {
t.Fatalf("expected validation error for invalid metric export interval")
}
}

func TestValidateRejectsInvalidVMLabelBudget(t *testing.T) {
cfg := defaultConfig()
cfg.Metrics.VMLabelBudget = 0

err := cfg.Validate()
if err == nil {
t.Fatalf("expected validation error for invalid vm label budget")
}
}
Loading