From cae6ad140d541f8516533f905b767f6747defb9d Mon Sep 17 00:00:00 2001 From: Steven Miller Date: Wed, 25 Mar 2026 13:40:47 -0400 Subject: [PATCH] Add running latency metric and compression labels --- lib/instances/metrics.go | 83 +++++++++++++++++++++++++ lib/instances/metrics_test.go | 114 ++++++++++++++++++++++++++++++++++ lib/instances/query.go | 3 + lib/instances/restore.go | 27 +++++++- lib/instances/standby.go | 2 +- lib/otel/README.md | 5 +- 6 files changed, 230 insertions(+), 4 deletions(-) diff --git a/lib/instances/metrics.go b/lib/instances/metrics.go index e88f439e..453e45ed 100644 --- a/lib/instances/metrics.go +++ b/lib/instances/metrics.go @@ -2,6 +2,7 @@ package instances import ( "context" + "strconv" "time" "github.com/kernel/hypeman/lib/hypervisor" @@ -66,6 +67,7 @@ type Metrics struct { standbyDuration metric.Float64Histogram stopDuration metric.Float64Histogram startDuration metric.Float64Histogram + timeToRunning metric.Float64Histogram stateTransitions metric.Int64Counter snapshotCompressionJobsTotal metric.Int64Counter snapshotCompressionDuration metric.Float64Histogram @@ -125,6 +127,15 @@ func newInstanceMetrics(meter metric.Meter, tracer trace.Tracer, m *manager) (*M return nil, err } + timeToRunning, err := meter.Float64Histogram( + "hypeman_instances_time_to_running_seconds", + metric.WithDescription("Time from boot start until an instance reaches Running"), + metric.WithUnit("s"), + ) + if err != nil { + return nil, err + } + stateTransitions, err := meter.Int64Counter( "hypeman_instances_state_transitions_total", metric.WithDescription("Total number of instance state transitions"), @@ -318,6 +329,7 @@ func newInstanceMetrics(meter metric.Meter, tracer trace.Tracer, m *manager) (*M standbyDuration: standbyDuration, stopDuration: stopDuration, startDuration: startDuration, + timeToRunning: timeToRunning, stateTransitions: stateTransitions, snapshotCompressionJobsTotal: snapshotCompressionJobsTotal, snapshotCompressionDuration: snapshotCompressionDuration, @@ -355,6 +367,77 @@ func (m *manager) recordDuration(ctx context.Context, histogram metric.Float64Hi histogram.Record(ctx, duration, metric.WithAttributes(attrs...)) } +func compressionMetricAttributes(cfg *snapshotstore.SnapshotCompressionConfig) []attribute.KeyValue { + algorithm := "none" + level := "none" + if cfg != nil && cfg.Enabled { + if cfg.Algorithm != "" { + algorithm = string(cfg.Algorithm) + } else { + algorithm = "unknown" + } + if cfg.Level != nil { + level = strconv.Itoa(*cfg.Level) + } else { + level = "unknown" + } + } + return []attribute.KeyValue{ + attribute.String("algorithm", algorithm), + attribute.String("level", level), + } +} + +func (m *manager) recordDurationWithCompression(ctx context.Context, histogram metric.Float64Histogram, start time.Time, status string, hvType hypervisor.Type, compression *snapshotstore.SnapshotCompressionConfig) { + if m.metrics == nil { + return + } + duration := time.Since(start).Seconds() + attrs := []attribute.KeyValue{ + attribute.String("status", status), + } + if hvType != "" { + attrs = append(attrs, attribute.String("hypervisor", string(hvType))) + } + attrs = append(attrs, compressionMetricAttributes(compression)...) + histogram.Record(ctx, duration, metric.WithAttributes(attrs...)) +} + +func timeToRunningReadyAt(stored *StoredMetadata) *time.Time { + if stored == nil || stored.ProgramStartedAt == nil { + return nil + } + if stored.SkipGuestAgent || stored.GuestAgentReadyAt == nil { + return stored.ProgramStartedAt + } + if stored.GuestAgentReadyAt.After(*stored.ProgramStartedAt) { + return stored.GuestAgentReadyAt + } + return stored.ProgramStartedAt +} + +func (m *manager) recordTimeToRunning(ctx context.Context, stored *StoredMetadata) { + if m.metrics == nil || stored == nil || stored.StartedAt == nil { + return + } + + readyAt := timeToRunningReadyAt(stored) + if readyAt == nil { + return + } + + duration := readyAt.UTC().Sub(stored.StartedAt.UTC()).Seconds() + if duration < 0 { + duration = 0 + } + + attrs := []attribute.KeyValue{} + if stored.HypervisorType != "" { + attrs = append(attrs, attribute.String("hypervisor", string(stored.HypervisorType))) + } + m.metrics.timeToRunning.Record(ctx, duration, metric.WithAttributes(attrs...)) +} + // recordStateTransition records a state transition with hypervisor label. func (m *manager) recordStateTransition(ctx context.Context, fromState, toState string, hvType hypervisor.Type) { if m.metrics == nil { diff --git a/lib/instances/metrics_test.go b/lib/instances/metrics_test.go index c6d1a3fe..88def215 100644 --- a/lib/instances/metrics_test.go +++ b/lib/instances/metrics_test.go @@ -224,6 +224,120 @@ func TestInstanceOldestInStateMetric_ObserveOldestAgePerState(t *testing.T) { } } +func TestInstanceTimeToRunningMetric_RecordWhenBootMarkersPersisted(t *testing.T) { + t.Parallel() + + reader := otelmetric.NewManualReader() + provider := otelmetric.NewMeterProvider(otelmetric.WithReader(reader)) + + tmpDir := t.TempDir() + m := &manager{ + paths: paths.New(tmpDir), + } + + metrics, err := newInstanceMetrics(provider.Meter("test"), nil, m) + require.NoError(t, err) + m.metrics = metrics + + id := "time-to-running-instance" + require.NoError(t, m.ensureDirectories(id)) + + bootStart := time.Date(2026, time.March, 25, 12, 0, 0, 0, time.UTC) + programStartedAt := bootStart.Add(2 * time.Second) + guestAgentReadyAt := bootStart.Add(3500 * time.Millisecond) + + require.NoError(t, m.saveMetadata(&metadata{StoredMetadata: StoredMetadata{ + Id: id, + Name: id, + DataDir: m.paths.InstanceDir(id), + SocketPath: m.paths.InstanceSocket(id, "cloud-hypervisor.sock"), + StartedAt: &bootStart, + HypervisorType: hypervisor.TypeCloudHypervisor, + }})) + + logPath := m.paths.InstanceAppLog(id) + require.NoError(t, os.MkdirAll(filepath.Dir(logPath), 0o755)) + require.NoError(t, os.WriteFile(logPath, []byte( + "HYPEMAN-PROGRAM-START ts="+programStartedAt.Format(time.RFC3339Nano)+" mode=exec\n"+ + "HYPEMAN-AGENT-READY ts="+guestAgentReadyAt.Format(time.RFC3339Nano)+"\n", + ), 0o644)) + require.NoError(t, os.Chtimes(logPath, bootStart.Add(time.Second), bootStart.Add(time.Second))) + + m.persistBootMarkers(t.Context(), id) + + var rm metricdata.ResourceMetrics + require.NoError(t, reader.Collect(t.Context(), &rm)) + + assertMetricNames(t, rm, []string{ + "hypeman_instances_time_to_running_seconds", + }) + + timeToRunningMetric := findMetric(t, rm, "hypeman_instances_time_to_running_seconds") + timeToRunning, ok := timeToRunningMetric.Data.(metricdata.Histogram[float64]) + require.True(t, ok) + require.Len(t, timeToRunning.DataPoints, 1) + assert.Equal(t, uint64(1), timeToRunning.DataPoints[0].Count) + assert.InDelta(t, 3.5, timeToRunning.DataPoints[0].Sum, 0.001) + assert.Equal(t, "cloud-hypervisor", metricLabel(t, timeToRunning.DataPoints[0].Attributes, "hypervisor")) +} + +func TestLifecycleDurationMetrics_RecordCompressionLabels(t *testing.T) { + t.Parallel() + + reader := otelmetric.NewManualReader() + provider := otelmetric.NewMeterProvider(otelmetric.WithReader(reader)) + + m := &manager{ + paths: paths.New(t.TempDir()), + } + metrics, err := newInstanceMetrics(provider.Meter("test"), nil, m) + require.NoError(t, err) + m.metrics = metrics + + level := 3 + m.recordDurationWithCompression( + t.Context(), + m.metrics.restoreDuration, + time.Now().Add(-150*time.Millisecond), + "success", + hypervisor.TypeCloudHypervisor, + &snapshotstore.SnapshotCompressionConfig{ + Enabled: true, + Algorithm: snapshotstore.SnapshotCompressionAlgorithmZstd, + Level: &level, + }, + ) + m.recordDurationWithCompression( + t.Context(), + m.metrics.standbyDuration, + time.Now().Add(-100*time.Millisecond), + "success", + hypervisor.TypeQEMU, + nil, + ) + + var rm metricdata.ResourceMetrics + require.NoError(t, reader.Collect(t.Context(), &rm)) + + restoreMetric := findMetric(t, rm, "hypeman_instances_restore_duration_seconds") + restore, ok := restoreMetric.Data.(metricdata.Histogram[float64]) + require.True(t, ok) + require.Len(t, restore.DataPoints, 1) + assert.Equal(t, "success", metricLabel(t, restore.DataPoints[0].Attributes, "status")) + assert.Equal(t, "cloud-hypervisor", metricLabel(t, restore.DataPoints[0].Attributes, "hypervisor")) + assert.Equal(t, "zstd", metricLabel(t, restore.DataPoints[0].Attributes, "algorithm")) + assert.Equal(t, "3", metricLabel(t, restore.DataPoints[0].Attributes, "level")) + + standbyMetric := findMetric(t, rm, "hypeman_instances_standby_duration_seconds") + standby, ok := standbyMetric.Data.(metricdata.Histogram[float64]) + require.True(t, ok) + require.Len(t, standby.DataPoints, 1) + assert.Equal(t, "success", metricLabel(t, standby.DataPoints[0].Attributes, "status")) + assert.Equal(t, "qemu", metricLabel(t, standby.DataPoints[0].Attributes, "hypervisor")) + assert.Equal(t, "none", metricLabel(t, standby.DataPoints[0].Attributes, "algorithm")) + assert.Equal(t, "none", metricLabel(t, standby.DataPoints[0].Attributes, "level")) +} + func assertMetricNames(t *testing.T, rm metricdata.ResourceMetrics, expected []string) { t.Helper() diff --git a/lib/instances/query.go b/lib/instances/query.go index 96d85f5f..0a38771c 100644 --- a/lib/instances/query.go +++ b/lib/instances/query.go @@ -497,6 +497,9 @@ func (m *manager) persistBootMarkers(ctx context.Context, id string) { if err := m.saveMetadata(meta); err != nil { log.WarnContext(ctx, "failed to persist boot markers", "instance_id", id, "error", err) } else { + if deriveRunningState(&meta.StoredMetadata) == StateRunning { + m.recordTimeToRunning(ctx, &meta.StoredMetadata) + } log.DebugContext(ctx, "persisted boot markers from serial log", "instance_id", id) } } diff --git a/lib/instances/restore.go b/lib/instances/restore.go index 09110a11..68bcb7f1 100644 --- a/lib/instances/restore.go +++ b/lib/instances/restore.go @@ -14,6 +14,7 @@ import ( "github.com/kernel/hypeman/lib/hypervisor" "github.com/kernel/hypeman/lib/logger" "github.com/kernel/hypeman/lib/network" + snapshotstore "github.com/kernel/hypeman/lib/snapshot" "go.opentelemetry.io/otel/attribute" ) @@ -69,6 +70,7 @@ func (m *manager) restoreInstance( // 3. Get snapshot directory snapshotDir := m.paths.InstanceSnapshotLatest(id) + restoreCompression := m.restoreCompressionConfigForMetrics(stored, snapshotDir) prepareSnapshotCtx, prepareSnapshotSpanEnd := m.startLifecycleStep(ctx, "prepare_snapshot_memory", attribute.String("instance_id", id), attribute.String("hypervisor", string(stored.HypervisorType)), @@ -296,13 +298,36 @@ func (m *manager) restoreInstance( finalInst := m.toInstanceWithoutHydration(ctx, meta) // Record metrics if m.metrics != nil { - m.recordDuration(ctx, m.metrics.restoreDuration, start, "success", stored.HypervisorType) + m.recordDurationWithCompression(ctx, m.metrics.restoreDuration, start, "success", stored.HypervisorType, restoreCompression) m.recordStateTransition(ctx, string(StateStandby), string(finalInst.State), stored.HypervisorType) } log.InfoContext(ctx, "instance restored successfully", "instance_id", id, "state", finalInst.State) return &finalInst, nil } +func (m *manager) restoreCompressionConfigForMetrics(stored *StoredMetadata, snapshotDir string) *snapshotstore.SnapshotCompressionConfig { + _, algorithm, ok := findCompressedSnapshotMemoryFile(snapshotDir) + if !ok { + return nil + } + + cfg := snapshotstore.SnapshotCompressionConfig{ + Enabled: true, + Algorithm: algorithm, + } + if stored == nil { + return &cfg + } + + policy, err := m.resolveSnapshotCompressionPolicy(stored, nil) + if err != nil || !policy.Enabled { + return &cfg + } + + resolved := compressionMetadataForExistingArtifact(policy, algorithm) + return &resolved +} + // restoreFromSnapshot starts the hypervisor and restores from snapshot func (m *manager) restoreFromSnapshot( ctx context.Context, diff --git a/lib/instances/standby.go b/lib/instances/standby.go index 193ed16a..4974f91e 100644 --- a/lib/instances/standby.go +++ b/lib/instances/standby.go @@ -206,7 +206,7 @@ func (m *manager) standbyInstance( // Record metrics if m.metrics != nil { - m.recordDuration(ctx, m.metrics.standbyDuration, start, "success", stored.HypervisorType) + m.recordDurationWithCompression(ctx, m.metrics.standbyDuration, start, "success", stored.HypervisorType, compressionPolicy) m.recordStateTransition(ctx, string(StateRunning), string(StateStandby), stored.HypervisorType) } diff --git a/lib/otel/README.md b/lib/otel/README.md index 05c8af6b..47cd9123 100644 --- a/lib/otel/README.md +++ b/lib/otel/README.md @@ -61,8 +61,9 @@ This keeps pull and push views aligned because both are sourced from the same OT |--------|------|--------|-------------| | `hypeman_instances_total` | gauge | state | Instances by state | | `hypeman_instances_create_duration_seconds` | histogram | status | Create time | -| `hypeman_instances_restore_duration_seconds` | histogram | status | Restore time | -| `hypeman_instances_standby_duration_seconds` | histogram | status | Standby time | +| `hypeman_instances_time_to_running_seconds` | histogram | hypervisor | Time from boot start until an instance reaches Running | +| `hypeman_instances_restore_duration_seconds` | histogram | status, hypervisor, algorithm, level | Restore time | +| `hypeman_instances_standby_duration_seconds` | histogram | status, hypervisor, algorithm, level | Standby time | | `hypeman_instances_state_transitions_total` | counter | from, to | State transitions | ### Network