Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
83 changes: 83 additions & 0 deletions lib/instances/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package instances

import (
"context"
"strconv"
"time"

"github.com/kernel/hypeman/lib/hypervisor"
Expand Down Expand Up @@ -66,6 +67,7 @@ type Metrics struct {
standbyDuration metric.Float64Histogram
stopDuration metric.Float64Histogram
startDuration metric.Float64Histogram
timeToRunning metric.Float64Histogram
stateTransitions metric.Int64Counter
snapshotCompressionJobsTotal metric.Int64Counter
snapshotCompressionDuration metric.Float64Histogram
Expand Down Expand Up @@ -125,6 +127,15 @@ func newInstanceMetrics(meter metric.Meter, tracer trace.Tracer, m *manager) (*M
return nil, err
}

timeToRunning, err := meter.Float64Histogram(
"hypeman_instances_time_to_running_seconds",
metric.WithDescription("Time from boot start until an instance reaches Running"),
metric.WithUnit("s"),
)
if err != nil {
return nil, err
}

stateTransitions, err := meter.Int64Counter(
"hypeman_instances_state_transitions_total",
metric.WithDescription("Total number of instance state transitions"),
Expand Down Expand Up @@ -318,6 +329,7 @@ func newInstanceMetrics(meter metric.Meter, tracer trace.Tracer, m *manager) (*M
standbyDuration: standbyDuration,
stopDuration: stopDuration,
startDuration: startDuration,
timeToRunning: timeToRunning,
stateTransitions: stateTransitions,
snapshotCompressionJobsTotal: snapshotCompressionJobsTotal,
snapshotCompressionDuration: snapshotCompressionDuration,
Expand Down Expand Up @@ -355,6 +367,77 @@ func (m *manager) recordDuration(ctx context.Context, histogram metric.Float64Hi
histogram.Record(ctx, duration, metric.WithAttributes(attrs...))
}

func compressionMetricAttributes(cfg *snapshotstore.SnapshotCompressionConfig) []attribute.KeyValue {
algorithm := "none"
level := "none"
if cfg != nil && cfg.Enabled {
if cfg.Algorithm != "" {
algorithm = string(cfg.Algorithm)
} else {
algorithm = "unknown"
}
if cfg.Level != nil {
level = strconv.Itoa(*cfg.Level)
} else {
level = "unknown"
}
}
return []attribute.KeyValue{
attribute.String("algorithm", algorithm),
attribute.String("level", level),
}
}

func (m *manager) recordDurationWithCompression(ctx context.Context, histogram metric.Float64Histogram, start time.Time, status string, hvType hypervisor.Type, compression *snapshotstore.SnapshotCompressionConfig) {
if m.metrics == nil {
return
}
duration := time.Since(start).Seconds()
attrs := []attribute.KeyValue{
attribute.String("status", status),
}
if hvType != "" {
attrs = append(attrs, attribute.String("hypervisor", string(hvType)))
}
attrs = append(attrs, compressionMetricAttributes(compression)...)
histogram.Record(ctx, duration, metric.WithAttributes(attrs...))
}

func timeToRunningReadyAt(stored *StoredMetadata) *time.Time {
if stored == nil || stored.ProgramStartedAt == nil {
return nil
}
if stored.SkipGuestAgent || stored.GuestAgentReadyAt == nil {
return stored.ProgramStartedAt
}
if stored.GuestAgentReadyAt.After(*stored.ProgramStartedAt) {
return stored.GuestAgentReadyAt
}
return stored.ProgramStartedAt
}

func (m *manager) recordTimeToRunning(ctx context.Context, stored *StoredMetadata) {
if m.metrics == nil || stored == nil || stored.StartedAt == nil {
return
}

readyAt := timeToRunningReadyAt(stored)
if readyAt == nil {
return
}

duration := readyAt.UTC().Sub(stored.StartedAt.UTC()).Seconds()
if duration < 0 {
duration = 0
}

attrs := []attribute.KeyValue{}
if stored.HypervisorType != "" {
attrs = append(attrs, attribute.String("hypervisor", string(stored.HypervisorType)))
}
m.metrics.timeToRunning.Record(ctx, duration, metric.WithAttributes(attrs...))
}

// recordStateTransition records a state transition with hypervisor label.
func (m *manager) recordStateTransition(ctx context.Context, fromState, toState string, hvType hypervisor.Type) {
if m.metrics == nil {
Expand Down
114 changes: 114 additions & 0 deletions lib/instances/metrics_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -224,6 +224,120 @@ func TestInstanceOldestInStateMetric_ObserveOldestAgePerState(t *testing.T) {
}
}

func TestInstanceTimeToRunningMetric_RecordWhenBootMarkersPersisted(t *testing.T) {
t.Parallel()

reader := otelmetric.NewManualReader()
provider := otelmetric.NewMeterProvider(otelmetric.WithReader(reader))

tmpDir := t.TempDir()
m := &manager{
paths: paths.New(tmpDir),
}

metrics, err := newInstanceMetrics(provider.Meter("test"), nil, m)
require.NoError(t, err)
m.metrics = metrics

id := "time-to-running-instance"
require.NoError(t, m.ensureDirectories(id))

bootStart := time.Date(2026, time.March, 25, 12, 0, 0, 0, time.UTC)
programStartedAt := bootStart.Add(2 * time.Second)
guestAgentReadyAt := bootStart.Add(3500 * time.Millisecond)

require.NoError(t, m.saveMetadata(&metadata{StoredMetadata: StoredMetadata{
Id: id,
Name: id,
DataDir: m.paths.InstanceDir(id),
SocketPath: m.paths.InstanceSocket(id, "cloud-hypervisor.sock"),
StartedAt: &bootStart,
HypervisorType: hypervisor.TypeCloudHypervisor,
}}))

logPath := m.paths.InstanceAppLog(id)
require.NoError(t, os.MkdirAll(filepath.Dir(logPath), 0o755))
require.NoError(t, os.WriteFile(logPath, []byte(
"HYPEMAN-PROGRAM-START ts="+programStartedAt.Format(time.RFC3339Nano)+" mode=exec\n"+
"HYPEMAN-AGENT-READY ts="+guestAgentReadyAt.Format(time.RFC3339Nano)+"\n",
), 0o644))
require.NoError(t, os.Chtimes(logPath, bootStart.Add(time.Second), bootStart.Add(time.Second)))

m.persistBootMarkers(t.Context(), id)

var rm metricdata.ResourceMetrics
require.NoError(t, reader.Collect(t.Context(), &rm))

assertMetricNames(t, rm, []string{
"hypeman_instances_time_to_running_seconds",
})

timeToRunningMetric := findMetric(t, rm, "hypeman_instances_time_to_running_seconds")
timeToRunning, ok := timeToRunningMetric.Data.(metricdata.Histogram[float64])
require.True(t, ok)
require.Len(t, timeToRunning.DataPoints, 1)
assert.Equal(t, uint64(1), timeToRunning.DataPoints[0].Count)
assert.InDelta(t, 3.5, timeToRunning.DataPoints[0].Sum, 0.001)
assert.Equal(t, "cloud-hypervisor", metricLabel(t, timeToRunning.DataPoints[0].Attributes, "hypervisor"))
}

func TestLifecycleDurationMetrics_RecordCompressionLabels(t *testing.T) {
t.Parallel()

reader := otelmetric.NewManualReader()
provider := otelmetric.NewMeterProvider(otelmetric.WithReader(reader))

m := &manager{
paths: paths.New(t.TempDir()),
}
metrics, err := newInstanceMetrics(provider.Meter("test"), nil, m)
require.NoError(t, err)
m.metrics = metrics

level := 3
m.recordDurationWithCompression(
t.Context(),
m.metrics.restoreDuration,
time.Now().Add(-150*time.Millisecond),
"success",
hypervisor.TypeCloudHypervisor,
&snapshotstore.SnapshotCompressionConfig{
Enabled: true,
Algorithm: snapshotstore.SnapshotCompressionAlgorithmZstd,
Level: &level,
},
)
m.recordDurationWithCompression(
t.Context(),
m.metrics.standbyDuration,
time.Now().Add(-100*time.Millisecond),
"success",
hypervisor.TypeQEMU,
nil,
)

var rm metricdata.ResourceMetrics
require.NoError(t, reader.Collect(t.Context(), &rm))

restoreMetric := findMetric(t, rm, "hypeman_instances_restore_duration_seconds")
restore, ok := restoreMetric.Data.(metricdata.Histogram[float64])
require.True(t, ok)
require.Len(t, restore.DataPoints, 1)
assert.Equal(t, "success", metricLabel(t, restore.DataPoints[0].Attributes, "status"))
assert.Equal(t, "cloud-hypervisor", metricLabel(t, restore.DataPoints[0].Attributes, "hypervisor"))
assert.Equal(t, "zstd", metricLabel(t, restore.DataPoints[0].Attributes, "algorithm"))
assert.Equal(t, "3", metricLabel(t, restore.DataPoints[0].Attributes, "level"))

standbyMetric := findMetric(t, rm, "hypeman_instances_standby_duration_seconds")
standby, ok := standbyMetric.Data.(metricdata.Histogram[float64])
require.True(t, ok)
require.Len(t, standby.DataPoints, 1)
assert.Equal(t, "success", metricLabel(t, standby.DataPoints[0].Attributes, "status"))
assert.Equal(t, "qemu", metricLabel(t, standby.DataPoints[0].Attributes, "hypervisor"))
assert.Equal(t, "none", metricLabel(t, standby.DataPoints[0].Attributes, "algorithm"))
assert.Equal(t, "none", metricLabel(t, standby.DataPoints[0].Attributes, "level"))
}

func assertMetricNames(t *testing.T, rm metricdata.ResourceMetrics, expected []string) {
t.Helper()

Expand Down
3 changes: 3 additions & 0 deletions lib/instances/query.go
Original file line number Diff line number Diff line change
Expand Up @@ -497,6 +497,9 @@ func (m *manager) persistBootMarkers(ctx context.Context, id string) {
if err := m.saveMetadata(meta); err != nil {
log.WarnContext(ctx, "failed to persist boot markers", "instance_id", id, "error", err)
} else {
if deriveRunningState(&meta.StoredMetadata) == StateRunning {
m.recordTimeToRunning(ctx, &meta.StoredMetadata)
}
log.DebugContext(ctx, "persisted boot markers from serial log", "instance_id", id)
}
}
Expand Down
27 changes: 26 additions & 1 deletion lib/instances/restore.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ import (
"github.com/kernel/hypeman/lib/hypervisor"
"github.com/kernel/hypeman/lib/logger"
"github.com/kernel/hypeman/lib/network"
snapshotstore "github.com/kernel/hypeman/lib/snapshot"
"go.opentelemetry.io/otel/attribute"
)

Expand Down Expand Up @@ -69,6 +70,7 @@ func (m *manager) restoreInstance(

// 3. Get snapshot directory
snapshotDir := m.paths.InstanceSnapshotLatest(id)
restoreCompression := m.restoreCompressionConfigForMetrics(stored, snapshotDir)
prepareSnapshotCtx, prepareSnapshotSpanEnd := m.startLifecycleStep(ctx, "prepare_snapshot_memory",
attribute.String("instance_id", id),
attribute.String("hypervisor", string(stored.HypervisorType)),
Expand Down Expand Up @@ -296,13 +298,36 @@ func (m *manager) restoreInstance(
finalInst := m.toInstanceWithoutHydration(ctx, meta)
// Record metrics
if m.metrics != nil {
m.recordDuration(ctx, m.metrics.restoreDuration, start, "success", stored.HypervisorType)
m.recordDurationWithCompression(ctx, m.metrics.restoreDuration, start, "success", stored.HypervisorType, restoreCompression)
m.recordStateTransition(ctx, string(StateStandby), string(finalInst.State), stored.HypervisorType)
}
log.InfoContext(ctx, "instance restored successfully", "instance_id", id, "state", finalInst.State)
return &finalInst, nil
}

func (m *manager) restoreCompressionConfigForMetrics(stored *StoredMetadata, snapshotDir string) *snapshotstore.SnapshotCompressionConfig {
_, algorithm, ok := findCompressedSnapshotMemoryFile(snapshotDir)
if !ok {
return nil
}

cfg := snapshotstore.SnapshotCompressionConfig{
Enabled: true,
Algorithm: algorithm,
}
if stored == nil {
return &cfg
}

policy, err := m.resolveSnapshotCompressionPolicy(stored, nil)
if err != nil || !policy.Enabled {
return &cfg
}

resolved := compressionMetadataForExistingArtifact(policy, algorithm)
return &resolved
}

// restoreFromSnapshot starts the hypervisor and restores from snapshot
func (m *manager) restoreFromSnapshot(
ctx context.Context,
Expand Down
2 changes: 1 addition & 1 deletion lib/instances/standby.go
Original file line number Diff line number Diff line change
Expand Up @@ -212,7 +212,7 @@ func (m *manager) standbyInstance(

// Record metrics
if m.metrics != nil {
m.recordDuration(ctx, m.metrics.standbyDuration, start, "success", stored.HypervisorType)
m.recordDurationWithCompression(ctx, m.metrics.standbyDuration, start, "success", stored.HypervisorType, compressionPolicy)
m.recordStateTransition(ctx, string(StateRunning), string(StateStandby), stored.HypervisorType)
}

Expand Down
5 changes: 3 additions & 2 deletions lib/otel/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -61,8 +61,9 @@ This keeps pull and push views aligned because both are sourced from the same OT
|--------|------|--------|-------------|
| `hypeman_instances_total` | gauge | state | Instances by state |
| `hypeman_instances_create_duration_seconds` | histogram | status | Create time |
| `hypeman_instances_restore_duration_seconds` | histogram | status | Restore time |
| `hypeman_instances_standby_duration_seconds` | histogram | status | Standby time |
| `hypeman_instances_time_to_running_seconds` | histogram | hypervisor | Time from boot start until an instance reaches Running |
| `hypeman_instances_restore_duration_seconds` | histogram | status, hypervisor, algorithm, level | Restore time |
| `hypeman_instances_standby_duration_seconds` | histogram | status, hypervisor, algorithm, level | Standby time |
| `hypeman_instances_state_transitions_total` | counter | from, to | State transitions |

### Network
Expand Down
Loading