Skip to content
Merged
16 changes: 16 additions & 0 deletions cmd/api/api/instances.go
Original file line number Diff line number Diff line change
Expand Up @@ -1121,6 +1121,22 @@ func instanceToOAPI(inst instances.Instance) oapi.Instance {
oapiInst.Gpu = gpu
}

// Expose phase accounting (cumulative time in each lifecycle phase). The
// snapshot rolls in time accrued in the current phase since the last
// transition, so consumers don't need a separate "live since" calculation.
if inst.Phases.Current != "" {
current := string(inst.Phases.Current)
oapiInst.CurrentPhase = &current
since := inst.Phases.Since
oapiInst.CurrentPhaseSince = &since
snapshot := inst.Phases.Snapshot(time.Now())
out := make(map[string]int64, len(snapshot))
for k, v := range snapshot {
out[string(k)] = v
}
oapiInst.PhaseDurationsMs = &out
}

return oapiInst
}

Expand Down
59 changes: 59 additions & 0 deletions cmd/api/api/instances_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ import (
"github.com/kernel/hypeman/lib/autostandby"
"github.com/kernel/hypeman/lib/hypervisor"
"github.com/kernel/hypeman/lib/instances"
"github.com/kernel/hypeman/lib/instances/phasetracking"
mw "github.com/kernel/hypeman/lib/middleware"
"github.com/kernel/hypeman/lib/oapi"
"github.com/kernel/hypeman/lib/paths"
Expand Down Expand Up @@ -531,6 +532,64 @@ func TestCreateInstance_InvalidStandbyCompressionDelayInSnapshotPolicy(t *testin
assert.Contains(t, badReq.Message, "standby_compression_delay")
}

func TestInstanceToOAPI_EmitsPhaseAccounting(t *testing.T) {
t.Parallel()

t0 := time.Now().Add(-10 * time.Minute)
tr := phasetracking.Tracker{}
tr.Record(phasetracking.PhaseRunning, t0)
tr.Record(phasetracking.PhaseStandby, t0.Add(60*time.Second))
tr.Record(phasetracking.PhaseRunning, t0.Add(60*time.Second+5*time.Minute))

inst := instances.Instance{
StoredMetadata: instances.StoredMetadata{
Id: "inst-phases",
Name: "inst-phases",
Image: "docker.io/library/alpine:latest",
CreatedAt: t0,
HypervisorType: hypervisor.TypeCloudHypervisor,
Phases: tr,
},
State: instances.StateRunning,
}

oapiInst := instanceToOAPI(inst)

require.NotNil(t, oapiInst.CurrentPhase)
assert.Equal(t, "running", *oapiInst.CurrentPhase)
require.NotNil(t, oapiInst.CurrentPhaseSince)
require.NotNil(t, oapiInst.PhaseDurationsMs)

durations := *oapiInst.PhaseDurationsMs
// Standby stint was a completed 300s window — no live accrual since.
assert.Equal(t, int64(300_000), durations["standby"])
// Running = 60s completed + live time since latest Record. The
// recorded-at instant is in the past, so this must be >= 60s.
assert.GreaterOrEqual(t, durations["running"], int64(60_000),
"running should include the completed 60s stint")
}

func TestInstanceToOAPI_OmitsPhaseFieldsWhenUnset(t *testing.T) {
t.Parallel()

inst := instances.Instance{
StoredMetadata: instances.StoredMetadata{
Id: "inst-no-phases",
Name: "inst-no-phases",
Image: "docker.io/library/alpine:latest",
CreatedAt: time.Now(),
HypervisorType: hypervisor.TypeCloudHypervisor,
},
State: instances.StateStopped,
}

oapiInst := instanceToOAPI(inst)

assert.Nil(t, oapiInst.CurrentPhase)
assert.Nil(t, oapiInst.CurrentPhaseSince)
assert.Nil(t, oapiInst.PhaseDurationsMs)
}

func TestInstanceToOAPI_EmitsStandbyCompressionDelayInSnapshotPolicy(t *testing.T) {
t.Parallel()

Expand Down
8 changes: 7 additions & 1 deletion lib/instances/create.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ import (
"github.com/kernel/hypeman/lib/guestmemory"
"github.com/kernel/hypeman/lib/hypervisor"
"github.com/kernel/hypeman/lib/images"
"github.com/kernel/hypeman/lib/instances/phasetracking"
"github.com/kernel/hypeman/lib/logger"
"github.com/kernel/hypeman/lib/network"
"github.com/kernel/hypeman/lib/system"
Expand Down Expand Up @@ -496,6 +497,7 @@ func (m *manager) createInstance(
}
bootStart := time.Now().UTC()
stored.StartedAt = &bootStart
stored.Phases.Record(phasetracking.PhaseCreated, bootStart)

// 18. Save metadata
log.DebugContext(ctx, "saving instance metadata", "instance_id", id)
Expand Down Expand Up @@ -528,7 +530,11 @@ func (m *manager) createInstance(
reservedResources = false
}

// 20. Persist runtime metadata updates after VM boot.
// 20. Persist runtime metadata updates after VM boot. The VMM is up but
// guest boot markers have not yet been written, so we are in Initializing;
// persistBootMarkers will advance us to Running once the markers appear
// in the serial log.
stored.Phases.Record(phasetracking.PhaseInitializing, time.Now().UTC())
meta = &metadata{StoredMetadata: *stored}
if err := m.saveMetadata(meta); err != nil {
// VM is running but metadata failed - log but don't fail
Expand Down
7 changes: 7 additions & 0 deletions lib/instances/firecracker_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ import (
"github.com/kernel/hypeman/lib/devices"
"github.com/kernel/hypeman/lib/hypervisor"
"github.com/kernel/hypeman/lib/images"
"github.com/kernel/hypeman/lib/instances/phasetracking"
"github.com/kernel/hypeman/lib/network"
"github.com/kernel/hypeman/lib/paths"
"github.com/kernel/hypeman/lib/resources"
Expand Down Expand Up @@ -163,6 +164,7 @@ func TestFirecrackerStandbyAndRestore(t *testing.T) {
inst, err = waitForInstanceState(ctx, mgr, inst.Id, StateRunning, integrationTestTimeout(20*time.Second))
require.NoError(t, err)
require.NoError(t, waitForExecAgent(ctx, mgr, inst.Id, 30*time.Second))
assert.Equal(t, phasetracking.PhaseRunning, inst.Phases.Current, "fresh instance should be in running phase")

firstFilePath := "/tmp/firecracker-standby-first.txt"
secondFilePath := "/tmp/firecracker-standby-second.txt"
Expand Down Expand Up @@ -222,10 +224,14 @@ func TestFirecrackerStandbyAndRestore(t *testing.T) {
t.Logf("first standby (full snapshot expected) took %v", firstStandbyDuration)
assert.Equal(t, StateStandby, inst.State)
assert.True(t, inst.HasSnapshot)
assert.Equal(t, phasetracking.PhaseStandby, inst.Phases.Current, "standby transition should set current phase")
assert.Greater(t, inst.Phases.Cumulative[phasetracking.PhaseRunning], int64(0), "first running stint should be accrued after standby")

firstRestoreRunningDuration, _ := restoreAndMeasure("first")
assert.False(t, inst.HasSnapshot, "running instances should not expose retained snapshot bases as standby snapshots")
assertRetainedBaseState()
assert.Equal(t, phasetracking.PhaseRunning, inst.Phases.Current, "restored instance should be in running phase")
assert.Greater(t, inst.Phases.Cumulative[phasetracking.PhaseStandby], int64(0), "first standby stint should be accrued after restore")
t.Logf("first full-cycle timings: standby=%v restore-to-running=%v", firstStandbyDuration, firstRestoreRunningDuration)

assertGuestFileContents(firstFilePath, firstFileContents)
Expand All @@ -245,6 +251,7 @@ func TestFirecrackerStandbyAndRestore(t *testing.T) {
secondRestoreRunningDuration, _ := restoreAndMeasure("second")
assert.False(t, inst.HasSnapshot, "running instances should not expose retained snapshot bases as standby snapshots")
assertRetainedBaseState()
assert.Equal(t, phasetracking.PhaseRunning, inst.Phases.Current, "second restore should land back in running")
t.Logf("second diff-cycle timings: standby=%v restore-to-running=%v", secondStandbyDuration, secondRestoreRunningDuration)

assertGuestFileContents(secondFilePath, secondFileContents)
Expand Down
15 changes: 14 additions & 1 deletion lib/instances/fork.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ import (
"github.com/kernel/hypeman/lib/forkvm"
"github.com/kernel/hypeman/lib/guest"
"github.com/kernel/hypeman/lib/hypervisor"
"github.com/kernel/hypeman/lib/instances/phasetracking"
"github.com/kernel/hypeman/lib/logger"
"github.com/kernel/hypeman/lib/network"
"github.com/nrednav/cuid2"
Expand Down Expand Up @@ -267,7 +268,7 @@ func (m *manager) forkInstanceFromStoppedOrStandby(ctx context.Context, id strin
return nil, fmt.Errorf("get vm starter: %w", err)
}

now := time.Now()
now := time.Now().UTC()
forkMeta := cloneStoredMetadataWithoutPendingStandbyCompression(meta.StoredMetadata)
forkMeta.Id = forkID
forkMeta.Name = req.Name
Expand All @@ -280,6 +281,17 @@ func (m *manager) forkInstanceFromStoppedOrStandby(ctx context.Context, id strin
forkMeta.VsockSocket = m.paths.InstanceSocket(forkID, hypervisor.VsockSocketNameForType(forkMeta.HypervisorType))
forkMeta.ExitCode = nil
forkMeta.ExitMessage = ""
// Forks are new instances; phase accounting must not inherit the source's
Comment thread
cursor[bot] marked this conversation as resolved.
// cumulative durations. The first transition into the fork's runtime
// phase (Standby for snapshot forks, Stopped for stopped forks) will be
// recorded by the appropriate operation when the fork is acted on.
forkMeta.Phases.Reset()
switch source.State {
case StateStandby:
forkMeta.Phases.Record(phasetracking.PhaseStandby, now)
case StateStopped:
forkMeta.Phases.Record(phasetracking.PhaseStopped, now)
}
Comment thread
cursor[bot] marked this conversation as resolved.

// Keep the original CID for snapshot-based forks.
// Rewriting CID in restored memory snapshots is not reliable across
Expand Down Expand Up @@ -514,6 +526,7 @@ func cloneStoredMetadata(src StoredMetadata) StoredMetadata {
exitCode := *src.ExitCode
dst.ExitCode = &exitCode
}
dst.Phases = src.Phases.Clone()

return dst
}
Expand Down
9 changes: 9 additions & 0 deletions lib/instances/fork_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ import (
"github.com/kernel/hypeman/lib/guest"
"github.com/kernel/hypeman/lib/hypervisor"
"github.com/kernel/hypeman/lib/images"
"github.com/kernel/hypeman/lib/instances/phasetracking"
"github.com/kernel/hypeman/lib/paths"
snapshotstore "github.com/kernel/hypeman/lib/snapshot"
"github.com/stretchr/testify/assert"
Expand Down Expand Up @@ -548,6 +549,14 @@ func TestForkCloudHypervisorFromRunningNetwork(t *testing.T) {
assert.NotEqual(t, sourceAfterFork.MAC, forked.MAC)
assertGuestHasOnlyExpectedIPv4(t, forked, forked.IP, 30*time.Second)
assertHostCanReachNginx(t, forked.IP, 80, 60*time.Second)

// Fork must start with a fresh phase ledger and not inherit the source's
// accumulated running time. The source has completed at least one running
// stint by now (running -> internal-standby -> running); the fork is still
// in its first running stint and so has no completed running ms logged.
assert.Equal(t, phasetracking.PhaseRunning, forked.Phases.Current)
assert.Zero(t, forked.Phases.Cumulative[phasetracking.PhaseRunning], "fork should not inherit source's running ledger")
assert.Greater(t, sourceAfterFork.Phases.Cumulative[phasetracking.PhaseRunning], int64(0), "source's pre-fork running stint should be cumulated")
}

func assertHostCanReachNginx(t *testing.T, ip string, port int, timeout time.Duration) {
Expand Down
114 changes: 114 additions & 0 deletions lib/instances/phasetracking/phasetracking.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
// Package phasetracking accumulates cumulative time-in-phase for instance
// lifecycle phases. The tracker is embedded in instance metadata and updated
// at every externally-observable state transition so consumers can use the
// resulting durations for billing, observability, and analytics.
//
// Phases mirror the externally-observable values of instances.State (lowercased
// so they remain stable in the API surface even if the internal enum is
// renamed). The Initializing→Running transition is detected lazily when guest
// boot markers are persisted, so the tracker reflects the same view of guest
// readiness that the public State machine reports — not the bare moment the
// VMM process came up.
//
// Transient internal substates that no external observer can see — for example
// the Paused/Shutdown steps inside a single Standby or Stop orchestration — are
// intentionally not recorded; they would be sub-millisecond blips inside a
// non-yielding function call that adds noise without truth.
//
// Only the transition orchestration sites in lib/instances should call Record.
// The tracker intentionally does not subscribe to the lifecycle event bus —
// that bus is best-effort and lossy, which is unsuitable for billing.
package phasetracking

import "time"

// Phase is the canonical lifecycle phase name. Values mirror instances.State
Comment thread
sjmiller609 marked this conversation as resolved.
// lowercased so they remain stable in the API surface even if the internal
// State enum is renamed.
type Phase string

const (
PhaseStopped Phase = "stopped"
PhaseCreated Phase = "created"
PhaseInitializing Phase = "initializing"
PhaseRunning Phase = "running"
PhaseStandby Phase = "standby"
)

// Tracker accumulates cumulative wall-clock time spent in each phase.
//
// Invariants:
// - Cumulative[phase] is the total ms spent in `phase` across all prior
// completed visits to that phase.
// - Time spent in the *current* phase (since `Since`) is NOT yet rolled into
// Cumulative — callers that want "live" totals should use Snapshot.
// - Current and Since must be updated atomically with Cumulative; that's
// the contract of Record. Direct mutation is not supported.
//
// The zero value is valid: it represents an instance that has not entered
// any phase yet. The first Record call sets Current and Since without
// accruing time (there is no prior phase to accrue from).
type Tracker struct {
Current Phase `json:"current,omitempty"`
Since time.Time `json:"since,omitempty"`
Cumulative map[Phase]int64 `json:"cumulative,omitempty"`
}

// Record transitions into newPhase as of `now`, first accruing time-in-current
// into Cumulative. Safe to call on a zero-value Tracker (first transition has
// no prior phase, so no accrual happens).
//
// `now` is a parameter rather than time.Now() so tests can pin time and so
// callers can use the same `now` value they're persisting elsewhere on the
// metadata (e.g. StartedAt) without drift.
func (t *Tracker) Record(newPhase Phase, now time.Time) {
if t.Cumulative == nil {
t.Cumulative = make(map[Phase]int64)
}
if t.Current != "" && !t.Since.IsZero() {
elapsed := now.Sub(t.Since).Milliseconds()
if elapsed > 0 {
t.Cumulative[t.Current] += elapsed
}
}
t.Current = newPhase
t.Since = now
}

// Snapshot returns a copy of Cumulative with the in-flight time-in-current
// rolled in, without mutating the tracker. Use this when reporting "running
// time so far" — typically in the API response path.
func (t Tracker) Snapshot(now time.Time) map[Phase]int64 {
out := make(map[Phase]int64, len(t.Cumulative)+1)
for k, v := range t.Cumulative {
out[k] = v
}
if t.Current != "" && !t.Since.IsZero() {
elapsed := now.Sub(t.Since).Milliseconds()
if elapsed > 0 {
out[t.Current] += elapsed
}
}
return out
}

// Reset clears all accumulated state. Used when forking — the fork is a new
// instance and must not inherit the source's phase history.
func (t *Tracker) Reset() {
t.Current = ""
t.Since = time.Time{}
t.Cumulative = nil
}

// Clone returns a deep copy of the tracker. The returned tracker shares no
// state with the receiver, so independent Record/Reset calls do not interfere.
func (t Tracker) Clone() Tracker {
dst := t
if t.Cumulative != nil {
dst.Cumulative = make(map[Phase]int64, len(t.Cumulative))
for k, v := range t.Cumulative {
dst.Cumulative[k] = v
}
}
return dst
}
Loading
Loading