Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions cmd/api/api/instances.go
Original file line number Diff line number Diff line change
Expand Up @@ -657,9 +657,10 @@ func (s *ApiService) ForkInstance(ctx context.Context, request oapi.ForkInstance
}

result, err := s.InstanceManager.ForkInstance(ctx, inst.Id, instances.ForkInstanceRequest{
Name: request.Body.Name,
FromRunning: request.Body.FromRunning != nil && *request.Body.FromRunning,
TargetState: targetState,
Name: request.Body.Name,
FromRunning: request.Body.FromRunning != nil && *request.Body.FromRunning,
TargetState: targetState,
WaitForNetwork: request.Body.WaitForNetwork,
})
if err != nil {
switch {
Expand Down
6 changes: 5 additions & 1 deletion cmd/api/api/instances_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1165,13 +1165,15 @@ func TestForkInstance_Success(t *testing.T) {
result: forked,
}
svc.InstanceManager = mockMgr
waitForNetwork := false

resp, err := svc.ForkInstance(
mw.WithResolvedInstance(ctx(), source.Id, source),
oapi.ForkInstanceRequestObject{
Id: source.Id,
Body: &oapi.ForkInstanceRequest{
Name: "forked-instance",
Name: "forked-instance",
WaitForNetwork: &waitForNetwork,
},
},
)
Expand All @@ -1185,6 +1187,8 @@ func TestForkInstance_Success(t *testing.T) {
assert.Equal(t, "forked-instance", mockMgr.lastReq.Name)
assert.False(t, mockMgr.lastReq.FromRunning)
assert.Equal(t, instances.State(""), mockMgr.lastReq.TargetState)
require.NotNil(t, mockMgr.lastReq.WaitForNetwork)
assert.False(t, *mockMgr.lastReq.WaitForNetwork)
}

func TestForkInstance_NotSupported(t *testing.T) {
Expand Down
5 changes: 4 additions & 1 deletion cmd/api/api/snapshots.go
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,10 @@ func (s *ApiService) ForkSnapshot(ctx context.Context, request oapi.ForkSnapshot
return oapi.ForkSnapshot400JSONResponse{Code: "invalid_request", Message: "request body is required"}, nil
}

domainReq := instances.ForkSnapshotRequest{Name: request.Body.Name}
domainReq := instances.ForkSnapshotRequest{
Name: request.Body.Name,
WaitForNetwork: request.Body.WaitForNetwork,
}
if request.Body.TargetState != nil {
domainReq.TargetState = instances.State(*request.Body.TargetState)
}
Expand Down
61 changes: 61 additions & 0 deletions cmd/api/api/snapshots_test.go
Original file line number Diff line number Diff line change
@@ -1,14 +1,35 @@
package api

import (
"context"
"testing"
"time"

"github.com/kernel/hypeman/lib/hypervisor"
"github.com/kernel/hypeman/lib/instances"
"github.com/kernel/hypeman/lib/oapi"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)

type captureForkSnapshotManager struct {
instances.Manager
lastID string
lastReq *instances.ForkSnapshotRequest
result *instances.Instance
err error
}

func (m *captureForkSnapshotManager) ForkSnapshot(ctx context.Context, snapshotID string, req instances.ForkSnapshotRequest) (*instances.Instance, error) {
reqCopy := req
m.lastID = snapshotID
m.lastReq = &reqCopy
if m.err != nil {
return nil, m.err
}
return m.result, nil
}

func TestSnapshotScheduleToOAPIPreservesZeroMaxCount(t *testing.T) {
t.Parallel()

Expand All @@ -30,3 +51,43 @@ func TestSnapshotScheduleToOAPIPreservesZeroMaxCount(t *testing.T) {
require.NotNil(t, out.Retention.MaxAge)
assert.Equal(t, "24h0m0s", *out.Retention.MaxAge)
}

func TestForkSnapshotMapsWaitForNetwork(t *testing.T) {
t.Parallel()
svc := newTestService(t)

forked := &instances.Instance{
StoredMetadata: instances.StoredMetadata{
Id: "forked-instance",
Name: "forked-instance",
Image: "docker.io/library/alpine:latest",
CreatedAt: time.Now(),
HypervisorType: hypervisor.TypeFirecracker,
},
State: instances.StateRunning,
}
mockMgr := &captureForkSnapshotManager{
Manager: svc.InstanceManager,
result: forked,
}
svc.InstanceManager = mockMgr
waitForNetwork := false

resp, err := svc.ForkSnapshot(ctx(), oapi.ForkSnapshotRequestObject{
SnapshotId: "snap-123",
Body: &oapi.ForkSnapshotRequest{
Name: "forked-instance",
WaitForNetwork: &waitForNetwork,
},
})
require.NoError(t, err)

created, ok := resp.(oapi.ForkSnapshot201JSONResponse)
require.True(t, ok, "expected 201 response")
assert.Equal(t, "forked-instance", created.Name)
assert.Equal(t, "snap-123", mockMgr.lastID)
require.NotNil(t, mockMgr.lastReq)
assert.Equal(t, "forked-instance", mockMgr.lastReq.Name)
require.NotNil(t, mockMgr.lastReq.WaitForNetwork)
assert.False(t, *mockMgr.lastReq.WaitForNetwork)
}
18 changes: 18 additions & 0 deletions lib/forkvm/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,24 @@ to work across implementations.
For networked forks, the fork gets a fresh host/guest identity (IP, MAC, TAP)
instead of reusing the source identity.

## Resume network handoff

Networked standby/running forks need a new host-side allocation, but the guest
memory snapshot still contains the source VM's old interface state. On restore,
Hypeman prepares the fork's TAP/IP/MAC before the VM resumes, then hands the new
guest network config to the guest through a small mailbox embedded in snapshot
memory. After resume, VMGenID tells the guest-agent that this is a restored VM;
the guest-agent reads the mailbox and applies the new MAC, address, route, and
neighbor state with netlink.

For API calls that return a running fork, `wait_for_network` defaults to true.
In that mode Hypeman waits for a guest UDP "applied" ack before returning, so
the fast path still avoids making host-initiated guest RPC/vsock contact as the
first post-resume dependency. If `wait_for_network=false`, the API returns after
resume once the mailbox has been patched and the guest finishes the network
handoff asynchronously. If the mailbox path is unavailable, restore falls back
to the older host-initiated guest network reconfigure path.

## Fork data copy behavior

- Guest directory copy is **sparse-only** for regular files.
Expand Down
107 changes: 107 additions & 0 deletions lib/guest/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -82,8 +82,12 @@ func GetOrCreateConn(ctx context.Context, dialer hypervisor.VsockDialer) (*grpc.
}

// Create new connection using the VsockDialer
traceCtx := ctx
conn, err := grpc.Dial("passthrough:///vsock",
grpc.WithContextDialer(func(ctx context.Context, addr string) (net.Conn, error) {
if span := trace.SpanFromContext(traceCtx); span.SpanContext().IsValid() {
ctx = trace.ContextWithSpan(ctx, span)
}
netConn, err := dialer.DialVsock(ctx, vsockGuestPort)
if err != nil {
return nil, &AgentVSockDialError{Err: err}
Expand Down Expand Up @@ -146,6 +150,109 @@ type ExecOptions struct {
ResizeChan <-chan *WindowSize // Optional: channel to receive resize events (pointer to avoid copying mutex)
}

type ReconfigureNetworkOptions struct {
InterfaceName string
MAC string
IPv4 string
Prefix uint32
Gateway string
WaitForAgent time.Duration
}

func ReconfigureNetworkInInstance(ctx context.Context, dialer hypervisor.VsockDialer, opts ReconfigureNetworkOptions) error {
if opts.WaitForAgent == 0 {
return reconfigureNetworkOnce(ctx, dialer, opts)
}

ctx, span := otel.Tracer("hypeman/guest").Start(ctx, "guest.reconfigure_network", trace.WithAttributes(
attribute.Bool("wait_for_agent", true),
attribute.Int64("wait_for_agent_ms", opts.WaitForAgent.Milliseconds()),
))
defer span.End()

deadline := time.Now().Add(opts.WaitForAgent)
start := time.Now()
attempts := 0
retryableAttempts := 0
firstRetryableErrorType := ""
lastRetryableErrorType := ""
lastRetryInterval := time.Duration(0)

for {
attempts++
err := reconfigureNetworkOnce(ctx, dialer, opts)
if err == nil {
recordGuestExecWait(span, start, attempts, retryableAttempts, firstRetryableErrorType, lastRetryableErrorType, lastRetryInterval)
span.SetStatus(otelcodes.Ok, "")
return nil
}
if !isRetryableConnectionError(err) {
recordGuestExecWait(span, start, attempts, retryableAttempts, firstRetryableErrorType, lastRetryableErrorType, lastRetryInterval)
span.RecordError(err)
span.SetStatus(otelcodes.Error, err.Error())
return err
}

retryableAttempts++
errType := retryableConnectionErrorType(err)
if firstRetryableErrorType == "" {
firstRetryableErrorType = errType
}
lastRetryableErrorType = errType
CloseConn(dialer.Key())

if time.Now().After(deadline) {
recordGuestExecWait(span, start, attempts, retryableAttempts, firstRetryableErrorType, lastRetryableErrorType, lastRetryInterval)
span.RecordError(err)
span.SetStatus(otelcodes.Error, err.Error())
return err
}

retryInterval := guestExecRetryInterval(time.Since(start))
lastRetryInterval = retryInterval
select {
case <-ctx.Done():
recordGuestExecWait(span, start, attempts, retryableAttempts, firstRetryableErrorType, lastRetryableErrorType, lastRetryInterval)
span.RecordError(ctx.Err())
span.SetStatus(otelcodes.Error, ctx.Err().Error())
return ctx.Err()
case <-time.After(retryInterval):
}
}
}

func reconfigureNetworkOnce(ctx context.Context, dialer hypervisor.VsockDialer, opts ReconfigureNetworkOptions) error {
grpcConn, err := GetOrCreateConn(ctx, dialer)
if err != nil {
return fmt.Errorf("get grpc connection: %w", err)
}
client := NewGuestServiceClient(grpcConn)

_, span := otel.Tracer("hypeman/guest").Start(ctx, "guest.reconfigure_network.rpc")
_, err = client.ReconfigureNetwork(ctx, &ReconfigureNetworkRequest{
InterfaceName: opts.InterfaceName,
Mac: opts.MAC,
Ipv4: opts.IPv4,
Prefix: opts.Prefix,
Gateway: opts.Gateway,
})
finishGuestNetworkStepSpan(span, err)
if err != nil {
return fmt.Errorf("reconfigure network rpc: %w", err)
}
return nil
}

func finishGuestNetworkStepSpan(span trace.Span, err error) {
if err != nil {
span.RecordError(err)
span.SetStatus(otelcodes.Error, err.Error())
} else {
span.SetStatus(otelcodes.Ok, "")
}
span.End()
}

// ExecIntoInstance executes command in instance via vsock using gRPC.
// The dialer is a hypervisor-specific VsockDialer that knows how to connect to the guest.
// If WaitForAgent is set, it will retry on connection errors until the timeout.
Expand Down
Loading
Loading