Skip to content
Merged
56 changes: 56 additions & 0 deletions cmd/api/api/instances.go
Original file line number Diff line number Diff line change
Expand Up @@ -798,6 +798,62 @@ func (s *ApiService) StatInstancePath(ctx context.Context, request oapi.StatInst
return response, nil
}

// UpdateInstance updates mutable properties of a running instance.
// Currently supports updating env vars referenced by credential policies for key rotation.
// Note: Resolution is handled by ResolveResource middleware
func (s *ApiService) UpdateInstance(ctx context.Context, request oapi.UpdateInstanceRequestObject) (oapi.UpdateInstanceResponseObject, error) {
inst := mw.GetResolvedInstance[instances.Instance](ctx)
if inst == nil {
return oapi.UpdateInstance500JSONResponse{
Code: "internal_error",
Message: "resource not resolved",
}, nil
}
log := logger.FromContext(ctx)

if request.Body == nil {
return oapi.UpdateInstance400JSONResponse{
Code: "invalid_request",
Message: "request body is required",
}, nil
}

env := make(map[string]string)
if request.Body.Env != nil {
env = *request.Body.Env
}

result, err := s.InstanceManager.UpdateInstance(ctx, inst.Id, instances.UpdateInstanceRequest{
Env: env,
})
if err != nil {
switch {
case errors.Is(err, instances.ErrNotFound):
return oapi.UpdateInstance404JSONResponse{
Code: "not_found",
Message: "instance not found",
}, nil
case errors.Is(err, instances.ErrInvalidState):
return oapi.UpdateInstance409JSONResponse{
Code: "invalid_state",
Message: err.Error(),
}, nil
case errors.Is(err, instances.ErrInvalidRequest):
return oapi.UpdateInstance400JSONResponse{
Code: "invalid_request",
Message: err.Error(),
}, nil
default:
log.ErrorContext(ctx, "failed to update instance", "error", err)
return oapi.UpdateInstance500JSONResponse{
Code: "internal_error",
Message: "failed to update instance",
}, nil
}
}
return oapi.UpdateInstance200JSONResponse(instanceToOAPI(*result)), nil
}

// AttachVolume attaches a volume to an instance (not yet implemented)
func (s *ApiService) AttachVolume(ctx context.Context, request oapi.AttachVolumeRequestObject) (oapi.AttachVolumeResponseObject, error) {
return oapi.AttachVolume500JSONResponse{
Expand Down
141 changes: 141 additions & 0 deletions cmd/api/api/instances_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,14 @@ type captureForkManager struct {
err error
}

type captureUpdateManager struct {
instances.Manager
lastID string
lastReq *instances.UpdateInstanceRequest
result *instances.Instance
err error
}

func (m *captureForkManager) ForkInstance(ctx context.Context, id string, req instances.ForkInstanceRequest) (*instances.Instance, error) {
reqCopy := req
m.lastID = id
Expand All @@ -214,6 +222,31 @@ func (m *captureForkManager) ForkInstance(ctx context.Context, id string, req in
return m.result, nil
}

func (m *captureUpdateManager) UpdateInstance(ctx context.Context, id string, req instances.UpdateInstanceRequest) (*instances.Instance, error) {
reqCopy := req
m.lastID = id
m.lastReq = &reqCopy
if m.err != nil {
return nil, m.err
}
if m.result != nil {
return m.result, nil
}

now := time.Now()
return &instances.Instance{
StoredMetadata: instances.StoredMetadata{
Id: id,
Name: "updated-instance",
Image: "docker.io/library/alpine:latest",
Env: req.Env,
CreatedAt: now,
HypervisorType: hypervisor.TypeCloudHypervisor,
},
State: instances.StateRunning,
}, nil
}

func (m *captureCreateManager) CreateInstance(ctx context.Context, req instances.CreateInstanceRequest) (*instances.Instance, error) {
reqCopy := req
m.lastReq = &reqCopy
Expand Down Expand Up @@ -408,6 +441,114 @@ func TestCreateInstance_MapsNetworkEgressEnforcementMode(t *testing.T) {
assert.Equal(t, instances.EgressEnforcementModeHTTPHTTPSOnly, mockMgr.lastReq.NetworkEgress.EnforcementMode)
}

func TestUpdateInstance_MapsEnvPatch(t *testing.T) {
t.Parallel()
svc := newTestService(t)

origMgr := svc.InstanceManager
now := time.Now()
mockMgr := &captureUpdateManager{
Manager: origMgr,
result: &instances.Instance{
StoredMetadata: instances.StoredMetadata{
Id: "inst-update",
Name: "inst-update",
Image: "docker.io/library/alpine:latest",
Env: map[string]string{"OUTBOUND_OPENAI_KEY": "rotated-key-456"},
CreatedAt: now,
HypervisorType: hypervisor.TypeCloudHypervisor,
},
State: instances.StateRunning,
},
}
svc.InstanceManager = mockMgr

env := map[string]string{"OUTBOUND_OPENAI_KEY": "rotated-key-456"}
resolved := &instances.Instance{
StoredMetadata: instances.StoredMetadata{
Id: "inst-update",
Name: "inst-update",
Image: "docker.io/library/alpine:latest",
CreatedAt: now,
HypervisorType: hypervisor.TypeCloudHypervisor,
},
State: instances.StateRunning,
}

resp, err := svc.UpdateInstance(mw.WithResolvedInstance(ctx(), resolved.Id, resolved), oapi.UpdateInstanceRequestObject{
Id: resolved.Id,
Body: &oapi.UpdateInstanceRequest{Env: &env},
})
require.NoError(t, err)
_, ok := resp.(oapi.UpdateInstance200JSONResponse)
require.True(t, ok, "expected 200 response")

require.NotNil(t, mockMgr.lastReq)
assert.Equal(t, resolved.Id, mockMgr.lastID)
assert.Equal(t, "rotated-key-456", mockMgr.lastReq.Env["OUTBOUND_OPENAI_KEY"])
}

func TestUpdateInstance_RequiresBody(t *testing.T) {
t.Parallel()
svc := newTestService(t)

now := time.Now()
resolved := &instances.Instance{
StoredMetadata: instances.StoredMetadata{
Id: "inst-update",
Name: "inst-update",
Image: "docker.io/library/alpine:latest",
CreatedAt: now,
HypervisorType: hypervisor.TypeCloudHypervisor,
},
State: instances.StateRunning,
}

resp, err := svc.UpdateInstance(mw.WithResolvedInstance(ctx(), resolved.Id, resolved), oapi.UpdateInstanceRequestObject{
Id: resolved.Id,
})
require.NoError(t, err)
badReq, ok := resp.(oapi.UpdateInstance400JSONResponse)
require.True(t, ok, "expected 400 response")
assert.Equal(t, "invalid_request", badReq.Code)
assert.Contains(t, badReq.Message, "request body is required")
}

func TestUpdateInstance_MapsInvalidRequestError(t *testing.T) {
t.Parallel()
svc := newTestService(t)

origMgr := svc.InstanceManager
mockMgr := &captureUpdateManager{
Manager: origMgr,
err: fmt.Errorf("%w: env keys [UNRELATED_KEY] are not credential source env vars; allowed keys: [OUTBOUND_OPENAI_KEY]", instances.ErrInvalidRequest),
}
svc.InstanceManager = mockMgr

now := time.Now()
resolved := &instances.Instance{
StoredMetadata: instances.StoredMetadata{
Id: "inst-update",
Name: "inst-update",
Image: "docker.io/library/alpine:latest",
CreatedAt: now,
HypervisorType: hypervisor.TypeCloudHypervisor,
},
State: instances.StateRunning,
}
env := map[string]string{"UNRELATED_KEY": "value"}

resp, err := svc.UpdateInstance(mw.WithResolvedInstance(ctx(), resolved.Id, resolved), oapi.UpdateInstanceRequestObject{
Id: resolved.Id,
Body: &oapi.UpdateInstanceRequest{Env: &env},
})
require.NoError(t, err)
badReq, ok := resp.(oapi.UpdateInstance400JSONResponse)
require.True(t, ok, "expected 400 response")
assert.Equal(t, "invalid_request", badReq.Code)
assert.Contains(t, badReq.Message, "UNRELATED_KEY")
}

func TestForkInstance_Success(t *testing.T) {
t.Parallel()
svc := newTestService(t)
Expand Down
4 changes: 4 additions & 0 deletions lib/builds/manager_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,10 @@ func (m *mockInstanceManager) StartInstance(ctx context.Context, id string, req
return nil, nil
}

func (m *mockInstanceManager) UpdateInstance(ctx context.Context, id string, req instances.UpdateInstanceRequest) (*instances.Instance, error) {
return nil, nil
}

func (m *mockInstanceManager) StreamInstanceLogs(ctx context.Context, id string, tail int, follow bool, source instances.LogSource) (<-chan string, error) {
return nil, nil
}
Expand Down
89 changes: 89 additions & 0 deletions lib/egressproxy/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,16 @@ When enabled for an instance, hypeman does three things:
2. It injects proxy environment variables into the guest (`HTTP_PROXY` / `HTTPS_PROXY`) and installs the proxy CA certificate in the guest trust store.
3. It enforces policy on the host to prevent direct outbound TCP egress from the VM unless traffic is going to the bridge gateway (the proxy), depending on `network.egress.enforcement.mode`.

## How the feature works

At a high level, the feature separates what the guest sees from what the host uses for outbound authentication:

- The guest gets normal proxy configuration plus mock credential values.
- The host keeps the real credential values in instance metadata.
- The host-side proxy injects the real values into outbound HTTPS headers only when the configured policy matches.

That means the VM can make authenticated outbound requests without ever receiving the real secret material directly.

## Secret substitution flow

- API callers provide real secret values in instance `env`.
Expand All @@ -30,6 +40,25 @@ When enabled for an instance, hypeman does three things:

This keeps real secrets out of the VM while still allowing authenticated egress requests.

## Credential rotation via instance update

The feature also supports rotating real credential values without restarting the VM.

- `PATCH /instances/{id}` accepts updates to env keys that are already referenced by existing credential `source.env` bindings.
- The request updates the host-side stored value for that credential source.
- If the instance is currently registered with the egress proxy, hypeman recompiles the proxy's header injection rules using the new real value.
- The guest-visible env value does not change: inside the VM, the credential still appears as `mock-<CREDENTIAL_NAME>`.
- New outbound HTTPS requests start using the rotated value after the update succeeds.

Operationally, this is intended for key rotation, revocation/reissue flows, and similar secret lifecycle events where you want host-side outbound auth behavior to change immediately without a guest reboot.

### Update safety behavior

- The update path only accepts env keys already bound by the instance's credential policy.
- If proxy rule recompilation fails, the running instance keeps using the old value.
- If runtime rules are updated but metadata persistence fails, hypeman rolls the proxy rules back to the previous value before returning an error.
- Invalid or unreadable metadata is treated as an internal failure, not as a synthetic "instance not found" result.

## Security behavior

- Real secret values are persisted in the normal instance `env` metadata, which is already host-side state.
Expand All @@ -42,3 +71,63 @@ This keeps real secrets out of the VM while still allowing authenticated egress
- Header injection is applied to HTTP headers only (not request/response bodies).
- Non-HTTP protocols or custom ports are not rewritten by the MITM layer.
- Plain HTTP requests are not eligible for secret substitution.

## Observability

This feature exposes operator-facing logs, traces, and metrics for both the control plane and the proxy data plane.

### Logs

- Egress proxy logs use the `EGRESS` logging subsystem.
- Control-plane actions such as register, unregister, and rule update include `instance_id` so they can be correlated with per-instance logs.
- Upstream proxy failures are logged with low-cardinality fields such as `protocol` and whether header injection occurred (`injected=true/false`).
- When request trace context is available, logs include `trace_id` and `span_id`.

In normal operation, the important things to watch for are:

- repeated "failed to configure egress proxy" errors during create, start, or restore
- repeated "failed to update egress proxy rules" errors during credential rotation
- repeated upstream proxy failure warnings, especially after a credential rotation or policy rollout

### Tracing

The feature adds child spans for control-plane operations, including:

- `MaybeRegisterEgressProxy`
- `EgressProxy.RegisterInstance`
- `EgressProxy.UpdateInstanceRules`
- `EgressProxy.UnregisterInstance`

These spans include attributes such as:

- `operation`
- `proxy_enabled`
- `enforcement_mode`
- `inject_rule_count`
- `result`

This makes it possible to distinguish failures in proxy registration, runtime rule update, and teardown from the broader instance lifecycle span that triggered them.

### Metrics

Control-plane metrics:

- `hypeman_egress_proxy_registrations_total{operation,result,enforcement_mode}`
- `hypeman_egress_proxy_rule_updates_total{result}`
- `hypeman_egress_proxy_registered_instances_total`
- `hypeman_egress_proxy_control_plane_duration_seconds{operation,result}`

Data-plane metrics:

- `hypeman_egress_proxy_requests_total{protocol,result,injected}`
- `hypeman_egress_proxy_upstream_duration_seconds{protocol,result}`
- `hypeman_egress_proxy_upstream_failures_total{protocol}`

These labels are intentionally low-cardinality. In particular, destination host is not used as a metric label.

### What operators should look for

- A rise in `hypeman_egress_proxy_registrations_total{result="error"}` usually means create/start/restore flows are failing to attach egress mediation correctly.
- A rise in `hypeman_egress_proxy_rule_updates_total{result="error"}` means key rotation requests are being rejected or failing to apply.
- `hypeman_egress_proxy_registered_instances_total` should roughly match the number of running instances that currently have `network.egress.enabled=true`.
- A rise in `hypeman_egress_proxy_upstream_failures_total` or a latency increase in `hypeman_egress_proxy_upstream_duration_seconds` usually points to upstream reachability, TLS trust, or destination-side issues rather than guest boot problems.
Loading
Loading