From 2f4353412d1bcf305898f676a815d342304ffa28 Mon Sep 17 00:00:00 2001 From: Tim Gross Date: Fri, 19 Jul 2024 13:29:41 -0400 Subject: [PATCH] keyring: support prepublishing keys (#23577) When a root key is rotated, the servers immediately start signing Workload Identities with the new active key. But workloads may be using those WI tokens to sign into external services, which may not have had time to fetch the new public key and which might try to fetch new keys as needed. Add support for prepublishing keys. Prepublished keys will be visible in the JWKS endpoint but will not be used for signing or encryption until their `PublishTime`. Update the periodic key rotation to prepublish keys at half the `root_key_rotation_threshold` window, and promote prepublished keys to active after the `PublishTime`. This changeset also fixes two bugs in periodic root key rotation and garbage collection, both of which can't be safely fixed without implementing prepublishing: * Periodic root key rotation would never happen because the default `root_key_rotation_threshold` of 720h exceeds the 72h maximum window of the FSM time table. We now compare the `CreateTime` against the wall clock time instead of the time table. (We expect to remove the time table in future work, ref https://github.com/hashicorp/nomad/issues/16359) * Root key garbage collection could GC keys that were used to sign identities. We now wait until `root_key_rotation_threshold` + `root_key_gc_threshold` before GC'ing a key. * When rekeying a root key, the core job did not mark the key as inactive after the rekey was complete. Ref: https://hashicorp.atlassian.net/browse/NET-10398 Ref: https://hashicorp.atlassian.net/browse/NET-10280 Fixes: https://github.com/hashicorp/nomad/issues/19669 Fixes: https://github.com/hashicorp/nomad/issues/23528 Fixes: https://github.com/hashicorp/nomad/issues/19368 --- .changelog/23577.txt | 15 ++ api/keyring.go | 18 +- command/agent/keyring_endpoint.go | 10 + command/agent/keyring_endpoint_test.go | 82 ++++-- command/operator_root_keyring.go | 10 +- command/operator_root_keyring_rotate.go | 45 +++- nomad/core_sched.go | 139 ++++++++-- nomad/core_sched_test.go | 240 +++++++++++++----- nomad/encrypter.go | 10 +- nomad/encrypter_test.go | 2 +- nomad/keyring_endpoint.go | 11 +- nomad/keyring_endpoint_test.go | 80 ++++-- nomad/leader.go | 2 +- nomad/state/state_store.go | 12 +- nomad/state/state_store_test.go | 11 +- nomad/structs/keyring.go | 102 ++++++-- .../commands/operator/root/keyring-rotate.mdx | 31 ++- website/content/docs/configuration/server.mdx | 16 +- .../content/docs/integrations/consul/acl.mdx | 8 +- .../content/docs/integrations/vault/acl.mdx | 8 +- .../content/docs/upgrade/upgrade-specific.mdx | 14 +- 21 files changed, 652 insertions(+), 214 deletions(-) create mode 100644 .changelog/23577.txt diff --git a/.changelog/23577.txt b/.changelog/23577.txt new file mode 100644 index 000000000000..353aa2b9ceb6 --- /dev/null +++ b/.changelog/23577.txt @@ -0,0 +1,15 @@ +```release-note:improvement +keyring: Added support for prepublishing keys +``` + +```release-note:bug +keyring: Fixed a bug where periodic key rotation would not occur +``` + +```release-note:bug +keyring: Fixed a bug where keys could be garbage collected before workload identities expire +``` + +```release-note:bug +keyring: Fixed a bug where keys would never exit the "rekeying" state after a rotation with the `-full` flag +``` diff --git a/api/keyring.go b/api/keyring.go index d87d8b720fcc..69e0c9c10731 100644 --- a/api/keyring.go +++ b/api/keyring.go @@ -34,16 +34,18 @@ type RootKeyMeta struct { CreateIndex uint64 ModifyIndex uint64 State RootKeyState + PublishTime int64 } // RootKeyState enum describes the lifecycle of a root key. type RootKeyState string const ( - RootKeyStateInactive RootKeyState = "inactive" - RootKeyStateActive = "active" - RootKeyStateRekeying = "rekeying" - RootKeyStateDeprecated = "deprecated" + RootKeyStateInactive RootKeyState = "inactive" + RootKeyStateActive = "active" + RootKeyStateRekeying = "rekeying" + RootKeyStateDeprecated = "deprecated" + RootKeyStatePrepublished = "prepublished" ) // List lists all the keyring metadata @@ -78,6 +80,9 @@ func (k *Keyring) Rotate(opts *KeyringRotateOptions, w *WriteOptions) (*RootKeyM if opts.Full { qp.Set("full", "true") } + if opts.PublishTime > 0 { + qp.Set("publish_time", fmt.Sprintf("%d", opts.PublishTime)) + } } resp := &struct{ Key *RootKeyMeta }{} wm, err := k.client.put("/v1/operator/keyring/rotate?"+qp.Encode(), nil, resp, w) @@ -86,6 +91,7 @@ func (k *Keyring) Rotate(opts *KeyringRotateOptions, w *WriteOptions) (*RootKeyM // KeyringRotateOptions are parameters for the Rotate API type KeyringRotateOptions struct { - Full bool - Algorithm EncryptionAlgorithm + Full bool + Algorithm EncryptionAlgorithm + PublishTime int64 } diff --git a/command/agent/keyring_endpoint.go b/command/agent/keyring_endpoint.go index b753507323cc..870aae3f06c4 100644 --- a/command/agent/keyring_endpoint.go +++ b/command/agent/keyring_endpoint.go @@ -6,6 +6,7 @@ package agent import ( "fmt" "net/http" + "strconv" "strings" "time" @@ -167,6 +168,15 @@ func (s *HTTPServer) keyringRotateRequest(resp http.ResponseWriter, req *http.Re args.Full = true } + ptRaw := query.Get("publish_time") + if ptRaw != "" { + publishTime, err := strconv.ParseInt(ptRaw, 10, 64) + if err != nil { + return nil, fmt.Errorf("invalid publish_time: %w", err) + } + args.PublishTime = publishTime + } + var out structs.KeyringRotateRootKeyResponse if err := s.agent.RPC("Keyring.Rotate", &args, &out); err != nil { return nil, err diff --git a/command/agent/keyring_endpoint_test.go b/command/agent/keyring_endpoint_test.go index 938642795af7..428da9d5078e 100644 --- a/command/agent/keyring_endpoint_test.go +++ b/command/agent/keyring_endpoint_test.go @@ -4,6 +4,7 @@ package agent import ( + "fmt" "net/http" "net/http/httptest" "strconv" @@ -13,7 +14,6 @@ import ( "github.com/go-jose/go-jose/v3" "github.com/shoenig/test/must" - "github.com/stretchr/testify/require" "github.com/hashicorp/nomad/ci" "github.com/hashicorp/nomad/nomad/structs" @@ -29,57 +29,83 @@ func TestHTTP_Keyring_CRUD(t *testing.T) { // List (get bootstrap key) req, err := http.NewRequest(http.MethodGet, "/v1/operator/keyring/keys", nil) - require.NoError(t, err) + must.NoError(t, err) obj, err := s.Server.KeyringRequest(respW, req) - require.NoError(t, err) + must.NoError(t, err) listResp := obj.([]*structs.RootKeyMeta) - require.Len(t, listResp, 1) - oldKeyID := listResp[0].KeyID + must.Len(t, 1, listResp) + key0 := listResp[0].KeyID // Rotate req, err = http.NewRequest(http.MethodPut, "/v1/operator/keyring/rotate", nil) - require.NoError(t, err) + must.NoError(t, err) obj, err = s.Server.KeyringRequest(respW, req) - require.NoError(t, err) - require.NotZero(t, respW.HeaderMap.Get("X-Nomad-Index")) + must.NoError(t, err) + must.NotEq(t, "", respW.HeaderMap.Get("X-Nomad-Index")) rotateResp := obj.(structs.KeyringRotateRootKeyResponse) - require.NotNil(t, rotateResp.Key) - require.True(t, rotateResp.Key.Active()) - newID1 := rotateResp.Key.KeyID + must.NotNil(t, rotateResp.Key) + must.True(t, rotateResp.Key.IsActive()) + key1 := rotateResp.Key.KeyID + + // Rotate with prepublish + + publishTime := time.Now().Add(24 * time.Hour).UnixNano() + req, err = http.NewRequest(http.MethodPut, + fmt.Sprintf("/v1/operator/keyring/rotate?publish_time=%d", publishTime), nil) + must.NoError(t, err) + obj, err = s.Server.KeyringRequest(respW, req) + must.NoError(t, err) + must.NotEq(t, "", respW.HeaderMap.Get("X-Nomad-Index")) + rotateResp = obj.(structs.KeyringRotateRootKeyResponse) + must.NotNil(t, rotateResp.Key) + must.True(t, rotateResp.Key.IsPrepublished()) + key2 := rotateResp.Key.KeyID // List req, err = http.NewRequest(http.MethodGet, "/v1/operator/keyring/keys", nil) - require.NoError(t, err) + must.NoError(t, err) obj, err = s.Server.KeyringRequest(respW, req) - require.NoError(t, err) + must.NoError(t, err) listResp = obj.([]*structs.RootKeyMeta) - require.Len(t, listResp, 2) + must.Len(t, 3, listResp) for _, key := range listResp { - if key.KeyID == newID1 { - require.True(t, key.Active(), "new key should be active") - } else { - require.False(t, key.Active(), "initial key should be inactive") + switch key.KeyID { + case key0: + must.True(t, key.IsInactive(), must.Sprint("initial key should be inactive")) + case key1: + must.True(t, key.IsActive(), must.Sprint("new key should be active")) + case key2: + must.True(t, key.IsPrepublished(), + must.Sprint("prepublished key should not be active")) } } - // Delete the old key and verify its gone + // Delete the original key and verify its gone - req, err = http.NewRequest(http.MethodDelete, "/v1/operator/keyring/key/"+oldKeyID, nil) - require.NoError(t, err) + req, err = http.NewRequest(http.MethodDelete, "/v1/operator/keyring/key/"+key0, nil) + must.NoError(t, err) obj, err = s.Server.KeyringRequest(respW, req) - require.NoError(t, err) + must.NoError(t, err) req, err = http.NewRequest(http.MethodGet, "/v1/operator/keyring/keys", nil) - require.NoError(t, err) + must.NoError(t, err) obj, err = s.Server.KeyringRequest(respW, req) - require.NoError(t, err) + must.NoError(t, err) listResp = obj.([]*structs.RootKeyMeta) - require.Len(t, listResp, 1) - require.Equal(t, newID1, listResp[0].KeyID) - require.True(t, listResp[0].Active()) - require.Len(t, listResp, 1) + must.Len(t, 2, listResp) + for _, key := range listResp { + switch key.KeyID { + case key0: + t.Fatalf("initial key should have been deleted") + case key1: + must.True(t, key.IsActive(), must.Sprint("new key should be active")) + case key2: + must.True(t, key.IsPrepublished(), + must.Sprint("prepublished key should not be active")) + } + } }) } diff --git a/command/operator_root_keyring.go b/command/operator_root_keyring.go index 63a3a14e8dc9..b92e1a577336 100644 --- a/command/operator_root_keyring.go +++ b/command/operator_root_keyring.go @@ -75,11 +75,15 @@ func renderVariablesKeysResponse(keys []*api.RootKeyMeta, verbose bool) string { length = 8 } out := make([]string, len(keys)+1) - out[0] = "Key|State|Create Time" + out[0] = "Key|State|Create Time|Publish Time" i := 1 for _, k := range keys { - out[i] = fmt.Sprintf("%s|%v|%s", - k.KeyID[:length], k.State, formatUnixNanoTime(k.CreateTime)) + publishTime := "" + if k.PublishTime > 0 { + publishTime = formatUnixNanoTime(k.PublishTime) + } + out[i] = fmt.Sprintf("%s|%v|%s|%s", + k.KeyID[:length], k.State, formatUnixNanoTime(k.CreateTime), publishTime) i = i + 1 } return formatList(out) diff --git a/command/operator_root_keyring_rotate.go b/command/operator_root_keyring_rotate.go index 9c274c772920..70271666cee9 100644 --- a/command/operator_root_keyring_rotate.go +++ b/command/operator_root_keyring_rotate.go @@ -6,6 +6,7 @@ package command import ( "fmt" "strings" + "time" "github.com/hashicorp/nomad/api" "github.com/posener/complete" @@ -36,6 +37,17 @@ Keyring Options: will immediately return and the re-encryption process will run asynchronously on the leader. + -now + Publish the new key immediately without prepublishing. One of -now or + -prepublish must be set. + + -prepublish + Set a duration for which to prepublish the new key (ex. "1h"). The currently + active key will be unchanged but the new public key will be available in the + JWKS endpoint. Multiple keys can be prepublished and they will be promoted to + active in order of publish time, at most once every root_key_gc_interval. One + of -now or -prepublish must be set. + -verbose Show full information. ` @@ -50,8 +62,10 @@ func (c *OperatorRootKeyringRotateCommand) Synopsis() string { func (c *OperatorRootKeyringRotateCommand) AutocompleteFlags() complete.Flags { return mergeAutocompleteFlags(c.Meta.AutocompleteFlags(FlagSetClient), complete.Flags{ - "-full": complete.PredictNothing, - "-verbose": complete.PredictNothing, + "-full": complete.PredictNothing, + "-now": complete.PredictNothing, + "-prepublish": complete.PredictNothing, + "-verbose": complete.PredictNothing, }) } @@ -64,12 +78,15 @@ func (c *OperatorRootKeyringRotateCommand) Name() string { } func (c *OperatorRootKeyringRotateCommand) Run(args []string) int { - var rotateFull, verbose bool + var rotateFull, rotateNow, verbose bool + var prepublishDuration time.Duration flags := c.Meta.FlagSet("root keyring rotate", FlagSetClient) flags.Usage = func() { c.Ui.Output(c.Help()) } flags.BoolVar(&rotateFull, "full", false, "full key rotation") + flags.BoolVar(&rotateNow, "now", false, "immediately rotate without prepublish") flags.BoolVar(&verbose, "verbose", false, "") + flags.DurationVar(&prepublishDuration, "prepublish", 0, "prepublish key") if err := flags.Parse(args); err != nil { return 1 @@ -88,8 +105,28 @@ func (c *OperatorRootKeyringRotateCommand) Run(args []string) int { return 1 } + if !rotateNow && prepublishDuration == 0 || rotateNow && prepublishDuration != 0 { + c.Ui.Error(` +One of "-now" or "-prepublish" must be used. + +If a key has been leaked use "-now" to force immediate rotation. + +Otherwise please use "-prepublish " to ensure the new key is not used +to sign workload identities before JWKS endpoints are updated. +`) + return 1 + } + + publishTime := int64(0) + if prepublishDuration > 0 { + publishTime = time.Now().UnixNano() + prepublishDuration.Nanoseconds() + } + resp, _, err := client.Keyring().Rotate( - &api.KeyringRotateOptions{Full: rotateFull}, nil) + &api.KeyringRotateOptions{ + Full: rotateFull, + PublishTime: publishTime, + }, nil) if err != nil { c.Ui.Error(fmt.Sprintf("error: %s", err)) return 1 diff --git a/nomad/core_sched.go b/nomad/core_sched.go index ebbeecb234c0..89afb6c3489f 100644 --- a/nomad/core_sched.go +++ b/nomad/core_sched.go @@ -99,7 +99,7 @@ func (c *CoreScheduler) forceGC(eval *structs.Evaluation) error { if err := c.expiredACLTokenGC(eval, true); err != nil { return err } - if err := c.rootKeyGC(eval); err != nil { + if err := c.rootKeyGC(eval, time.Now()); err != nil { return err } // Node GC must occur after the others to ensure the allocations are @@ -902,20 +902,17 @@ func (c *CoreScheduler) rootKeyRotateOrGC(eval *structs.Evaluation) error { // a rotation will be sent to the leader so our view of state // is no longer valid. we ack this core job and will pick up // the GC work on the next interval - wasRotated, err := c.rootKeyRotate(eval) + wasRotated, err := c.rootKeyRotate(eval, time.Now()) if err != nil { return err } if wasRotated { return nil } - return c.rootKeyGC(eval) + return c.rootKeyGC(eval, time.Now()) } -func (c *CoreScheduler) rootKeyGC(eval *structs.Evaluation) error { - - oldThreshold := c.getThreshold(eval, "root key", - "root_key_gc_threshold", c.srv.config.RootKeyGCThreshold) +func (c *CoreScheduler) rootKeyGC(eval *structs.Evaluation, now time.Time) error { ws := memdb.NewWatchSet() iter, err := c.snap.RootKeyMetas(ws) @@ -923,19 +920,31 @@ func (c *CoreScheduler) rootKeyGC(eval *structs.Evaluation) error { return err } + // the threshold is longer than we can support with the time table, and we + // never want to force-GC keys because that will orphan signed Workload + // Identities + rotationThreshold := now.Add(-1 * + (c.srv.config.RootKeyRotationThreshold + c.srv.config.RootKeyGCThreshold)) + for { raw := iter.Next() if raw == nil { break } keyMeta := raw.(*structs.RootKeyMeta) - if keyMeta.Active() || keyMeta.Rekeying() { - continue // never GC the active key or one we're rekeying + if !keyMeta.IsInactive() { + continue // never GC keys we're still using } - if keyMeta.CreateIndex > oldThreshold { - continue // don't GC recent keys + + c.logger.Trace("checking inactive key eligibility for gc", + "create_time", keyMeta.CreateTime, "threshold", rotationThreshold.UnixNano()) + + if keyMeta.CreateTime > rotationThreshold.UnixNano() { + continue // don't GC keys with potentially live Workload Identities } + // don't GC keys used to encrypt Variables or sign legacy non-expiring + // Workload Identities inUse, err := c.snap.IsRootKeyMetaInUse(keyMeta.KeyID) if err != nil { return err @@ -961,26 +970,97 @@ func (c *CoreScheduler) rootKeyGC(eval *structs.Evaluation) error { return nil } -// rootKeyRotate checks if the active key is old enough that we need -// to kick off a rotation. -func (c *CoreScheduler) rootKeyRotate(eval *structs.Evaluation) (bool, error) { - - rotationThreshold := c.getThreshold(eval, "root key", - "root_key_rotation_threshold", c.srv.config.RootKeyRotationThreshold) +// rootKeyRotate checks if the active key is old enough that we need to kick off +// a rotation. It prepublishes a key first and only promotes that prepublished +// key to active once the rotation threshold has expired +func (c *CoreScheduler) rootKeyRotate(eval *structs.Evaluation, now time.Time) (bool, error) { ws := memdb.NewWatchSet() - activeKey, err := c.snap.GetActiveRootKeyMeta(ws) + iter, err := c.snap.RootKeyMetas(ws) if err != nil { return false, err } + + var ( + activeKey *structs.RootKeyMeta + prepublishedKey *structs.RootKeyMeta + ) + + for raw := iter.Next(); raw != nil; raw = iter.Next() { + key := raw.(*structs.RootKeyMeta) + switch key.State { + case structs.RootKeyStateActive: + activeKey = key + case structs.RootKeyStatePrepublished: + // multiple keys can be prepublished, so we only want to handle the + // very next one + if prepublishedKey == nil { + prepublishedKey = key + } else if prepublishedKey.PublishTime > key.PublishTime { + prepublishedKey = key + } + } + } + + if prepublishedKey != nil { + c.logger.Trace("checking prepublished key eligibility for promotion", + "publish_time", prepublishedKey.PublishTime, "now", now.UnixNano()) + + if prepublishedKey.PublishTime > now.UnixNano() { + // at this point we have a key in a prepublished state but it's not + // ready to be made active, so we bail out. otherwise we'd kick off + // a new rotation every time we process this eval and we're past + // internval/2 + return false, nil + } + + rootKey, err := c.srv.encrypter.GetKey(prepublishedKey.KeyID) + if err != nil { + c.logger.Error("prepublished key does not exist in keyring", "error", err) + return false, nil + } + rootKey = rootKey.MakeActive() + + req := &structs.KeyringUpdateRootKeyRequest{ + RootKey: rootKey, + WriteRequest: structs.WriteRequest{ + Region: c.srv.config.Region, + AuthToken: eval.LeaderACL, + }, + } + + if err := c.srv.RPC("Keyring.Update", + req, &structs.KeyringUpdateRootKeyResponse{}); err != nil { + c.logger.Error("setting prepublished key active failed", "error", err) + return false, err + } + return true, nil + } + + // There's no prepublished key so prepublish one now + if activeKey == nil { - return false, nil // no active key + c.logger.Warn("keyring has no active key: rotate keyring to repair") + return false, nil } - if activeKey.CreateIndex >= rotationThreshold { + + // we rotate at half the rotation threshold because we want to prepublish a key + rotationThreshold := now.Add(-1 * c.srv.config.RootKeyRotationThreshold / 2) + + c.logger.Trace("checking active key eligibility for rotation", + "create_time", activeKey.CreateTime, "threshold", rotationThreshold.UnixNano()) + + if activeKey.CreateTime > rotationThreshold.UnixNano() { return false, nil // key is too new } + // this eval may be processed up to RootKeyGCInterval after the halfway + // mark, so use the CreateTime of the previous key rather than the wall + // clock to set the publish time + publishTime := activeKey.CreateTime + c.srv.config.RootKeyRotationThreshold.Nanoseconds() + req := &structs.KeyringRotateRootKeyRequest{ + PublishTime: publishTime, WriteRequest: structs.WriteRequest{ Region: c.srv.config.Region, AuthToken: eval.LeaderACL, @@ -1014,7 +1094,7 @@ func (c *CoreScheduler) variablesRekey(eval *structs.Evaluation) error { break } keyMeta := raw.(*structs.RootKeyMeta) - if !keyMeta.Rekeying() { + if !keyMeta.IsRekeying() { continue } varIter, err := c.snap.GetVariablesByKeyID(ws, keyMeta.KeyID) @@ -1026,6 +1106,23 @@ func (c *CoreScheduler) variablesRekey(eval *structs.Evaluation) error { return err } + rootKey, err := c.srv.encrypter.GetKey(keyMeta.KeyID) + if err != nil { + return fmt.Errorf("rotated key does not exist in keyring: %w", err) + } + rootKey = rootKey.MakeInactive() + + req := &structs.KeyringUpdateRootKeyRequest{ + RootKey: rootKey, + WriteRequest: structs.WriteRequest{ + Region: c.srv.config.Region, + AuthToken: eval.LeaderACL}, + } + if err := c.srv.RPC("Keyring.Update", + req, &structs.KeyringUpdateRootKeyResponse{}); err != nil { + c.logger.Error("rekey complete but failed to mark key as inactive", "error", err) + return err + } } return nil diff --git a/nomad/core_sched_test.go b/nomad/core_sched_test.go index 2f1347c896d6..76a04a67de93 100644 --- a/nomad/core_sched_test.go +++ b/nomad/core_sched_test.go @@ -19,6 +19,7 @@ import ( "github.com/hashicorp/nomad/nomad/structs" "github.com/hashicorp/nomad/testutil" "github.com/shoenig/test/must" + "github.com/shoenig/test/wait" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" ) @@ -2609,32 +2610,137 @@ func TestCoreScheduler_CSIBadState_ClaimGC(t *testing.T) { } } +// TestCoreScheduler_RootKeyRotate exercises periodic rotation of the root key +func TestCoreScheduler_RootKeyRotate(t *testing.T) { + ci.Parallel(t) + + srv, cleanup := TestServer(t, func(c *Config) { + c.NumSchedulers = 0 + c.RootKeyRotationThreshold = time.Hour + }) + defer cleanup() + testutil.WaitForKeyring(t, srv.RPC, "global") + + // active key, will never be GC'd + store := srv.fsm.State() + key0, err := store.GetActiveRootKeyMeta(nil) + must.NotNil(t, key0, must.Sprint("expected keyring to be bootstapped")) + must.NoError(t, err) + + // run the core job + snap, err := store.Snapshot() + must.NoError(t, err) + core := NewCoreScheduler(srv, snap) + index := key0.ModifyIndex + 1 + eval := srv.coreJobEval(structs.CoreJobRootKeyRotateOrGC, index) + c := core.(*CoreScheduler) + + // Eval immediately + now := time.Unix(0, key0.CreateTime) + rotated, err := c.rootKeyRotate(eval, now) + must.NoError(t, err) + must.False(t, rotated, must.Sprint("key should not rotate")) + + // Eval after half threshold has passed + c.snap, _ = store.Snapshot() + now = time.Unix(0, key0.CreateTime+(time.Minute*40).Nanoseconds()) + rotated, err = c.rootKeyRotate(eval, now) + must.NoError(t, err) + must.True(t, rotated, must.Sprint("key should rotate")) + + var key1 *structs.RootKeyMeta + iter, err := store.RootKeyMetas(nil) + must.NoError(t, err) + for raw := iter.Next(); raw != nil; raw = iter.Next() { + k := raw.(*structs.RootKeyMeta) + if k.KeyID == key0.KeyID { + must.True(t, k.IsActive(), must.Sprint("expected original key to be active")) + } else { + key1 = k + } + } + must.NotNil(t, key1) + must.True(t, key1.IsPrepublished()) + must.Eq(t, key0.CreateTime+time.Hour.Nanoseconds(), key1.PublishTime) + + // Externally rotate with prepublish to add a second prepublished key + resp := &structs.KeyringRotateRootKeyResponse{} + must.NoError(t, srv.RPC("Keyring.Rotate", &structs.KeyringRotateRootKeyRequest{ + PublishTime: key1.PublishTime + (time.Hour * 24).Nanoseconds(), + WriteRequest: structs.WriteRequest{Region: srv.Region()}, + }, resp)) + key2 := resp.Key + + // Eval again with time unchanged + c.snap, _ = store.Snapshot() + rotated, err = c.rootKeyRotate(eval, now) + + iter, err = store.RootKeyMetas(nil) + must.NoError(t, err) + for raw := iter.Next(); raw != nil; raw = iter.Next() { + k := raw.(*structs.RootKeyMeta) + switch k.KeyID { + case key0.KeyID: + must.True(t, k.IsActive(), must.Sprint("original key should still be active")) + case key1.KeyID, key2.KeyID: + must.True(t, k.IsPrepublished(), must.Sprint("new key should be prepublished")) + default: + t.Fatalf("should not have created any new keys: %#v", k) + } + } + + // Eval again with time after publish time + c.snap, _ = store.Snapshot() + now = time.Unix(0, key1.PublishTime+(time.Minute*10).Nanoseconds()) + rotated, err = c.rootKeyRotate(eval, now) + + iter, err = store.RootKeyMetas(nil) + must.NoError(t, err) + for raw := iter.Next(); raw != nil; raw = iter.Next() { + k := raw.(*structs.RootKeyMeta) + switch k.KeyID { + case key0.KeyID: + must.True(t, k.IsInactive(), must.Sprint("original key should be inactive")) + case key1.KeyID: + must.True(t, k.IsActive(), must.Sprint("prepublished key should now be active")) + case key2.KeyID: + must.True(t, k.IsPrepublished(), must.Sprint("later prepublished key should still be prepublished")) + default: + t.Fatalf("should not have created any new keys: %#v", k) + } + } +} + // TestCoreScheduler_RootKeyGC exercises root key GC func TestCoreScheduler_RootKeyGC(t *testing.T) { ci.Parallel(t) - srv, cleanup := TestServer(t, nil) + srv, cleanup := TestServer(t, func(c *Config) { + c.NumSchedulers = 0 + c.RootKeyRotationThreshold = time.Hour + c.RootKeyGCThreshold = time.Minute * 10 + }) defer cleanup() testutil.WaitForKeyring(t, srv.RPC, "global") - // reset the time table - srv.fsm.timetable.table = make([]TimeTableEntry, 1, 10) - // active key, will never be GC'd store := srv.fsm.State() key0, err := store.GetActiveRootKeyMeta(nil) - require.NotNil(t, key0, "expected keyring to be bootstapped") - require.NoError(t, err) + must.NotNil(t, key0, must.Sprint("expected keyring to be bootstapped")) + must.NoError(t, err) + + now := key0.CreateTime + yesterday := now - (24 * time.Hour).Nanoseconds() // insert an "old" inactive key - key1 := structs.NewRootKeyMeta() - key1.SetInactive() - require.NoError(t, store.UpsertRootKeyMeta(600, key1, false)) + key1 := structs.NewRootKeyMeta().MakeInactive() + key1.CreateTime = yesterday + must.NoError(t, store.UpsertRootKeyMeta(600, key1, false)) // insert an "old" and inactive key with a variable that's using it - key2 := structs.NewRootKeyMeta() - key2.SetInactive() - require.NoError(t, store.UpsertRootKeyMeta(700, key2, false)) + key2 := structs.NewRootKeyMeta().MakeInactive() + key2.CreateTime = yesterday + must.NoError(t, store.UpsertRootKeyMeta(700, key2, false)) variable := mock.VariableEncrypted() variable.KeyID = key2.KeyID @@ -2643,89 +2749,95 @@ func TestCoreScheduler_RootKeyGC(t *testing.T) { Op: structs.VarOpSet, Var: variable, }) - require.NoError(t, setResp.Error) + must.NoError(t, setResp.Error) // insert an "old" key that's inactive but being used by an alloc - key3 := structs.NewRootKeyMeta() - key3.SetInactive() - require.NoError(t, store.UpsertRootKeyMeta(800, key3, false)) + key3 := structs.NewRootKeyMeta().MakeInactive() + key3.CreateTime = yesterday + must.NoError(t, store.UpsertRootKeyMeta(800, key3, false)) // insert the allocation using key3 alloc := mock.Alloc() alloc.ClientStatus = structs.AllocClientStatusRunning alloc.SigningKeyID = key3.KeyID - require.NoError(t, store.UpsertAllocs( + must.NoError(t, store.UpsertAllocs( structs.MsgTypeTestSetup, 850, []*structs.Allocation{alloc})) // insert an "old" key that's inactive but being used by an alloc - key4 := structs.NewRootKeyMeta() - key4.SetInactive() - require.NoError(t, store.UpsertRootKeyMeta(900, key4, false)) + key4 := structs.NewRootKeyMeta().MakeInactive() + key4.CreateTime = yesterday + must.NoError(t, store.UpsertRootKeyMeta(900, key4, false)) // insert the dead allocation using key4 alloc2 := mock.Alloc() alloc2.ClientStatus = structs.AllocClientStatusFailed alloc2.DesiredStatus = structs.AllocDesiredStatusStop alloc2.SigningKeyID = key4.KeyID - require.NoError(t, store.UpsertAllocs( + must.NoError(t, store.UpsertAllocs( structs.MsgTypeTestSetup, 950, []*structs.Allocation{alloc2})) - // insert a time table index before the last key - tt := srv.fsm.TimeTable() - tt.Witness(1000, time.Now().UTC().Add(-1*srv.config.RootKeyGCThreshold)) + // insert an inactive key older than RootKeyGCThreshold but not RootKeyRotationThreshold + key5 := structs.NewRootKeyMeta().MakeInactive() + key5.CreateTime = now - (15 * time.Minute).Nanoseconds() + must.NoError(t, store.UpsertRootKeyMeta(1500, key5, false)) - // insert a "new" but inactive key - key5 := structs.NewRootKeyMeta() - key5.SetInactive() - require.NoError(t, store.UpsertRootKeyMeta(1500, key5, false)) + // prepublishing key should never be GC'd no matter how old + key6 := structs.NewRootKeyMeta().MakePrepublished(yesterday) + key6.CreateTime = yesterday + must.NoError(t, store.UpsertRootKeyMeta(1600, key6, false)) // run the core job snap, err := store.Snapshot() - require.NoError(t, err) + must.NoError(t, err) core := NewCoreScheduler(srv, snap) eval := srv.coreJobEval(structs.CoreJobRootKeyRotateOrGC, 2000) c := core.(*CoreScheduler) - require.NoError(t, c.rootKeyRotateOrGC(eval)) + must.NoError(t, c.rootKeyGC(eval, time.Now())) ws := memdb.NewWatchSet() key, err := store.RootKeyMetaByID(ws, key0.KeyID) - require.NoError(t, err) - require.NotNil(t, key, "active key should not have been GCd") + must.NoError(t, err) + must.NotNil(t, key, must.Sprint("active key should not have been GCd")) key, err = store.RootKeyMetaByID(ws, key1.KeyID) - require.NoError(t, err) - require.Nil(t, key, "old and unused inactive key should have been GCd") + must.NoError(t, err) + must.Nil(t, key, must.Sprint("old and unused inactive key should have been GCd")) key, err = store.RootKeyMetaByID(ws, key2.KeyID) - require.NoError(t, err) - require.NotNil(t, key, "old key should not have been GCd if still in use") + must.NoError(t, err) + must.NotNil(t, key, must.Sprint("old key should not have been GCd if still in use")) key, err = store.RootKeyMetaByID(ws, key3.KeyID) - require.NoError(t, err) - require.NotNil(t, key, "old key used to sign a live alloc should not have been GCd") + must.NoError(t, err) + must.NotNil(t, key, must.Sprint("old key used to sign a live alloc should not have been GCd")) key, err = store.RootKeyMetaByID(ws, key4.KeyID) - require.NoError(t, err) - require.Nil(t, key, "old key used to sign a terminal alloc should have been GCd") + must.NoError(t, err) + must.Nil(t, key, must.Sprint("old key used to sign a terminal alloc should have been GCd")) key, err = store.RootKeyMetaByID(ws, key5.KeyID) - require.NoError(t, err) - require.NotNil(t, key, "new key should not have been GCd") + must.NoError(t, err) + must.NotNil(t, key, must.Sprint("key newer than GC+rotation threshold should not have been GCd")) + key, err = store.RootKeyMetaByID(ws, key6.KeyID) + must.NoError(t, err) + must.NotNil(t, key, must.Sprint("prepublishing key should not have been GCd")) } // TestCoreScheduler_VariablesRekey exercises variables rekeying func TestCoreScheduler_VariablesRekey(t *testing.T) { ci.Parallel(t) - srv, cleanup := TestServer(t, nil) + srv, cleanup := TestServer(t, func(c *Config) { + c.NumSchedulers = 1 + }) defer cleanup() testutil.WaitForKeyring(t, srv.RPC, "global") store := srv.fsm.State() key0, err := store.GetActiveRootKeyMeta(nil) - require.NotNil(t, key0, "expected keyring to be bootstapped") - require.NoError(t, err) + must.NotNil(t, key0, must.Sprint("expected keyring to be bootstapped")) + must.NoError(t, err) for i := 0; i < 3; i++ { req := &structs.VariablesApplyRequest{ @@ -2734,7 +2846,7 @@ func TestCoreScheduler_VariablesRekey(t *testing.T) { WriteRequest: structs.WriteRequest{Region: srv.config.Region}, } resp := &structs.VariablesApplyResponse{} - require.NoError(t, srv.RPC("Variables.Apply", req, resp)) + must.NoError(t, srv.RPC("Variables.Apply", req, resp)) } rotateReq := &structs.KeyringRotateRootKeyRequest{ @@ -2743,7 +2855,7 @@ func TestCoreScheduler_VariablesRekey(t *testing.T) { }, } var rotateResp structs.KeyringRotateRootKeyResponse - require.NoError(t, srv.RPC("Keyring.Rotate", rotateReq, &rotateResp)) + must.NoError(t, srv.RPC("Keyring.Rotate", rotateReq, &rotateResp)) for i := 0; i < 3; i++ { req := &structs.VariablesApplyRequest{ @@ -2752,31 +2864,29 @@ func TestCoreScheduler_VariablesRekey(t *testing.T) { WriteRequest: structs.WriteRequest{Region: srv.config.Region}, } resp := &structs.VariablesApplyResponse{} - require.NoError(t, srv.RPC("Variables.Apply", req, resp)) + must.NoError(t, srv.RPC("Variables.Apply", req, resp)) } rotateReq.Full = true - require.NoError(t, srv.RPC("Keyring.Rotate", rotateReq, &rotateResp)) + must.NoError(t, srv.RPC("Keyring.Rotate", rotateReq, &rotateResp)) newKeyID := rotateResp.Key.KeyID - require.Eventually(t, func() bool { - ws := memdb.NewWatchSet() - iter, err := store.Variables(ws) - require.NoError(t, err) - for { - raw := iter.Next() - if raw == nil { - break + must.Wait(t, wait.InitialSuccess( + wait.Timeout(5*time.Second), + wait.Gap(100*time.Millisecond), + wait.BoolFunc(func() bool { + iter, _ := store.Variables(nil) + for raw := iter.Next(); raw != nil; raw = iter.Next() { + variable := raw.(*structs.VariableEncrypted) + if variable.KeyID != newKeyID { + return false + } } - variable := raw.(*structs.VariableEncrypted) - if variable.KeyID != newKeyID { - return false - } - } - return true - }, time.Second*5, 100*time.Millisecond, - "variable rekey should be complete") + originalKey, _ := store.RootKeyMetaByID(nil, key0.KeyID) + return originalKey.IsInactive() + }), + ), must.Sprint("variable rekey should be complete")) } func TestCoreScheduler_FailLoop(t *testing.T) { diff --git a/nomad/encrypter.go b/nomad/encrypter.go index f39a2855bd7a..6b23bb9a7657 100644 --- a/nomad/encrypter.go +++ b/nomad/encrypter.go @@ -23,9 +23,7 @@ import ( "github.com/go-jose/go-jose/v3" "github.com/go-jose/go-jose/v3/jwt" "github.com/hashicorp/go-hclog" - log "github.com/hashicorp/go-hclog" kms "github.com/hashicorp/go-kms-wrapping/v2" - wrapping "github.com/hashicorp/go-kms-wrapping/v2" "github.com/hashicorp/go-kms-wrapping/v2/aead" "github.com/hashicorp/go-kms-wrapping/wrappers/awskms/v2" "github.com/hashicorp/go-kms-wrapping/wrappers/azurekeyvault/v2" @@ -554,7 +552,7 @@ func (e *Encrypter) loadKeyFromStore(path string) (*structs.RootKey, error) { if wrappedDEK == nil { // older KEK wrapper versions with AEAD-only have the key material in a // different field - wrappedDEK = &wrapping.BlobInfo{Ciphertext: kekWrapper.EncryptedDataEncryptionKey} + wrappedDEK = &kms.BlobInfo{Ciphertext: kekWrapper.EncryptedDataEncryptionKey} } key, err := wrapper.Decrypt(e.srv.shutdownCtx, wrappedDEK) if err != nil { @@ -573,7 +571,7 @@ func (e *Encrypter) loadKeyFromStore(path string) (*structs.RootKey, error) { } else if len(kekWrapper.EncryptedRSAKey) > 0 { // older KEK wrapper versions with AEAD-only have the key material in a // different field - rsaKey, err = wrapper.Decrypt(e.srv.shutdownCtx, &wrapping.BlobInfo{ + rsaKey, err = wrapper.Decrypt(e.srv.shutdownCtx, &kms.BlobInfo{ Ciphertext: kekWrapper.EncryptedRSAKey}) if err != nil { return nil, fmt.Errorf("%w (rsa key): %w", ErrDecryptFailed, err) @@ -652,7 +650,7 @@ func (e *Encrypter) newKMSWrapper(provider *structs.KEKProviderConfig, keyID str config, ok := e.providerConfigs[provider.ID()] if ok { - _, err := wrapper.SetConfig(context.Background(), wrapping.WithConfigMap(config.Config)) + _, err := wrapper.SetConfig(context.Background(), kms.WithConfigMap(config.Config)) if err != nil { return nil, err } @@ -663,7 +661,7 @@ func (e *Encrypter) newKMSWrapper(provider *structs.KEKProviderConfig, keyID str type KeyringReplicator struct { srv *Server encrypter *Encrypter - logger log.Logger + logger hclog.Logger stopFn context.CancelFunc } diff --git a/nomad/encrypter_test.go b/nomad/encrypter_test.go index e3227daf3258..092e136bd1e7 100644 --- a/nomad/encrypter_test.go +++ b/nomad/encrypter_test.go @@ -568,7 +568,7 @@ func TestEncrypter_Upgrade17(t *testing.T) { oldRootKey, err := structs.NewRootKey(structs.EncryptionAlgorithmAES256GCM) must.NoError(t, err) - oldRootKey.Meta.SetActive() + oldRootKey = oldRootKey.MakeActive() // Remove RSAKey to mimic 1.6 oldRootKey.RSAKey = nil diff --git a/nomad/keyring_endpoint.go b/nomad/keyring_endpoint.go index 95302b639164..9a383b1c5cb9 100644 --- a/nomad/keyring_endpoint.go +++ b/nomad/keyring_endpoint.go @@ -50,13 +50,20 @@ func (k *Keyring) Rotate(args *structs.KeyringRotateRootKeyRequest, reply *struc if args.Algorithm == "" { args.Algorithm = structs.EncryptionAlgorithmAES256GCM } + if args.Full && args.PublishTime > 0 { + return fmt.Errorf("keyring cannot be prepublished and full rotated at the same time") + } rootKey, err := structs.NewRootKey(args.Algorithm) if err != nil { return err } - rootKey.Meta.SetActive() + if args.PublishTime != 0 { + rootKey.Meta = rootKey.Meta.MakePrepublished(args.PublishTime) + } else { + rootKey.Meta = rootKey.Meta.MakeActive() + } // make sure it's been added to the local keystore before we write // it to raft, so that followers don't try to Get a key that @@ -329,7 +336,7 @@ func (k *Keyring) Delete(args *structs.KeyringDeleteRootKeyRequest, reply *struc if keyMeta == nil { return nil // safe to bail out early } - if keyMeta.Active() { + if keyMeta.IsActive() { return fmt.Errorf("active root key cannot be deleted - call rotate first") } diff --git a/nomad/keyring_endpoint_test.go b/nomad/keyring_endpoint_test.go index e0573fad840b..7aa999e97f96 100644 --- a/nomad/keyring_endpoint_test.go +++ b/nomad/keyring_endpoint_test.go @@ -32,7 +32,7 @@ func TestKeyringEndpoint_CRUD(t *testing.T) { key, err := structs.NewRootKey(structs.EncryptionAlgorithmAES256GCM) require.NoError(t, err) id := key.Meta.KeyID - key.Meta.SetActive() + key = key.MakeActive() updateReq := &structs.KeyringUpdateRootKeyRequest{ RootKey: key, @@ -106,7 +106,7 @@ func TestKeyringEndpoint_CRUD(t *testing.T) { require.EqualError(t, err, "active root key cannot be deleted - call rotate first") // set inactive - updateReq.RootKey.Meta.SetInactive() + updateReq.RootKey.Meta = updateReq.RootKey.Meta.MakeInactive() err = msgpackrpc.CallWithCodec(codec, "Keyring.Update", updateReq, &updateResp) require.NoError(t, err) @@ -142,7 +142,7 @@ func TestKeyringEndpoint_InvalidUpdates(t *testing.T) { key, err := structs.NewRootKey(structs.EncryptionAlgorithmAES256GCM) require.NoError(t, err) id := key.Meta.KeyID - key.Meta.SetActive() + key = key.MakeActive() updateReq := &structs.KeyringUpdateRootKeyRequest{ RootKey: key, @@ -228,10 +228,14 @@ func TestKeyringEndpoint_Rotate(t *testing.T) { testutil.WaitForKeyring(t, srv.RPC, "global") codec := rpcClient(t, srv) + store := srv.fsm.State() + key0, err := store.GetActiveRootKeyMeta(nil) + must.NoError(t, err) + // Setup an existing key key, err := structs.NewRootKey(structs.EncryptionAlgorithmAES256GCM) - require.NoError(t, err) - key.Meta.SetActive() + must.NoError(t, err) + key1 := key.Meta updateReq := &structs.KeyringUpdateRootKeyRequest{ RootKey: key, @@ -242,7 +246,7 @@ func TestKeyringEndpoint_Rotate(t *testing.T) { } var updateResp structs.KeyringUpdateRootKeyResponse err = msgpackrpc.CallWithCodec(codec, "Keyring.Update", updateReq, &updateResp) - require.NoError(t, err) + must.NoError(t, err) // Rotate the key @@ -253,14 +257,13 @@ func TestKeyringEndpoint_Rotate(t *testing.T) { } var rotateResp structs.KeyringRotateRootKeyResponse err = msgpackrpc.CallWithCodec(codec, "Keyring.Rotate", rotateReq, &rotateResp) - require.EqualError(t, err, structs.ErrPermissionDenied.Error()) + must.EqError(t, err, structs.ErrPermissionDenied.Error()) rotateReq.AuthToken = rootToken.SecretID err = msgpackrpc.CallWithCodec(codec, "Keyring.Rotate", rotateReq, &rotateResp) - require.NoError(t, err) - require.NotEqual(t, updateResp.Index, rotateResp.Index) - - newID := rotateResp.Key.KeyID + must.NoError(t, err) + must.Greater(t, updateResp.Index, rotateResp.Index) + key2 := rotateResp.Key // Verify we have a new key and the old one is inactive @@ -272,31 +275,62 @@ func TestKeyringEndpoint_Rotate(t *testing.T) { } var listResp structs.KeyringListRootKeyMetaResponse err = msgpackrpc.CallWithCodec(codec, "Keyring.List", listReq, &listResp) - require.NoError(t, err) - - require.Greater(t, listResp.Index, updateResp.Index) - require.Len(t, listResp.Keys, 3) // bootstrap + old + new + must.NoError(t, err) + must.Greater(t, updateResp.Index, listResp.Index) + must.Len(t, 3, listResp.Keys) // bootstrap + old + new for _, keyMeta := range listResp.Keys { - if keyMeta.KeyID != newID { - require.False(t, keyMeta.Active(), "expected old keys to be inactive") - } else { - require.True(t, keyMeta.Active(), "expected new key to be inactive") + switch keyMeta.KeyID { + case key0.KeyID, key1.KeyID: + must.True(t, keyMeta.IsInactive(), must.Sprint("older keys must be inactive")) + case key2.KeyID: + must.True(t, keyMeta.IsActive(), must.Sprint("expected new key to be active")) } } getReq := &structs.KeyringGetRootKeyRequest{ - KeyID: newID, + KeyID: key2.KeyID, QueryOptions: structs.QueryOptions{ Region: "global", }, } var getResp structs.KeyringGetRootKeyResponse err = msgpackrpc.CallWithCodec(codec, "Keyring.Get", getReq, &getResp) - require.NoError(t, err) + must.NoError(t, err) + must.Len(t, 32, getResp.Key.Key) + + // Rotate the key with prepublishing + + publishTime := time.Now().Add(24 * time.Hour).UnixNano() + rotateResp = structs.KeyringRotateRootKeyResponse{} + rotateReq = &structs.KeyringRotateRootKeyRequest{ + PublishTime: publishTime, + WriteRequest: structs.WriteRequest{ + Region: "global", + AuthToken: rootToken.SecretID, + }, + } + err = msgpackrpc.CallWithCodec(codec, "Keyring.Rotate", rotateReq, &rotateResp) + must.NoError(t, err) + must.Greater(t, updateResp.Index, rotateResp.Index) + key3 := rotateResp.Key - gotKey := getResp.Key - require.Len(t, gotKey.Key, 32) + listResp = structs.KeyringListRootKeyMetaResponse{} + err = msgpackrpc.CallWithCodec(codec, "Keyring.List", listReq, &listResp) + must.NoError(t, err) + must.Greater(t, updateResp.Index, listResp.Index) + must.Len(t, 4, listResp.Keys) // bootstrap + old + new + prepublished + + for _, keyMeta := range listResp.Keys { + switch keyMeta.KeyID { + case key0.KeyID, key1.KeyID: + must.True(t, keyMeta.IsInactive(), must.Sprint("older keys must be inactive")) + case key2.KeyID: + must.True(t, keyMeta.IsActive(), must.Sprint("expected active key to remain active")) + case key3.KeyID: + must.True(t, keyMeta.IsPrepublished(), must.Sprint("expected new key to be prepublished")) + } + } } // TestKeyringEndpoint_ListPublic asserts the Keyring.ListPublic RPC returns diff --git a/nomad/leader.go b/nomad/leader.go index fc5a0b09d404..d771a3cd1607 100644 --- a/nomad/leader.go +++ b/nomad/leader.go @@ -2760,7 +2760,7 @@ func (s *Server) initializeKeyring(stopCh <-chan struct{}) { logger.Trace("initializing keyring") rootKey, err := structs.NewRootKey(structs.EncryptionAlgorithmAES256GCM) - rootKey.Meta.SetActive() + rootKey = rootKey.MakeActive() if err != nil { logger.Error("could not initialize keyring: %v", err) return diff --git a/nomad/state/state_store.go b/nomad/state/state_store.go index acd4c782c280..6fce8574da48 100644 --- a/nomad/state/state_store.go +++ b/nomad/state/state_store.go @@ -7340,10 +7340,10 @@ func (s *StateStore) UpsertRootKeyMeta(index uint64, rootKeyMeta *structs.RootKe existing := raw.(*structs.RootKeyMeta) rootKeyMeta.CreateIndex = existing.CreateIndex rootKeyMeta.CreateTime = existing.CreateTime - isRotation = !existing.Active() && rootKeyMeta.Active() + isRotation = !existing.IsActive() && rootKeyMeta.IsActive() } else { rootKeyMeta.CreateIndex = index - isRotation = rootKeyMeta.Active() + isRotation = rootKeyMeta.IsActive() } rootKeyMeta.ModifyIndex = index @@ -7369,14 +7369,14 @@ func (s *StateStore) UpsertRootKeyMeta(index uint64, rootKeyMeta *structs.RootKe switch key.State { case structs.RootKeyStateInactive: if rekey { - key.SetRekeying() + key = key.MakeRekeying() modified = true } case structs.RootKeyStateActive: if rekey { - key.SetRekeying() + key = key.MakeRekeying() } else { - key.SetInactive() + key = key.MakeInactive() } modified = true case structs.RootKeyStateRekeying, structs.RootKeyStateDeprecated: @@ -7475,7 +7475,7 @@ func (s *StateStore) GetActiveRootKeyMeta(ws memdb.WatchSet) (*structs.RootKeyMe break } key := raw.(*structs.RootKeyMeta) - if key.Active() { + if key.IsActive() { return key, nil } } diff --git a/nomad/state/state_store_test.go b/nomad/state/state_store_test.go index 04e0a0d4c1f7..7e4243296f27 100644 --- a/nomad/state/state_store_test.go +++ b/nomad/state/state_store_test.go @@ -10776,7 +10776,7 @@ func TestStateStore_RootKeyMetaData_CRUD(t *testing.T) { key := structs.NewRootKeyMeta() keyIDs = append(keyIDs, key.KeyID) if i == 0 { - key.SetActive() + key = key.MakeActive() } index++ require.NoError(t, store.UpsertRootKeyMeta(index, key, false)) @@ -10792,8 +10792,7 @@ func TestStateStore_RootKeyMetaData_CRUD(t *testing.T) { require.NoError(t, err) require.NotNil(t, inactiveKey) oldCreateIndex := inactiveKey.CreateIndex - newlyActiveKey := inactiveKey.Copy() - newlyActiveKey.SetActive() + newlyActiveKey := inactiveKey.MakeActive() index++ require.NoError(t, store.UpsertRootKeyMeta(index, newlyActiveKey, false)) @@ -10806,10 +10805,10 @@ func TestStateStore_RootKeyMetaData_CRUD(t *testing.T) { } key := raw.(*structs.RootKeyMeta) if key.KeyID == newlyActiveKey.KeyID { - require.True(t, key.Active(), "expected updated key to be active") + require.True(t, key.IsActive(), "expected updated key to be active") require.Equal(t, oldCreateIndex, key.CreateIndex) } else { - require.False(t, key.Active(), "expected other keys to be inactive") + require.False(t, key.IsActive(), "expected other keys to be inactive") } } @@ -10827,7 +10826,7 @@ func TestStateStore_RootKeyMetaData_CRUD(t *testing.T) { } key := raw.(*structs.RootKeyMeta) require.NotEqual(t, keyIDs[1], key.KeyID) - require.False(t, key.Active(), "expected remaining keys to be inactive") + require.False(t, key.IsActive(), "expected remaining keys to be inactive") found++ } require.Equal(t, 2, found, "expected only 2 keys remaining") diff --git a/nomad/structs/keyring.go b/nomad/structs/keyring.go index 67379d89a9b4..64a47798d409 100644 --- a/nomad/structs/keyring.go +++ b/nomad/structs/keyring.go @@ -11,6 +11,7 @@ import ( "fmt" "maps" "net/url" + "slices" "time" "github.com/go-jose/go-jose/v3" @@ -77,6 +78,33 @@ func NewRootKey(algorithm EncryptionAlgorithm) (*RootKey, error) { return rootKey, nil } +func (k *RootKey) Copy() *RootKey { + return &RootKey{ + Meta: k.Meta.Copy(), + Key: slices.Clone(k.Key), + RSAKey: slices.Clone(k.RSAKey), + } +} + +// MakeInactive returns a copy of the RootKey with the meta state set to active +func (k *RootKey) MakeActive() *RootKey { + return &RootKey{ + Meta: k.Meta.MakeActive(), + Key: slices.Clone(k.Key), + RSAKey: slices.Clone(k.RSAKey), + } +} + +// MakeInactive returns a copy of the RootKey with the meta state set to +// inactive +func (k *RootKey) MakeInactive() *RootKey { + return &RootKey{ + Meta: k.Meta.MakeInactive(), + Key: slices.Clone(k.Key), + RSAKey: slices.Clone(k.RSAKey), + } +} + // RootKeyMeta is the metadata used to refer to a RootKey. It is // stored in raft. type RootKeyMeta struct { @@ -86,6 +114,7 @@ type RootKeyMeta struct { CreateIndex uint64 ModifyIndex uint64 State RootKeyState + PublishTime int64 } // KEKProviderName enum are the built-in KEK providers. @@ -145,9 +174,10 @@ func (c *KEKProviderConfig) ID() string { type RootKeyState string const ( - RootKeyStateInactive RootKeyState = "inactive" - RootKeyStateActive = "active" - RootKeyStateRekeying = "rekeying" + RootKeyStateInactive RootKeyState = "inactive" + RootKeyStateActive = "active" + RootKeyStateRekeying = "rekeying" + RootKeyStatePrepublished = "prepublished" // RootKeyStateDeprecated is, itself, deprecated and is no longer in // use. For backwards compatibility, any existing keys with this state will @@ -177,33 +207,66 @@ type RootKeyMetaStub struct { State RootKeyState } -// Active indicates his key is the one currently being used for -// crypto operations (at most one key can be Active) -func (rkm *RootKeyMeta) Active() bool { +// IsActive indicates this key is the one currently being used for crypto +// operations (at most one key can be Active) +func (rkm *RootKeyMeta) IsActive() bool { return rkm.State == RootKeyStateActive } -func (rkm *RootKeyMeta) SetActive() { - rkm.State = RootKeyStateActive +// MakeActive returns a copy of the RootKeyMeta with the state set to active +func (rkm *RootKeyMeta) MakeActive() *RootKeyMeta { + out := rkm.Copy() + if out != nil { + out.State = RootKeyStateActive + out.PublishTime = 0 + } + return out } -// Rekeying indicates that variables encrypted with this key should be +// IsRekeying indicates that variables encrypted with this key should be // rekeyed -func (rkm *RootKeyMeta) Rekeying() bool { +func (rkm *RootKeyMeta) IsRekeying() bool { return rkm.State == RootKeyStateRekeying } -func (rkm *RootKeyMeta) SetRekeying() { - rkm.State = RootKeyStateRekeying +// MakeRekeying returns a copy of the RootKeyMeta with the state set to rekeying +func (rkm *RootKeyMeta) MakeRekeying() *RootKeyMeta { + out := rkm.Copy() + if out != nil { + out.State = RootKeyStateRekeying + } + return out +} + +// MakePrepublished returns a copy of the RootKeyMeta with the state set to +// prepublished at the time t +func (rkm *RootKeyMeta) MakePrepublished(t int64) *RootKeyMeta { + out := rkm.Copy() + if out != nil { + out.PublishTime = t + out.State = RootKeyStatePrepublished + } + return out } -func (rkm *RootKeyMeta) SetInactive() { - rkm.State = RootKeyStateInactive +// IsPrepublished indicates that this key has been published and is pending +// being promoted to active +func (rkm *RootKeyMeta) IsPrepublished() bool { + return rkm.State == RootKeyStatePrepublished } -// Inactive indicates that this key is no longer being used to encrypt new +// MakeInactive returns a copy of the RootKeyMeta with the state set to inactive +func (rkm *RootKeyMeta) MakeInactive() *RootKeyMeta { + out := rkm.Copy() + if out != nil { + out.State = RootKeyStateInactive + } + return out +} + +// IsInactive indicates that this key is no longer being used to encrypt new // variables or workload identities. -func (rkm *RootKeyMeta) Inactive() bool { +func (rkm *RootKeyMeta) IsInactive() bool { return rkm.State == RootKeyStateInactive || rkm.State == RootKeyStateDeprecated } @@ -239,7 +302,7 @@ func (rkm *RootKeyMeta) Validate() error { } switch rkm.State { case RootKeyStateInactive, RootKeyStateActive, - RootKeyStateRekeying, RootKeyStateDeprecated: + RootKeyStateRekeying, RootKeyStateDeprecated, RootKeyStatePrepublished: default: return fmt.Errorf("root key state %q is invalid", rkm.State) } @@ -277,8 +340,9 @@ const ( // KeyringRotateRootKeyRequest is the argument to the Keyring.Rotate RPC type KeyringRotateRootKeyRequest struct { - Algorithm EncryptionAlgorithm - Full bool + Algorithm EncryptionAlgorithm + Full bool + PublishTime int64 WriteRequest } diff --git a/website/content/docs/commands/operator/root/keyring-rotate.mdx b/website/content/docs/commands/operator/root/keyring-rotate.mdx index 2ba424ef2418..aef49c6265aa 100644 --- a/website/content/docs/commands/operator/root/keyring-rotate.mdx +++ b/website/content/docs/commands/operator/root/keyring-rotate.mdx @@ -25,19 +25,34 @@ nomad operator root keyring rotate [options] ## Rotate Options - `-full`: Decrypt all existing variables and re-encrypt with the new key. This - command will immediately return and the re-encryption process will run - asynchronously on the leader. + command will immediately return and the re-encryption process will run + asynchronously on the leader. + +- `-now`: Publish the new key immediately without prepublishing. One of `-now` + or `-prepublish` must be set. + +- `-prepublish`: Set a duration for which to prepublish the new key + (ex. "1h"). The currently active key will be unchanged but the new public key + will be available in the JWKS endpoint. Multiple keys can be prepublished and + they will be promoted to active in order of publish time, at most once every + [`root_key_gc_interval`][]. One of `-now` or `-prepublish` must be set. - `-verbose`: Enable verbose output ## Examples ```shell-session -$ nomad operator root keyring rotate -Key State Create Time -f19f6029 active 2022-07-11T19:14:36Z +$ nomad operator root keyring rotate -now +Key State Create Time Publish Time +f19f6029 active 2022-07-11T19:14:36Z -$ nomad operator root keyring rotate -verbose -Key State Create Time -53186ac1-9002-c4b6-216d-bb19fd37a791 active 2022-07-11T19:14:47Z +$ nomad operator root keyring rotate -now -verbose +Key State Create Time Publish Time +53186ac1-9002-c4b6-216d-bb19fd37a791 active 2022-07-11T19:14:47Z + +$ nomad operator root keyring rotate -prepublish 1h +Key State Create Time Publish Time +7f15e4e9 active 2022-07-11T19:15:10Z 2022-07-11T20:15:10Z ``` + +[`root_key_gc_interval`]: /nomad/docs/configuration/server#root_key_gc_interval diff --git a/website/content/docs/configuration/server.mdx b/website/content/docs/configuration/server.mdx index 3bde93207724..a1a4fa113d9b 100644 --- a/website/content/docs/configuration/server.mdx +++ b/website/content/docs/configuration/server.mdx @@ -236,13 +236,16 @@ server { - `root_key_gc_interval` `(string: "10m")` - Specifies the interval between [encryption key][] metadata garbage collections. -- `root_key_gc_threshold` `(string: "1h")` - Specifies the minimum time that an - [encryption key][] must exist before it can be eligible for garbage - collection. +- `root_key_gc_threshold` `(string: "1h")` - Specifies the minimum time after + the `root_key_rotation_threshold` has passed that an [encryption key][] must + exist before it can be eligible for garbage collection. -- `root_key_rotation_threshold` `(string: "720h")` - Specifies the minimum time - that an [encryption key][] must exist before it is automatically rotated on - the next garbage collection interval. +- `root_key_rotation_threshold` `(string: "720h")` - Specifies the lifetime of + an active [encryption key][] before it is automatically rotated on the next + garbage collection interval. Nomad will prepublish the replacement key at half + the `root_key_rotation_threshold` time so external consumers of Workload + Identity have time to obtain the new public key from the [JWKS URL][] before + it is used. - `server_join` ([server_join][server-join]: nil) - Specifies how the Nomad server will connect to other Nomad servers. The `retry_join` @@ -517,3 +520,4 @@ work. [wi]: /nomad/docs/concepts/workload-identity [Configure for multiple regions]: /nomad/tutorials/access-control/access-control-bootstrap#configure-for-multiple-regions [top_level_data_dir]: /nomad/docs/configuration#data_dir +[JWKS URL]: /nomad/api-docs/operator/keyring#list-active-public-keys diff --git a/website/content/docs/integrations/consul/acl.mdx b/website/content/docs/integrations/consul/acl.mdx index 9499fe7fe2a7..d5bc5d4b3587 100644 --- a/website/content/docs/integrations/consul/acl.mdx +++ b/website/content/docs/integrations/consul/acl.mdx @@ -349,10 +349,10 @@ You may also host the JWKS JSON response from Nomad in an external location that is reachable by the Consul servers, and use that address as the value for `JWKSURL`. -It is important to remember that the Nomad keys **are rotated periodically**, -so both approaches should be automated and done continually. The rotation -frequency is controlled by the [`server.root_key_rotation_threshold`][] -configuration of the Nomad servers. +It is important to remember that the Nomad keys **are rotated periodically**, so +both approaches should be automated and done continually. The rotation frequency +is controlled by the [`server.root_key_rotation_threshold`][] configuration of +the Nomad servers. Keys will be prepublished at half the rotation threshold. ### Additional References diff --git a/website/content/docs/integrations/vault/acl.mdx b/website/content/docs/integrations/vault/acl.mdx index f6fe3d09929d..13ea25860118 100644 --- a/website/content/docs/integrations/vault/acl.mdx +++ b/website/content/docs/integrations/vault/acl.mdx @@ -316,10 +316,10 @@ You may also host the JWKS JSON response from Nomad in an external location that is reachable by the Vault servers, and use that address as the value for `jwks_url`. -It is important to remember that the Nomad keys **are rotated periodically**, -so both approaches should be automated and done continually. The rotation -frequency is controlled by the [`server.root_key_rotation_threshold`][] -configuration of the Nomad servers. +It is important to remember that the Nomad keys **are rotated periodically**, so +both approaches should be automated and done continually. The rotation frequency +is controlled by the [`server.root_key_rotation_threshold`][] configuration of +the Nomad servers. Keys will be prepublished at half the rotation threshold. ### Additional References diff --git a/website/content/docs/upgrade/upgrade-specific.mdx b/website/content/docs/upgrade/upgrade-specific.mdx index 181e624e2dce..4e9467d2f92b 100644 --- a/website/content/docs/upgrade/upgrade-specific.mdx +++ b/website/content/docs/upgrade/upgrade-specific.mdx @@ -13,7 +13,19 @@ upgrade. However, specific versions of Nomad may have more details provided for their upgrades as a result of new features or changed behavior. This page is used to document those details separately from the standard upgrade flow. -## Nomad 1.8.2 (UNRELEASED) +## Nomad 1.8.3 (UNRELEASED) + +#### Nomad keyring rotation + +In Nomad 1.8.3, the Nomad root keyring will prepublish keys at half the +`root_key_rotation_threshold` and promote them to active once the +`root_key_rotation_threshold` has passed. The `nomad operator root keyring +rotate` command now requires one of two arguments: `-prepublish ` to +prepublish a key or `-now` to rotate immediately. We recommend using +`-prepublish` to avoid outages from workload identities used to log into +external services such as Vault or Consul. + +## Nomad 1.8.2 #### New `windows_allow_insecure_container_admin` configuration option for Docker driver