From 6d054c401d4c5269220855f5b724ae29e665a4ed Mon Sep 17 00:00:00 2001 From: Tim Gross Date: Fri, 12 Jul 2024 11:08:37 -0400 Subject: [PATCH] keyring: support prepublishing keys When a root key is rotated, the servers immediately start signing Workload Identities with the new active key. But workloads may be using those WI tokens to sign into external services, which may not have had time to fetch the new public key and which might try to fetch new keys as needed. Add support for prepublishing keys. Prepublished keys will be visible in the JWKS endpoint but will not be used for signing or encryption until their `PublishTime`. Update the periodic key rotation to prepublish keys at half the `root_key_rotation_threshold` window, and promote prepublished keys to active after the `PublishTime`. This changeset also fixes two bugs in periodic root key rotation and garbage collection, both of which can't be safely fixed without implementing prepublishing: * Periodic root key rotation would never happen because the default `root_key_rotation_threshold` of 720h exceeds the 72h maximum window of the FSM time table. We now compare the `CreateTime` against the wall clock time instead of the time table. (We expect to remove the time table in future work, ref https://github.com/hashicorp/nomad/issues/16359) * Root key garbage collection could GC keys that were used to sign identities. We now wait until `root_key_rotation_threshold` + `root_key_gc_threshold` before GC'ing a key. Ref: https://hashicorp.atlassian.net/browse/NET-10398 Ref: https://hashicorp.atlassian.net/browse/NET-10280 Fixes: https://github.com/hashicorp/nomad/issues/19669 Fixes: https://github.com/hashicorp/nomad/issues/23528 --- .changelog/23577.txt | 11 ++ api/keyring.go | 18 +- command/agent/keyring_endpoint.go | 10 ++ command/agent/keyring_endpoint_test.go | 82 ++++++--- command/operator_root_keyring.go | 10 +- command/operator_root_keyring_rotate.go | 24 ++- nomad/core_sched.go | 137 +++++++++++--- nomad/core_sched_test.go | 228 ++++++++++++++++++------ nomad/encrypter.go | 17 +- nomad/keyring_endpoint.go | 16 +- nomad/keyring_endpoint_test.go | 74 +++++--- nomad/structs/keyring.go | 38 +++- 12 files changed, 506 insertions(+), 159 deletions(-) create mode 100644 .changelog/23577.txt diff --git a/.changelog/23577.txt b/.changelog/23577.txt new file mode 100644 index 000000000000..544035e282d2 --- /dev/null +++ b/.changelog/23577.txt @@ -0,0 +1,11 @@ +```release-note:improvement +keyring: Added support for prepublishing keys +``` + +```release-note:bug +keyring: Fixed a bug where periodic key rotation would not occur +``` + +```release-note:bug +keyring: Fixed a bug where keys could be garbage collected before workload identities expire +``` diff --git a/api/keyring.go b/api/keyring.go index d87d8b720fcc..69e0c9c10731 100644 --- a/api/keyring.go +++ b/api/keyring.go @@ -34,16 +34,18 @@ type RootKeyMeta struct { CreateIndex uint64 ModifyIndex uint64 State RootKeyState + PublishTime int64 } // RootKeyState enum describes the lifecycle of a root key. type RootKeyState string const ( - RootKeyStateInactive RootKeyState = "inactive" - RootKeyStateActive = "active" - RootKeyStateRekeying = "rekeying" - RootKeyStateDeprecated = "deprecated" + RootKeyStateInactive RootKeyState = "inactive" + RootKeyStateActive = "active" + RootKeyStateRekeying = "rekeying" + RootKeyStateDeprecated = "deprecated" + RootKeyStatePrepublished = "prepublished" ) // List lists all the keyring metadata @@ -78,6 +80,9 @@ func (k *Keyring) Rotate(opts *KeyringRotateOptions, w *WriteOptions) (*RootKeyM if opts.Full { qp.Set("full", "true") } + if opts.PublishTime > 0 { + qp.Set("publish_time", fmt.Sprintf("%d", opts.PublishTime)) + } } resp := &struct{ Key *RootKeyMeta }{} wm, err := k.client.put("/v1/operator/keyring/rotate?"+qp.Encode(), nil, resp, w) @@ -86,6 +91,7 @@ func (k *Keyring) Rotate(opts *KeyringRotateOptions, w *WriteOptions) (*RootKeyM // KeyringRotateOptions are parameters for the Rotate API type KeyringRotateOptions struct { - Full bool - Algorithm EncryptionAlgorithm + Full bool + Algorithm EncryptionAlgorithm + PublishTime int64 } diff --git a/command/agent/keyring_endpoint.go b/command/agent/keyring_endpoint.go index b753507323cc..870aae3f06c4 100644 --- a/command/agent/keyring_endpoint.go +++ b/command/agent/keyring_endpoint.go @@ -6,6 +6,7 @@ package agent import ( "fmt" "net/http" + "strconv" "strings" "time" @@ -167,6 +168,15 @@ func (s *HTTPServer) keyringRotateRequest(resp http.ResponseWriter, req *http.Re args.Full = true } + ptRaw := query.Get("publish_time") + if ptRaw != "" { + publishTime, err := strconv.ParseInt(ptRaw, 10, 64) + if err != nil { + return nil, fmt.Errorf("invalid publish_time: %w", err) + } + args.PublishTime = publishTime + } + var out structs.KeyringRotateRootKeyResponse if err := s.agent.RPC("Keyring.Rotate", &args, &out); err != nil { return nil, err diff --git a/command/agent/keyring_endpoint_test.go b/command/agent/keyring_endpoint_test.go index 938642795af7..a17028540ff1 100644 --- a/command/agent/keyring_endpoint_test.go +++ b/command/agent/keyring_endpoint_test.go @@ -4,6 +4,7 @@ package agent import ( + "fmt" "net/http" "net/http/httptest" "strconv" @@ -13,7 +14,6 @@ import ( "github.com/go-jose/go-jose/v3" "github.com/shoenig/test/must" - "github.com/stretchr/testify/require" "github.com/hashicorp/nomad/ci" "github.com/hashicorp/nomad/nomad/structs" @@ -29,57 +29,83 @@ func TestHTTP_Keyring_CRUD(t *testing.T) { // List (get bootstrap key) req, err := http.NewRequest(http.MethodGet, "/v1/operator/keyring/keys", nil) - require.NoError(t, err) + must.NoError(t, err) obj, err := s.Server.KeyringRequest(respW, req) - require.NoError(t, err) + must.NoError(t, err) listResp := obj.([]*structs.RootKeyMeta) - require.Len(t, listResp, 1) - oldKeyID := listResp[0].KeyID + must.Len(t, 1, listResp) + key0 := listResp[0].KeyID // Rotate req, err = http.NewRequest(http.MethodPut, "/v1/operator/keyring/rotate", nil) - require.NoError(t, err) + must.NoError(t, err) obj, err = s.Server.KeyringRequest(respW, req) - require.NoError(t, err) - require.NotZero(t, respW.HeaderMap.Get("X-Nomad-Index")) + must.NoError(t, err) + must.NotEq(t, "", respW.HeaderMap.Get("X-Nomad-Index")) rotateResp := obj.(structs.KeyringRotateRootKeyResponse) - require.NotNil(t, rotateResp.Key) - require.True(t, rotateResp.Key.Active()) - newID1 := rotateResp.Key.KeyID + must.NotNil(t, rotateResp.Key) + must.True(t, rotateResp.Key.Active()) + key1 := rotateResp.Key.KeyID + + // Rotate with prepublish + + publishTime := time.Now().Add(24 * time.Hour).UnixNano() + req, err = http.NewRequest(http.MethodPut, + fmt.Sprintf("/v1/operator/keyring/rotate?publish_time=%d", publishTime), nil) + must.NoError(t, err) + obj, err = s.Server.KeyringRequest(respW, req) + must.NoError(t, err) + must.NotEq(t, "", respW.HeaderMap.Get("X-Nomad-Index")) + rotateResp = obj.(structs.KeyringRotateRootKeyResponse) + must.NotNil(t, rotateResp.Key) + must.True(t, rotateResp.Key.Prepublished()) + key2 := rotateResp.Key.KeyID // List req, err = http.NewRequest(http.MethodGet, "/v1/operator/keyring/keys", nil) - require.NoError(t, err) + must.NoError(t, err) obj, err = s.Server.KeyringRequest(respW, req) - require.NoError(t, err) + must.NoError(t, err) listResp = obj.([]*structs.RootKeyMeta) - require.Len(t, listResp, 2) + must.Len(t, 3, listResp) for _, key := range listResp { - if key.KeyID == newID1 { - require.True(t, key.Active(), "new key should be active") - } else { - require.False(t, key.Active(), "initial key should be inactive") + switch key.KeyID { + case key0: + must.True(t, key.Inactive(), must.Sprint("initial key should be inactive")) + case key1: + must.True(t, key.Active(), must.Sprint("new key should be active")) + case key2: + must.True(t, key.Prepublished(), + must.Sprint("prepublished key should not be active")) } } - // Delete the old key and verify its gone + // Delete the original key and verify its gone - req, err = http.NewRequest(http.MethodDelete, "/v1/operator/keyring/key/"+oldKeyID, nil) - require.NoError(t, err) + req, err = http.NewRequest(http.MethodDelete, "/v1/operator/keyring/key/"+key0, nil) + must.NoError(t, err) obj, err = s.Server.KeyringRequest(respW, req) - require.NoError(t, err) + must.NoError(t, err) req, err = http.NewRequest(http.MethodGet, "/v1/operator/keyring/keys", nil) - require.NoError(t, err) + must.NoError(t, err) obj, err = s.Server.KeyringRequest(respW, req) - require.NoError(t, err) + must.NoError(t, err) listResp = obj.([]*structs.RootKeyMeta) - require.Len(t, listResp, 1) - require.Equal(t, newID1, listResp[0].KeyID) - require.True(t, listResp[0].Active()) - require.Len(t, listResp, 1) + must.Len(t, 2, listResp) + for _, key := range listResp { + switch key.KeyID { + case key0: + t.Fatalf("initial key should have been deleted") + case key1: + must.True(t, key.Active(), must.Sprint("new key should be active")) + case key2: + must.True(t, key.Prepublished(), + must.Sprint("prepublished key should not be active")) + } + } }) } diff --git a/command/operator_root_keyring.go b/command/operator_root_keyring.go index 63a3a14e8dc9..b92e1a577336 100644 --- a/command/operator_root_keyring.go +++ b/command/operator_root_keyring.go @@ -75,11 +75,15 @@ func renderVariablesKeysResponse(keys []*api.RootKeyMeta, verbose bool) string { length = 8 } out := make([]string, len(keys)+1) - out[0] = "Key|State|Create Time" + out[0] = "Key|State|Create Time|Publish Time" i := 1 for _, k := range keys { - out[i] = fmt.Sprintf("%s|%v|%s", - k.KeyID[:length], k.State, formatUnixNanoTime(k.CreateTime)) + publishTime := "" + if k.PublishTime > 0 { + publishTime = formatUnixNanoTime(k.PublishTime) + } + out[i] = fmt.Sprintf("%s|%v|%s|%s", + k.KeyID[:length], k.State, formatUnixNanoTime(k.CreateTime), publishTime) i = i + 1 } return formatList(out) diff --git a/command/operator_root_keyring_rotate.go b/command/operator_root_keyring_rotate.go index 9c274c772920..fd89deeb5d46 100644 --- a/command/operator_root_keyring_rotate.go +++ b/command/operator_root_keyring_rotate.go @@ -6,6 +6,7 @@ package command import ( "fmt" "strings" + "time" "github.com/hashicorp/nomad/api" "github.com/posener/complete" @@ -36,6 +37,12 @@ Keyring Options: will immediately return and the re-encryption process will run asynchronously on the leader. + -prepublish + Set a duration for which to prepublish the new key (ex. "1h"). The currently + active key will be unchanged but the new public key will be available in the + JWKS endpoint. Multiple keys can be prepublished and they will be promoted to + active in order of publish time, at most once every root_key_gc_interval. + -verbose Show full information. ` @@ -50,8 +57,9 @@ func (c *OperatorRootKeyringRotateCommand) Synopsis() string { func (c *OperatorRootKeyringRotateCommand) AutocompleteFlags() complete.Flags { return mergeAutocompleteFlags(c.Meta.AutocompleteFlags(FlagSetClient), complete.Flags{ - "-full": complete.PredictNothing, - "-verbose": complete.PredictNothing, + "-full": complete.PredictNothing, + "-prepublish": complete.PredictNothing, + "-verbose": complete.PredictNothing, }) } @@ -65,11 +73,13 @@ func (c *OperatorRootKeyringRotateCommand) Name() string { func (c *OperatorRootKeyringRotateCommand) Run(args []string) int { var rotateFull, verbose bool + var prepublishDuration time.Duration flags := c.Meta.FlagSet("root keyring rotate", FlagSetClient) flags.Usage = func() { c.Ui.Output(c.Help()) } flags.BoolVar(&rotateFull, "full", false, "full key rotation") flags.BoolVar(&verbose, "verbose", false, "") + flags.DurationVar(&prepublishDuration, "prepublish", 0, "prepublish key") if err := flags.Parse(args); err != nil { return 1 @@ -88,8 +98,16 @@ func (c *OperatorRootKeyringRotateCommand) Run(args []string) int { return 1 } + publishTime := int64(0) + if prepublishDuration > 0 { + publishTime = time.Now().UnixNano() + prepublishDuration.Nanoseconds() + } + resp, _, err := client.Keyring().Rotate( - &api.KeyringRotateOptions{Full: rotateFull}, nil) + &api.KeyringRotateOptions{ + Full: rotateFull, + PublishTime: publishTime, + }, nil) if err != nil { c.Ui.Error(fmt.Sprintf("error: %s", err)) return 1 diff --git a/nomad/core_sched.go b/nomad/core_sched.go index ebbeecb234c0..c4260ca9bc35 100644 --- a/nomad/core_sched.go +++ b/nomad/core_sched.go @@ -99,7 +99,7 @@ func (c *CoreScheduler) forceGC(eval *structs.Evaluation) error { if err := c.expiredACLTokenGC(eval, true); err != nil { return err } - if err := c.rootKeyGC(eval); err != nil { + if err := c.rootKeyGC(eval, time.Now()); err != nil { return err } // Node GC must occur after the others to ensure the allocations are @@ -902,20 +902,17 @@ func (c *CoreScheduler) rootKeyRotateOrGC(eval *structs.Evaluation) error { // a rotation will be sent to the leader so our view of state // is no longer valid. we ack this core job and will pick up // the GC work on the next interval - wasRotated, err := c.rootKeyRotate(eval) + wasRotated, err := c.rootKeyRotate(eval, time.Now()) if err != nil { return err } if wasRotated { return nil } - return c.rootKeyGC(eval) + return c.rootKeyGC(eval, time.Now()) } -func (c *CoreScheduler) rootKeyGC(eval *structs.Evaluation) error { - - oldThreshold := c.getThreshold(eval, "root key", - "root_key_gc_threshold", c.srv.config.RootKeyGCThreshold) +func (c *CoreScheduler) rootKeyGC(eval *structs.Evaluation, now time.Time) error { ws := memdb.NewWatchSet() iter, err := c.snap.RootKeyMetas(ws) @@ -923,19 +920,31 @@ func (c *CoreScheduler) rootKeyGC(eval *structs.Evaluation) error { return err } + // the threshold is longer than we can support with the time table, and we + // never want to force-GC keys because that will orphan signed Workload + // Identities + rotationThreshold := now.Add(-1 * + (c.srv.config.RootKeyRotationThreshold + c.srv.config.RootKeyGCThreshold)) + for { raw := iter.Next() if raw == nil { break } keyMeta := raw.(*structs.RootKeyMeta) - if keyMeta.Active() || keyMeta.Rekeying() { - continue // never GC the active key or one we're rekeying + if !keyMeta.Inactive() { + continue // never GC keys we're still using } - if keyMeta.CreateIndex > oldThreshold { - continue // don't GC recent keys + + c.logger.Trace("checking inactive key eligibility for gc", + "create_time", keyMeta.CreateTime, "threshold", rotationThreshold.UnixNano()) + + if keyMeta.CreateTime > rotationThreshold.UnixNano() { + continue // don't GC keys with potentially live Workload Identities } + // don't GC keys used to encrypt Variables or sign legacy non-expiring + // Workload Identities inUse, err := c.snap.IsRootKeyMetaInUse(keyMeta.KeyID) if err != nil { return err @@ -961,26 +970,96 @@ func (c *CoreScheduler) rootKeyGC(eval *structs.Evaluation) error { return nil } -// rootKeyRotate checks if the active key is old enough that we need -// to kick off a rotation. -func (c *CoreScheduler) rootKeyRotate(eval *structs.Evaluation) (bool, error) { - - rotationThreshold := c.getThreshold(eval, "root key", - "root_key_rotation_threshold", c.srv.config.RootKeyRotationThreshold) +// rootKeyRotate checks if the active key is old enough that we need to kick off +// a rotation. It prepublishes a key first and only promotes that prepublished +// key to active once the rotation threshold has expired +func (c *CoreScheduler) rootKeyRotate(eval *structs.Evaluation, now time.Time) (bool, error) { ws := memdb.NewWatchSet() - activeKey, err := c.snap.GetActiveRootKeyMeta(ws) + iter, err := c.snap.RootKeyMetas(ws) if err != nil { return false, err } + + var ( + activeKey *structs.RootKeyMeta + prepublishedKey *structs.RootKeyMeta + ) + + for raw := iter.Next(); raw != nil; raw = iter.Next() { + key := raw.(*structs.RootKeyMeta) + switch key.State { + case structs.RootKeyStateActive: + activeKey = key + case structs.RootKeyStatePrepublished: + // multiple keys can be prepublished, so we only want to handle the + // very next one + if prepublishedKey == nil { + prepublishedKey = key + } else if prepublishedKey.PublishTime > key.PublishTime { + prepublishedKey = key + } + } + } + + if prepublishedKey != nil { + c.logger.Trace("checking prepublished key eligibility for promotion", + "publish_time", prepublishedKey.PublishTime, "now", now.UnixNano()) + + if prepublishedKey.PublishTime > now.UnixNano() { + // at this point we have a key in a prepublished state but it's not + // ready to be made active, so we bail out. otherwise we'd kick off + // a new rotation every time we process this eval and we're past + // internval/2 + return false, nil + } + + rootKey, err := c.srv.encrypter.GetKey(prepublishedKey.KeyID) + if err != nil { + c.logger.Error("prepublished key does not exist in keyring", "error", err) + return false, nil + } + rootKey = rootKey.Copy() + rootKey.Meta.SetActive() + + req := &structs.KeyringUpdateRootKeyRequest{ + RootKey: rootKey, + WriteRequest: structs.WriteRequest{ + Region: c.srv.config.Region, + AuthToken: eval.LeaderACL, + }, + } + + if err := c.srv.RPC("Keyring.Update", + req, &structs.KeyringUpdateRootKeyResponse{}); err != nil { + c.logger.Error("setting prepublished key active failed", "error", err) + return false, err + } + return true, nil + } + if activeKey == nil { - return false, nil // no active key + c.logger.Warn("keyring has no active key: rotate keyring to repair") + return false, nil } - if activeKey.CreateIndex >= rotationThreshold { + + // we rotate at half the rotation threshold because we want to prepublish a key + rotationThreshold := now.Add(-1 * c.srv.config.RootKeyRotationThreshold / 2) + + c.logger.Trace("checking active key eligibility for rotation", + "create_time", activeKey.CreateTime, "threshold", rotationThreshold.UnixNano()) + + if activeKey.CreateTime > rotationThreshold.UnixNano() { return false, nil // key is too new } + // this eval may be processed up to RootKeyGCInterval after the halfway + // mark, so use the CreateTime of the previous key rather than the wall + // clock to set the publish time + publishTime := activeKey.CreateTime + c.srv.config.RootKeyRotationThreshold.Nanoseconds() + req := &structs.KeyringRotateRootKeyRequest{ + PublishTime: publishTime, WriteRequest: structs.WriteRequest{ Region: c.srv.config.Region, AuthToken: eval.LeaderACL, @@ -1026,6 +1105,24 @@ func (c *CoreScheduler) variablesRekey(eval *structs.Evaluation) error { return err } + rootKey, err := c.srv.encrypter.GetKey(keyMeta.KeyID) + if err != nil { + return fmt.Errorf("rotated key does not exist in keyring: %w", err) + } + rootKey = rootKey.Copy() + rootKey.Meta.SetInactive() + + req := &structs.KeyringUpdateRootKeyRequest{ + RootKey: rootKey, + WriteRequest: structs.WriteRequest{ + Region: c.srv.config.Region, + AuthToken: eval.LeaderACL}, + } + if err := c.srv.RPC("Keyring.Update", + req, &structs.KeyringUpdateRootKeyResponse{}); err != nil { + c.logger.Error("rekey complete but failed to mark key as inactive", "error", err) + return err + } } return nil diff --git a/nomad/core_sched_test.go b/nomad/core_sched_test.go index 2f1347c896d6..867426d2d486 100644 --- a/nomad/core_sched_test.go +++ b/nomad/core_sched_test.go @@ -19,6 +19,7 @@ import ( "github.com/hashicorp/nomad/nomad/structs" "github.com/hashicorp/nomad/testutil" "github.com/shoenig/test/must" + "github.com/shoenig/test/wait" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" ) @@ -2609,32 +2610,139 @@ func TestCoreScheduler_CSIBadState_ClaimGC(t *testing.T) { } } +// TestCoreScheduler_RootKeyRotate exercises periodic rotation of the root key +func TestCoreScheduler_RootKeyRotate(t *testing.T) { + ci.Parallel(t) + + srv, cleanup := TestServer(t, func(c *Config) { + c.NumSchedulers = 0 + c.RootKeyRotationThreshold = time.Hour + }) + defer cleanup() + testutil.WaitForKeyring(t, srv.RPC, "global") + + // active key, will never be GC'd + store := srv.fsm.State() + key0, err := store.GetActiveRootKeyMeta(nil) + must.NotNil(t, key0, must.Sprint("expected keyring to be bootstapped")) + must.NoError(t, err) + + // run the core job + snap, err := store.Snapshot() + must.NoError(t, err) + core := NewCoreScheduler(srv, snap) + index := key0.ModifyIndex + 1 + eval := srv.coreJobEval(structs.CoreJobRootKeyRotateOrGC, index) + c := core.(*CoreScheduler) + + // Eval immediately + now := time.Unix(0, key0.CreateTime) + rotated, err := c.rootKeyRotate(eval, now) + must.NoError(t, err) + must.False(t, rotated, must.Sprint("key should not rotate")) + + // Eval after half threshold has passed + c.snap, _ = store.Snapshot() + now = time.Unix(0, key0.CreateTime+(time.Minute*40).Nanoseconds()) + rotated, err = c.rootKeyRotate(eval, now) + must.NoError(t, err) + must.True(t, rotated, must.Sprint("key should rotate")) + + var key1 *structs.RootKeyMeta + iter, err := store.RootKeyMetas(nil) + must.NoError(t, err) + for raw := iter.Next(); raw != nil; raw = iter.Next() { + k := raw.(*structs.RootKeyMeta) + if k.KeyID == key0.KeyID { + must.True(t, k.Active(), must.Sprint("expected original key to be active")) + } else { + key1 = k + } + } + must.NotNil(t, key1) + must.True(t, key1.Prepublished()) + must.Eq(t, key0.CreateTime+time.Hour.Nanoseconds(), key1.PublishTime) + + // Externally rotate with prepublish to add a second prepublished key + resp := &structs.KeyringRotateRootKeyResponse{} + must.NoError(t, srv.RPC("Keyring.Rotate", &structs.KeyringRotateRootKeyRequest{ + PublishTime: key1.PublishTime + (time.Hour * 24).Nanoseconds(), + WriteRequest: structs.WriteRequest{Region: srv.Region()}, + }, resp)) + key2 := resp.Key + + // Eval again with time unchanged + c.snap, _ = store.Snapshot() + rotated, err = c.rootKeyRotate(eval, now) + + iter, err = store.RootKeyMetas(nil) + must.NoError(t, err) + for raw := iter.Next(); raw != nil; raw = iter.Next() { + k := raw.(*structs.RootKeyMeta) + switch k.KeyID { + case key0.KeyID: + must.True(t, k.Active(), must.Sprint("original key should still be active")) + case key1.KeyID, key2.KeyID: + must.True(t, k.Prepublished(), must.Sprint("new key should be prepublished")) + default: + t.Fatalf("should not have created any new keys: %#v", k) + } + } + + // Eval again with time after publish time + c.snap, _ = store.Snapshot() + now = time.Unix(0, key1.PublishTime+(time.Minute*10).Nanoseconds()) + rotated, err = c.rootKeyRotate(eval, now) + + iter, err = store.RootKeyMetas(nil) + must.NoError(t, err) + for raw := iter.Next(); raw != nil; raw = iter.Next() { + k := raw.(*structs.RootKeyMeta) + switch k.KeyID { + case key0.KeyID: + must.True(t, k.Inactive(), must.Sprint("original key should be inactive")) + case key1.KeyID: + must.True(t, k.Active(), must.Sprint("prepublished key should now be active")) + case key2.KeyID: + must.True(t, k.Prepublished(), must.Sprint("later prepublished key should still be prepublished")) + default: + t.Fatalf("should not have created any new keys: %#v", k) + } + } +} + // TestCoreScheduler_RootKeyGC exercises root key GC func TestCoreScheduler_RootKeyGC(t *testing.T) { ci.Parallel(t) - srv, cleanup := TestServer(t, nil) + srv, cleanup := TestServer(t, func(c *Config) { + c.NumSchedulers = 0 + c.RootKeyRotationThreshold = time.Hour + c.RootKeyGCThreshold = time.Minute * 10 + }) defer cleanup() testutil.WaitForKeyring(t, srv.RPC, "global") - // reset the time table - srv.fsm.timetable.table = make([]TimeTableEntry, 1, 10) - // active key, will never be GC'd store := srv.fsm.State() key0, err := store.GetActiveRootKeyMeta(nil) - require.NotNil(t, key0, "expected keyring to be bootstapped") - require.NoError(t, err) + must.NotNil(t, key0, must.Sprint("expected keyring to be bootstapped")) + must.NoError(t, err) + + now := key0.CreateTime + yesterday := now - (24 * time.Hour).Nanoseconds() // insert an "old" inactive key key1 := structs.NewRootKeyMeta() key1.SetInactive() - require.NoError(t, store.UpsertRootKeyMeta(600, key1, false)) + key1.CreateTime = yesterday + must.NoError(t, store.UpsertRootKeyMeta(600, key1, false)) // insert an "old" and inactive key with a variable that's using it key2 := structs.NewRootKeyMeta() key2.SetInactive() - require.NoError(t, store.UpsertRootKeyMeta(700, key2, false)) + key2.CreateTime = yesterday + must.NoError(t, store.UpsertRootKeyMeta(700, key2, false)) variable := mock.VariableEncrypted() variable.KeyID = key2.KeyID @@ -2643,89 +2751,99 @@ func TestCoreScheduler_RootKeyGC(t *testing.T) { Op: structs.VarOpSet, Var: variable, }) - require.NoError(t, setResp.Error) + must.NoError(t, setResp.Error) // insert an "old" key that's inactive but being used by an alloc key3 := structs.NewRootKeyMeta() key3.SetInactive() - require.NoError(t, store.UpsertRootKeyMeta(800, key3, false)) + key3.CreateTime = yesterday + must.NoError(t, store.UpsertRootKeyMeta(800, key3, false)) // insert the allocation using key3 alloc := mock.Alloc() alloc.ClientStatus = structs.AllocClientStatusRunning alloc.SigningKeyID = key3.KeyID - require.NoError(t, store.UpsertAllocs( + must.NoError(t, store.UpsertAllocs( structs.MsgTypeTestSetup, 850, []*structs.Allocation{alloc})) // insert an "old" key that's inactive but being used by an alloc key4 := structs.NewRootKeyMeta() key4.SetInactive() - require.NoError(t, store.UpsertRootKeyMeta(900, key4, false)) + key4.CreateTime = yesterday + must.NoError(t, store.UpsertRootKeyMeta(900, key4, false)) // insert the dead allocation using key4 alloc2 := mock.Alloc() alloc2.ClientStatus = structs.AllocClientStatusFailed alloc2.DesiredStatus = structs.AllocDesiredStatusStop alloc2.SigningKeyID = key4.KeyID - require.NoError(t, store.UpsertAllocs( + must.NoError(t, store.UpsertAllocs( structs.MsgTypeTestSetup, 950, []*structs.Allocation{alloc2})) - // insert a time table index before the last key - tt := srv.fsm.TimeTable() - tt.Witness(1000, time.Now().UTC().Add(-1*srv.config.RootKeyGCThreshold)) - - // insert a "new" but inactive key + // insert an inactive key older than RootKeyGCThreshold but not RootKeyRotationThreshold key5 := structs.NewRootKeyMeta() key5.SetInactive() - require.NoError(t, store.UpsertRootKeyMeta(1500, key5, false)) + key5.CreateTime = now - (15 * time.Minute).Nanoseconds() + must.NoError(t, store.UpsertRootKeyMeta(1500, key5, false)) + + // prepublishing key should never be GC'd no matter how old + key6 := structs.NewRootKeyMeta() + key6.SetPrepublished(yesterday) + key6.CreateTime = yesterday + must.NoError(t, store.UpsertRootKeyMeta(1600, key6, false)) // run the core job snap, err := store.Snapshot() - require.NoError(t, err) + must.NoError(t, err) core := NewCoreScheduler(srv, snap) eval := srv.coreJobEval(structs.CoreJobRootKeyRotateOrGC, 2000) c := core.(*CoreScheduler) - require.NoError(t, c.rootKeyRotateOrGC(eval)) + must.NoError(t, c.rootKeyGC(eval, time.Now())) ws := memdb.NewWatchSet() key, err := store.RootKeyMetaByID(ws, key0.KeyID) - require.NoError(t, err) - require.NotNil(t, key, "active key should not have been GCd") + must.NoError(t, err) + must.NotNil(t, key, must.Sprint("active key should not have been GCd")) key, err = store.RootKeyMetaByID(ws, key1.KeyID) - require.NoError(t, err) - require.Nil(t, key, "old and unused inactive key should have been GCd") + must.NoError(t, err) + must.Nil(t, key, must.Sprint("old and unused inactive key should have been GCd")) key, err = store.RootKeyMetaByID(ws, key2.KeyID) - require.NoError(t, err) - require.NotNil(t, key, "old key should not have been GCd if still in use") + must.NoError(t, err) + must.NotNil(t, key, must.Sprint("old key should not have been GCd if still in use")) key, err = store.RootKeyMetaByID(ws, key3.KeyID) - require.NoError(t, err) - require.NotNil(t, key, "old key used to sign a live alloc should not have been GCd") + must.NoError(t, err) + must.NotNil(t, key, must.Sprint("old key used to sign a live alloc should not have been GCd")) key, err = store.RootKeyMetaByID(ws, key4.KeyID) - require.NoError(t, err) - require.Nil(t, key, "old key used to sign a terminal alloc should have been GCd") + must.NoError(t, err) + must.Nil(t, key, must.Sprint("old key used to sign a terminal alloc should have been GCd")) key, err = store.RootKeyMetaByID(ws, key5.KeyID) - require.NoError(t, err) - require.NotNil(t, key, "new key should not have been GCd") + must.NoError(t, err) + must.NotNil(t, key, must.Sprint("key newer than GC+rotation threshold should not have been GCd")) + key, err = store.RootKeyMetaByID(ws, key6.KeyID) + must.NoError(t, err) + must.NotNil(t, key, must.Sprint("prepublishing key should not have been GCd")) } // TestCoreScheduler_VariablesRekey exercises variables rekeying func TestCoreScheduler_VariablesRekey(t *testing.T) { ci.Parallel(t) - srv, cleanup := TestServer(t, nil) + srv, cleanup := TestServer(t, func(c *Config) { + c.NumSchedulers = 1 + }) defer cleanup() testutil.WaitForKeyring(t, srv.RPC, "global") store := srv.fsm.State() key0, err := store.GetActiveRootKeyMeta(nil) - require.NotNil(t, key0, "expected keyring to be bootstapped") - require.NoError(t, err) + must.NotNil(t, key0, must.Sprint("expected keyring to be bootstapped")) + must.NoError(t, err) for i := 0; i < 3; i++ { req := &structs.VariablesApplyRequest{ @@ -2734,7 +2852,7 @@ func TestCoreScheduler_VariablesRekey(t *testing.T) { WriteRequest: structs.WriteRequest{Region: srv.config.Region}, } resp := &structs.VariablesApplyResponse{} - require.NoError(t, srv.RPC("Variables.Apply", req, resp)) + must.NoError(t, srv.RPC("Variables.Apply", req, resp)) } rotateReq := &structs.KeyringRotateRootKeyRequest{ @@ -2743,7 +2861,7 @@ func TestCoreScheduler_VariablesRekey(t *testing.T) { }, } var rotateResp structs.KeyringRotateRootKeyResponse - require.NoError(t, srv.RPC("Keyring.Rotate", rotateReq, &rotateResp)) + must.NoError(t, srv.RPC("Keyring.Rotate", rotateReq, &rotateResp)) for i := 0; i < 3; i++ { req := &structs.VariablesApplyRequest{ @@ -2752,31 +2870,29 @@ func TestCoreScheduler_VariablesRekey(t *testing.T) { WriteRequest: structs.WriteRequest{Region: srv.config.Region}, } resp := &structs.VariablesApplyResponse{} - require.NoError(t, srv.RPC("Variables.Apply", req, resp)) + must.NoError(t, srv.RPC("Variables.Apply", req, resp)) } rotateReq.Full = true - require.NoError(t, srv.RPC("Keyring.Rotate", rotateReq, &rotateResp)) + must.NoError(t, srv.RPC("Keyring.Rotate", rotateReq, &rotateResp)) newKeyID := rotateResp.Key.KeyID - require.Eventually(t, func() bool { - ws := memdb.NewWatchSet() - iter, err := store.Variables(ws) - require.NoError(t, err) - for { - raw := iter.Next() - if raw == nil { - break + must.Wait(t, wait.InitialSuccess( + wait.Timeout(5*time.Second), + wait.Gap(100*time.Millisecond), + wait.BoolFunc(func() bool { + iter, _ := store.Variables(nil) + for raw := iter.Next(); raw != nil; raw = iter.Next() { + variable := raw.(*structs.VariableEncrypted) + if variable.KeyID != newKeyID { + return false + } } - variable := raw.(*structs.VariableEncrypted) - if variable.KeyID != newKeyID { - return false - } - } - return true - }, time.Second*5, 100*time.Millisecond, - "variable rekey should be complete") + originalKey, _ := store.RootKeyMetaByID(nil, key0.KeyID) + return originalKey.Inactive() + }), + ), must.Sprint("variable rekey should be complete")) } func TestCoreScheduler_FailLoop(t *testing.T) { diff --git a/nomad/encrypter.go b/nomad/encrypter.go index 9264612f74d5..15167ec6ffec 100644 --- a/nomad/encrypter.go +++ b/nomad/encrypter.go @@ -21,15 +21,14 @@ import ( "github.com/go-jose/go-jose/v3" "github.com/go-jose/go-jose/v3/jwt" - log "github.com/hashicorp/go-hclog" + "github.com/hashicorp/go-hclog" kms "github.com/hashicorp/go-kms-wrapping/v2" "github.com/hashicorp/go-kms-wrapping/v2/aead" - "golang.org/x/time/rate" - "github.com/hashicorp/nomad/helper" "github.com/hashicorp/nomad/helper/crypto" "github.com/hashicorp/nomad/helper/joseutil" "github.com/hashicorp/nomad/nomad/structs" + "golang.org/x/time/rate" ) const nomadKeystoreExtension = ".nks.json" @@ -337,16 +336,16 @@ func (e *Encrypter) addCipher(rootKey *structs.RootKey) error { return nil } -// GetKey retrieves the key material by ID from the keyring -func (e *Encrypter) GetKey(keyID string) ([]byte, []byte, error) { +// GetKey retrieves the key material by ID from the keyring. +func (e *Encrypter) GetKey(keyID string) (*structs.RootKey, error) { e.lock.RLock() defer e.lock.RUnlock() keyset, err := e.keysetByIDLocked(keyID) if err != nil { - return nil, nil, err + return nil, err } - return keyset.rootKey.Key, keyset.rootKey.RSAKey, nil + return keyset.rootKey, nil } // activeKeySetLocked returns the keyset that belongs to the key marked as @@ -528,7 +527,7 @@ func (e *Encrypter) newKMSWrapper(keyID string, kek []byte) (kms.Wrapper, error) type KeyringReplicator struct { srv *Server encrypter *Encrypter - logger log.Logger + logger hclog.Logger stopFn context.CancelFunc } @@ -582,7 +581,7 @@ func (krr *KeyringReplicator) run(ctx context.Context) { } keyMeta := raw.(*structs.RootKeyMeta) - if key, _, err := krr.encrypter.GetKey(keyMeta.KeyID); err == nil && len(key) > 0 { + if key, err := krr.encrypter.GetKey(keyMeta.KeyID); err == nil && len(key.Key) > 0 { // the key material is immutable so if we've already got it // we can move on to the next key continue diff --git a/nomad/keyring_endpoint.go b/nomad/keyring_endpoint.go index 65a8d02b38c2..c3a82758c387 100644 --- a/nomad/keyring_endpoint.go +++ b/nomad/keyring_endpoint.go @@ -50,13 +50,20 @@ func (k *Keyring) Rotate(args *structs.KeyringRotateRootKeyRequest, reply *struc if args.Algorithm == "" { args.Algorithm = structs.EncryptionAlgorithmAES256GCM } + if args.Full && args.PublishTime > 0 { + return fmt.Errorf("keyring cannot be prepublished and full rotated at the same time") + } rootKey, err := structs.NewRootKey(args.Algorithm) if err != nil { return err } - rootKey.Meta.SetActive() + if args.PublishTime != 0 { + rootKey.Meta.SetPrepublished(args.PublishTime) + } else { + rootKey.Meta.SetActive() + } // make sure it's been added to the local keystore before we write // it to raft, so that followers don't try to Get a key that @@ -268,15 +275,10 @@ func (k *Keyring) Get(args *structs.KeyringGetRootKeyRequest, reply *structs.Key } // retrieve the key material from the keyring - key, rsaKey, err := k.encrypter.GetKey(keyMeta.KeyID) + rootKey, err := k.encrypter.GetKey(keyMeta.KeyID) if err != nil { return err } - rootKey := &structs.RootKey{ - Meta: keyMeta, - Key: key, - RSAKey: rsaKey, - } reply.Key = rootKey // Use the last index that affected the policy table diff --git a/nomad/keyring_endpoint_test.go b/nomad/keyring_endpoint_test.go index e0573fad840b..e97b4150f0b3 100644 --- a/nomad/keyring_endpoint_test.go +++ b/nomad/keyring_endpoint_test.go @@ -228,10 +228,14 @@ func TestKeyringEndpoint_Rotate(t *testing.T) { testutil.WaitForKeyring(t, srv.RPC, "global") codec := rpcClient(t, srv) + store := srv.fsm.State() + key0, err := store.GetActiveRootKeyMeta(nil) + must.NoError(t, err) + // Setup an existing key key, err := structs.NewRootKey(structs.EncryptionAlgorithmAES256GCM) - require.NoError(t, err) - key.Meta.SetActive() + must.NoError(t, err) + key1 := key.Meta updateReq := &structs.KeyringUpdateRootKeyRequest{ RootKey: key, @@ -242,7 +246,7 @@ func TestKeyringEndpoint_Rotate(t *testing.T) { } var updateResp structs.KeyringUpdateRootKeyResponse err = msgpackrpc.CallWithCodec(codec, "Keyring.Update", updateReq, &updateResp) - require.NoError(t, err) + must.NoError(t, err) // Rotate the key @@ -253,14 +257,13 @@ func TestKeyringEndpoint_Rotate(t *testing.T) { } var rotateResp structs.KeyringRotateRootKeyResponse err = msgpackrpc.CallWithCodec(codec, "Keyring.Rotate", rotateReq, &rotateResp) - require.EqualError(t, err, structs.ErrPermissionDenied.Error()) + must.EqError(t, err, structs.ErrPermissionDenied.Error()) rotateReq.AuthToken = rootToken.SecretID err = msgpackrpc.CallWithCodec(codec, "Keyring.Rotate", rotateReq, &rotateResp) - require.NoError(t, err) - require.NotEqual(t, updateResp.Index, rotateResp.Index) - - newID := rotateResp.Key.KeyID + must.NoError(t, err) + must.Greater(t, updateResp.Index, rotateResp.Index) + key2 := rotateResp.Key // Verify we have a new key and the old one is inactive @@ -272,31 +275,62 @@ func TestKeyringEndpoint_Rotate(t *testing.T) { } var listResp structs.KeyringListRootKeyMetaResponse err = msgpackrpc.CallWithCodec(codec, "Keyring.List", listReq, &listResp) - require.NoError(t, err) - - require.Greater(t, listResp.Index, updateResp.Index) - require.Len(t, listResp.Keys, 3) // bootstrap + old + new + must.NoError(t, err) + must.Greater(t, updateResp.Index, listResp.Index) + must.Len(t, 3, listResp.Keys) // bootstrap + old + new for _, keyMeta := range listResp.Keys { - if keyMeta.KeyID != newID { - require.False(t, keyMeta.Active(), "expected old keys to be inactive") - } else { - require.True(t, keyMeta.Active(), "expected new key to be inactive") + switch keyMeta.KeyID { + case key0.KeyID, key1.KeyID: + must.True(t, keyMeta.Inactive(), must.Sprint("older keys must be inactive")) + case key2.KeyID: + must.True(t, keyMeta.Active(), must.Sprint("expected new key to be active")) } } getReq := &structs.KeyringGetRootKeyRequest{ - KeyID: newID, + KeyID: key2.KeyID, QueryOptions: structs.QueryOptions{ Region: "global", }, } var getResp structs.KeyringGetRootKeyResponse err = msgpackrpc.CallWithCodec(codec, "Keyring.Get", getReq, &getResp) - require.NoError(t, err) + must.NoError(t, err) + must.Len(t, 32, getResp.Key.Key) + + // Rotate the key with prepublishing - gotKey := getResp.Key - require.Len(t, gotKey.Key, 32) + publishTime := time.Now().Add(24 * time.Hour).UnixNano() + rotateResp = structs.KeyringRotateRootKeyResponse{} + rotateReq = &structs.KeyringRotateRootKeyRequest{ + PublishTime: publishTime, + WriteRequest: structs.WriteRequest{ + Region: "global", + AuthToken: rootToken.SecretID, + }, + } + err = msgpackrpc.CallWithCodec(codec, "Keyring.Rotate", rotateReq, &rotateResp) + must.NoError(t, err) + must.Greater(t, updateResp.Index, rotateResp.Index) + key3 := rotateResp.Key + + listResp = structs.KeyringListRootKeyMetaResponse{} + err = msgpackrpc.CallWithCodec(codec, "Keyring.List", listReq, &listResp) + must.NoError(t, err) + must.Greater(t, updateResp.Index, listResp.Index) + must.Len(t, 4, listResp.Keys) // bootstrap + old + new + prepublished + + for _, keyMeta := range listResp.Keys { + switch keyMeta.KeyID { + case key0.KeyID, key1.KeyID: + must.True(t, keyMeta.Inactive(), must.Sprint("older keys must be inactive")) + case key2.KeyID: + must.True(t, keyMeta.Active(), must.Sprint("expected active key to remain active")) + case key3.KeyID: + must.True(t, keyMeta.Prepublished(), must.Sprint("expected new key to be prepublished")) + } + } } // TestKeyringEndpoint_ListPublic asserts the Keyring.ListPublic RPC returns diff --git a/nomad/structs/keyring.go b/nomad/structs/keyring.go index 103068716d86..f82ce6320481 100644 --- a/nomad/structs/keyring.go +++ b/nomad/structs/keyring.go @@ -75,6 +75,17 @@ func NewRootKey(algorithm EncryptionAlgorithm) (*RootKey, error) { return rootKey, nil } +func (k *RootKey) Copy() *RootKey { + out := &RootKey{ + Meta: k.Meta.Copy(), + Key: make([]byte, len(k.Key)), + RSAKey: make([]byte, len(k.RSAKey)), + } + copy(out.Key, k.Key) + copy(out.RSAKey, k.RSAKey) + return out +} + // RootKeyMeta is the metadata used to refer to a RootKey. It is // stored in raft. type RootKeyMeta struct { @@ -84,15 +95,17 @@ type RootKeyMeta struct { CreateIndex uint64 ModifyIndex uint64 State RootKeyState + PublishTime int64 } // RootKeyState enum describes the lifecycle of a root key. type RootKeyState string const ( - RootKeyStateInactive RootKeyState = "inactive" - RootKeyStateActive = "active" - RootKeyStateRekeying = "rekeying" + RootKeyStateInactive RootKeyState = "inactive" + RootKeyStateActive = "active" + RootKeyStateRekeying = "rekeying" + RootKeyStatePrepublished = "prepublished" // RootKeyStateDeprecated is, itself, deprecated and is no longer in // use. For backwards compatibility, any existing keys with this state will @@ -130,6 +143,7 @@ func (rkm *RootKeyMeta) Active() bool { func (rkm *RootKeyMeta) SetActive() { rkm.State = RootKeyStateActive + rkm.PublishTime = 0 } // Rekeying indicates that variables encrypted with this key should be @@ -142,6 +156,15 @@ func (rkm *RootKeyMeta) SetRekeying() { rkm.State = RootKeyStateRekeying } +func (rkm *RootKeyMeta) SetPrepublished(t int64) { + rkm.PublishTime = t + rkm.State = RootKeyStatePrepublished +} + +func (rkm *RootKeyMeta) Prepublished() bool { + return rkm.State == RootKeyStatePrepublished +} + func (rkm *RootKeyMeta) SetInactive() { rkm.State = RootKeyStateInactive } @@ -184,7 +207,7 @@ func (rkm *RootKeyMeta) Validate() error { } switch rkm.State { case RootKeyStateInactive, RootKeyStateActive, - RootKeyStateRekeying, RootKeyStateDeprecated: + RootKeyStateRekeying, RootKeyStateDeprecated, RootKeyStatePrepublished: default: return fmt.Errorf("root key state %q is invalid", rkm.State) } @@ -211,8 +234,9 @@ const ( // KeyringRotateRootKeyRequest is the argument to the Keyring.Rotate RPC type KeyringRotateRootKeyRequest struct { - Algorithm EncryptionAlgorithm - Full bool + Algorithm EncryptionAlgorithm + Full bool + PublishTime int64 WriteRequest } @@ -261,7 +285,7 @@ type KeyringGetRootKeyResponse struct { // KeyringUpdateRootKeyMetaRequest is used internally for key // replication so that we have a request wrapper for writing the -// metadata to the FSM without including the key material +// metadata to the FSM without including the key material. type KeyringUpdateRootKeyMetaRequest struct { RootKeyMeta *RootKeyMeta Rekey bool