Skip to content

Commit

Permalink
cherrypick(v0.34.x): feat: Add Versioned for NodePool Hash to Prevent…
Browse files Browse the repository at this point in the history
… Drift on NodePool CRD Upgrade (#1128)
  • Loading branch information
engedaam committed Mar 22, 2024
1 parent dd87d6e commit bee2bf9
Show file tree
Hide file tree
Showing 7 changed files with 202 additions and 11 deletions.
1 change: 1 addition & 0 deletions pkg/apis/v1beta1/labels.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ const (
ProviderCompatabilityAnnotationKey = CompatabilityGroup + "/provider"
ManagedByAnnotationKey = Group + "/managed-by"
NodePoolHashAnnotationKey = Group + "/nodepool-hash"
NodePoolHashVersionAnnotationKey = Group + "/nodepool-hash-version"
)

// Karpenter specific finalizers
Expand Down
6 changes: 6 additions & 0 deletions pkg/apis/v1beta1/nodepool.go
Original file line number Diff line number Diff line change
Expand Up @@ -189,6 +189,12 @@ type NodePool struct {
Status NodePoolStatus `json:"status,omitempty"`
}

// We need to bump the NodePoolHashVersion when we make an update to the NodePool CRD under these conditions:
// 1. A field changes its default value for an existing field that is already hashed
// 2. A field is added to the hash calculation with an already-set value
// 3. A field is removed from the hash calculations
const NodePoolHashVersion = "v1"

func (in *NodePool) Hash() string {
return fmt.Sprint(lo.Must(hashstructure.Hash(in.Spec.Template, hashstructure.FormatV2, &hashstructure.HashOptions{
SlicesAsSets: true,
Expand Down
11 changes: 9 additions & 2 deletions pkg/controllers/nodeclaim/disruption/drift.go
Original file line number Diff line number Diff line change
Expand Up @@ -115,12 +115,19 @@ func (d *Drift) isDrifted(ctx context.Context, nodePool *v1beta1.NodePool, nodeC
return driftedReason, nil
}

// Eligible fields for static drift are described in the docs
// Eligible fields for drift are described in the docs
// https://karpenter.sh/docs/concepts/deprovisioning/#drift
func areStaticFieldsDrifted(nodePool *v1beta1.NodePool, nodeClaim *v1beta1.NodeClaim) cloudprovider.DriftReason {
nodePoolHash, foundHashNodePool := nodePool.Annotations[v1beta1.NodePoolHashAnnotationKey]
nodePoolVersionHash, foundVersionHashNodePool := nodePool.Annotations[v1beta1.NodePoolHashVersionAnnotationKey]
nodeClaimHash, foundHashNodeClaim := nodeClaim.Annotations[v1beta1.NodePoolHashAnnotationKey]
if !foundHashNodePool || !foundHashNodeClaim {
nodeClaimVersionHash, foundVersionHashNodeClaim := nodeClaim.Annotations[v1beta1.NodePoolHashVersionAnnotationKey]

if !foundHashNodePool || !foundHashNodeClaim || !foundVersionHashNodePool || !foundVersionHashNodeClaim {
return ""
}
// validate that the version of the crd is the same
if nodePoolVersionHash != nodeClaimVersionHash {
return ""
}
return lo.Ternary(nodePoolHash != nodeClaimHash, NodePoolDrifted, "")
Expand Down
26 changes: 23 additions & 3 deletions pkg/controllers/nodeclaim/disruption/drift_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,12 @@ var _ = Describe("Drift", func() {
It("should detect static drift before cloud provider drift", func() {
cp.Drifted = "drifted"
nodePool.Annotations = lo.Assign(nodePool.Annotations, map[string]string{
v1beta1.NodePoolHashAnnotationKey: "123456789",
v1beta1.NodePoolHashAnnotationKey: "test-123456789",
v1beta1.NodePoolHashVersionAnnotationKey: v1beta1.NodePoolHashVersion,
})
nodeClaim.Annotations = lo.Assign(nodeClaim.Annotations, map[string]string{
v1beta1.NodePoolHashAnnotationKey: "test-123",
v1beta1.NodePoolHashVersionAnnotationKey: v1beta1.NodePoolHashVersion,
})
ExpectApplied(ctx, env.Client, nodePool, nodeClaim)
ExpectReconcileSucceeded(ctx, nodeClaimDisruptionController, client.ObjectKeyFromObject(nodeClaim))
Expand Down Expand Up @@ -427,8 +432,9 @@ var _ = Describe("Drift", func() {
Template: v1beta1.NodeClaimTemplate{
ObjectMeta: v1beta1.ObjectMeta{
Annotations: map[string]string{
"keyAnnotation": "valueAnnotation",
"keyAnnotation2": "valueAnnotation2",
"keyAnnotation": "valueAnnotation",
"keyAnnotation2": "valueAnnotation2",
v1beta1.NodePoolHashVersionAnnotationKey: v1beta1.NodePoolHashVersion,
},
Labels: map[string]string{
"keyLabel": "valueLabel",
Expand Down Expand Up @@ -495,5 +501,19 @@ var _ = Describe("Drift", func() {
nodeClaim = ExpectExists(ctx, env.Client, nodeClaim)
Expect(nodeClaim.StatusConditions().GetCondition(v1beta1.Drifted)).To(BeNil())
})
It("should not return drifted if the NodeClaim's karpenter.sh/nodepool-hash-version annotation does not match the NodePool's", func() {
nodePool.ObjectMeta.Annotations = map[string]string{
v1beta1.NodePoolHashAnnotationKey: "test-hash-1",
v1beta1.NodePoolHashVersionAnnotationKey: "test-version-1",
}
nodeClaim.ObjectMeta.Annotations = map[string]string{
v1beta1.NodePoolHashAnnotationKey: "test-hash-2",
v1beta1.NodePoolHashVersionAnnotationKey: "test-version-2",
}
ExpectApplied(ctx, env.Client, nodePool, nodeClaim)
ExpectReconcileSucceeded(ctx, nodeClaimDisruptionController, client.ObjectKeyFromObject(nodeClaim))
nodeClaim = ExpectExists(ctx, env.Client, nodeClaim)
Expect(nodeClaim.StatusConditions().GetCondition(v1beta1.Drifted)).To(BeNil())
})
})
})
51 changes: 50 additions & 1 deletion pkg/controllers/nodepool/hash/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
"context"

"github.com/samber/lo"
"go.uber.org/multierr"
"k8s.io/apimachinery/pkg/api/equality"
controllerruntime "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
Expand Down Expand Up @@ -48,7 +49,16 @@ func NewController(kubeClient client.Client) operatorcontroller.Controller {
// Reconcile the resource
func (c *Controller) Reconcile(ctx context.Context, np *v1beta1.NodePool) (reconcile.Result, error) {
stored := np.DeepCopy()
np.Annotations = lo.Assign(np.Annotations, map[string]string{v1beta1.NodePoolHashAnnotationKey: np.Hash()})

if np.Annotations[v1beta1.NodePoolHashVersionAnnotationKey] != v1beta1.NodePoolHashVersion {
if err := c.updateNodeClaimHash(ctx, np); err != nil {
return reconcile.Result{}, err
}
}
np.Annotations = lo.Assign(np.Annotations, map[string]string{
v1beta1.NodePoolHashAnnotationKey: np.Hash(),
v1beta1.NodePoolHashVersionAnnotationKey: v1beta1.NodePoolHashVersion,
})

if !equality.Semantic.DeepEqual(stored, np) {
if err := c.kubeClient.Patch(ctx, np, client.MergeFrom(stored)); err != nil {
Expand All @@ -69,3 +79,42 @@ func (c *Controller) Builder(_ context.Context, m manager.Manager) operatorcontr
WithOptions(controller.Options{MaxConcurrentReconciles: 10}),
)
}

// Updating `nodepool-hash-version` annotation inside the controller means a breaking change has made to the hash function calculating
// `nodepool-hash` on both the NodePool and NodeClaim, automatically making the `nodepool-hash` on the NodeClaim different from
// NodePool. Since we can not rely on the hash on the NodeClaims, we will need to re-calculate the hash and update the annotation.
// Look at designs/drift-hash-version.md for more information.
func (c *Controller) updateNodeClaimHash(ctx context.Context, np *v1beta1.NodePool) error {
ncList := &v1beta1.NodeClaimList{}
if err := c.kubeClient.List(ctx, ncList, client.MatchingLabels(map[string]string{v1beta1.NodePoolLabelKey: np.Name})); err != nil {
return err
}

errs := make([]error, len(ncList.Items))
for i := range ncList.Items {
nc := ncList.Items[i]
stored := nc.DeepCopy()

if nc.Annotations[v1beta1.NodePoolHashVersionAnnotationKey] != v1beta1.NodePoolHashVersion {
nc.Annotations = lo.Assign(nc.Annotations, map[string]string{
v1beta1.NodePoolHashVersionAnnotationKey: v1beta1.NodePoolHashVersion,
})

// Any NodeClaim that is already drifted will remain drifted if the karpenter.sh/nodepool-hash-version doesn't match
// Since the hashing mechanism has changed we will not be able to determine if the drifted status of the node has changed
if nc.StatusConditions().GetCondition(v1beta1.Drifted) == nil {
nc.Annotations = lo.Assign(nc.Annotations, map[string]string{
v1beta1.NodePoolHashAnnotationKey: np.Hash(),
})
}

if !equality.Semantic.DeepEqual(stored, nc) {
if err := c.kubeClient.Patch(ctx, &nc, client.MergeFrom(stored)); err != nil {
errs[i] = client.IgnoreNotFound(err)
}
}
}
}

return multierr.Combine(errs...)
}
111 changes: 108 additions & 3 deletions pkg/controllers/nodepool/hash/suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,11 @@ import (
"knative.dev/pkg/ptr"
"sigs.k8s.io/controller-runtime/pkg/client"

. "sigs.k8s.io/karpenter/pkg/test/expectations"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"

"sigs.k8s.io/karpenter/pkg/apis"
. "sigs.k8s.io/karpenter/pkg/test/expectations"

"sigs.k8s.io/karpenter/pkg/apis/v1beta1"
"sigs.k8s.io/karpenter/pkg/controllers/nodepool/hash"
"sigs.k8s.io/karpenter/pkg/operator/controller"
Expand Down Expand Up @@ -95,7 +97,7 @@ var _ = Describe("Static Drift Hash", func() {
},
})
})
It("should update the static drift hash when NodePool static field is updated", func() {
It("should update the drift hash when NodePool static field is updated", func() {
ExpectApplied(ctx, env.Client, nodePool)
ExpectReconcileSucceeded(ctx, nodePoolController, client.ObjectKeyFromObject(nodePool))
nodePool = ExpectExists(ctx, env.Client, nodePool)
Expand All @@ -111,7 +113,7 @@ var _ = Describe("Static Drift Hash", func() {
expectedHashTwo := nodePool.Hash()
Expect(nodePool.Annotations).To(HaveKeyWithValue(v1beta1.NodePoolHashAnnotationKey, expectedHashTwo))
})
It("should not update the static drift hash when NodePool behavior field is updated", func() {
It("should not update the drift hash when NodePool behavior field is updated", func() {
ExpectApplied(ctx, env.Client, nodePool)
ExpectReconcileSucceeded(ctx, nodePoolController, client.ObjectKeyFromObject(nodePool))
nodePool = ExpectExists(ctx, env.Client, nodePool)
Expand All @@ -134,4 +136,107 @@ var _ = Describe("Static Drift Hash", func() {

Expect(nodePool.Annotations).To(HaveKeyWithValue(v1beta1.NodePoolHashAnnotationKey, expectedHash))
})
It("should update nodepool hash version when the nodepool hash version is out of sync with the controller hash version", func() {
nodePool.Annotations = map[string]string{
v1beta1.NodePoolHashAnnotationKey: "abceduefed",
v1beta1.NodePoolHashVersionAnnotationKey: "test",
}
ExpectApplied(ctx, env.Client, nodePool)

ExpectReconcileSucceeded(ctx, nodePoolController, client.ObjectKeyFromObject(nodePool))
nodePool = ExpectExists(ctx, env.Client, nodePool)

expectedHash := nodePool.Hash()
Expect(nodePool.Annotations).To(HaveKeyWithValue(v1beta1.NodePoolHashAnnotationKey, expectedHash))
Expect(nodePool.Annotations).To(HaveKeyWithValue(v1beta1.NodePoolHashVersionAnnotationKey, v1beta1.NodePoolHashVersion))
})
It("should update nodepool hash versions on all nodeclaims when the hash versions don't match the controller hash version", func() {
nodePool.Annotations = map[string]string{
v1beta1.NodePoolHashAnnotationKey: "abceduefed",
v1beta1.NodePoolHashVersionAnnotationKey: "test",
}
nodeClaimOne := test.NodeClaim(v1beta1.NodeClaim{
ObjectMeta: metav1.ObjectMeta{
Labels: map[string]string{v1beta1.NodePoolLabelKey: nodePool.Name},
Annotations: map[string]string{
v1beta1.NodePoolHashAnnotationKey: "123456",
v1beta1.NodePoolHashVersionAnnotationKey: "test",
},
},
})
nodeClaimTwo := test.NodeClaim(v1beta1.NodeClaim{
ObjectMeta: metav1.ObjectMeta{
Labels: map[string]string{v1beta1.NodePoolLabelKey: nodePool.Name},
Annotations: map[string]string{
v1beta1.NodePoolHashAnnotationKey: "123456",
v1beta1.NodePoolHashVersionAnnotationKey: "test",
},
},
})

ExpectApplied(ctx, env.Client, nodePool, nodeClaimOne, nodeClaimTwo)

ExpectReconcileSucceeded(ctx, nodePoolController, client.ObjectKeyFromObject(nodePool))
nodePool = ExpectExists(ctx, env.Client, nodePool)
nodeClaimOne = ExpectExists(ctx, env.Client, nodeClaimOne)
nodeClaimTwo = ExpectExists(ctx, env.Client, nodeClaimTwo)

expectedHash := nodePool.Hash()
Expect(nodeClaimOne.Annotations).To(HaveKeyWithValue(v1beta1.NodePoolHashAnnotationKey, expectedHash))
Expect(nodeClaimOne.Annotations).To(HaveKeyWithValue(v1beta1.NodePoolHashVersionAnnotationKey, v1beta1.NodePoolHashVersion))
Expect(nodeClaimTwo.Annotations).To(HaveKeyWithValue(v1beta1.NodePoolHashAnnotationKey, expectedHash))
Expect(nodeClaimTwo.Annotations).To(HaveKeyWithValue(v1beta1.NodePoolHashVersionAnnotationKey, v1beta1.NodePoolHashVersion))
})
It("should not update nodepool hash on all nodeclaims when the hash versions match the controller hash version", func() {
nodePool.Annotations = map[string]string{
v1beta1.NodePoolHashAnnotationKey: "abceduefed",
v1beta1.NodePoolHashVersionAnnotationKey: "test-version",
}
nodeClaim := test.NodeClaim(v1beta1.NodeClaim{
ObjectMeta: metav1.ObjectMeta{
Labels: map[string]string{v1beta1.NodePoolLabelKey: nodePool.Name},
Annotations: map[string]string{
v1beta1.NodePoolHashAnnotationKey: "1234564654",
v1beta1.NodePoolHashVersionAnnotationKey: v1beta1.NodePoolHashVersion,
},
},
})
ExpectApplied(ctx, env.Client, nodePool, nodeClaim)

ExpectReconcileSucceeded(ctx, nodePoolController, client.ObjectKeyFromObject(nodePool))
nodePool = ExpectExists(ctx, env.Client, nodePool)
nodeClaim = ExpectExists(ctx, env.Client, nodeClaim)

expectedHash := nodePool.Hash()

// Expect NodeClaims to have been updated to the original hash
Expect(nodePool.Annotations).To(HaveKeyWithValue(v1beta1.NodePoolHashAnnotationKey, expectedHash))
Expect(nodePool.Annotations).To(HaveKeyWithValue(v1beta1.NodePoolHashVersionAnnotationKey, v1beta1.NodePoolHashVersion))
Expect(nodeClaim.Annotations).To(HaveKeyWithValue(v1beta1.NodePoolHashAnnotationKey, "1234564654"))
Expect(nodeClaim.Annotations).To(HaveKeyWithValue(v1beta1.NodePoolHashVersionAnnotationKey, v1beta1.NodePoolHashVersion))
})
It("should not update nodepool hash on the nodeclaim if it's drifted", func() {
nodePool.Annotations = map[string]string{
v1beta1.NodePoolHashAnnotationKey: "abceduefed",
v1beta1.NodePoolHashVersionAnnotationKey: "test",
}
nodeClaim := test.NodeClaim(v1beta1.NodeClaim{
ObjectMeta: metav1.ObjectMeta{
Labels: map[string]string{v1beta1.NodePoolLabelKey: nodePool.Name},
Annotations: map[string]string{
v1beta1.NodePoolHashAnnotationKey: "123456",
v1beta1.NodePoolHashVersionAnnotationKey: "test",
},
},
})
nodeClaim.StatusConditions().MarkTrue(v1beta1.Drifted)
ExpectApplied(ctx, env.Client, nodePool, nodeClaim)

ExpectReconcileSucceeded(ctx, nodePoolController, client.ObjectKeyFromObject(nodePool))
nodeClaim = ExpectExists(ctx, env.Client, nodeClaim)

// Expect NodeClaims hash to not have been updated
Expect(nodeClaim.Annotations).To(HaveKeyWithValue(v1beta1.NodePoolHashAnnotationKey, "123456"))
Expect(nodeClaim.Annotations).To(HaveKeyWithValue(v1beta1.NodePoolHashVersionAnnotationKey, v1beta1.NodePoolHashVersion))
})
})
7 changes: 5 additions & 2 deletions pkg/controllers/provisioning/scheduling/nodeclaimtemplate.go
Original file line number Diff line number Diff line change
Expand Up @@ -62,8 +62,11 @@ func (i *NodeClaimTemplate) ToNodeClaim(nodePool *v1beta1.NodePool) *v1beta1.Nod
nc := &v1beta1.NodeClaim{
ObjectMeta: metav1.ObjectMeta{
GenerateName: fmt.Sprintf("%s-", i.NodePoolName),
Annotations: lo.Assign(i.Annotations, map[string]string{v1beta1.NodePoolHashAnnotationKey: nodePool.Hash()}),
Labels: i.Labels,
Annotations: lo.Assign(i.Annotations, map[string]string{
v1beta1.NodePoolHashAnnotationKey: nodePool.Hash(),
v1beta1.NodePoolHashVersionAnnotationKey: v1beta1.NodePoolHashVersion,
}),
Labels: i.Labels,
OwnerReferences: []metav1.OwnerReference{
{
APIVersion: v1beta1.SchemeGroupVersion.String(),
Expand Down

0 comments on commit bee2bf9

Please sign in to comment.