Skip to content

Commit

Permalink
chore: remove various alpha code references (#780)
Browse files Browse the repository at this point in the history
  • Loading branch information
njtran committed Nov 11, 2023
1 parent 7b5ae9c commit a9b77e7
Show file tree
Hide file tree
Showing 23 changed files with 347 additions and 703 deletions.
7 changes: 3 additions & 4 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -3,21 +3,20 @@ help: ## Display help

presubmit: verify test licenses vulncheck ## Run all steps required for code to be checked in

## TODO @joinnis: Reduce the gingko timeout back to 10m once the v1alpha5 testing is removed
test: ## Run tests
go test ./... \
-race \
-timeout 20m \
-timeout 10m \
--ginkgo.focus="${FOCUS}" \
--ginkgo.timeout=20m \
--ginkgo.timeout=10m \
--ginkgo.v \
-cover -coverprofile=coverage.out -outputdir=. -coverpkg=./...

deflake: ## Run randomized, racing tests until the test fails to catch flakes
ginkgo \
--race \
--focus="${FOCUS}" \
--timeout=20m \
--timeout=10m \
--randomize-all \
--until-it-fails \
-v \
Expand Down
7 changes: 3 additions & 4 deletions pkg/cloudprovider/fake/cloudprovider.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@ import (
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/sets"

"github.com/aws/karpenter-core/pkg/apis/v1alpha5"
"github.com/aws/karpenter-core/pkg/apis/v1beta1"
"github.com/aws/karpenter-core/pkg/cloudprovider"
"github.com/aws/karpenter-core/pkg/scheduling"
Expand Down Expand Up @@ -93,9 +92,9 @@ func (c *CloudProvider) Create(ctx context.Context, nodeClaim *v1beta1.NodeClaim
return &v1beta1.NodeClaim{}, fmt.Errorf("erroring as number of AllowedCreateCalls has been exceeded")
}
reqs := scheduling.NewNodeSelectorRequirements(nodeClaim.Spec.Requirements...)
np := &v1beta1.NodePool{ObjectMeta: metav1.ObjectMeta{Name: nodeClaim.Labels[lo.Ternary(nodeClaim.IsMachine, v1alpha5.ProvisionerNameLabelKey, v1beta1.NodePoolLabelKey)]}}
np := &v1beta1.NodePool{ObjectMeta: metav1.ObjectMeta{Name: nodeClaim.Labels[v1beta1.NodePoolLabelKey]}}
instanceTypes := lo.Filter(lo.Must(c.GetInstanceTypes(ctx, np)), func(i *cloudprovider.InstanceType, _ int) bool {
return reqs.Compatible(i.Requirements, scheduling.AllowUndefinedWellKnownLabelsV1Alpha5) == nil &&
return reqs.Compatible(i.Requirements, scheduling.AllowUndefinedWellKnownLabels) == nil &&
len(i.Offerings.Requirements(reqs).Available()) > 0 &&
resources.Fits(nodeClaim.Spec.Resources.Requests, i.Allocatable())
})
Expand All @@ -118,7 +117,7 @@ func (c *CloudProvider) Create(ctx context.Context, nodeClaim *v1beta1.NodeClaim
if reqs.Compatible(scheduling.NewRequirements(
scheduling.NewRequirement(v1.LabelTopologyZone, v1.NodeSelectorOpIn, o.Zone),
scheduling.NewRequirement(v1beta1.CapacityTypeLabelKey, v1.NodeSelectorOpIn, o.CapacityType),
), scheduling.AllowUndefinedWellKnownLabelsV1Alpha5) == nil {
), scheduling.AllowUndefinedWellKnownLabels) == nil {
labels[v1.LabelTopologyZone] = o.Zone
labels[v1beta1.CapacityTypeLabelKey] = o.CapacityType
break
Expand Down
5 changes: 0 additions & 5 deletions pkg/cloudprovider/fake/instancetype.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,11 +39,6 @@ const (
)

func init() {
v1alpha5.WellKnownLabels.Insert(
LabelInstanceSize,
ExoticInstanceLabelKey,
IntegerInstanceLabelKey,
)
v1beta1.WellKnownLabels.Insert(
LabelInstanceSize,
ExoticInstanceLabelKey,
Expand Down
20 changes: 10 additions & 10 deletions pkg/controllers/disruption/consolidation_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -795,7 +795,7 @@ var _ = Describe("Consolidation", func() {
},
},
})
// Block this pod from being disrupted with karpenter.sh/do-not-evict
// Block this pod from being disrupted with karpenter.sh/do-not-disrupt
pods[2].Annotations = lo.Assign(pods[2].Annotations, map[string]string{v1beta1.DoNotDisruptAnnotationKey: "true"})

ExpectApplied(ctx, env.Client, rs, pods[0], pods[1], pods[2], nodePool)
Expand Down Expand Up @@ -2345,21 +2345,21 @@ var _ = Describe("Consolidation", func() {
break
}
}
doNotEvictPod := test.Pod(test.PodOptions{
doNotDisruptPod := test.Pod(test.PodOptions{
ObjectMeta: metav1.ObjectMeta{
Annotations: map[string]string{
v1beta1.DoNotDisruptAnnotationKey: "true",
},
},
})
ExpectApplied(ctx, env.Client, doNotEvictPod)
ExpectManualBinding(ctx, env.Client, doNotEvictPod, node)
ExpectApplied(ctx, env.Client, doNotDisruptPod)
ExpectManualBinding(ctx, env.Client, doNotDisruptPod, node)

// Step forward to satisfy the validation timeout and wait for the reconcile to finish
ExpectTriggerVerifyAction(&wg)
wg.Wait()

// we would normally be able to replace a node, but we are blocked by the do-not-evict pods during validation
// we would normally be able to replace a node, but we are blocked by the do-not-disrupt pods during validation
Expect(ExpectNodeClaims(ctx, env.Client)).To(HaveLen(1))
Expect(ExpectNodes(ctx, env.Client)).To(HaveLen(1))
ExpectExists(ctx, env.Client, node)
Expand Down Expand Up @@ -2498,22 +2498,22 @@ var _ = Describe("Consolidation", func() {
break
}
}
doNotEvictPods := test.Pods(2, test.PodOptions{
doNotDisruptPods := test.Pods(2, test.PodOptions{
ObjectMeta: metav1.ObjectMeta{
Annotations: map[string]string{
v1beta1.DoNotDisruptAnnotationKey: "true",
},
},
})
ExpectApplied(ctx, env.Client, doNotEvictPods[0], doNotEvictPods[1])
ExpectManualBinding(ctx, env.Client, doNotEvictPods[0], node)
ExpectManualBinding(ctx, env.Client, doNotEvictPods[1], node2)
ExpectApplied(ctx, env.Client, doNotDisruptPods[0], doNotDisruptPods[1])
ExpectManualBinding(ctx, env.Client, doNotDisruptPods[0], node)
ExpectManualBinding(ctx, env.Client, doNotDisruptPods[1], node2)

// Step forward to satisfy the validation timeout and wait for the reconcile to finish
ExpectTriggerVerifyAction(&wg)
wg.Wait()

// we would normally be able to consolidate down to a single node, but we are blocked by the do-not-evict pods during validation
// we would normally be able to consolidate down to a single node, but we are blocked by the do-not-disrupt pods during validation
Expect(ExpectNodeClaims(ctx, env.Client)).To(HaveLen(2))
Expect(ExpectNodes(ctx, env.Client)).To(HaveLen(2))
ExpectExists(ctx, env.Client, node)
Expand Down
7 changes: 0 additions & 7 deletions pkg/controllers/disruption/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,6 @@ func (c *Controller) Reconcile(ctx context.Context, _ reconcile.Request) (reconc
}

func (c *Controller) disrupt(ctx context.Context, disruption Method) (bool, error) {
defer metrics.Measure(deprovisioningDurationHistogram.WithLabelValues(disruption.Type()))()
defer metrics.Measure(disruptionEvaluationDurationHistogram.With(map[string]string{
methodLabel: disruption.Type(),
consolidationTypeLabel: disruption.ConsolidationType(),
Expand Down Expand Up @@ -180,10 +179,6 @@ func (c *Controller) disrupt(ctx context.Context, disruption Method) (bool, erro
}

func (c *Controller) executeCommand(ctx context.Context, m Method, cmd Command) error {
deprovisioningActionsPerformedCounter.With(map[string]string{
actionLabel: fmt.Sprintf("%s/%s", m.Type(), cmd.Action()),
deprovisionerLabel: m.Type(),
}).Inc()
disruptionActionsPerformedCounter.With(map[string]string{
actionLabel: string(cmd.Action()),
methodLabel: m.Type(),
Expand Down Expand Up @@ -224,7 +219,6 @@ func (c *Controller) executeCommand(ctx context.Context, m Method, cmd Command)
// nolint:gocyclo
func (c *Controller) launchReplacementNodeClaims(ctx context.Context, m Method, cmd Command) error {
reason := fmt.Sprintf("%s/%s", m.Type(), cmd.Action())
defer metrics.Measure(deprovisioningReplacementNodeInitializedHistogram)()
defer metrics.Measure(disruptionReplacementNodeClaimInitializedHistogram)()

stateNodes := lo.Map(cmd.candidates, func(c *Candidate, _ int) *state.StateNode { return c.StateNode })
Expand Down Expand Up @@ -254,7 +248,6 @@ func (c *Controller) launchReplacementNodeClaims(ctx context.Context, m Method,
// NodeClaim never became ready or the NodeClaims that we tried to launch got Insufficient Capacity or some
// other transient error
if err := c.waitForReadiness(ctx, nodeClaimKeys[i], reason); err != nil {
deprovisioningReplacementNodeLaunchFailedCounter.WithLabelValues(reason).Inc()
disruptionReplacementNodeClaimFailedCounter.With(map[string]string{
methodLabel: m.Type(),
consolidationTypeLabel: m.ConsolidationType(),
Expand Down
5 changes: 1 addition & 4 deletions pkg/controllers/disruption/drift.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ import (
"fmt"
"sort"

"knative.dev/pkg/logging"
"sigs.k8s.io/controller-runtime/pkg/client"

"github.com/samber/lo"
Expand Down Expand Up @@ -76,7 +75,6 @@ func (d *Drift) ComputeCommand(ctx context.Context, candidates ...*Candidate) (C
if err != nil {
return Command{}, err
}
deprovisioningEligibleMachinesGauge.WithLabelValues(d.Type()).Set(float64(len(candidates)))
disruptionEligibleNodesGauge.With(map[string]string{
methodLabel: d.Type(),
consolidationTypeLabel: d.ConsolidationType(),
Expand All @@ -101,9 +99,8 @@ func (d *Drift) ComputeCommand(ctx context.Context, candidates ...*Candidate) (C
}
return Command{}, err
}
// Log when all pods can't schedule, as the command will get executed immediately.
// Emit an event that we couldn't reschedule the pods on the node.
if !results.AllNonPendingPodsScheduled() {
logging.FromContext(ctx).With(lo.Ternary(candidate.NodeClaim.IsMachine, "machine", "nodeclaim"), candidate.NodeClaim.Name, "node", candidate.Node.Name).Debugf("cannot terminate since scheduling simulation failed to schedule all pods %s", results.NonPendingPodSchedulingErrors())
d.recorder.Publish(disruptionevents.Blocked(candidate.Node, candidate.NodeClaim, "Scheduling simulation failed to schedule all pods")...)
continue
}
Expand Down
45 changes: 45 additions & 0 deletions pkg/controllers/disruption/drift_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ import (
"knative.dev/pkg/apis"
"knative.dev/pkg/ptr"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/reconcile"

"github.com/aws/karpenter-core/pkg/apis/v1alpha5"
"github.com/aws/karpenter-core/pkg/apis/v1beta1"
Expand Down Expand Up @@ -348,6 +349,50 @@ var _ = Describe("Drift", func() {
Expect(nodeclaims[0].Name).ToNot(Equal(nodeClaim.Name))
Expect(nodes[0].Name).ToNot(Equal(node.Name))
})
It("should untaint nodes when drift replacement fails", func() {
cloudProvider.AllowedCreateCalls = 0 // fail the replacement and expect it to untaint

labels := map[string]string{
"app": "test",
}
// create our RS so we can link a pod to it
rs := test.ReplicaSet()
ExpectApplied(ctx, env.Client, rs)
Expect(env.Client.Get(ctx, client.ObjectKeyFromObject(rs), rs)).To(Succeed())

pod := test.Pod(test.PodOptions{
ObjectMeta: metav1.ObjectMeta{Labels: labels,
OwnerReferences: []metav1.OwnerReference{
{
APIVersion: "apps/v1",
Kind: "ReplicaSet",
Name: rs.Name,
UID: rs.UID,
Controller: ptr.Bool(true),
BlockOwnerDeletion: ptr.Bool(true),
},
},
},
})
ExpectApplied(ctx, env.Client, rs, nodeClaim, node, nodePool, pod)

// bind pods to node
ExpectManualBinding(ctx, env.Client, pod, node)

// inform cluster state about nodes and nodeclaims
ExpectMakeNodesAndNodeClaimsInitializedAndStateUpdated(ctx, env.Client, nodeStateController, nodeClaimStateController, []*v1.Node{node}, []*v1beta1.NodeClaim{nodeClaim})

var wg sync.WaitGroup
ExpectTriggerVerifyAction(&wg)
ExpectNewNodeClaimsDeleted(ctx, env.Client, &wg, 1)
_, err := disruptionController.Reconcile(ctx, reconcile.Request{})
Expect(err).To(HaveOccurred())
wg.Wait()

// We should have tried to create a new nodeClaim but failed to do so; therefore, we untainted the existing node
node = ExpectExists(ctx, env.Client, node)
Expect(node.Spec.Taints).ToNot(ContainElement(v1beta1.DisruptionNoScheduleTaint))
})
It("can replace drifted nodes with multiple nodes", func() {
currentInstance := fake.NewInstanceType(fake.InstanceTypeOptions{
Name: "current-on-demand",
Expand Down
1 change: 0 additions & 1 deletion pkg/controllers/disruption/emptiness.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,6 @@ func (e *Emptiness) ComputeCommand(_ context.Context, candidates ...*Candidate)
emptyCandidates := lo.Filter(candidates, func(cn *Candidate, _ int) bool {
return cn.NodeClaim.DeletionTimestamp.IsZero() && len(cn.pods) == 0
})
deprovisioningEligibleMachinesGauge.WithLabelValues(e.Type()).Set(float64(len(candidates)))
disruptionEligibleNodesGauge.With(map[string]string{
methodLabel: e.Type(),
consolidationTypeLabel: e.ConsolidationType(),
Expand Down
1 change: 0 additions & 1 deletion pkg/controllers/disruption/emptynodeconsolidation.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,6 @@ func (c *EmptyNodeConsolidation) ComputeCommand(ctx context.Context, candidates
if err != nil {
return Command{}, fmt.Errorf("sorting candidates, %w", err)
}
deprovisioningEligibleMachinesGauge.WithLabelValues(c.Type()).Set(float64(len(candidates)))
disruptionEligibleNodesGauge.With(map[string]string{
methodLabel: c.Type(),
consolidationTypeLabel: c.ConsolidationType(),
Expand Down
4 changes: 1 addition & 3 deletions pkg/controllers/disruption/expiration.go
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,6 @@ func (e *Expiration) ComputeCommand(ctx context.Context, candidates ...*Candidat
if err != nil {
return Command{}, fmt.Errorf("filtering candidates, %w", err)
}
deprovisioningEligibleMachinesGauge.WithLabelValues(e.Type()).Set(float64(len(candidates)))
disruptionEligibleNodesGauge.With(map[string]string{
methodLabel: e.Type(),
consolidationTypeLabel: e.ConsolidationType(),
Expand All @@ -105,9 +104,8 @@ func (e *Expiration) ComputeCommand(ctx context.Context, candidates ...*Candidat
}
return Command{}, err
}
// Log when all pods can't schedule, as the command will get executed immediately.
// Emit an event that we couldn't reschedule the pods on the node.
if !results.AllNonPendingPodsScheduled() {
logging.FromContext(ctx).With(lo.Ternary(candidate.NodeClaim.IsMachine, "machine", "nodeclaim"), candidate.NodeClaim.Name, "node", candidate.Node.Name).Debugf("cannot terminate since scheduling simulation failed to schedule all pods, %s", results.NonPendingPodSchedulingErrors())
e.recorder.Publish(disruptionevents.Blocked(candidate.Node, candidate.NodeClaim, "Scheduling simulation failed to schedule all pods")...)
continue
}
Expand Down
8 changes: 4 additions & 4 deletions pkg/controllers/disruption/expiration_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -396,8 +396,8 @@ var _ = Describe("Expiration", func() {
Expect(nodeclaims[0].Name).ToNot(Equal(nodeClaim.Name))
Expect(nodes[0].Name).ToNot(Equal(node.Name))
})
It("should uncordon nodes when expiration replacement fails", func() {
cloudProvider.AllowedCreateCalls = 0 // fail the replacement and expect it to uncordon
It("should untaint nodes when expiration replacement fails", func() {
cloudProvider.AllowedCreateCalls = 0 // fail the replacement and expect it to untaint

labels := map[string]string{
"app": "test",
Expand Down Expand Up @@ -436,9 +436,9 @@ var _ = Describe("Expiration", func() {
Expect(err).To(HaveOccurred())
wg.Wait()

// We should have tried to create a new nodeClaim but failed to do so; therefore, we uncordoned the existing node
// We should have tried to create a new nodeClaim but failed to do so; therefore, we untainted the existing node
node = ExpectExists(ctx, env.Client, node)
Expect(node.Spec.Unschedulable).To(BeFalse())
Expect(node.Spec.Taints).ToNot(ContainElement(v1beta1.DisruptionNoScheduleTaint))
})
It("can replace node for expiration with multiple nodes", func() {
currentInstance := fake.NewInstanceType(fake.InstanceTypeOptions{
Expand Down
67 changes: 1 addition & 66 deletions pkg/controllers/disruption/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,80 +22,15 @@ import (
)

func init() {
crmetrics.Registry.MustRegister(deprovisioningDurationHistogram, deprovisioningReplacementNodeInitializedHistogram, deprovisioningActionsPerformedCounter,
deprovisioningEligibleMachinesGauge, deprovisioningReplacementNodeLaunchFailedCounter, deprovisioningConsolidationTimeoutsCounter,
disruptionEvaluationDurationHistogram, disruptionReplacementNodeClaimInitializedHistogram, disruptionReplacementNodeClaimFailedCounter,
crmetrics.Registry.MustRegister(disruptionEvaluationDurationHistogram, disruptionReplacementNodeClaimInitializedHistogram, disruptionReplacementNodeClaimFailedCounter,
disruptionActionsPerformedCounter, disruptionEligibleNodesGauge, disruptionConsolidationTimeoutTotalCounter)
}

const (
deprovisioningSubsystem = "deprovisioning"
deprovisionerLabel = "deprovisioner"

disruptionSubsystem = "disruption"
actionLabel = "action"
methodLabel = "method"
consolidationTypeLabel = "consolidation_type"

multiMachineConsolidationLabelValue = "multi-machine"
singleMachineConsolidationLabelValue = "single-machine"
)

var (
deprovisioningDurationHistogram = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Namespace: metrics.Namespace,
Subsystem: deprovisioningSubsystem,
Name: "evaluation_duration_seconds",
Help: "Duration of the deprovisioning evaluation process in seconds.",
Buckets: metrics.DurationBuckets(),
},
[]string{"method"},
)
deprovisioningReplacementNodeInitializedHistogram = prometheus.NewHistogram(
prometheus.HistogramOpts{
Namespace: metrics.Namespace,
Subsystem: deprovisioningSubsystem,
Name: "replacement_machine_initialized_seconds",
Help: "Amount of time required for a replacement machine to become initialized.",
Buckets: metrics.DurationBuckets(),
})
deprovisioningActionsPerformedCounter = prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: metrics.Namespace,
Subsystem: deprovisioningSubsystem,
Name: "actions_performed",
Help: "Number of deprovisioning actions performed. Labeled by deprovisioner.",
},
[]string{actionLabel, deprovisionerLabel},
)
deprovisioningEligibleMachinesGauge = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: metrics.Namespace,
Subsystem: deprovisioningSubsystem,
Name: "eligible_machines",
Help: "Number of machines eligible for deprovisioning by Karpenter. Labeled by deprovisioner",
},
[]string{deprovisionerLabel},
)
deprovisioningConsolidationTimeoutsCounter = prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: metrics.Namespace,
Subsystem: deprovisioningSubsystem,
Name: "consolidation_timeouts",
Help: "Number of times the Consolidation algorithm has reached a timeout. Labeled by consolidation type.",
},
[]string{consolidationTypeLabel},
)
deprovisioningReplacementNodeLaunchFailedCounter = prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: metrics.Namespace,
Subsystem: deprovisioningSubsystem,
Name: "replacement_machine_launch_failure_counter",
Help: "The number of times that Karpenter failed to launch a replacement node for deprovisioning. Labeled by deprovisioner.",
},
[]string{deprovisionerLabel},
)
)

var (
Expand Down

0 comments on commit a9b77e7

Please sign in to comment.