Skip to content

Commit

Permalink
feat: Drop the hostname requirement when handling PV topology (#1018)
Browse files Browse the repository at this point in the history
  • Loading branch information
jonathan-innis committed Mar 8, 2024
1 parent 7ab5eca commit 1b873b5
Show file tree
Hide file tree
Showing 4 changed files with 259 additions and 8 deletions.
133 changes: 126 additions & 7 deletions pkg/controllers/disruption/suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -157,13 +157,12 @@ var _ = AfterEach(func() {

var _ = Describe("Simulate Scheduling", func() {
var nodePool *v1beta1.NodePool
var nodeClaims []*v1beta1.NodeClaim
var nodes []*v1.Node
var numNodes int
BeforeEach(func() {
numNodes = 10
nodePool = test.NodePool()
nodeClaims, nodes = test.NodeClaimsAndNodes(numNodes, v1beta1.NodeClaim{
})
It("should allow pods on deleting nodes to reschedule to uninitialized nodes", func() {
numNodes := 10
nodeClaims, nodes := test.NodeClaimsAndNodes(numNodes, v1beta1.NodeClaim{
ObjectMeta: metav1.ObjectMeta{
Finalizers: []string{"karpenter.sh/test-finalizer"},
Labels: map[string]string{
Expand All @@ -189,8 +188,6 @@ var _ = Describe("Simulate Scheduling", func() {
// inform cluster state about nodes and nodeclaims
ExpectMakeNodesAndNodeClaimsInitializedAndStateUpdated(ctx, env.Client, nodeStateController, nodeClaimStateController, nodes, nodeClaims)

})
It("should allow pods on deleting nodes to reschedule to uninitialized nodes", func() {
pod := test.Pod(test.PodOptions{
ResourceRequirements: v1.ResourceRequirements{
Requests: v1.ResourceList{
Expand Down Expand Up @@ -234,6 +231,33 @@ var _ = Describe("Simulate Scheduling", func() {
Expect(results.PodErrors[pod]).To(BeNil())
})
It("should allow multiple replace operations to happen successively", func() {
numNodes := 10
nodeClaims, nodes := test.NodeClaimsAndNodes(numNodes, v1beta1.NodeClaim{
ObjectMeta: metav1.ObjectMeta{
Finalizers: []string{"karpenter.sh/test-finalizer"},
Labels: map[string]string{
v1beta1.NodePoolLabelKey: nodePool.Name,
v1.LabelInstanceTypeStable: mostExpensiveInstance.Name,
v1beta1.CapacityTypeLabelKey: mostExpensiveOffering.CapacityType,
v1.LabelTopologyZone: mostExpensiveOffering.Zone,
},
},
Status: v1beta1.NodeClaimStatus{
Allocatable: map[v1.ResourceName]resource.Quantity{
v1.ResourceCPU: resource.MustParse("3"),
v1.ResourcePods: resource.MustParse("100"),
},
},
})
ExpectApplied(ctx, env.Client, nodePool)

for i := 0; i < numNodes; i++ {
ExpectApplied(ctx, env.Client, nodeClaims[i], nodes[i])
}

// inform cluster state about nodes and nodeclaims
ExpectMakeNodesAndNodeClaimsInitializedAndStateUpdated(ctx, env.Client, nodeStateController, nodeClaimStateController, nodes, nodeClaims)

// Create a pod for each node
pods := test.Pods(10, test.PodOptions{
ResourceRequirements: v1.ResourceRequirements{
Expand Down Expand Up @@ -336,6 +360,101 @@ var _ = Describe("Simulate Scheduling", func() {
ncs = ExpectNodeClaims(ctx, env.Client)
Expect(len(ncs)).To(Equal(13))
})
It("can replace node with a local PV (ignoring hostname affinity)", func() {
nodeClaim, node := test.NodeClaimAndNode(v1beta1.NodeClaim{
ObjectMeta: metav1.ObjectMeta{
Labels: map[string]string{
v1beta1.NodePoolLabelKey: nodePool.Name,
v1.LabelInstanceTypeStable: mostExpensiveInstance.Name,
v1beta1.CapacityTypeLabelKey: mostExpensiveOffering.CapacityType,
v1.LabelTopologyZone: mostExpensiveOffering.Zone,
},
},
Status: v1beta1.NodeClaimStatus{
Allocatable: map[v1.ResourceName]resource.Quantity{
v1.ResourceCPU: resource.MustParse("32"),
v1.ResourcePods: resource.MustParse("100"),
},
},
})
labels := map[string]string{
"app": "test",
}
// create our RS so we can link a pod to it
ss := test.StatefulSet()
ExpectApplied(ctx, env.Client, ss)

// StorageClass that references "no-provisioner" and is used for local volume storage
storageClass := test.StorageClass(test.StorageClassOptions{
ObjectMeta: metav1.ObjectMeta{
Name: "local-path",
},
Provisioner: lo.ToPtr("kubernetes.io/no-provisioner"),
})
persistentVolume := test.PersistentVolume(test.PersistentVolumeOptions{UseLocal: true})
persistentVolume.Spec.NodeAffinity = &v1.VolumeNodeAffinity{
Required: &v1.NodeSelector{
NodeSelectorTerms: []v1.NodeSelectorTerm{
{
// This PV is only valid for use against this node
MatchExpressions: []v1.NodeSelectorRequirement{
{
Key: v1.LabelHostname,
Operator: v1.NodeSelectorOpIn,
Values: []string{node.Name},
},
},
},
},
},
}
persistentVolumeClaim := test.PersistentVolumeClaim(test.PersistentVolumeClaimOptions{VolumeName: persistentVolume.Name, StorageClassName: &storageClass.Name})
pod := test.Pod(test.PodOptions{
ObjectMeta: metav1.ObjectMeta{Labels: labels,
OwnerReferences: []metav1.OwnerReference{
{
APIVersion: "apps/v1",
Kind: "StatefulSet",
Name: ss.Name,
UID: ss.UID,
Controller: ptr.Bool(true),
BlockOwnerDeletion: ptr.Bool(true),
},
},
},
PersistentVolumeClaims: []string{persistentVolumeClaim.Name},
})
ExpectApplied(ctx, env.Client, ss, pod, nodeClaim, node, nodePool, storageClass, persistentVolume, persistentVolumeClaim)

// bind pods to node
ExpectManualBinding(ctx, env.Client, pod, node)

// inform cluster state about nodes and nodeclaims
ExpectMakeNodesAndNodeClaimsInitializedAndStateUpdated(ctx, env.Client, nodeStateController, nodeClaimStateController, []*v1.Node{node}, []*v1beta1.NodeClaim{nodeClaim})

// disruption won't delete the old node until the new node is ready
var wg sync.WaitGroup
ExpectTriggerVerifyAction(&wg)
ExpectMakeNewNodeClaimsReady(ctx, env.Client, &wg, cluster, cloudProvider, 1)
ExpectReconcileSucceeded(ctx, disruptionController, types.NamespacedName{})
wg.Wait()

// Process the item so that the nodes can be deleted.
ExpectReconcileSucceeded(ctx, queue, types.NamespacedName{})
// Cascade any deletion of the nodeClaim to the node
ExpectNodeClaimsCascadeDeletion(ctx, env.Client, nodeClaim)

// Expect that the new nodeClaim was created, and it's different than the original
// We should succeed in getting a replacement, since we assume that the node affinity requirement will be invalid
// once we spin-down the old node
ExpectNotFound(ctx, env.Client, nodeClaim, node)
nodeclaims := ExpectNodeClaims(ctx, env.Client)
nodes := ExpectNodes(ctx, env.Client)
Expect(nodeclaims).To(HaveLen(1))
Expect(nodes).To(HaveLen(1))
Expect(nodeclaims[0].Name).ToNot(Equal(nodeClaim.Name))
Expect(nodes[0].Name).ToNot(Equal(node.Name))
})
})

var _ = Describe("Disruption Taints", func() {
Expand Down
9 changes: 8 additions & 1 deletion pkg/controllers/provisioning/scheduling/volumetopology.go
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,14 @@ func (v *VolumeTopology) getPersistentVolumeRequirements(ctx context.Context, po
var requirements []v1.NodeSelectorRequirement
if len(pv.Spec.NodeAffinity.Required.NodeSelectorTerms) > 0 {
// Terms are ORed, only use the first term
requirements = append(requirements, pv.Spec.NodeAffinity.Required.NodeSelectorTerms[0].MatchExpressions...)
requirements = pv.Spec.NodeAffinity.Required.NodeSelectorTerms[0].MatchExpressions
// If we are using a Local volume or a HostPath volume, then we should ignore the Hostname affinity
// on it because re-scheduling this pod to a new node means not using the same Hostname affinity that we currently have
if pv.Spec.Local != nil || pv.Spec.HostPath != nil {
requirements = lo.Reject(requirements, func(n v1.NodeSelectorRequirement, _ int) bool {
return n.Key == v1.LabelHostname
})
}
}
return requirements, nil
}
Expand Down
111 changes: 111 additions & 0 deletions pkg/controllers/provisioning/suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1540,6 +1540,117 @@ var _ = Describe("Provisioning", func() {
node := ExpectScheduled(ctx, env.Client, pod)
Expect(node.Labels).To(HaveKeyWithValue(v1.LabelTopologyZone, "test-zone-3"))
})
DescribeTable("should ignore hostname affinity scheduling when using local path volumes",
func(volumeOptions test.PersistentVolumeOptions) {
// StorageClass that references "no-provisioner" and is used for local volume storage
storageClass = test.StorageClass(test.StorageClassOptions{
ObjectMeta: metav1.ObjectMeta{
Name: "local-path",
},
Provisioner: lo.ToPtr("kubernetes.io/no-provisioner"),
})
// Create a PersistentVolume that is using a random node name for its affinity
persistentVolume := test.PersistentVolume(volumeOptions)
persistentVolume.Spec.NodeAffinity = &v1.VolumeNodeAffinity{
Required: &v1.NodeSelector{
NodeSelectorTerms: []v1.NodeSelectorTerm{
{
MatchExpressions: []v1.NodeSelectorRequirement{
{
Key: v1.LabelHostname,
Operator: v1.NodeSelectorOpIn,
Values: []string{test.RandomName()},
},
},
},
},
},
}
persistentVolumeClaim := test.PersistentVolumeClaim(test.PersistentVolumeClaimOptions{VolumeName: persistentVolume.Name, StorageClassName: &storageClass.Name})
ExpectApplied(ctx, env.Client, test.NodePool(), storageClass, persistentVolumeClaim, persistentVolume)
pod := test.UnschedulablePod(test.PodOptions{
PersistentVolumeClaims: []string{persistentVolumeClaim.Name},
})
ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, prov, pod)
// Expect that we are still able to schedule this pod to a node, even though we had a hostname affinity on it
ExpectScheduled(ctx, env.Client, pod)
},
Entry("when using local volumes", test.PersistentVolumeOptions{UseLocal: true}),
Entry("when using hostpath volumes", test.PersistentVolumeOptions{UseHostPath: true}),
)
DescribeTable("should ignore hostname affinity scheduling when using local path volumes (ephemeral volume)",
func(volumeOptions test.PersistentVolumeOptions) {
// StorageClass that references "no-provisioner" and is used for local volume storage
storageClass = test.StorageClass(test.StorageClassOptions{
ObjectMeta: metav1.ObjectMeta{
Name: "local-path",
},
Provisioner: lo.ToPtr("kubernetes.io/no-provisioner"),
})
pod := test.UnschedulablePod(test.PodOptions{
EphemeralVolumeTemplates: []test.EphemeralVolumeTemplateOptions{
{
StorageClassName: &storageClass.Name,
},
},
})
persistentVolume := test.PersistentVolume(volumeOptions)
persistentVolume.Spec.NodeAffinity = &v1.VolumeNodeAffinity{
Required: &v1.NodeSelector{
NodeSelectorTerms: []v1.NodeSelectorTerm{
{
MatchExpressions: []v1.NodeSelectorRequirement{
{
Key: v1.LabelHostname,
Operator: v1.NodeSelectorOpIn,
Values: []string{test.RandomName()},
},
},
},
},
},
}
persistentVolumeClaim := test.PersistentVolumeClaim(test.PersistentVolumeClaimOptions{
ObjectMeta: metav1.ObjectMeta{
Name: fmt.Sprintf("%s-%s", pod.Name, pod.Spec.Volumes[0].Name),
},
VolumeName: persistentVolume.Name,
StorageClassName: &storageClass.Name,
})
ExpectApplied(ctx, env.Client, test.NodePool(), storageClass, pod, persistentVolumeClaim, persistentVolume)
ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, prov, pod)
ExpectScheduled(ctx, env.Client, pod)
},
Entry("when using local volumes", test.PersistentVolumeOptions{UseLocal: true}),
Entry("when using hostpath volumes", test.PersistentVolumeOptions{UseHostPath: true}),
)
It("should not ignore hostname affinity when using non-local path volumes", func() {
// This PersistentVolume is going to use a standard CSI volume for provisioning
persistentVolume := test.PersistentVolume()
persistentVolume.Spec.NodeAffinity = &v1.VolumeNodeAffinity{
Required: &v1.NodeSelector{
NodeSelectorTerms: []v1.NodeSelectorTerm{
{
MatchExpressions: []v1.NodeSelectorRequirement{
{
Key: v1.LabelHostname,
Operator: v1.NodeSelectorOpIn,
Values: []string{test.RandomName()},
},
},
},
},
},
}
persistentVolumeClaim := test.PersistentVolumeClaim(test.PersistentVolumeClaimOptions{VolumeName: persistentVolume.Name, StorageClassName: &storageClass.Name})
ExpectApplied(ctx, env.Client, test.NodePool(), storageClass, persistentVolumeClaim, persistentVolume)
pod := test.UnschedulablePod(test.PodOptions{
PersistentVolumeClaims: []string{persistentVolumeClaim.Name},
})
ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, prov, pod)
// Expect that this pod can't schedule because we have a hostname affinity, and we don't currently have a pod that we can schedule to
ExpectNotScheduled(ctx, env.Client, pod)
})
It("should not schedule if volume zones are incompatible", func() {
persistentVolume := test.PersistentVolume(test.PersistentVolumeOptions{Zones: []string{"test-zone-3"}})
persistentVolumeClaim := test.PersistentVolumeClaim(test.PersistentVolumeClaimOptions{VolumeName: persistentVolume.Name, StorageClassName: &storageClass.Name})
Expand Down
14 changes: 14 additions & 0 deletions pkg/test/storage.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@ type PersistentVolumeOptions struct {
StorageClassName string
Driver string
UseAWSInTreeDriver bool
UseLocal bool
UseHostPath bool
}

func PersistentVolume(overrides ...PersistentVolumeOptions) *v1.PersistentVolume {
Expand All @@ -46,6 +48,18 @@ func PersistentVolume(overrides ...PersistentVolumeOptions) *v1.PersistentVolume
// Determine the PersistentVolumeSource based on the options
var source v1.PersistentVolumeSource
switch {
case options.UseLocal:
source = v1.PersistentVolumeSource{
Local: &v1.LocalVolumeSource{
Path: "/mnt/local-disks",
},
}
case options.UseHostPath:
source = v1.PersistentVolumeSource{
HostPath: &v1.HostPathVolumeSource{
Path: "/mnt/local-disks",
},
}
case options.UseAWSInTreeDriver:
source = v1.PersistentVolumeSource{
AWSElasticBlockStore: &v1.AWSElasticBlockStoreVolumeSource{
Expand Down

0 comments on commit 1b873b5

Please sign in to comment.