Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add tests for ignoring scheduler processing #121783

Merged
merged 4 commits into from
Jan 17, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
60 changes: 56 additions & 4 deletions test/e2e/autoscaling/cluster_size_autoscaling.go
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,8 @@ const (
highPriorityClassName = "high-priority"

gpuLabel = "cloud.google.com/gke-accelerator"

nonExistingBypassedSchedulerName = "non-existing-bypassed-scheduler"
)

var _ = SIGDescribe("Cluster size autoscaling", framework.WithSlow(), func() {
Expand Down Expand Up @@ -1000,6 +1002,49 @@ var _ = SIGDescribe("Cluster size autoscaling", framework.WithSlow(), func() {
framework.ExpectNoError(WaitForClusterSizeFunc(ctx, f.ClientSet,
func(size int) bool { return size == increasedSize }, time.Second))
})

f.It("should scale up when unprocessed pod is created and is going to be unschedulable", feature.ClusterScaleUpBypassScheduler, func(ctx context.Context) {
// 70% of allocatable memory of a single node * replica count, forcing a scale up in case of normal pods
replicaCount := 2 * nodeCount
reservedMemory := int(float64(replicaCount) * float64(0.7) * float64(memAllocatableMb))
ginkgo.DeferCleanup(ReserveMemoryWithSchedulerName(ctx, f, "memory-reservation", replicaCount, reservedMemory, false, 1, nonExistingBypassedSchedulerName))
// Verify that cluster size is increased
ginkgo.By("Waiting for cluster scale-up")
sizeFunc := func(size int) bool {
// Softly checks scale-up since other types of machines can be added which would affect #nodes
return size > nodeCount
}
framework.ExpectNoError(WaitForClusterSizeFuncWithUnready(ctx, f.ClientSet, sizeFunc, scaleUpTimeout, 0))
})
f.It("shouldn't scale up when unprocessed pod is created and is going to be schedulable", feature.ClusterScaleUpBypassScheduler, func(ctx context.Context) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This test should have been marked as "slow" because it blocks for 5 minutes (= scaleUpTimeout).

// 50% of allocatable memory of a single node, so that no scale up would trigger in normal cases
replicaCount := 1
reservedMemory := int(float64(0.5) * float64(memAllocatableMb))
ginkgo.DeferCleanup(ReserveMemoryWithSchedulerName(ctx, f, "memory-reservation", replicaCount, reservedMemory, false, 1, nonExistingBypassedSchedulerName))
// Verify that cluster size is the same
ginkgo.By(fmt.Sprintf("Waiting for scale up hoping it won't happen, polling cluster size for %s", scaleUpTimeout.String()))
sizeFunc := func(size int) bool {
return size == nodeCount
}
gomega.Consistently(ctx, func() error {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Consistently or Eventually? https://onsi.github.io/gomega/#eventually

return WaitForClusterSizeFunc(ctx, f.ClientSet, sizeFunc, time.Second)
}).WithTimeout(scaleUpTimeout).WithPolling(framework.Poll).ShouldNot(gomega.HaveOccurred())
})
f.It("shouldn't scale up when unprocessed pod is created and scheduler is not specified to be bypassed", feature.ClusterScaleUpBypassScheduler, func(ctx context.Context) {
// 70% of allocatable memory of a single node * replica count, forcing a scale up in case of normal pods
replicaCount := 2 * nodeCount
reservedMemory := int(float64(replicaCount) * float64(0.7) * float64(memAllocatableMb))
schedulerName := "non-existent-scheduler-" + f.UniqueName
ginkgo.DeferCleanup(ReserveMemoryWithSchedulerName(ctx, f, "memory-reservation", replicaCount, reservedMemory, false, 1, schedulerName))
// Verify that cluster size is the same
ginkgo.By(fmt.Sprintf("Waiting for scale up hoping it won't happen, polling cluster size for %s", scaleUpTimeout.String()))
sizeFunc := func(size int) bool {
return size == nodeCount
}
gomega.Consistently(ctx, func() error {
return WaitForClusterSizeFunc(ctx, f.ClientSet, sizeFunc, time.Second)
}).WithTimeout(scaleUpTimeout).WithPolling(framework.Poll).ShouldNot(gomega.HaveOccurred())
})
})

func installNvidiaDriversDaemonSet(ctx context.Context, f *framework.Framework) {
Expand Down Expand Up @@ -1298,7 +1343,7 @@ func getPoolSize(ctx context.Context, f *framework.Framework, poolName string) i
return size
}

func reserveMemory(ctx context.Context, f *framework.Framework, id string, replicas, megabytes int, expectRunning bool, timeout time.Duration, selector map[string]string, tolerations []v1.Toleration, priorityClassName string) func() error {
func reserveMemory(ctx context.Context, f *framework.Framework, id string, replicas, megabytes int, expectRunning bool, timeout time.Duration, selector map[string]string, tolerations []v1.Toleration, priorityClassName, schedulerName string) func() error {
ginkgo.By(fmt.Sprintf("Running RC which reserves %v MB of memory", megabytes))
request := int64(1024 * 1024 * megabytes / replicas)
config := &testutils.RCConfig{
Expand All @@ -1312,6 +1357,7 @@ func reserveMemory(ctx context.Context, f *framework.Framework, id string, repli
NodeSelector: selector,
Tolerations: tolerations,
PriorityClassName: priorityClassName,
SchedulerName: schedulerName,
}
for start := time.Now(); time.Since(start) < rcCreationRetryTimeout; time.Sleep(rcCreationRetryDelay) {
err := e2erc.RunRC(ctx, *config)
Expand All @@ -1333,19 +1379,25 @@ func reserveMemory(ctx context.Context, f *framework.Framework, id string, repli
// ReserveMemoryWithPriority creates a replication controller with pods with priority that, in summation,
// request the specified amount of memory.
func ReserveMemoryWithPriority(ctx context.Context, f *framework.Framework, id string, replicas, megabytes int, expectRunning bool, timeout time.Duration, priorityClassName string) func() error {
return reserveMemory(ctx, f, id, replicas, megabytes, expectRunning, timeout, nil, nil, priorityClassName)
return reserveMemory(ctx, f, id, replicas, megabytes, expectRunning, timeout, nil, nil, priorityClassName, "")
}

// ReserveMemoryWithSelectorAndTolerations creates a replication controller with pods with node selector that, in summation,
// request the specified amount of memory.
func ReserveMemoryWithSelectorAndTolerations(ctx context.Context, f *framework.Framework, id string, replicas, megabytes int, expectRunning bool, timeout time.Duration, selector map[string]string, tolerations []v1.Toleration) func() error {
return reserveMemory(ctx, f, id, replicas, megabytes, expectRunning, timeout, selector, tolerations, "")
return reserveMemory(ctx, f, id, replicas, megabytes, expectRunning, timeout, selector, tolerations, "", "")
}

// ReserveMemoryWithSchedulerName creates a replication controller with pods with scheduler name that, in summation,
// request the specified amount of memory.
func ReserveMemoryWithSchedulerName(ctx context.Context, f *framework.Framework, id string, replicas, megabytes int, expectRunning bool, timeout time.Duration, schedulerName string) func() error {
return reserveMemory(ctx, f, id, replicas, megabytes, expectRunning, timeout, nil, nil, "", schedulerName)
}

// ReserveMemory creates a replication controller with pods that, in summation,
// request the specified amount of memory.
func ReserveMemory(ctx context.Context, f *framework.Framework, id string, replicas, megabytes int, expectRunning bool, timeout time.Duration) func() error {
return reserveMemory(ctx, f, id, replicas, megabytes, expectRunning, timeout, nil, nil, "")
return reserveMemory(ctx, f, id, replicas, megabytes, expectRunning, timeout, nil, nil, "", "")
}

// WaitForClusterSizeFunc waits until the cluster size matches the given function.
Expand Down
1 change: 1 addition & 0 deletions test/e2e/feature/feature.go
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,7 @@ var (
Windows = framework.WithFeature(framework.ValidFeatures.Add("Windows"))
WindowsHostProcessContainers = framework.WithFeature(framework.ValidFeatures.Add("WindowsHostProcessContainers"))
WindowsHyperVContainers = framework.WithFeature(framework.ValidFeatures.Add("WindowsHyperVContainers"))
ClusterScaleUpBypassScheduler = framework.WithFeature(framework.ValidFeatures.Add("ClusterScaleUpBypassScheduler"))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In the future please keep this in alphabetical order (#123260 will make that more obvious).

)

func init() {
Expand Down
4 changes: 3 additions & 1 deletion test/utils/runners.go
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,7 @@ type RCConfig struct {
PriorityClassName string
TerminationGracePeriodSeconds *int64
Lifecycle *v1.Lifecycle
SchedulerName string

// Env vars, set the same for every pod.
Env map[string]string
Expand Down Expand Up @@ -615,7 +616,8 @@ func (config *RCConfig) create() error {
Annotations: config.Annotations,
},
Spec: v1.PodSpec{
Affinity: config.Affinity,
SchedulerName: config.SchedulerName,
Affinity: config.Affinity,
Containers: []v1.Container{
{
Name: config.Name,
Expand Down