Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add tests for ignoring scheduler processing #121783

Merged
merged 4 commits into from
Jan 17, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
58 changes: 54 additions & 4 deletions test/e2e/autoscaling/cluster_size_autoscaling.go
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,8 @@ const (
highPriorityClassName = "high-priority"

gpuLabel = "cloud.google.com/gke-accelerator"

nonExistingBypassedSchedulerName = "non-existing-bypassed-scheduler"
)

var _ = SIGDescribe("Cluster size autoscaling", framework.WithSlow(), func() {
Expand Down Expand Up @@ -1000,6 +1002,47 @@ var _ = SIGDescribe("Cluster size autoscaling", framework.WithSlow(), func() {
framework.ExpectNoError(WaitForClusterSizeFunc(ctx, f.ClientSet,
func(size int) bool { return size == increasedSize }, time.Second))
})

f.It("should scale up when unprocessed pod is created and is going to be unschedulable", feature.ClusterScaleUpBypassScheduler, func(ctx context.Context) {
// 70% of allocatable memory of a single node * replica count, forcing a scale up in case of normal pods
replicaCount := 2 * nodeCount
reservedMemory := int(float64(replicaCount) * float64(0.7) * float64(memAllocatableMb))
ginkgo.DeferCleanup(ReserveMemoryWithSchedulerName(ctx, f, "memory-reservation", replicaCount, reservedMemory, false, 1, nonExistingBypassedSchedulerName))
// Verify that cluster size is increased
ginkgo.By("Waiting for cluster scale-up")
sizeFunc := func(size int) bool {
// Softly checks scale-up since other types of machines can be added which would affect #nodes
return size > nodeCount
}
framework.ExpectNoError(WaitForClusterSizeFuncWithUnready(ctx, f.ClientSet, sizeFunc, scaleUpTimeout, 0))
})
f.It("shouldn't scale up when unprocessed pod is created and is going to be schedulable", feature.ClusterScaleUpBypassScheduler, func(ctx context.Context) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This test should have been marked as "slow" because it blocks for 5 minutes (= scaleUpTimeout).

// 50% of allocatable memory of a single node, so that no scale up would trigger in normal cases
replicaCount := 1
reservedMemory := int(float64(0.5) * float64(memAllocatableMb))
ginkgo.DeferCleanup(ReserveMemoryWithSchedulerName(ctx, f, "memory-reservation", replicaCount, reservedMemory, false, 1, nonExistingBypassedSchedulerName))
// Verify that cluster size is the same
ginkgo.By(fmt.Sprintf("Waiting for scale up hoping it won't happen, sleep for %s", scaleUpTimeout.String()))
time.Sleep(scaleUpTimeout)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why I a sleep?

can't we do an active loop with a wait.Poll per example?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Mainly to be consistent with other test(s) that doesn't expect a scale up/down, in this test (and the next one) we're expecting no scale-up/down.
for example this test https://github.com/kubernetes/kubernetes/pull/121783/files/6073d1cd3d58384d12a750bda749ff1922812be3#diff-4f7cc8ec3b56aa879a019633705140fea00aafda1495d321704f92bd31ec6468R1001 doesn't expect a scale down so it sleeps for a while and then checks for the size

We can update it however to use a Poll up to a timeout and update other tests that sleep as well

Updated to use gomega.Consistently instead of sleeping

sizeFunc := func(size int) bool {
return size == nodeCount
}
framework.ExpectNoError(WaitForClusterSizeFuncWithUnready(ctx, f.ClientSet, sizeFunc, time.Second, 0))
})
f.It("shouldn't scale up when unprocessed pod is created and scheduler is not specified to be bypassed", feature.ClusterScaleUpBypassScheduler, func(ctx context.Context) {
// 70% of allocatable memory of a single node * replica count, forcing a scale up in case of normal pods
replicaCount := 2 * nodeCount
reservedMemory := int(float64(replicaCount) * float64(0.7) * float64(memAllocatableMb))
schedulerName := "non-existent-scheduler-" + f.UniqueName
ginkgo.DeferCleanup(ReserveMemoryWithSchedulerName(ctx, f, "memory-reservation", replicaCount, reservedMemory, false, 1, schedulerName))
// Verify that cluster size is the same
ginkgo.By(fmt.Sprintf("Waiting for scale up hoping it won't happen, sleep for %s", scaleUpTimeout.String()))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IIUC we are not actually sleeping here, while we should.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, mb I forgot to keep it consistent with the prev test, Should be alright now

time.Sleep(scaleUpTimeout)
sizeFunc := func(size int) bool {
return size == nodeCount
}
framework.ExpectNoError(WaitForClusterSizeFuncWithUnready(ctx, f.ClientSet, sizeFunc, time.Second, 0))
})
})

func installNvidiaDriversDaemonSet(ctx context.Context, f *framework.Framework) {
Expand Down Expand Up @@ -1298,7 +1341,7 @@ func getPoolSize(ctx context.Context, f *framework.Framework, poolName string) i
return size
}

func reserveMemory(ctx context.Context, f *framework.Framework, id string, replicas, megabytes int, expectRunning bool, timeout time.Duration, selector map[string]string, tolerations []v1.Toleration, priorityClassName string) func() error {
func reserveMemory(ctx context.Context, f *framework.Framework, id string, replicas, megabytes int, expectRunning bool, timeout time.Duration, selector map[string]string, tolerations []v1.Toleration, priorityClassName, schedulerName string) func() error {
ginkgo.By(fmt.Sprintf("Running RC which reserves %v MB of memory", megabytes))
request := int64(1024 * 1024 * megabytes / replicas)
config := &testutils.RCConfig{
Expand All @@ -1312,6 +1355,7 @@ func reserveMemory(ctx context.Context, f *framework.Framework, id string, repli
NodeSelector: selector,
Tolerations: tolerations,
PriorityClassName: priorityClassName,
SchedulerName: schedulerName,
}
for start := time.Now(); time.Since(start) < rcCreationRetryTimeout; time.Sleep(rcCreationRetryDelay) {
err := e2erc.RunRC(ctx, *config)
Expand All @@ -1333,19 +1377,25 @@ func reserveMemory(ctx context.Context, f *framework.Framework, id string, repli
// ReserveMemoryWithPriority creates a replication controller with pods with priority that, in summation,
// request the specified amount of memory.
func ReserveMemoryWithPriority(ctx context.Context, f *framework.Framework, id string, replicas, megabytes int, expectRunning bool, timeout time.Duration, priorityClassName string) func() error {
return reserveMemory(ctx, f, id, replicas, megabytes, expectRunning, timeout, nil, nil, priorityClassName)
return reserveMemory(ctx, f, id, replicas, megabytes, expectRunning, timeout, nil, nil, priorityClassName, "")
}

// ReserveMemoryWithSelectorAndTolerations creates a replication controller with pods with node selector that, in summation,
// request the specified amount of memory.
func ReserveMemoryWithSelectorAndTolerations(ctx context.Context, f *framework.Framework, id string, replicas, megabytes int, expectRunning bool, timeout time.Duration, selector map[string]string, tolerations []v1.Toleration) func() error {
return reserveMemory(ctx, f, id, replicas, megabytes, expectRunning, timeout, selector, tolerations, "")
return reserveMemory(ctx, f, id, replicas, megabytes, expectRunning, timeout, selector, tolerations, "", "")
}

// ReserveMemoryWithSchedulerName creates a replication controller with pods with scheduler name that, in summation,
// request the specified amount of memory.
func ReserveMemoryWithSchedulerName(ctx context.Context, f *framework.Framework, id string, replicas, megabytes int, expectRunning bool, timeout time.Duration, schedulerName string) func() error {
return reserveMemory(ctx, f, id, replicas, megabytes, expectRunning, timeout, nil, nil, "", schedulerName)
}

// ReserveMemory creates a replication controller with pods that, in summation,
// request the specified amount of memory.
func ReserveMemory(ctx context.Context, f *framework.Framework, id string, replicas, megabytes int, expectRunning bool, timeout time.Duration) func() error {
return reserveMemory(ctx, f, id, replicas, megabytes, expectRunning, timeout, nil, nil, "")
return reserveMemory(ctx, f, id, replicas, megabytes, expectRunning, timeout, nil, nil, "", "")
}

// WaitForClusterSizeFunc waits until the cluster size matches the given function.
Expand Down
1 change: 1 addition & 0 deletions test/e2e/feature/feature.go
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,7 @@ var (
Windows = framework.WithFeature(framework.ValidFeatures.Add("Windows"))
WindowsHostProcessContainers = framework.WithFeature(framework.ValidFeatures.Add("WindowsHostProcessContainers"))
WindowsHyperVContainers = framework.WithFeature(framework.ValidFeatures.Add("WindowsHyperVContainers"))
ClusterScaleUpBypassScheduler = framework.WithFeature(framework.ValidFeatures.Add("ClusterScaleUpBypassScheduler"))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In the future please keep this in alphabetical order (#123260 will make that more obvious).

)

func init() {
Expand Down
4 changes: 3 additions & 1 deletion test/utils/runners.go
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,7 @@ type RCConfig struct {
PriorityClassName string
TerminationGracePeriodSeconds *int64
Lifecycle *v1.Lifecycle
SchedulerName string

// Env vars, set the same for every pod.
Env map[string]string
Expand Down Expand Up @@ -615,7 +616,8 @@ func (config *RCConfig) create() error {
Annotations: config.Annotations,
},
Spec: v1.PodSpec{
Affinity: config.Affinity,
SchedulerName: config.SchedulerName,
Affinity: config.Affinity,
Containers: []v1.Container{
{
Name: config.Name,
Expand Down