Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Attempt to deflake networking tests in large clusters #98181

Merged
merged 1 commit into from Jan 20, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
4 changes: 2 additions & 2 deletions test/e2e/framework/service/const.go
Expand Up @@ -51,10 +51,10 @@ const (
// LoadBalancerCreateTimeoutDefault is the default time to wait for a load balancer to be created/modified.
// TODO: once support ticket 21807001 is resolved, reduce this timeout back to something reasonable
// Hideen - use GetServiceLoadBalancerCreateTimeout function instead.
loadBalancerCreateTimeoutDefault = 20 * time.Minute
loadBalancerCreateTimeoutDefault = 10 * time.Minute
// LoadBalancerCreateTimeoutLarge is the maximum time to wait for a load balancer to be created/modified.
// Hideen - use GetServiceLoadBalancerCreateTimeout function instead.
loadBalancerCreateTimeoutLarge = 2 * time.Hour
loadBalancerCreateTimeoutLarge = 45 * time.Minute

// LoadBalancerPropagationTimeoutDefault is the default time to wait for pods to
// be targeted by load balancers.
Expand Down
38 changes: 21 additions & 17 deletions test/e2e/network/service.go
Expand Up @@ -2151,10 +2151,6 @@ var _ = SIGDescribe("Services", func() {
// this feature currently supported only on GCE/GKE/AWS
e2eskipper.SkipUnlessProviderIs("gce", "gke", "aws")

loadBalancerLagTimeout := e2eservice.LoadBalancerLagTimeoutDefault
if framework.ProviderIs("aws") {
loadBalancerLagTimeout = e2eservice.LoadBalancerLagTimeoutAWS
}
loadBalancerCreateTimeout := e2eservice.GetServiceLoadBalancerCreationTimeout(cs)

namespace := f.Namespace.Name
Expand Down Expand Up @@ -2193,17 +2189,16 @@ var _ = SIGDescribe("Services", func() {
svc, err = jig.WaitForLoadBalancer(loadBalancerCreateTimeout)
framework.ExpectNoError(err)

// timeout when we haven't just created the load balancer
normalReachabilityTimeout := 2 * time.Minute

ginkgo.By("check reachability from different sources")
svcIP := e2eservice.GetIngressPoint(&svc.Status.LoadBalancer.Ingress[0])
// Wait longer as this is our first request after creation. We can't check using a separate method,
// because the LB should only be reachable from the "accept" pod
checkReachabilityFromPod(true, loadBalancerLagTimeout, namespace, acceptPod.Name, svcIP)
checkReachabilityFromPod(false, normalReachabilityTimeout, namespace, dropPod.Name, svcIP)

// Make sure dropPod is running. There are certain chances that the pod might be teminated due to unexpected reasons. dropPod, err = cs.CoreV1().Pods(namespace).Get(dropPod.Name, metav1.GetOptions{})
// We should wait until service changes are actually propagated in the cloud-provider,
// as this may take significant amount of time, especially in large clusters.
// However, the information whether it was already programmed isn't achievable.
// So we're resolving it by using loadBalancerCreateTimeout that takes cluster size into account.
checkReachabilityFromPod(true, loadBalancerCreateTimeout, namespace, acceptPod.Name, svcIP)
checkReachabilityFromPod(false, loadBalancerCreateTimeout, namespace, dropPod.Name, svcIP)

// Make sure dropPod is running. There are certain chances that the pod might be teminated due to unexpected reasons.
dropPod, err = cs.CoreV1().Pods(namespace).Get(context.TODO(), dropPod.Name, metav1.GetOptions{})
framework.ExpectNoError(err, "Unable to get pod %s", dropPod.Name)
framework.ExpectEqual(acceptPod.Status.Phase, v1.PodRunning)
Expand All @@ -2215,16 +2210,25 @@ var _ = SIGDescribe("Services", func() {
svc.Spec.LoadBalancerSourceRanges = []string{dropPod.Status.PodIP + "/32"}
})
framework.ExpectNoError(err)
checkReachabilityFromPod(false, normalReachabilityTimeout, namespace, acceptPod.Name, svcIP)
checkReachabilityFromPod(true, normalReachabilityTimeout, namespace, dropPod.Name, svcIP)

// We should wait until service changes are actually propagates, as this may take
// significant amount of time, especially in large clusters.
// However, the information whether it was already programmed isn't achievable.
// So we're resolving it by using loadBalancerCreateTimeout that takes cluster size into account.
checkReachabilityFromPod(false, loadBalancerCreateTimeout, namespace, acceptPod.Name, svcIP)
checkReachabilityFromPod(true, loadBalancerCreateTimeout, namespace, dropPod.Name, svcIP)

ginkgo.By("Delete LoadBalancerSourceRange field and check reachability")
_, err = jig.UpdateService(func(svc *v1.Service) {
svc.Spec.LoadBalancerSourceRanges = nil
})
framework.ExpectNoError(err)
checkReachabilityFromPod(true, normalReachabilityTimeout, namespace, acceptPod.Name, svcIP)
checkReachabilityFromPod(true, normalReachabilityTimeout, namespace, dropPod.Name, svcIP)
// We should wait until service changes are actually propagates, as this may take
// significant amount of time, especially in large clusters.
// However, the information whether it was already programmed isn't achievable.
// So we're resolving it by using loadBalancerCreateTimeout that takes cluster size into account.
checkReachabilityFromPod(true, loadBalancerCreateTimeout, namespace, acceptPod.Name, svcIP)
checkReachabilityFromPod(true, loadBalancerCreateTimeout, namespace, dropPod.Name, svcIP)
})

ginkgo.It("should be able to create an internal type load balancer [Slow]", func() {
Expand Down