Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

intra-pod test improvements for bug-triage and node-level reporting #93837

Merged
merged 7 commits into from Sep 22, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
48 changes: 42 additions & 6 deletions test/e2e/common/networking.go
Expand Up @@ -18,6 +18,7 @@ package common

import (
"github.com/onsi/ginkgo"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/util/sets"
"k8s.io/kubernetes/test/e2e/framework"
e2enetwork "k8s.io/kubernetes/test/e2e/framework/network"
Expand All @@ -28,6 +29,45 @@ var _ = ginkgo.Describe("[sig-network] Networking", func() {

ginkgo.Describe("Granular Checks: Pods", func() {

checkNodeConnectivity := func(config *e2enetwork.NetworkingTestConfig, protocol string, port int) {
// breadth first poll to quickly estimate failure.
failedPodsByHost := map[string][]*v1.Pod{}
// First time, we'll quickly try all pods, breadth first.
for _, endpointPod := range config.EndpointPods {
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

... note, this first 'shot' doesn't change the test semantics, but it allows us to separate the test output between successful nodes , and potentially problematic ones, meaning its very natural to browse the diagnostic results.

framework.Logf("Breadth first check of %v on host %v...", endpointPod.Status.PodIP, endpointPod.Status.HostIP)
if err := config.DialFromTestContainer(protocol, endpointPod.Status.PodIP, port, 1, 0, sets.NewString(endpointPod.Name)); err != nil {
if _, ok := failedPodsByHost[endpointPod.Status.HostIP]; !ok {
failedPodsByHost[endpointPod.Status.HostIP] = []*v1.Pod{}
}
failedPodsByHost[endpointPod.Status.HostIP] = append(failedPodsByHost[endpointPod.Status.HostIP], endpointPod)
framework.Logf("...failed...will try again in next pass")
}
}
errors := []error{}
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this struct is aggregated later, so that the end resulting test , all errors are printed out together, and ratio of polls that failed is available , so that you can clearly know wether a cluster is in catastrophic state or wether there's just a few problematic nodes

// Second time, we pass through pods more carefully...
framework.Logf("Going to retry %v out of %v pods....", len(failedPodsByHost), len(config.EndpointPods))
for host, failedPods := range failedPodsByHost {
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if one node is down, all the pods from that node will be shown together.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(imo) this isn't fundamentally changing the tests because, there is no order guarantee in the conformance definition. that said, if we want it randomly ordered by pod, i can still do that explicitly.

Note that the default is going to be polling like 80 times for a 6 node cluster (nodes^2 + 30) , so this test is highly stable in terms of retries, so the ordering isn't a sensitive thing at all.

framework.Logf("Doublechecking %v pods in host %v which werent seen the first time.", len(failedPods), host)
for _, endpointPod := range failedPods {
framework.Logf("Now attempting to probe pod [[[ %v ]]]", endpointPod.Status.PodIP)
if err := config.DialFromTestContainer(protocol, endpointPod.Status.PodIP, port, config.MaxTries, 0, sets.NewString(endpointPod.Name)); err != nil {
errors = append(errors, err)
} else {
framework.Logf("Was able to reach %v on %v ", endpointPod.Status.PodIP, endpointPod.Status.HostIP)
}
framework.Logf("... Done probing pod [[[ %v ]]]", endpointPod.Status.PodIP)
}
framework.Logf("succeeded at polling %v out of %v connections", len(config.EndpointPods)-len(errors), len(config.EndpointPods))
}
if len(errors) > 0 {
framework.Logf("pod polling failure summary:")
for _, e := range errors {
framework.Logf("Collected error: %v", e)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This log message seems duplicated with line 55 to explain what error happens on each DialFromTestContainer() call.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍

}
framework.Failf("failed, %v out of %v connections failed", len(errors), len(config.EndpointPods))
}
}

// Try to hit all endpoints through a test container, retry 5 times,
// expect exactly one unique hostname. Each of these endpoints reports
// its own hostname.
Expand All @@ -39,9 +79,7 @@ var _ = ginkgo.Describe("[sig-network] Networking", func() {
*/
framework.ConformanceIt("should function for intra-pod communication: http [NodeConformance]", func() {
config := e2enetwork.NewCoreNetworkingTestConfig(f, false)
for _, endpointPod := range config.EndpointPods {
config.DialFromTestContainer("http", endpointPod.Status.PodIP, e2enetwork.EndpointHTTPPort, config.MaxTries, 0, sets.NewString(endpointPod.Name))
}
checkNodeConnectivity(config, "http", e2enetwork.EndpointHTTPPort)
})

/*
Expand All @@ -52,9 +90,7 @@ var _ = ginkgo.Describe("[sig-network] Networking", func() {
*/
framework.ConformanceIt("should function for intra-pod communication: udp [NodeConformance]", func() {
config := e2enetwork.NewCoreNetworkingTestConfig(f, false)
for _, endpointPod := range config.EndpointPods {
config.DialFromTestContainer("udp", endpointPod.Status.PodIP, e2enetwork.EndpointUDPPort, config.MaxTries, 0, sets.NewString(endpointPod.Name))
}
checkNodeConnectivity(config, "udp", e2enetwork.EndpointUDPPort)
})

/*
Expand Down
25 changes: 15 additions & 10 deletions test/e2e/framework/network/utils.go
Expand Up @@ -167,17 +167,17 @@ type NetexecDialResponse struct {
}

// DialFromEndpointContainer executes a curl via kubectl exec in an endpoint container.
func (config *NetworkingTestConfig) DialFromEndpointContainer(protocol, targetIP string, targetPort, maxTries, minTries int, expectedEps sets.String) {
config.DialFromContainer(protocol, echoHostname, config.EndpointPods[0].Status.PodIP, targetIP, EndpointHTTPPort, targetPort, maxTries, minTries, expectedEps)
func (config *NetworkingTestConfig) DialFromEndpointContainer(protocol, targetIP string, targetPort, maxTries, minTries int, expectedEps sets.String) error {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I 'm afraid we have to revert this commit, this method is used by other tests, and now instead of failing it returns an error, but the other tests are not asserting the error, so a failure will go unnoticed

ginkgo.It("should function for endpoint-Service: http", func() {
config := e2enetwork.NewNetworkingTestConfig(f, false, false)
ginkgo.By(fmt.Sprintf("dialing(http) %v (endpoint) --> %v:%v (config.clusterIP)", config.EndpointPods[0].Name, config.ClusterIP, e2enetwork.ClusterHTTPPort))
config.DialFromEndpointContainer("http", config.ClusterIP, e2enetwork.ClusterHTTPPort, config.MaxTries, 0, config.EndpointHostnames())
ginkgo.By(fmt.Sprintf("dialing(http) %v (endpoint) --> %v:%v (nodeIP)", config.EndpointPods[0].Name, config.NodeIP, config.NodeHTTPPort))
config.DialFromEndpointContainer("http", config.NodeIP, config.NodeHTTPPort, config.MaxTries, 0, config.EndpointHostnames())
})
ginkgo.It("should function for endpoint-Service: udp", func() {
config := e2enetwork.NewNetworkingTestConfig(f, false, false)
ginkgo.By(fmt.Sprintf("dialing(udp) %v (endpoint) --> %v:%v (config.clusterIP)", config.EndpointPods[0].Name, config.ClusterIP, e2enetwork.ClusterUDPPort))
config.DialFromEndpointContainer("udp", config.ClusterIP, e2enetwork.ClusterUDPPort, config.MaxTries, 0, config.EndpointHostnames())
ginkgo.By(fmt.Sprintf("dialing(udp) %v (endpoint) --> %v:%v (nodeIP)", config.EndpointPods[0].Name, config.NodeIP, config.NodeUDPPort))
config.DialFromEndpointContainer("udp", config.NodeIP, config.NodeUDPPort, config.MaxTries, 0, config.EndpointHostnames())
})
// This test ensures that in a situation where multiple services exist with the same selector,
// deleting one of the services does not affect the connectivity of the remaining service
ginkgo.It("should function for multiple endpoint-Services with same selector", func() {
config := e2enetwork.NewNetworkingTestConfig(f, false, false)
ginkgo.By("creating a second service with same selector")
svc2, httpPort := createSecondNodePortService(f, config)
// original service should work
ginkgo.By(fmt.Sprintf("dialing(http) %v (endpoint) --> %v:%v (config.clusterIP)", config.EndpointPods[0].Name, config.ClusterIP, e2enetwork.ClusterHTTPPort))
config.DialFromEndpointContainer("http", config.ClusterIP, e2enetwork.ClusterHTTPPort, config.MaxTries, 0, config.EndpointHostnames())
ginkgo.By(fmt.Sprintf("dialing(http) %v (endpoint) --> %v:%v (nodeIP)", config.EndpointPods[0].Name, config.NodeIP, config.NodeHTTPPort))
config.DialFromEndpointContainer("http", config.NodeIP, config.NodeHTTPPort, config.MaxTries, 0, config.EndpointHostnames())
// Dial second service
ginkgo.By(fmt.Sprintf("dialing(http) %v (endpoint) --> %v:%v (svc2.clusterIP)", config.EndpointPods[0].Name, svc2.Spec.ClusterIP, e2enetwork.ClusterHTTPPort))
config.DialFromEndpointContainer("http", svc2.Spec.ClusterIP, e2enetwork.ClusterHTTPPort, config.MaxTries, 0, config.EndpointHostnames())
ginkgo.By(fmt.Sprintf("dialing(http) %v (endpoint) --> %v:%v (nodeIP)", config.EndpointPods[0].Name, config.NodeIP, httpPort))
config.DialFromEndpointContainer("http", config.NodeIP, httpPort, config.MaxTries, 0, config.EndpointHostnames())
ginkgo.By("deleting the original node port service")
config.DeleteNodePortService()
// Second service should continue to function unaffected
ginkgo.By(fmt.Sprintf("dialing(http) %v (endpoint) --> %v:%v (svc2.clusterIP)", config.EndpointPods[0].Name, svc2.Spec.ClusterIP, e2enetwork.ClusterHTTPPort))
config.DialFromEndpointContainer("http", svc2.Spec.ClusterIP, e2enetwork.ClusterHTTPPort, config.MaxTries, 0, config.EndpointHostnames())
ginkgo.By(fmt.Sprintf("dialing(http) %v (endpoint) --> %v:%v (nodeIP)", config.EndpointPods[0].Name, config.NodeIP, httpPort))
config.DialFromEndpointContainer("http", config.NodeIP, httpPort, config.MaxTries, 0, config.EndpointHostnames())

return config.DialFromContainer(protocol, echoHostname, config.EndpointPods[0].Status.PodIP, targetIP, EndpointHTTPPort, targetPort, maxTries, minTries, expectedEps)
}

// DialFromTestContainer executes a curl via kubectl exec in a test container.
func (config *NetworkingTestConfig) DialFromTestContainer(protocol, targetIP string, targetPort, maxTries, minTries int, expectedEps sets.String) {
config.DialFromContainer(protocol, echoHostname, config.TestContainerPod.Status.PodIP, targetIP, testContainerHTTPPort, targetPort, maxTries, minTries, expectedEps)
func (config *NetworkingTestConfig) DialFromTestContainer(protocol, targetIP string, targetPort, maxTries, minTries int, expectedEps sets.String) error {
return config.DialFromContainer(protocol, echoHostname, config.TestContainerPod.Status.PodIP, targetIP, testContainerHTTPPort, targetPort, maxTries, minTries, expectedEps)
}

// DialEchoFromTestContainer executes a curl via kubectl exec in a test container. The response is expected to match the echoMessage.
func (config *NetworkingTestConfig) DialEchoFromTestContainer(protocol, targetIP string, targetPort, maxTries, minTries int, echoMessage string) {
func (config *NetworkingTestConfig) DialEchoFromTestContainer(protocol, targetIP string, targetPort, maxTries, minTries int, echoMessage string) error {
expectedResponse := sets.NewString()
expectedResponse.Insert(echoMessage)
var dialCommand string
Expand All @@ -191,7 +191,7 @@ func (config *NetworkingTestConfig) DialEchoFromTestContainer(protocol, targetIP
} else {
dialCommand = fmt.Sprintf("echo%%20%s", echoMessage)
}
config.DialFromContainer(protocol, dialCommand, config.TestContainerPod.Status.PodIP, targetIP, testContainerHTTPPort, targetPort, maxTries, minTries, expectedResponse)
return config.DialFromContainer(protocol, dialCommand, config.TestContainerPod.Status.PodIP, targetIP, testContainerHTTPPort, targetPort, maxTries, minTries, expectedResponse)
}

// diagnoseMissingEndpoints prints debug information about the endpoints that
Expand Down Expand Up @@ -248,7 +248,8 @@ func makeCURLDialCommand(ipPort, dialCmd, protocol, targetIP string, targetPort
// maxTries == minTries will confirm that we see the expected endpoints and no
// more for maxTries. Use this if you want to eg: fail a readiness check on a
// pod and confirm it doesn't show up as an endpoint.
func (config *NetworkingTestConfig) DialFromContainer(protocol, dialCommand, containerIP, targetIP string, containerHTTPPort, targetPort, maxTries, minTries int, expectedResponses sets.String) {
// Returns nil if no error, or error message if failed after trying maxTries.
func (config *NetworkingTestConfig) DialFromContainer(protocol, dialCommand, containerIP, targetIP string, containerHTTPPort, targetPort, maxTries, minTries int, expectedResponses sets.String) error {
ipPort := net.JoinHostPort(containerIP, strconv.Itoa(containerHTTPPort))
cmd := makeCURLDialCommand(ipPort, dialCommand, protocol, targetIP, targetPort)

Expand All @@ -273,16 +274,19 @@ func (config *NetworkingTestConfig) DialFromContainer(protocol, dialCommand, con

// Check against i+1 so we exit if minTries == maxTries.
if (responses.Equal(expectedResponses) || responses.Len() == 0 && expectedResponses.Len() == 0) && i+1 >= minTries {
return
framework.Logf("reached %v after %v/%v tries", targetIP, i, maxTries)
return nil
}
// TODO: get rid of this delay #36281
time.Sleep(hitEndpointRetryDelay)
}

if dialCommand == echoHostname {
config.diagnoseMissingEndpoints(responses)
}
framework.Failf("Failed to find expected responses:\nTries %d\nCommand %v\nretrieved %v\nexpected %v\n", maxTries, cmd, responses, expectedResponses)
returnMsg := fmt.Errorf("did not find expected responses... \nTries %d\nCommand %v\nretrieved %v\nexpected %v", maxTries, cmd, responses, expectedResponses)
framework.Logf("encountered error during dial (%v)", returnMsg)
return returnMsg

}

// GetEndpointsFromTestContainer executes a curl via kubectl exec in a test container.
Expand Down Expand Up @@ -676,6 +680,7 @@ func (config *NetworkingTestConfig) setupCore(selector map[string]string) {

epCount := len(config.EndpointPods)
config.MaxTries = epCount*epCount + testTries
framework.Logf("Setting MaxTries for pod polling to %v for networking test based on endpoint count %v", config.MaxTries, epCount)
}

// setup includes setupCore and also sets up services
Expand Down