Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make the resize test wait until kube-system is healthy #17044

Merged
merged 1 commit into from Nov 13, 2015
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
76 changes: 30 additions & 46 deletions test/e2e/resize_nodes.go
Expand Up @@ -386,31 +386,7 @@ func performTemporaryNetworkFailure(c *client.Client, ns, rcName string, replica
}

var _ = Describe("Nodes", func() {
var c *client.Client
var ns string

BeforeEach(func() {
var err error
c, err = loadClient()
expectNoError(err)
testingNs, err := createTestingNS("resize-nodes", c)
ns = testingNs.Name
Expect(err).NotTo(HaveOccurred())
})

AfterEach(func() {
By("checking whether all nodes are healthy")
if err := allNodesReady(c, time.Minute); err != nil {
Failf("Not all nodes are ready: %v", err)
}
By(fmt.Sprintf("destroying namespace for this suite %s", ns))
if err := deleteNS(c, ns, 5*time.Minute /* namespace deletion timeout */); err != nil {
Failf("Couldn't delete namespace '%s', %v", ns, err)
}
if err := checkTestingNSDeletedExcept(c, ""); err != nil {
Failf("Couldn't delete testing namespaces '%s', %v", ns, err)
}
})
framework := NewFramework("resize-nodes")

Describe("Resize", func() {
var skipped bool
Expand All @@ -434,30 +410,38 @@ var _ = Describe("Nodes", func() {
if err := waitForGroupSize(testContext.CloudConfig.NumNodes); err != nil {
Failf("Couldn't restore the original node instance group size: %v", err)
}
if err := waitForClusterSize(c, testContext.CloudConfig.NumNodes, 10*time.Minute); err != nil {
if err := waitForClusterSize(framework.Client, testContext.CloudConfig.NumNodes, 10*time.Minute); err != nil {
Failf("Couldn't restore the original cluster size: %v", err)
}
// Many e2e tests assume that the cluster is fully healthy before they start. Wait until
// the cluster is restored to health
By("waiting for system pods to successfully restart")
pods, err := framework.Client.Pods("kube-system").List(labels.Everything(), fields.Everything())
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Wouldn't this lead to some races?
If the new node was just added, kube-system pods that are supposed to be running on that node may not have been created yet - or am I missing something?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The trouble is that when you kill a node, and add a new one things get rescheduled, so things like influx and kube-dns aren't running yet. And subsequent tests fail because they expect those containers to already be up and running.

Expect(err).NotTo(HaveOccurred())

err = waitForPodsRunningReady("kube-system", len(pods.Items), podReadyBeforeTimeout)
Expect(err).NotTo(HaveOccurred())
})

It("should be able to delete nodes", func() {
// Create a replication controller for a service that serves its hostname.
// The source for the Docker container kubernetes/serve_hostname is in contrib/for-demos/serve_hostname
name := "my-hostname-delete-node"
replicas := testContext.CloudConfig.NumNodes
newRCByName(c, ns, name, replicas)
err := verifyPods(c, ns, name, true, replicas)
newRCByName(framework.Client, framework.Namespace.Name, name, replicas)
err := verifyPods(framework.Client, framework.Namespace.Name, name, true, replicas)
Expect(err).NotTo(HaveOccurred())

By(fmt.Sprintf("decreasing cluster size to %d", replicas-1))
err = resizeGroup(replicas - 1)
Expect(err).NotTo(HaveOccurred())
err = waitForGroupSize(replicas - 1)
Expect(err).NotTo(HaveOccurred())
err = waitForClusterSize(c, replicas-1, 10*time.Minute)
err = waitForClusterSize(framework.Client, replicas-1, 10*time.Minute)
Expect(err).NotTo(HaveOccurred())

By("verifying whether the pods from the removed node are recreated")
err = verifyPods(c, ns, name, true, replicas)
err = verifyPods(framework.Client, framework.Namespace.Name, name, true, replicas)
Expect(err).NotTo(HaveOccurred())
})

Expand All @@ -466,24 +450,24 @@ var _ = Describe("Nodes", func() {
// Create a replication controller for a service that serves its hostname.
// The source for the Docker container kubernetes/serve_hostname is in contrib/for-demos/serve_hostname
name := "my-hostname-add-node"
newSVCByName(c, ns, name)
newSVCByName(framework.Client, framework.Namespace.Name, name)
replicas := testContext.CloudConfig.NumNodes
newRCByName(c, ns, name, replicas)
err := verifyPods(c, ns, name, true, replicas)
newRCByName(framework.Client, framework.Namespace.Name, name, replicas)
err := verifyPods(framework.Client, framework.Namespace.Name, name, true, replicas)
Expect(err).NotTo(HaveOccurred())

By(fmt.Sprintf("increasing cluster size to %d", replicas+1))
err = resizeGroup(replicas + 1)
Expect(err).NotTo(HaveOccurred())
err = waitForGroupSize(replicas + 1)
Expect(err).NotTo(HaveOccurred())
err = waitForClusterSize(c, replicas+1, 10*time.Minute)
err = waitForClusterSize(framework.Client, replicas+1, 10*time.Minute)
Expect(err).NotTo(HaveOccurred())

By(fmt.Sprintf("increasing size of the replication controller to %d and verifying all pods are running", replicas+1))
err = resizeRC(c, ns, name, replicas+1)
err = resizeRC(framework.Client, framework.Namespace.Name, name, replicas+1)
Expect(err).NotTo(HaveOccurred())
err = verifyPods(c, ns, name, true, replicas+1)
err = verifyPods(framework.Client, framework.Namespace.Name, name, true, replicas+1)
Expect(err).NotTo(HaveOccurred())
})
})
Expand All @@ -507,25 +491,25 @@ var _ = Describe("Nodes", func() {
// Create a replication controller for a service that serves its hostname.
// The source for the Docker container kubernetes/serve_hostname is in contrib/for-demos/serve_hostname
name := "my-hostname-net"
newSVCByName(c, ns, name)
newSVCByName(framework.Client, framework.Namespace.Name, name)
replicas := testContext.CloudConfig.NumNodes
newRCByName(c, ns, name, replicas)
err := verifyPods(c, ns, name, true, replicas)
newRCByName(framework.Client, framework.Namespace.Name, name, replicas)
err := verifyPods(framework.Client, framework.Namespace.Name, name, true, replicas)
Expect(err).NotTo(HaveOccurred(), "Each pod should start running and responding")

By("choose a node with at least one pod - we will block some network traffic on this node")
label := labels.SelectorFromSet(labels.Set(map[string]string{"name": name}))
pods, err := c.Pods(ns).List(label, fields.Everything()) // list pods after all have been scheduled
pods, err := framework.Client.Pods(framework.Namespace.Name).List(label, fields.Everything()) // list pods after all have been scheduled
Expect(err).NotTo(HaveOccurred())
nodeName := pods.Items[0].Spec.NodeName

node, err := c.Nodes().Get(nodeName)
node, err := framework.Client.Nodes().Get(nodeName)
Expect(err).NotTo(HaveOccurred())

By(fmt.Sprintf("block network traffic from node %s", node.Name))
performTemporaryNetworkFailure(c, ns, name, replicas, pods.Items[0].Name, node)
performTemporaryNetworkFailure(framework.Client, framework.Namespace.Name, name, replicas, pods.Items[0].Name, node)
Logf("Waiting %v for node %s to be ready once temporary network failure ends", resizeNodeReadyTimeout, node.Name)
if !waitForNodeToBeReady(c, node.Name, resizeNodeReadyTimeout) {
if !waitForNodeToBeReady(framework.Client, node.Name, resizeNodeReadyTimeout) {
Failf("Node %s did not become ready within %v", node.Name, resizeNodeReadyTimeout)
}

Expand All @@ -536,14 +520,14 @@ var _ = Describe("Nodes", func() {
// increasing the RC size is not a valid way to test this
// since we have no guarantees the pod will be scheduled on our node.
additionalPod := "additionalpod"
err = newPodOnNode(c, ns, additionalPod, node.Name)
err = newPodOnNode(framework.Client, framework.Namespace.Name, additionalPod, node.Name)
Expect(err).NotTo(HaveOccurred())
err = verifyPods(c, ns, additionalPod, true, 1)
err = verifyPods(framework.Client, framework.Namespace.Name, additionalPod, true, 1)
Expect(err).NotTo(HaveOccurred())

// verify that it is really on the requested node
{
pod, err := c.Pods(ns).Get(additionalPod)
pod, err := framework.Client.Pods(framework.Namespace.Name).Get(additionalPod)
Expect(err).NotTo(HaveOccurred())
if pod.Spec.NodeName != node.Name {
Logf("Pod %s found on invalid node: %s instead of %s", pod.Name, pod.Spec.NodeName, node.Name)
Expand Down