Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

kubeadm: add a upgrade health check that deploys a Job #81319

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 1 addition & 1 deletion cmd/kubeadm/app/cmd/upgrade/common.go
Expand Up @@ -126,7 +126,7 @@ func enforceRequirements(flags *applyPlanFlags, dryRun bool, newK8sVersion strin
}

// Run healthchecks against the cluster
if err := upgrade.CheckClusterHealth(client, ignorePreflightErrorsSet); err != nil {
if err := upgrade.CheckClusterHealth(client, &cfg.ClusterConfiguration, ignorePreflightErrorsSet); err != nil {
return nil, nil, nil, errors.Wrap(err, "[upgrade/health] FATAL")
}

Expand Down
2 changes: 2 additions & 0 deletions cmd/kubeadm/app/phases/upgrade/BUILD
Expand Up @@ -36,13 +36,15 @@ go_library(
"//cmd/kubeadm/app/util/etcd:go_default_library",
"//cmd/kubeadm/app/util/staticpod:go_default_library",
"//staging/src/k8s.io/api/apps/v1:go_default_library",
"//staging/src/k8s.io/api/batch/v1:go_default_library",
"//staging/src/k8s.io/api/core/v1:go_default_library",
"//staging/src/k8s.io/apimachinery/pkg/api/errors:go_default_library",
"//staging/src/k8s.io/apimachinery/pkg/apis/meta/v1:go_default_library",
"//staging/src/k8s.io/apimachinery/pkg/labels:go_default_library",
"//staging/src/k8s.io/apimachinery/pkg/util/errors:go_default_library",
"//staging/src/k8s.io/apimachinery/pkg/util/sets:go_default_library",
"//staging/src/k8s.io/apimachinery/pkg/util/version:go_default_library",
"//staging/src/k8s.io/apimachinery/pkg/util/wait:go_default_library",
"//staging/src/k8s.io/client-go/kubernetes:go_default_library",
"//staging/src/k8s.io/component-base/version:go_default_library",
"//vendor/github.com/coredns/corefile-migration/migration:go_default_library",
Expand Down
158 changes: 133 additions & 25 deletions cmd/kubeadm/app/phases/upgrade/health.go
Expand Up @@ -18,32 +18,39 @@ package upgrade

import (
"fmt"
"net/http"
"os"
"time"

"github.com/pkg/errors"

apps "k8s.io/api/apps/v1"
"k8s.io/api/core/v1"
batchv1 "k8s.io/api/batch/v1"
v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/apimachinery/pkg/util/sets"
"k8s.io/apimachinery/pkg/util/wait"
clientset "k8s.io/client-go/kubernetes"
"k8s.io/klog"
kubeadmapi "k8s.io/kubernetes/cmd/kubeadm/app/apis/kubeadm"
"k8s.io/kubernetes/cmd/kubeadm/app/constants"
"k8s.io/kubernetes/cmd/kubeadm/app/images"
"k8s.io/kubernetes/cmd/kubeadm/app/preflight"
utilpointer "k8s.io/utils/pointer"
)

// healthCheck is a helper struct for easily performing healthchecks against the cluster and printing the output
type healthCheck struct {
name string
client clientset.Interface
// f is invoked with a k8s client passed to it. Should return an optional error
f func(clientset.Interface) error
cfg *kubeadmapi.ClusterConfiguration
// f is invoked with a k8s client and a kubeadm ClusterConfiguration passed to it. Should return an optional error
f func(clientset.Interface, *kubeadmapi.ClusterConfiguration) error
}

// Check is part of the preflight.Checker interface
func (c *healthCheck) Check() (warnings, errors []error) {
if err := c.f(c.client); err != nil {
if err := c.f(c.client, c.cfg); err != nil {
return nil, []error{err}
}
return nil, nil
Expand All @@ -59,49 +66,150 @@ func (c *healthCheck) Name() string {
// - all control-plane Nodes are Ready
// - (if self-hosted) that there are DaemonSets with at least one Pod for all control plane components
// - (if static pod-hosted) that all required Static Pod manifests exist on disk
func CheckClusterHealth(client clientset.Interface, ignoreChecksErrors sets.String) error {
fmt.Println("[upgrade] Making sure the cluster is healthy:")
func CheckClusterHealth(client clientset.Interface, cfg *kubeadmapi.ClusterConfiguration, ignoreChecksErrors sets.String) error {
fmt.Println("[upgrade] Running cluster health checks")

healthChecks := []preflight.Checker{
&healthCheck{
name: "APIServerHealth",
name: "CreateJob",
client: client,
f: apiServerHealthy,
cfg: cfg,
f: createJob,
},
&healthCheck{
name: "ControlPlaneNodesReady",
client: client,
f: controlPlaneNodesReady,
},
// TODO: Add a check for ComponentStatuses here?
&healthCheck{
name: "StaticPodManifest",
client: client,
cfg: cfg,
f: staticPodManifestHealth,
},
}

healthChecks = append(healthChecks, &healthCheck{
name: "StaticPodManifest",
client: client,
f: staticPodManifestHealth,
})

return preflight.RunChecks(healthChecks, os.Stderr, ignoreChecksErrors)
}

// apiServerHealthy checks whether the API server's /healthz endpoint is healthy
func apiServerHealthy(client clientset.Interface) error {
healthStatus := 0
// CreateJob is a check that verifies that a Job can be created in the cluster
func createJob(client clientset.Interface, cfg *kubeadmapi.ClusterConfiguration) (lastError error) {
const (
jobName = "upgrade-health-check"
ns = metav1.NamespaceSystem
timeout = 15 * time.Second
)

// If client.Discovery().RESTClient() is nil, the fake client is used, and that means we are dry-running. Just proceed
// If client.Discovery().RESTClient() is nil, the fake client is used.
// Return early because the kubeadm dryrun dynamic client only handles the core/v1 GroupVersion.
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

had to add this.

the dynamic client fails when trying to get batch/v1 Job here:

unstructuredObj, err := clg.dynamicClient.Resource(action.GetResource()).Namespace(action.GetNamespace()).Get(action.GetName(), metav1.GetOptions{})

i think we should stop using the dynamic client for dryrun, but this is a much larger refactor.
until then we have to skip the logic in this preflight check on dryrun.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

+1 we can have a backlog issue for this so an eager contributor can try it.

if client.Discovery().RESTClient() == nil {
fmt.Printf("[dryrun] Would create the Job %q in namespace %q and wait until it completes\n", jobName, ns)
return nil
}
client.Discovery().RESTClient().Get().AbsPath("/healthz").Do().StatusCode(&healthStatus)
if healthStatus != http.StatusOK {
return errors.Errorf("the API Server is unhealthy; /healthz didn't return %q", "ok")

// Prepare Job
job := &batchv1.Job{
ObjectMeta: metav1.ObjectMeta{
Name: jobName,
Namespace: ns,
},
Spec: batchv1.JobSpec{
BackoffLimit: utilpointer.Int32Ptr(0),
Template: v1.PodTemplateSpec{
Spec: v1.PodSpec{
RestartPolicy: v1.RestartPolicyNever,
SecurityContext: &v1.PodSecurityContext{
RunAsUser: utilpointer.Int64Ptr(999),
RunAsGroup: utilpointer.Int64Ptr(999),
RunAsNonRoot: utilpointer.BoolPtr(true),
},
Tolerations: []v1.Toleration{
{
Key: "node-role.kubernetes.io/master",
Effect: v1.TaintEffectNoSchedule,
},
},
Containers: []v1.Container{
{
Name: jobName,
Image: images.GetPauseImage(cfg),
Args: []string{"-v"},
},
},
},
},
},
}

// Check if the Job already exists and delete it
if _, err := client.BatchV1().Jobs(ns).Get(jobName, metav1.GetOptions{}); err == nil {
if err = deleteHealthCheckJob(client, ns, jobName); err != nil {
return err
}
}

// Cleanup the Job on exit
defer func() {
lastError = deleteHealthCheckJob(client, ns, jobName)
}()

// Create the Job, but retry in case it is being currently deleted
klog.V(2).Infof("Creating Job %q in the namespace %q", jobName, ns)
err := wait.PollImmediate(time.Second*1, timeout, func() (bool, error) {
if _, err := client.BatchV1().Jobs(ns).Create(job); err != nil {
klog.V(2).Infof("Could not create Job %q in the namespace %q, retrying: %v", jobName, ns, err)
lastError = err
return false, nil
}
return true, nil
})
if err != nil {
return errors.Wrapf(lastError, "could not create Job %q in the namespace %q", jobName, ns)
}

// Waiting and manually deleteing the Job is a workaround to not enabling the TTL controller.
// TODO: refactor this if the TTL controller is enabled in kubeadm once it goes Beta.
neolit123 marked this conversation as resolved.
Show resolved Hide resolved

// Wait for the Job to complete
err = wait.PollImmediate(time.Second*1, timeout, func() (bool, error) {
neolit123 marked this conversation as resolved.
Show resolved Hide resolved
job, err := client.BatchV1().Jobs(ns).Get(jobName, metav1.GetOptions{})
if err != nil {
lastError = err
klog.V(2).Infof("could not get Job %q in the namespace %q, retrying: %v", jobName, ns, err)
return false, nil
}
for _, cond := range job.Status.Conditions {
if cond.Type == batchv1.JobComplete {
return true, nil
}
}
lastError = errors.Errorf("no condition of type %v", batchv1.JobComplete)
klog.V(2).Infof("Job %q in the namespace %q is not yet complete, retrying", jobName, ns)
return false, nil
})
if err != nil {
return errors.Wrapf(lastError, "Job %q in the namespace %q did not complete in %v", jobName, ns, timeout)
}

klog.V(2).Infof("Job %q in the namespace %q completed", jobName, ns)

return nil
}

func deleteHealthCheckJob(client clientset.Interface, ns, jobName string) error {
klog.V(2).Infof("Deleting Job %q in the namespace %q", jobName, ns)
propagation := metav1.DeletePropagationForeground
deleteOptions := &metav1.DeleteOptions{
PropagationPolicy: &propagation,
}
if err := client.BatchV1().Jobs(ns).Delete(jobName, deleteOptions); err != nil {
return errors.Wrapf(err, "could not delete Job %q in the namespace %q", jobName, ns)
}
return nil
}

// controlPlaneNodesReady checks whether all control-plane Nodes in the cluster are in the Running state
func controlPlaneNodesReady(client clientset.Interface) error {
func controlPlaneNodesReady(client clientset.Interface, _ *kubeadmapi.ClusterConfiguration) error {
selector := labels.SelectorFromSet(labels.Set(map[string]string{
constants.LabelNodeRoleMaster: "",
}))
Expand All @@ -124,7 +232,7 @@ func controlPlaneNodesReady(client clientset.Interface) error {
}

// staticPodManifestHealth makes sure the required static pods are presents
func staticPodManifestHealth(_ clientset.Interface) error {
func staticPodManifestHealth(_ clientset.Interface, _ *kubeadmapi.ClusterConfiguration) error {
nonExistentManifests := []string{}
for _, component := range constants.ControlPlaneComponents {
manifestFile := constants.GetStaticPodFilepath(component, constants.GetStaticPodDirectory())
Expand Down