From f894990c0dc0cd8c7b6a2a6431e16b19ff686656 Mon Sep 17 00:00:00 2001 From: Sedef Date: Wed, 30 Sep 2020 10:04:47 -0700 Subject: [PATCH] Avoid MachineHealtchCheck to return early on patch errors --- controllers/machinehealthcheck_controller.go | 123 ++++++++++-------- .../machinehealthcheck_controller_test.go | 67 ++++++++++ controllers/remote/cluster_cache.go | 6 +- 3 files changed, 142 insertions(+), 54 deletions(-) diff --git a/controllers/machinehealthcheck_controller.go b/controllers/machinehealthcheck_controller.go index 3c39f3e7c43..0150f92c970 100644 --- a/controllers/machinehealthcheck_controller.go +++ b/controllers/machinehealthcheck_controller.go @@ -242,13 +242,19 @@ func (r *MachineHealthCheckReconciler) reconcile(ctx context.Context, logger log EventRemediationRestricted, message, ) + errList := []error{} for _, t := range append(healthy, unhealthy...) { if err := t.patchHelper.Patch(ctx, t.Machine); err != nil { - return ctrl.Result{}, errors.Wrapf(err, "Failed to patch machine status for machine %q", t.Machine.Name) + errList = append(errList, errors.Wrapf(err, "failed to patch machine status for machine: %s/%s", t.Machine.Namespace, t.Machine.Name)) + continue } } + if len(errList) > 0 { + return ctrl.Result{}, kerrors.NewAggregate(errList) + } return reconcile.Result{Requeue: true}, nil } + logger.V(3).Info( "Remediations are allowed", "total target", totalTargets, @@ -265,19 +271,71 @@ func (r *MachineHealthCheckReconciler) reconcile(ctx context.Context, logger log m.Status.RemediationsAllowed = int32(maxUnhealthy - unhealthyMachineCount(m)) conditions.MarkTrue(m, clusterv1.RemediationAllowedCondition) + errList := r.PatchUnhealthyTargets(ctx, unhealthy, cluster, m) + errList = append(errList, r.PatchHealthyTargets(ctx, healthy, cluster, m)...) + + // handle update errors + if len(errList) > 0 { + logger.V(3).Info("Error(s) marking machine, requeueing") + return reconcile.Result{}, kerrors.NewAggregate(errList) + } + + if minNextCheck := minDuration(nextCheckTimes); minNextCheck > 0 { + logger.V(3).Info("Some targets might go unhealthy. Ensuring a requeue happens", "requeueIn", minNextCheck.Truncate(time.Second).String()) + return ctrl.Result{RequeueAfter: minNextCheck}, nil + } + + logger.V(3).Info("No more targets meet unhealthy criteria") + + return ctrl.Result{}, nil +} + +// PatchHealthyTargets patches healthy machines with MachineHealthCheckSuccededCondition. +func (r *MachineHealthCheckReconciler) PatchHealthyTargets(ctx context.Context, healthy []healthCheckTarget, cluster *clusterv1.Cluster, m *clusterv1.MachineHealthCheck) []error { + errList := []error{} + for _, t := range healthy { + if m.Spec.RemediationTemplate != nil { + + // Get remediation request object + obj, err := r.getExternalRemediationRequest(ctx, m, t.Machine.Name) + if err != nil { + if apierrors.IsNotFound(errors.Cause(err)) { + continue + } + r.Log.Error(err, "failed to fetch remediation request for machine %q in namespace %q within cluster %q", t.Machine.Name, t.Machine.Namespace, t.Machine.ClusterName) + } + // Check that obj has no DeletionTimestamp to avoid hot loop + if obj.GetDeletionTimestamp() == nil { + // Issue a delete for remediation request. + if err := r.Client.Delete(ctx, obj); err != nil && !apierrors.IsNotFound(err) { + r.Log.Error(err, "failed to delete %v %q for Machine %q", obj.GroupVersionKind(), obj.GetName(), t.Machine.Name) + } + } + } + + if err := t.patchHelper.Patch(ctx, t.Machine); err != nil { + r.Log.Error(err, "failed to patch healthy machine status for machine", "machine", t.Machine.GetName()) + errList = append(errList, errors.Wrapf(err, "failed to patch healthy machine status for machine: %s/%s", t.Machine.Namespace, t.Machine.Name)) + } + } + return errList +} + +// PatchUnhealthyTargets patches machines with MachineOwnerRemediatedCondition for remediation +func (r *MachineHealthCheckReconciler) PatchUnhealthyTargets(ctx context.Context, unhealthy []healthCheckTarget, cluster *clusterv1.Cluster, m *clusterv1.MachineHealthCheck) []error { // mark for remediation errList := []error{} for _, t := range unhealthy { condition := conditions.Get(t.Machine, clusterv1.MachineHealthCheckSuccededCondition) if annotations.IsPaused(cluster, t.Machine) { - logger.Info("Machine has failed health check, but machine is paused so skipping remediation", "target", t.string(), "reason", condition.Reason, "message", condition.Message) + r.Log.Info("Machine has failed health check, but machine is paused so skipping remediation", "target", t.string(), "reason", condition.Reason, "message", condition.Message) } else { if m.Spec.RemediationTemplate != nil { // If external remediation request already exists, // return early if r.externalRemediationRequestExists(ctx, m, t.Machine.Name) { - return ctrl.Result{}, nil + return errList } cloneOwnerRef := &metav1.OwnerReference{ @@ -290,7 +348,8 @@ func (r *MachineHealthCheckReconciler) reconcile(ctx context.Context, logger log from, err := external.Get(ctx, r.Client, m.Spec.RemediationTemplate, t.Machine.Namespace) if err != nil { conditions.MarkFalse(m, clusterv1.ExternalRemediationTemplateAvailable, clusterv1.ExternalRemediationTemplateNotFound, clusterv1.ConditionSeverityError, err.Error()) - return ctrl.Result{}, errors.Wrapf(err, "error retrieving remediation template %v %q for machine %q in namespace %q within cluster %q", m.Spec.RemediationTemplate.GroupVersionKind(), m.Spec.RemediationTemplate.Name, t.Machine.Name, t.Machine.Namespace, m.Spec.ClusterName) + errList = append(errList, errors.Wrapf(err, "error retrieving remediation template %v %q for machine %q in namespace %q within cluster %q", m.Spec.RemediationTemplate.GroupVersionKind(), m.Spec.RemediationTemplate.Name, t.Machine.Name, t.Machine.Namespace, m.Spec.ClusterName)) + return errList } generateTemplateInput := &external.GenerateTemplateInput{ @@ -302,7 +361,8 @@ func (r *MachineHealthCheckReconciler) reconcile(ctx context.Context, logger log } to, err := external.GenerateTemplate(generateTemplateInput) if err != nil { - return ctrl.Result{}, errors.Wrapf(err, "failed to create template for remediation request %v %q for machine %q in namespace %q within cluster %q", m.Spec.RemediationTemplate.GroupVersionKind(), m.Spec.RemediationTemplate.Name, t.Machine.Name, t.Machine.Namespace, m.Spec.ClusterName) + errList = append(errList, errors.Wrapf(err, "failed to create template for remediation request %v %q for machine %q in namespace %q within cluster %q", m.Spec.RemediationTemplate.GroupVersionKind(), m.Spec.RemediationTemplate.Name, t.Machine.Name, t.Machine.Namespace, m.Spec.ClusterName)) + return errList } // Set the Remediation Request to match the Machine name, the name is used to @@ -313,21 +373,22 @@ func (r *MachineHealthCheckReconciler) reconcile(ctx context.Context, logger log // the same Machine, users are in charge of setting health checks and remediation properly. to.SetName(t.Machine.Name) - logger.Info("Target has failed health check, creating an external remediation request", "remediation request name", to.GetName(), "target", t.string(), "reason", condition.Reason, "message", condition.Message) + r.Log.Info("Target has failed health check, creating an external remediation request", "remediation request name", to.GetName(), "target", t.string(), "reason", condition.Reason, "message", condition.Message) // Create the external clone. if err := r.Client.Create(ctx, to); err != nil { conditions.MarkFalse(m, clusterv1.ExternalRemediationRequestAvailable, clusterv1.ExternalRemediationRequestCreationFailed, clusterv1.ConditionSeverityError, err.Error()) - return ctrl.Result{}, errors.Wrapf(err, "error creating remediation request for machine %q in namespace %q within cluster %q", t.Machine.Name, t.Machine.Namespace, t.Machine.ClusterName) + errList = append(errList, errors.Wrapf(err, "error creating remediation request for machine %q in namespace %q within cluster %q", t.Machine.Name, t.Machine.Namespace, t.Machine.ClusterName)) + return errList } } else { - logger.Info("Target has failed health check, marking for remediation", "target", t.string(), "reason", condition.Reason, "message", condition.Message) + r.Log.Info("Target has failed health check, marking for remediation", "target", t.string(), "reason", condition.Reason, "message", condition.Message) conditions.MarkFalse(t.Machine, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "MachineHealthCheck failed") } } if err := t.patchHelper.Patch(ctx, t.Machine); err != nil { - logger.Error(err, "failed to patch unhealthy machine status for machine", "machine", t.Machine) - return ctrl.Result{}, err + errList = append(errList, errors.Wrapf(err, "failed to patch unhealthy machine status for machine: %s/%s", t.Machine.Namespace, t.Machine.Name)) + continue } r.recorder.Eventf( t.Machine, @@ -337,47 +398,7 @@ func (r *MachineHealthCheckReconciler) reconcile(ctx context.Context, logger log t.string(), ) } - - for _, t := range healthy { - if m.Spec.RemediationTemplate != nil { - - // Get remediation request object - obj, err := r.getExternalRemediationRequest(ctx, m, t.Machine.Name) - if err != nil { - if apierrors.IsNotFound(errors.Cause(err)) { - continue - } - logger.Error(err, "failed to fetch remediation request for machine %q in namespace %q within cluster %q", t.Machine.Name, t.Machine.Namespace, t.Machine.ClusterName) - } - // Check that obj has no DeletionTimestamp to avoid hot loop - if obj.GetDeletionTimestamp() == nil { - // Issue a delete for remediation request. - if err := r.Client.Delete(ctx, obj); err != nil && !apierrors.IsNotFound(err) { - logger.Error(err, "failed to delete %v %q for Machine %q", obj.GroupVersionKind(), obj.GetName(), t.Machine.Name) - } - } - } - - if err := t.patchHelper.Patch(ctx, t.Machine); err != nil { - logger.Error(err, "failed to patch healthy machine status for machine", "machine", t.Machine.GetName()) - return reconcile.Result{}, err - } - } - - // handle update errors - if len(errList) > 0 { - logger.V(3).Info("Error(s) marking machine, requeueing") - return reconcile.Result{}, kerrors.NewAggregate(errList) - } - - if minNextCheck := minDuration(nextCheckTimes); minNextCheck > 0 { - logger.V(3).Info("Some targets might go unhealthy. Ensuring a requeue happens", "requeueIn", minNextCheck.Truncate(time.Second).String()) - return ctrl.Result{RequeueAfter: minNextCheck}, nil - } - - logger.V(3).Info("No more targets meet unhealthy criteria") - - return ctrl.Result{}, nil + return errList } // clusterToMachineHealthCheck maps events from Cluster objects to diff --git a/controllers/machinehealthcheck_controller_test.go b/controllers/machinehealthcheck_controller_test.go index 83bb0eb2e0f..22d6a2d8260 100644 --- a/controllers/machinehealthcheck_controller_test.go +++ b/controllers/machinehealthcheck_controller_test.go @@ -33,10 +33,13 @@ import ( "k8s.io/apimachinery/pkg/util/intstr" "k8s.io/apimachinery/pkg/util/uuid" "k8s.io/client-go/kubernetes/scheme" + "k8s.io/client-go/tools/record" "k8s.io/utils/pointer" clusterv1 "sigs.k8s.io/cluster-api/api/v1alpha3" + "sigs.k8s.io/cluster-api/controllers/remote" "sigs.k8s.io/cluster-api/util" "sigs.k8s.io/cluster-api/util/conditions" + "sigs.k8s.io/cluster-api/util/patch" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client/fake" "sigs.k8s.io/controller-runtime/pkg/handler" @@ -2161,3 +2164,67 @@ func newMachineHealthCheck(namespace, clusterName string) *clusterv1.MachineHeal }, } } + +func TestPatchTargets(t *testing.T) { + _ = clusterv1.AddToScheme(scheme.Scheme) + g := NewWithT(t) + + namespace := defaultNamespaceName + clusterName := "test-cluster" + defaultCluster := &clusterv1.Cluster{ + ObjectMeta: metav1.ObjectMeta{ + Name: clusterName, + Namespace: namespace, + }, + } + labels := map[string]string{"cluster": "foo", "nodepool": "bar"} + + mhc := newMachineHealthCheckWithLabels("mhc", namespace, clusterName, labels) + machine1 := newTestMachine("machine1", namespace, clusterName, "nodeName", labels) + machine1.ResourceVersion = "1" + conditions.MarkTrue(machine1, clusterv1.MachineHealthCheckSuccededCondition) + machine2 := machine1.DeepCopy() + machine2.Name = "machine2" + + cl := fake.NewFakeClientWithScheme(scheme.Scheme, + machine1, + machine2, + mhc, + ) + r := &MachineHealthCheckReconciler{ + Client: cl, + recorder: record.NewFakeRecorder(32), + Log: log.Log, + scheme: scheme.Scheme, + Tracker: remote.NewTestClusterCacheTracker(cl, scheme.Scheme, client.ObjectKey{Name: clusterName, Namespace: namespace}, "machinehealthcheck-watchClusterNodes"), + } + + // To make the patch fail, create patchHelper with a different client. + fakeMachine := machine1.DeepCopy() + fakeMachine.Name = "fake" + patchHelper, _ := patch.NewHelper(fakeMachine, fake.NewFakeClientWithScheme(scheme.Scheme, fakeMachine)) + // healthCheckTarget with fake patchHelper, patch should fail on this target. + target1 := healthCheckTarget{ + MHC: mhc, + Machine: machine1, + patchHelper: patchHelper, + Node: &corev1.Node{}, + } + + // healthCheckTarget with correct patchHelper. + patchHelper2, _ := patch.NewHelper(machine2, cl) + target3 := healthCheckTarget{ + MHC: mhc, + Machine: machine2, + patchHelper: patchHelper2, + Node: &corev1.Node{}, + } + + // Target with wrong patch helper will fail but the other one will be patched. + g.Expect(len(r.PatchUnhealthyTargets(context.TODO(), []healthCheckTarget{target1, target3}, defaultCluster, mhc))).To(BeNumerically(">", 0)) + g.Expect(cl.Get(ctx, client.ObjectKey{Name: machine2.Name, Namespace: machine2.Namespace}, machine2)).NotTo(HaveOccurred()) + g.Expect(conditions.Get(machine2, clusterv1.MachineOwnerRemediatedCondition).Status).To(Equal(corev1.ConditionFalse)) + + // Target with wrong patch helper will fail but the other one will be patched. + g.Expect(len(r.PatchHealthyTargets(context.TODO(), []healthCheckTarget{target1, target3}, defaultCluster, mhc))).To(BeNumerically(">", 0)) +} diff --git a/controllers/remote/cluster_cache.go b/controllers/remote/cluster_cache.go index 9cdcb1b19b1..5d013f6ee7e 100644 --- a/controllers/remote/cluster_cache.go +++ b/controllers/remote/cluster_cache.go @@ -69,8 +69,8 @@ func NewClusterCacheTracker(log logr.Logger, manager ctrl.Manager) (*ClusterCach }, nil } -// NewTestClusterCacheTracker creates a new dummy ClusterCacheTracker that can be used by unit tests with fake client. -func NewTestClusterCacheTracker(cl client.Client, scheme *runtime.Scheme, objKey client.ObjectKey) *ClusterCacheTracker { +// NewTestClusterCacheTracker creates a new fake ClusterCacheTracker that can be used by unit tests with fake client. +func NewTestClusterCacheTracker(cl client.Client, scheme *runtime.Scheme, objKey client.ObjectKey, watchObjects ...string) *ClusterCacheTracker { testCacheTracker := &ClusterCacheTracker{ log: log.Log, client: cl, @@ -84,7 +84,7 @@ func NewTestClusterCacheTracker(cl client.Client, scheme *runtime.Scheme, objKey Writer: cl, StatusClient: cl, }, - watches: nil, + watches: sets.NewString(watchObjects...), } return testCacheTracker }