From f894990c0dc0cd8c7b6a2a6431e16b19ff686656 Mon Sep 17 00:00:00 2001
From: Sedef <ssavas@vmware.com>
Date: Wed, 30 Sep 2020 10:04:47 -0700
Subject: [PATCH] Avoid MachineHealtchCheck to return early on patch errors

---
 controllers/machinehealthcheck_controller.go  | 123 ++++++++++--------
 .../machinehealthcheck_controller_test.go     |  67 ++++++++++
 controllers/remote/cluster_cache.go           |   6 +-
 3 files changed, 142 insertions(+), 54 deletions(-)

diff --git a/controllers/machinehealthcheck_controller.go b/controllers/machinehealthcheck_controller.go
index 3c39f3e7c43..0150f92c970 100644
--- a/controllers/machinehealthcheck_controller.go
+++ b/controllers/machinehealthcheck_controller.go
@@ -242,13 +242,19 @@ func (r *MachineHealthCheckReconciler) reconcile(ctx context.Context, logger log
 			EventRemediationRestricted,
 			message,
 		)
+		errList := []error{}
 		for _, t := range append(healthy, unhealthy...) {
 			if err := t.patchHelper.Patch(ctx, t.Machine); err != nil {
-				return ctrl.Result{}, errors.Wrapf(err, "Failed to patch machine status for machine %q", t.Machine.Name)
+				errList = append(errList, errors.Wrapf(err, "failed to patch machine status for machine: %s/%s", t.Machine.Namespace, t.Machine.Name))
+				continue
 			}
 		}
+		if len(errList) > 0 {
+			return ctrl.Result{}, kerrors.NewAggregate(errList)
+		}
 		return reconcile.Result{Requeue: true}, nil
 	}
+
 	logger.V(3).Info(
 		"Remediations are allowed",
 		"total target", totalTargets,
@@ -265,19 +271,71 @@ func (r *MachineHealthCheckReconciler) reconcile(ctx context.Context, logger log
 	m.Status.RemediationsAllowed = int32(maxUnhealthy - unhealthyMachineCount(m))
 	conditions.MarkTrue(m, clusterv1.RemediationAllowedCondition)
 
+	errList := r.PatchUnhealthyTargets(ctx, unhealthy, cluster, m)
+	errList = append(errList, r.PatchHealthyTargets(ctx, healthy, cluster, m)...)
+
+	// handle update errors
+	if len(errList) > 0 {
+		logger.V(3).Info("Error(s) marking machine, requeueing")
+		return reconcile.Result{}, kerrors.NewAggregate(errList)
+	}
+
+	if minNextCheck := minDuration(nextCheckTimes); minNextCheck > 0 {
+		logger.V(3).Info("Some targets might go unhealthy. Ensuring a requeue happens", "requeueIn", minNextCheck.Truncate(time.Second).String())
+		return ctrl.Result{RequeueAfter: minNextCheck}, nil
+	}
+
+	logger.V(3).Info("No more targets meet unhealthy criteria")
+
+	return ctrl.Result{}, nil
+}
+
+// PatchHealthyTargets patches healthy machines with MachineHealthCheckSuccededCondition.
+func (r *MachineHealthCheckReconciler) PatchHealthyTargets(ctx context.Context, healthy []healthCheckTarget, cluster *clusterv1.Cluster, m *clusterv1.MachineHealthCheck) []error {
+	errList := []error{}
+	for _, t := range healthy {
+		if m.Spec.RemediationTemplate != nil {
+
+			// Get remediation request object
+			obj, err := r.getExternalRemediationRequest(ctx, m, t.Machine.Name)
+			if err != nil {
+				if apierrors.IsNotFound(errors.Cause(err)) {
+					continue
+				}
+				r.Log.Error(err, "failed to fetch remediation request for machine %q in namespace %q within cluster %q", t.Machine.Name, t.Machine.Namespace, t.Machine.ClusterName)
+			}
+			// Check that obj has no DeletionTimestamp to avoid hot loop
+			if obj.GetDeletionTimestamp() == nil {
+				// Issue a delete for remediation request.
+				if err := r.Client.Delete(ctx, obj); err != nil && !apierrors.IsNotFound(err) {
+					r.Log.Error(err, "failed to delete %v %q for Machine %q", obj.GroupVersionKind(), obj.GetName(), t.Machine.Name)
+				}
+			}
+		}
+
+		if err := t.patchHelper.Patch(ctx, t.Machine); err != nil {
+			r.Log.Error(err, "failed to patch healthy machine status for machine", "machine", t.Machine.GetName())
+			errList = append(errList, errors.Wrapf(err, "failed to patch healthy machine status for machine: %s/%s", t.Machine.Namespace, t.Machine.Name))
+		}
+	}
+	return errList
+}
+
+// PatchUnhealthyTargets patches machines with MachineOwnerRemediatedCondition for remediation
+func (r *MachineHealthCheckReconciler) PatchUnhealthyTargets(ctx context.Context, unhealthy []healthCheckTarget, cluster *clusterv1.Cluster, m *clusterv1.MachineHealthCheck) []error {
 	// mark for remediation
 	errList := []error{}
 	for _, t := range unhealthy {
 		condition := conditions.Get(t.Machine, clusterv1.MachineHealthCheckSuccededCondition)
 
 		if annotations.IsPaused(cluster, t.Machine) {
-			logger.Info("Machine has failed health check, but machine is paused so skipping remediation", "target", t.string(), "reason", condition.Reason, "message", condition.Message)
+			r.Log.Info("Machine has failed health check, but machine is paused so skipping remediation", "target", t.string(), "reason", condition.Reason, "message", condition.Message)
 		} else {
 			if m.Spec.RemediationTemplate != nil {
 				// If external remediation request already exists,
 				// return early
 				if r.externalRemediationRequestExists(ctx, m, t.Machine.Name) {
-					return ctrl.Result{}, nil
+					return errList
 				}
 
 				cloneOwnerRef := &metav1.OwnerReference{
@@ -290,7 +348,8 @@ func (r *MachineHealthCheckReconciler) reconcile(ctx context.Context, logger log
 				from, err := external.Get(ctx, r.Client, m.Spec.RemediationTemplate, t.Machine.Namespace)
 				if err != nil {
 					conditions.MarkFalse(m, clusterv1.ExternalRemediationTemplateAvailable, clusterv1.ExternalRemediationTemplateNotFound, clusterv1.ConditionSeverityError, err.Error())
-					return ctrl.Result{}, errors.Wrapf(err, "error retrieving remediation template %v %q for machine %q in namespace %q within cluster %q", m.Spec.RemediationTemplate.GroupVersionKind(), m.Spec.RemediationTemplate.Name, t.Machine.Name, t.Machine.Namespace, m.Spec.ClusterName)
+					errList = append(errList, errors.Wrapf(err, "error retrieving remediation template %v %q for machine %q in namespace %q within cluster %q", m.Spec.RemediationTemplate.GroupVersionKind(), m.Spec.RemediationTemplate.Name, t.Machine.Name, t.Machine.Namespace, m.Spec.ClusterName))
+					return errList
 				}
 
 				generateTemplateInput := &external.GenerateTemplateInput{
@@ -302,7 +361,8 @@ func (r *MachineHealthCheckReconciler) reconcile(ctx context.Context, logger log
 				}
 				to, err := external.GenerateTemplate(generateTemplateInput)
 				if err != nil {
-					return ctrl.Result{}, errors.Wrapf(err, "failed to create template for remediation request %v %q for machine %q in namespace %q within cluster %q", m.Spec.RemediationTemplate.GroupVersionKind(), m.Spec.RemediationTemplate.Name, t.Machine.Name, t.Machine.Namespace, m.Spec.ClusterName)
+					errList = append(errList, errors.Wrapf(err, "failed to create template for remediation request %v %q for machine %q in namespace %q within cluster %q", m.Spec.RemediationTemplate.GroupVersionKind(), m.Spec.RemediationTemplate.Name, t.Machine.Name, t.Machine.Namespace, m.Spec.ClusterName))
+					return errList
 				}
 
 				// Set the Remediation Request to match the Machine name, the name is used to
@@ -313,21 +373,22 @@ func (r *MachineHealthCheckReconciler) reconcile(ctx context.Context, logger log
 				// the same Machine, users are in charge of setting health checks and remediation properly.
 				to.SetName(t.Machine.Name)
 
-				logger.Info("Target has failed health check, creating an external remediation request", "remediation request name", to.GetName(), "target", t.string(), "reason", condition.Reason, "message", condition.Message)
+				r.Log.Info("Target has failed health check, creating an external remediation request", "remediation request name", to.GetName(), "target", t.string(), "reason", condition.Reason, "message", condition.Message)
 				// Create the external clone.
 				if err := r.Client.Create(ctx, to); err != nil {
 					conditions.MarkFalse(m, clusterv1.ExternalRemediationRequestAvailable, clusterv1.ExternalRemediationRequestCreationFailed, clusterv1.ConditionSeverityError, err.Error())
-					return ctrl.Result{}, errors.Wrapf(err, "error creating remediation request for machine %q in namespace %q within cluster %q", t.Machine.Name, t.Machine.Namespace, t.Machine.ClusterName)
+					errList = append(errList, errors.Wrapf(err, "error creating remediation request for machine %q in namespace %q within cluster %q", t.Machine.Name, t.Machine.Namespace, t.Machine.ClusterName))
+					return errList
 				}
 			} else {
-				logger.Info("Target has failed health check, marking for remediation", "target", t.string(), "reason", condition.Reason, "message", condition.Message)
+				r.Log.Info("Target has failed health check, marking for remediation", "target", t.string(), "reason", condition.Reason, "message", condition.Message)
 				conditions.MarkFalse(t.Machine, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "MachineHealthCheck failed")
 			}
 		}
 
 		if err := t.patchHelper.Patch(ctx, t.Machine); err != nil {
-			logger.Error(err, "failed to patch unhealthy machine status for machine", "machine", t.Machine)
-			return ctrl.Result{}, err
+			errList = append(errList, errors.Wrapf(err, "failed to patch unhealthy machine status for machine: %s/%s", t.Machine.Namespace, t.Machine.Name))
+			continue
 		}
 		r.recorder.Eventf(
 			t.Machine,
@@ -337,47 +398,7 @@ func (r *MachineHealthCheckReconciler) reconcile(ctx context.Context, logger log
 			t.string(),
 		)
 	}
-
-	for _, t := range healthy {
-		if m.Spec.RemediationTemplate != nil {
-
-			// Get remediation request object
-			obj, err := r.getExternalRemediationRequest(ctx, m, t.Machine.Name)
-			if err != nil {
-				if apierrors.IsNotFound(errors.Cause(err)) {
-					continue
-				}
-				logger.Error(err, "failed to fetch remediation request for machine %q in namespace %q within cluster %q", t.Machine.Name, t.Machine.Namespace, t.Machine.ClusterName)
-			}
-			// Check that obj has no DeletionTimestamp to avoid hot loop
-			if obj.GetDeletionTimestamp() == nil {
-				// Issue a delete for remediation request.
-				if err := r.Client.Delete(ctx, obj); err != nil && !apierrors.IsNotFound(err) {
-					logger.Error(err, "failed to delete %v %q for Machine %q", obj.GroupVersionKind(), obj.GetName(), t.Machine.Name)
-				}
-			}
-		}
-
-		if err := t.patchHelper.Patch(ctx, t.Machine); err != nil {
-			logger.Error(err, "failed to patch healthy machine status for machine", "machine", t.Machine.GetName())
-			return reconcile.Result{}, err
-		}
-	}
-
-	// handle update errors
-	if len(errList) > 0 {
-		logger.V(3).Info("Error(s) marking machine, requeueing")
-		return reconcile.Result{}, kerrors.NewAggregate(errList)
-	}
-
-	if minNextCheck := minDuration(nextCheckTimes); minNextCheck > 0 {
-		logger.V(3).Info("Some targets might go unhealthy. Ensuring a requeue happens", "requeueIn", minNextCheck.Truncate(time.Second).String())
-		return ctrl.Result{RequeueAfter: minNextCheck}, nil
-	}
-
-	logger.V(3).Info("No more targets meet unhealthy criteria")
-
-	return ctrl.Result{}, nil
+	return errList
 }
 
 // clusterToMachineHealthCheck maps events from Cluster objects to
diff --git a/controllers/machinehealthcheck_controller_test.go b/controllers/machinehealthcheck_controller_test.go
index 83bb0eb2e0f..22d6a2d8260 100644
--- a/controllers/machinehealthcheck_controller_test.go
+++ b/controllers/machinehealthcheck_controller_test.go
@@ -33,10 +33,13 @@ import (
 	"k8s.io/apimachinery/pkg/util/intstr"
 	"k8s.io/apimachinery/pkg/util/uuid"
 	"k8s.io/client-go/kubernetes/scheme"
+	"k8s.io/client-go/tools/record"
 	"k8s.io/utils/pointer"
 	clusterv1 "sigs.k8s.io/cluster-api/api/v1alpha3"
+	"sigs.k8s.io/cluster-api/controllers/remote"
 	"sigs.k8s.io/cluster-api/util"
 	"sigs.k8s.io/cluster-api/util/conditions"
+	"sigs.k8s.io/cluster-api/util/patch"
 	"sigs.k8s.io/controller-runtime/pkg/client"
 	"sigs.k8s.io/controller-runtime/pkg/client/fake"
 	"sigs.k8s.io/controller-runtime/pkg/handler"
@@ -2161,3 +2164,67 @@ func newMachineHealthCheck(namespace, clusterName string) *clusterv1.MachineHeal
 		},
 	}
 }
+
+func TestPatchTargets(t *testing.T) {
+	_ = clusterv1.AddToScheme(scheme.Scheme)
+	g := NewWithT(t)
+
+	namespace := defaultNamespaceName
+	clusterName := "test-cluster"
+	defaultCluster := &clusterv1.Cluster{
+		ObjectMeta: metav1.ObjectMeta{
+			Name:      clusterName,
+			Namespace: namespace,
+		},
+	}
+	labels := map[string]string{"cluster": "foo", "nodepool": "bar"}
+
+	mhc := newMachineHealthCheckWithLabels("mhc", namespace, clusterName, labels)
+	machine1 := newTestMachine("machine1", namespace, clusterName, "nodeName", labels)
+	machine1.ResourceVersion = "1"
+	conditions.MarkTrue(machine1, clusterv1.MachineHealthCheckSuccededCondition)
+	machine2 := machine1.DeepCopy()
+	machine2.Name = "machine2"
+
+	cl := fake.NewFakeClientWithScheme(scheme.Scheme,
+		machine1,
+		machine2,
+		mhc,
+	)
+	r := &MachineHealthCheckReconciler{
+		Client:   cl,
+		recorder: record.NewFakeRecorder(32),
+		Log:      log.Log,
+		scheme:   scheme.Scheme,
+		Tracker:  remote.NewTestClusterCacheTracker(cl, scheme.Scheme, client.ObjectKey{Name: clusterName, Namespace: namespace}, "machinehealthcheck-watchClusterNodes"),
+	}
+
+	// To make the patch fail, create patchHelper with a different client.
+	fakeMachine := machine1.DeepCopy()
+	fakeMachine.Name = "fake"
+	patchHelper, _ := patch.NewHelper(fakeMachine, fake.NewFakeClientWithScheme(scheme.Scheme, fakeMachine))
+	// healthCheckTarget with fake patchHelper, patch should fail on this target.
+	target1 := healthCheckTarget{
+		MHC:         mhc,
+		Machine:     machine1,
+		patchHelper: patchHelper,
+		Node:        &corev1.Node{},
+	}
+
+	// healthCheckTarget with correct patchHelper.
+	patchHelper2, _ := patch.NewHelper(machine2, cl)
+	target3 := healthCheckTarget{
+		MHC:         mhc,
+		Machine:     machine2,
+		patchHelper: patchHelper2,
+		Node:        &corev1.Node{},
+	}
+
+	// Target with wrong patch helper will fail but the other one will be patched.
+	g.Expect(len(r.PatchUnhealthyTargets(context.TODO(), []healthCheckTarget{target1, target3}, defaultCluster, mhc))).To(BeNumerically(">", 0))
+	g.Expect(cl.Get(ctx, client.ObjectKey{Name: machine2.Name, Namespace: machine2.Namespace}, machine2)).NotTo(HaveOccurred())
+	g.Expect(conditions.Get(machine2, clusterv1.MachineOwnerRemediatedCondition).Status).To(Equal(corev1.ConditionFalse))
+
+	// Target with wrong patch helper will fail but the other one will be patched.
+	g.Expect(len(r.PatchHealthyTargets(context.TODO(), []healthCheckTarget{target1, target3}, defaultCluster, mhc))).To(BeNumerically(">", 0))
+}
diff --git a/controllers/remote/cluster_cache.go b/controllers/remote/cluster_cache.go
index 9cdcb1b19b1..5d013f6ee7e 100644
--- a/controllers/remote/cluster_cache.go
+++ b/controllers/remote/cluster_cache.go
@@ -69,8 +69,8 @@ func NewClusterCacheTracker(log logr.Logger, manager ctrl.Manager) (*ClusterCach
 	}, nil
 }
 
-// NewTestClusterCacheTracker creates a new dummy ClusterCacheTracker that can be used by unit tests with fake client.
-func NewTestClusterCacheTracker(cl client.Client, scheme *runtime.Scheme, objKey client.ObjectKey) *ClusterCacheTracker {
+// NewTestClusterCacheTracker creates a new fake ClusterCacheTracker that can be used by unit tests with fake client.
+func NewTestClusterCacheTracker(cl client.Client, scheme *runtime.Scheme, objKey client.ObjectKey, watchObjects ...string) *ClusterCacheTracker {
 	testCacheTracker := &ClusterCacheTracker{
 		log:              log.Log,
 		client:           cl,
@@ -84,7 +84,7 @@ func NewTestClusterCacheTracker(cl client.Client, scheme *runtime.Scheme, objKey
 			Writer:       cl,
 			StatusClient: cl,
 		},
-		watches: nil,
+		watches: sets.NewString(watchObjects...),
 	}
 	return testCacheTracker
 }