Skip to content

Commit

Permalink
Merge pull request #3713 from sedefsavas/mhc_err_fix
Browse files Browse the repository at this point in the history
馃尡 Avoid MachineHealthCheck to return early on patch errors
  • Loading branch information
k8s-ci-robot committed Oct 27, 2020
2 parents cb3095b + f894990 commit db6cc2a
Show file tree
Hide file tree
Showing 3 changed files with 142 additions and 54 deletions.
123 changes: 72 additions & 51 deletions controllers/machinehealthcheck_controller.go
Expand Up @@ -242,13 +242,19 @@ func (r *MachineHealthCheckReconciler) reconcile(ctx context.Context, logger log
EventRemediationRestricted,
message,
)
errList := []error{}
for _, t := range append(healthy, unhealthy...) {
if err := t.patchHelper.Patch(ctx, t.Machine); err != nil {
return ctrl.Result{}, errors.Wrapf(err, "Failed to patch machine status for machine %q", t.Machine.Name)
errList = append(errList, errors.Wrapf(err, "failed to patch machine status for machine: %s/%s", t.Machine.Namespace, t.Machine.Name))
continue
}
}
if len(errList) > 0 {
return ctrl.Result{}, kerrors.NewAggregate(errList)
}
return reconcile.Result{Requeue: true}, nil
}

logger.V(3).Info(
"Remediations are allowed",
"total target", totalTargets,
Expand All @@ -265,19 +271,71 @@ func (r *MachineHealthCheckReconciler) reconcile(ctx context.Context, logger log
m.Status.RemediationsAllowed = int32(maxUnhealthy - unhealthyMachineCount(m))
conditions.MarkTrue(m, clusterv1.RemediationAllowedCondition)

errList := r.PatchUnhealthyTargets(ctx, unhealthy, cluster, m)
errList = append(errList, r.PatchHealthyTargets(ctx, healthy, cluster, m)...)

// handle update errors
if len(errList) > 0 {
logger.V(3).Info("Error(s) marking machine, requeueing")
return reconcile.Result{}, kerrors.NewAggregate(errList)
}

if minNextCheck := minDuration(nextCheckTimes); minNextCheck > 0 {
logger.V(3).Info("Some targets might go unhealthy. Ensuring a requeue happens", "requeueIn", minNextCheck.Truncate(time.Second).String())
return ctrl.Result{RequeueAfter: minNextCheck}, nil
}

logger.V(3).Info("No more targets meet unhealthy criteria")

return ctrl.Result{}, nil
}

// PatchHealthyTargets patches healthy machines with MachineHealthCheckSuccededCondition.
func (r *MachineHealthCheckReconciler) PatchHealthyTargets(ctx context.Context, healthy []healthCheckTarget, cluster *clusterv1.Cluster, m *clusterv1.MachineHealthCheck) []error {
errList := []error{}
for _, t := range healthy {
if m.Spec.RemediationTemplate != nil {

// Get remediation request object
obj, err := r.getExternalRemediationRequest(ctx, m, t.Machine.Name)
if err != nil {
if apierrors.IsNotFound(errors.Cause(err)) {
continue
}
r.Log.Error(err, "failed to fetch remediation request for machine %q in namespace %q within cluster %q", t.Machine.Name, t.Machine.Namespace, t.Machine.ClusterName)
}
// Check that obj has no DeletionTimestamp to avoid hot loop
if obj.GetDeletionTimestamp() == nil {
// Issue a delete for remediation request.
if err := r.Client.Delete(ctx, obj); err != nil && !apierrors.IsNotFound(err) {
r.Log.Error(err, "failed to delete %v %q for Machine %q", obj.GroupVersionKind(), obj.GetName(), t.Machine.Name)
}
}
}

if err := t.patchHelper.Patch(ctx, t.Machine); err != nil {
r.Log.Error(err, "failed to patch healthy machine status for machine", "machine", t.Machine.GetName())
errList = append(errList, errors.Wrapf(err, "failed to patch healthy machine status for machine: %s/%s", t.Machine.Namespace, t.Machine.Name))
}
}
return errList
}

// PatchUnhealthyTargets patches machines with MachineOwnerRemediatedCondition for remediation
func (r *MachineHealthCheckReconciler) PatchUnhealthyTargets(ctx context.Context, unhealthy []healthCheckTarget, cluster *clusterv1.Cluster, m *clusterv1.MachineHealthCheck) []error {
// mark for remediation
errList := []error{}
for _, t := range unhealthy {
condition := conditions.Get(t.Machine, clusterv1.MachineHealthCheckSuccededCondition)

if annotations.IsPaused(cluster, t.Machine) {
logger.Info("Machine has failed health check, but machine is paused so skipping remediation", "target", t.string(), "reason", condition.Reason, "message", condition.Message)
r.Log.Info("Machine has failed health check, but machine is paused so skipping remediation", "target", t.string(), "reason", condition.Reason, "message", condition.Message)
} else {
if m.Spec.RemediationTemplate != nil {
// If external remediation request already exists,
// return early
if r.externalRemediationRequestExists(ctx, m, t.Machine.Name) {
return ctrl.Result{}, nil
return errList
}

cloneOwnerRef := &metav1.OwnerReference{
Expand All @@ -290,7 +348,8 @@ func (r *MachineHealthCheckReconciler) reconcile(ctx context.Context, logger log
from, err := external.Get(ctx, r.Client, m.Spec.RemediationTemplate, t.Machine.Namespace)
if err != nil {
conditions.MarkFalse(m, clusterv1.ExternalRemediationTemplateAvailable, clusterv1.ExternalRemediationTemplateNotFound, clusterv1.ConditionSeverityError, err.Error())
return ctrl.Result{}, errors.Wrapf(err, "error retrieving remediation template %v %q for machine %q in namespace %q within cluster %q", m.Spec.RemediationTemplate.GroupVersionKind(), m.Spec.RemediationTemplate.Name, t.Machine.Name, t.Machine.Namespace, m.Spec.ClusterName)
errList = append(errList, errors.Wrapf(err, "error retrieving remediation template %v %q for machine %q in namespace %q within cluster %q", m.Spec.RemediationTemplate.GroupVersionKind(), m.Spec.RemediationTemplate.Name, t.Machine.Name, t.Machine.Namespace, m.Spec.ClusterName))
return errList
}

generateTemplateInput := &external.GenerateTemplateInput{
Expand All @@ -302,7 +361,8 @@ func (r *MachineHealthCheckReconciler) reconcile(ctx context.Context, logger log
}
to, err := external.GenerateTemplate(generateTemplateInput)
if err != nil {
return ctrl.Result{}, errors.Wrapf(err, "failed to create template for remediation request %v %q for machine %q in namespace %q within cluster %q", m.Spec.RemediationTemplate.GroupVersionKind(), m.Spec.RemediationTemplate.Name, t.Machine.Name, t.Machine.Namespace, m.Spec.ClusterName)
errList = append(errList, errors.Wrapf(err, "failed to create template for remediation request %v %q for machine %q in namespace %q within cluster %q", m.Spec.RemediationTemplate.GroupVersionKind(), m.Spec.RemediationTemplate.Name, t.Machine.Name, t.Machine.Namespace, m.Spec.ClusterName))
return errList
}

// Set the Remediation Request to match the Machine name, the name is used to
Expand All @@ -313,21 +373,22 @@ func (r *MachineHealthCheckReconciler) reconcile(ctx context.Context, logger log
// the same Machine, users are in charge of setting health checks and remediation properly.
to.SetName(t.Machine.Name)

logger.Info("Target has failed health check, creating an external remediation request", "remediation request name", to.GetName(), "target", t.string(), "reason", condition.Reason, "message", condition.Message)
r.Log.Info("Target has failed health check, creating an external remediation request", "remediation request name", to.GetName(), "target", t.string(), "reason", condition.Reason, "message", condition.Message)
// Create the external clone.
if err := r.Client.Create(ctx, to); err != nil {
conditions.MarkFalse(m, clusterv1.ExternalRemediationRequestAvailable, clusterv1.ExternalRemediationRequestCreationFailed, clusterv1.ConditionSeverityError, err.Error())
return ctrl.Result{}, errors.Wrapf(err, "error creating remediation request for machine %q in namespace %q within cluster %q", t.Machine.Name, t.Machine.Namespace, t.Machine.ClusterName)
errList = append(errList, errors.Wrapf(err, "error creating remediation request for machine %q in namespace %q within cluster %q", t.Machine.Name, t.Machine.Namespace, t.Machine.ClusterName))
return errList
}
} else {
logger.Info("Target has failed health check, marking for remediation", "target", t.string(), "reason", condition.Reason, "message", condition.Message)
r.Log.Info("Target has failed health check, marking for remediation", "target", t.string(), "reason", condition.Reason, "message", condition.Message)
conditions.MarkFalse(t.Machine, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "MachineHealthCheck failed")
}
}

if err := t.patchHelper.Patch(ctx, t.Machine); err != nil {
logger.Error(err, "failed to patch unhealthy machine status for machine", "machine", t.Machine)
return ctrl.Result{}, err
errList = append(errList, errors.Wrapf(err, "failed to patch unhealthy machine status for machine: %s/%s", t.Machine.Namespace, t.Machine.Name))
continue
}
r.recorder.Eventf(
t.Machine,
Expand All @@ -337,47 +398,7 @@ func (r *MachineHealthCheckReconciler) reconcile(ctx context.Context, logger log
t.string(),
)
}

for _, t := range healthy {
if m.Spec.RemediationTemplate != nil {

// Get remediation request object
obj, err := r.getExternalRemediationRequest(ctx, m, t.Machine.Name)
if err != nil {
if apierrors.IsNotFound(errors.Cause(err)) {
continue
}
logger.Error(err, "failed to fetch remediation request for machine %q in namespace %q within cluster %q", t.Machine.Name, t.Machine.Namespace, t.Machine.ClusterName)
}
// Check that obj has no DeletionTimestamp to avoid hot loop
if obj.GetDeletionTimestamp() == nil {
// Issue a delete for remediation request.
if err := r.Client.Delete(ctx, obj); err != nil && !apierrors.IsNotFound(err) {
logger.Error(err, "failed to delete %v %q for Machine %q", obj.GroupVersionKind(), obj.GetName(), t.Machine.Name)
}
}
}

if err := t.patchHelper.Patch(ctx, t.Machine); err != nil {
logger.Error(err, "failed to patch healthy machine status for machine", "machine", t.Machine.GetName())
return reconcile.Result{}, err
}
}

// handle update errors
if len(errList) > 0 {
logger.V(3).Info("Error(s) marking machine, requeueing")
return reconcile.Result{}, kerrors.NewAggregate(errList)
}

if minNextCheck := minDuration(nextCheckTimes); minNextCheck > 0 {
logger.V(3).Info("Some targets might go unhealthy. Ensuring a requeue happens", "requeueIn", minNextCheck.Truncate(time.Second).String())
return ctrl.Result{RequeueAfter: minNextCheck}, nil
}

logger.V(3).Info("No more targets meet unhealthy criteria")

return ctrl.Result{}, nil
return errList
}

// clusterToMachineHealthCheck maps events from Cluster objects to
Expand Down
67 changes: 67 additions & 0 deletions controllers/machinehealthcheck_controller_test.go
Expand Up @@ -33,10 +33,13 @@ import (
"k8s.io/apimachinery/pkg/util/intstr"
"k8s.io/apimachinery/pkg/util/uuid"
"k8s.io/client-go/kubernetes/scheme"
"k8s.io/client-go/tools/record"
"k8s.io/utils/pointer"
clusterv1 "sigs.k8s.io/cluster-api/api/v1alpha3"
"sigs.k8s.io/cluster-api/controllers/remote"
"sigs.k8s.io/cluster-api/util"
"sigs.k8s.io/cluster-api/util/conditions"
"sigs.k8s.io/cluster-api/util/patch"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/client/fake"
"sigs.k8s.io/controller-runtime/pkg/handler"
Expand Down Expand Up @@ -2161,3 +2164,67 @@ func newMachineHealthCheck(namespace, clusterName string) *clusterv1.MachineHeal
},
}
}

func TestPatchTargets(t *testing.T) {
_ = clusterv1.AddToScheme(scheme.Scheme)
g := NewWithT(t)

namespace := defaultNamespaceName
clusterName := "test-cluster"
defaultCluster := &clusterv1.Cluster{
ObjectMeta: metav1.ObjectMeta{
Name: clusterName,
Namespace: namespace,
},
}
labels := map[string]string{"cluster": "foo", "nodepool": "bar"}

mhc := newMachineHealthCheckWithLabels("mhc", namespace, clusterName, labels)
machine1 := newTestMachine("machine1", namespace, clusterName, "nodeName", labels)
machine1.ResourceVersion = "1"
conditions.MarkTrue(machine1, clusterv1.MachineHealthCheckSuccededCondition)
machine2 := machine1.DeepCopy()
machine2.Name = "machine2"

cl := fake.NewFakeClientWithScheme(scheme.Scheme,
machine1,
machine2,
mhc,
)
r := &MachineHealthCheckReconciler{
Client: cl,
recorder: record.NewFakeRecorder(32),
Log: log.Log,
scheme: scheme.Scheme,
Tracker: remote.NewTestClusterCacheTracker(cl, scheme.Scheme, client.ObjectKey{Name: clusterName, Namespace: namespace}, "machinehealthcheck-watchClusterNodes"),
}

// To make the patch fail, create patchHelper with a different client.
fakeMachine := machine1.DeepCopy()
fakeMachine.Name = "fake"
patchHelper, _ := patch.NewHelper(fakeMachine, fake.NewFakeClientWithScheme(scheme.Scheme, fakeMachine))
// healthCheckTarget with fake patchHelper, patch should fail on this target.
target1 := healthCheckTarget{
MHC: mhc,
Machine: machine1,
patchHelper: patchHelper,
Node: &corev1.Node{},
}

// healthCheckTarget with correct patchHelper.
patchHelper2, _ := patch.NewHelper(machine2, cl)
target3 := healthCheckTarget{
MHC: mhc,
Machine: machine2,
patchHelper: patchHelper2,
Node: &corev1.Node{},
}

// Target with wrong patch helper will fail but the other one will be patched.
g.Expect(len(r.PatchUnhealthyTargets(context.TODO(), []healthCheckTarget{target1, target3}, defaultCluster, mhc))).To(BeNumerically(">", 0))
g.Expect(cl.Get(ctx, client.ObjectKey{Name: machine2.Name, Namespace: machine2.Namespace}, machine2)).NotTo(HaveOccurred())
g.Expect(conditions.Get(machine2, clusterv1.MachineOwnerRemediatedCondition).Status).To(Equal(corev1.ConditionFalse))

// Target with wrong patch helper will fail but the other one will be patched.
g.Expect(len(r.PatchHealthyTargets(context.TODO(), []healthCheckTarget{target1, target3}, defaultCluster, mhc))).To(BeNumerically(">", 0))
}
6 changes: 3 additions & 3 deletions controllers/remote/cluster_cache.go
Expand Up @@ -69,8 +69,8 @@ func NewClusterCacheTracker(log logr.Logger, manager ctrl.Manager) (*ClusterCach
}, nil
}

// NewTestClusterCacheTracker creates a new dummy ClusterCacheTracker that can be used by unit tests with fake client.
func NewTestClusterCacheTracker(cl client.Client, scheme *runtime.Scheme, objKey client.ObjectKey) *ClusterCacheTracker {
// NewTestClusterCacheTracker creates a new fake ClusterCacheTracker that can be used by unit tests with fake client.
func NewTestClusterCacheTracker(cl client.Client, scheme *runtime.Scheme, objKey client.ObjectKey, watchObjects ...string) *ClusterCacheTracker {
testCacheTracker := &ClusterCacheTracker{
log: log.Log,
client: cl,
Expand All @@ -84,7 +84,7 @@ func NewTestClusterCacheTracker(cl client.Client, scheme *runtime.Scheme, objKey
Writer: cl,
StatusClient: cl,
},
watches: nil,
watches: sets.NewString(watchObjects...),
}
return testCacheTracker
}
Expand Down

0 comments on commit db6cc2a

Please sign in to comment.