Skip to content

Commit

Permalink
Merge pull request #2494 from JoelSpeed/mhc-remediation
Browse files Browse the repository at this point in the history
✨ Add MHC remediation logic
  • Loading branch information
k8s-ci-robot committed Mar 5, 2020
2 parents 7711f95 + 8d6e9fb commit bd12857
Show file tree
Hide file tree
Showing 6 changed files with 619 additions and 2 deletions.
9 changes: 9 additions & 0 deletions api/v1alpha3/machinehealthcheck_webhook.go
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,15 @@ func (m *MachineHealthCheck) validate(old *MachineHealthCheck) error {
)
}

if m.Spec.MaxUnhealthy != nil {
if _, err := intstr.GetValueFromIntOrPercent(m.Spec.MaxUnhealthy, 0, false); err != nil {
allErrs = append(
allErrs,
field.Invalid(field.NewPath("spec", "maxUnhealthy"), m.Spec.MaxUnhealthy, "must be either an int or a percentage"),
)
}
}

if len(allErrs) == 0 {
return nil
}
Expand Down
44 changes: 44 additions & 0 deletions api/v1alpha3/machinehealthcheck_webhook_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import (

. "github.com/onsi/gomega"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/intstr"
)

func TestMachineHealthCheckDefault(t *testing.T) {
Expand Down Expand Up @@ -181,3 +182,46 @@ func TestMachineHealthCheckNodeStartupTimeout(t *testing.T) {
}
}
}

func TestMachineHealthCheckMaxUnhealthy(t *testing.T) {
tests := []struct {
name string
value intstr.IntOrString
expectErr bool
}{
{
name: "when the value is an integer",
value: intstr.Parse("10"),
expectErr: false,
},
{
name: "when the value is a percentage",
value: intstr.Parse("10%"),
expectErr: false,
},
{
name: "when the value is a random string",
value: intstr.Parse("abcdef"),
expectErr: true,
},
}

for _, tt := range tests {
g := NewWithT(t)

maxUnhealthy := tt.value
mhc := &MachineHealthCheck{
Spec: MachineHealthCheckSpec{
MaxUnhealthy: &maxUnhealthy,
},
}

if tt.expectErr {
g.Expect(mhc.ValidateCreate()).NotTo(Succeed())
g.Expect(mhc.ValidateUpdate(mhc)).NotTo(Succeed())
} else {
g.Expect(mhc.ValidateCreate()).To(Succeed())
g.Expect(mhc.ValidateUpdate(mhc)).To(Succeed())
}
}
}
62 changes: 61 additions & 1 deletion controllers/machinehealthcheck_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ import (
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/types"
kerrors "k8s.io/apimachinery/pkg/util/errors"
"k8s.io/apimachinery/pkg/util/intstr"
"k8s.io/client-go/informers"
"k8s.io/client-go/kubernetes"
"k8s.io/client-go/tools/record"
Expand All @@ -49,6 +50,12 @@ import (
const (
mhcClusterNameIndex = "spec.clusterName"
machineNodeNameIndex = "status.nodeRef.name"

// Event types

// EventRemediationRestricted is emitted in case when machine remediation
// is restricted by remediation circuit shorting logic
EventRemediationRestricted string = "RemediationRestricted"
)

// MachineHealthCheckReconciler reconciles a MachineHealthCheck object
Expand Down Expand Up @@ -206,10 +213,47 @@ func (r *MachineHealthCheckReconciler) reconcile(ctx context.Context, cluster *c
currentHealthy, needRemediationTargets, nextCheckTimes := r.healthCheckTargets(targets, logger, m.Spec.NodeStartupTimeout.Duration)
m.Status.CurrentHealthy = int32(currentHealthy)

// check MHC current health against MaxUnhealthy
if !isAllowedRemediation(m) {
logger.V(3).Info(
"Short-circuiting remediation",
"total target", totalTargets,
"max unhealthy", m.Spec.MaxUnhealthy,
"unhealthy targets", totalTargets-currentHealthy,
)

r.recorder.Eventf(
m,
corev1.EventTypeWarning,
EventRemediationRestricted,
"Remediation restricted due to exceeded number of unhealthy machines (total: %v, unhealthy: %v, maxUnhealthy: %v)",
totalTargets,
totalTargets-currentHealthy,
m.Spec.MaxUnhealthy,
)
return reconcile.Result{Requeue: true}, nil
}
logger.V(3).Info(
"Remediations are allowed",
"total target", totalTargets,
"max unhealthy", m.Spec.MaxUnhealthy,
"unhealthy targets", totalTargets-currentHealthy,
)

// remediate
errList := []error{}
for _, t := range needRemediationTargets {
logger.V(3).Info("Target meets unhealthy criteria, triggers remediation", "target", t.string())
// TODO(JoelSpeed): Implement remediation logic
if err := t.remediate(ctx, logger, r.Client, r.recorder); err != nil {
logger.Error(err, "Error remediating target", "target", t.string())
errList = append(errList, err)
}
}

// handle remediation errors
if len(errList) > 0 {
logger.V(3).Info("Error(s) remediating request, requeueing")
return reconcile.Result{}, kerrors.NewAggregate(errList)
}

if minNextCheck := minDuration(nextCheckTimes); minNextCheck > 0 {
Expand Down Expand Up @@ -391,3 +435,19 @@ func (r *MachineHealthCheckReconciler) indexMachineByNodeName(object runtime.Obj

return nil
}

// isAllowedRemediation checks the value of the MaxUnhealthy field to determine
// whether remediation should be allowed or not
func isAllowedRemediation(mhc *clusterv1.MachineHealthCheck) bool {
if mhc.Spec.MaxUnhealthy == nil {
return true
}
maxUnhealthy, err := intstr.GetValueFromIntOrPercent(mhc.Spec.MaxUnhealthy, int(mhc.Status.ExpectedMachines), false)
if err != nil {
return false
}

// If unhealthy is above maxUnhealthy, short circuit any further remediation
unhealthy := mhc.Status.ExpectedMachines - mhc.Status.CurrentHealthy
return int(unhealthy) <= maxUnhealthy
}
86 changes: 86 additions & 0 deletions controllers/machinehealthcheck_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ import (
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/types"
"k8s.io/apimachinery/pkg/util/intstr"
"k8s.io/client-go/kubernetes/scheme"
clusterv1 "sigs.k8s.io/cluster-api/api/v1alpha3"
"sigs.k8s.io/cluster-api/controllers/external"
Expand Down Expand Up @@ -777,3 +778,88 @@ func TestIndexMachineByNodeName(t *testing.T) {
})
}
}

func TestIsAllowedRedmediation(t *testing.T) {
testCases := []struct {
name string
maxUnhealthy *intstr.IntOrString
expectedMachines int32
currentHealthy int32
allowed bool
}{
{
name: "when maxUnhealthy is not set",
maxUnhealthy: nil,
expectedMachines: int32(3),
currentHealthy: int32(0),
allowed: true,
},
{
name: "when maxUnhealthy is not an int or percentage",
maxUnhealthy: &intstr.IntOrString{Type: intstr.String, StrVal: "abcdef"},
expectedMachines: int32(5),
currentHealthy: int32(2),
allowed: false,
},
{
name: "when maxUnhealthy is an int less than current unhealthy",
maxUnhealthy: &intstr.IntOrString{Type: intstr.Int, IntVal: int32(1)},
expectedMachines: int32(3),
currentHealthy: int32(1),
allowed: false,
},
{
name: "when maxUnhealthy is an int equal to current unhealthy",
maxUnhealthy: &intstr.IntOrString{Type: intstr.Int, IntVal: int32(2)},
expectedMachines: int32(3),
currentHealthy: int32(1),
allowed: true,
},
{
name: "when maxUnhealthy is an int greater than current unhealthy",
maxUnhealthy: &intstr.IntOrString{Type: intstr.Int, IntVal: int32(3)},
expectedMachines: int32(3),
currentHealthy: int32(1),
allowed: true,
},
{
name: "when maxUnhealthy is a percentage less than current unhealthy",
maxUnhealthy: &intstr.IntOrString{Type: intstr.String, StrVal: "50%"},
expectedMachines: int32(5),
currentHealthy: int32(2),
allowed: false,
},
{
name: "when maxUnhealthy is a percentage equal to current unhealthy",
maxUnhealthy: &intstr.IntOrString{Type: intstr.String, StrVal: "60%"},
expectedMachines: int32(5),
currentHealthy: int32(2),
allowed: true,
},
{
name: "when maxUnhealthy is a percentage greater than current unhealthy",
maxUnhealthy: &intstr.IntOrString{Type: intstr.String, StrVal: "70%"},
expectedMachines: int32(5),
currentHealthy: int32(2),
allowed: true,
},
}

for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
g := NewWithT(t)

mhc := &clusterv1.MachineHealthCheck{
Spec: clusterv1.MachineHealthCheckSpec{
MaxUnhealthy: tc.maxUnhealthy,
},
Status: clusterv1.MachineHealthCheckStatus{
ExpectedMachines: tc.expectedMachines,
CurrentHealthy: tc.currentHealthy,
},
}

g.Expect(isAllowedRemediation(mhc)).To(Equal(tc.allowed))
})
}
}
101 changes: 101 additions & 0 deletions controllers/machinehealthcheck_targets.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,29 @@ import (
corev1 "k8s.io/api/core/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/apimachinery/pkg/runtime/schema"
"k8s.io/apimachinery/pkg/types"
"k8s.io/client-go/tools/record"
clusterv1 "sigs.k8s.io/cluster-api/api/v1alpha3"
"sigs.k8s.io/controller-runtime/pkg/client"
)

const (
ownerControllerKind = "MachineSet"
nodeControlPlaneLabel = "node-role.kubernetes.io/master"

// Event types

// EventSkippedControlPlane is emitted in case an unhealthy node (or a machine
// associated with the node) has the `master` role
EventSkippedControlPlane string = "SkippedControlPlane"
// EventMachineDeletionFailed is emitted in case remediation of a machine
// is required but deletion of its Machine object failed
EventMachineDeletionFailed string = "MachineDeletionFailed"
// EventMachineDeleted is emitted when machine was successfully remediated
// by deleting its Machine object
EventMachineDeleted string = "MachineDeleted"
// EventDetectedUnhealthy is emitted in case a node associated with a
// machine was detected unhealthy
EventDetectedUnhealthy string = "DetectedUnhealthy"
Expand Down Expand Up @@ -256,3 +273,87 @@ func minDuration(durations []time.Duration) time.Duration {
}
return minDuration
}

// remediate deletes the Machine if it is owned by a MachineSet and is not part of the cluster's control plane
func (t *healthCheckTarget) remediate(ctx context.Context, logger logr.Logger, c client.Client, r record.EventRecorder) error {
logger = logger.WithValues("target", t.string())
logger.Info("Starting remediation for target")

// If the machine is not owned by a MachineSet, it should be skipped
hasOwner, err := t.hasMachineSetOwner()
if err != nil {
return fmt.Errorf("%s: unable to determine Machine owners: %v", t.string(), err)
}
if !hasOwner {
logger.Info("Target has no machineset owner, skipping remediation")
return nil
}

// If the machine is a control plane node, it should be skipped
if t.isControlPlane() {
r.Eventf(
t.Machine,
corev1.EventTypeNormal,
EventSkippedControlPlane,
"Machine %v is a control plane node, skipping remediation",
t.string(),
)
logger.Info("Target is a control plane node, skipping remediation")
return nil
}

logger.Info("Deleting target machine")
if err := c.Delete(ctx, t.Machine); err != nil {
r.Eventf(
t.Machine,
corev1.EventTypeWarning,
EventMachineDeletionFailed,
"Machine %v remediation failed: unable to delete Machine object: %v",
t.string(),
err,
)
return fmt.Errorf("%s: failed to delete machine: %v", t.string(), err)
}
r.Eventf(
t.Machine,
corev1.EventTypeNormal,
EventMachineDeleted,
"Machine %v has been remediated by requesting to delete Machine object",
t.string(),
)

return nil
}

// hasMachineSetOwner checks whether the target's Machine is owned by a MachineSet
func (t *healthCheckTarget) hasMachineSetOwner() (bool, error) {
ownerRefs := t.Machine.ObjectMeta.GetOwnerReferences()
for _, or := range ownerRefs {
if or.Kind == ownerControllerKind {
// The Kind matches so check the Group matches as well
gv, err := schema.ParseGroupVersion(or.APIVersion)
if err != nil {
return false, err
}
if gv.Group == clusterv1.GroupVersion.Group {
return true, nil
}
}
}
return false, nil
}

// isControlPlane checks whether the target refers to a machine that is part of the
// cluster's control plane
func (t *healthCheckTarget) isControlPlane() bool {
if t.Node != nil && labels.Set(t.Node.Labels).Has(nodeControlPlaneLabel) {
return true
}

// if the node is not found, fallback to checking the machine
if t.Machine != nil && labels.Set(t.Machine.Labels).Has(clusterv1.MachineControlPlaneLabelName) {
return true
}

return false
}
Loading

0 comments on commit bd12857

Please sign in to comment.