Skip to content

Commit

Permalink
MachineHealthCheck External Remediation
Browse files Browse the repository at this point in the history
  • Loading branch information
jan-est committed Oct 19, 2020
2 parents af66309 + cbdb964 commit 050798a
Show file tree
Hide file tree
Showing 16 changed files with 714 additions and 77 deletions.
1 change: 0 additions & 1 deletion api/v1alpha2/conversion.go
Expand Up @@ -119,7 +119,6 @@ func (src *Machine) ConvertTo(dstRaw conversion.Hub) error {
restoreMachineSpec(&restored.Spec, &dst.Spec)
dst.Status.ObservedGeneration = restored.Status.ObservedGeneration
dst.Status.Conditions = restored.Status.Conditions

return nil
}

Expand Down
7 changes: 7 additions & 0 deletions api/v1alpha3/condition_consts.go
Expand Up @@ -126,4 +126,11 @@ const (

// WaitingForRemediationReason is the reason used when a machine fails a health check and remediation is needed.
WaitingForRemediationReason = "WaitingForRemediation"

// ExternalRemediationTemplateAvailable is set on machinehealthchecks when MachineHealthCheck controller uses external remediation.
// ExternalRemediationTemplateAvailable is set to false if external remediation template is not found.
ExternalRemediationTemplateAvailable ConditionType = "ExternalRemediationTemplateAvailable"

// ExternalRemediationTemplateNotFound is the reason used when a machine health check fails to find external remediation template.
ExternalRemediationTemplateNotFound = "ExternalRemediationTemplateNotFound"
)
21 changes: 21 additions & 0 deletions api/v1alpha3/machinehealthcheck_types.go
Expand Up @@ -49,6 +49,15 @@ type MachineHealthCheckSpec struct {
// failed and will be remediated.
// +optional
NodeStartupTimeout *metav1.Duration `json:"nodeStartupTimeout,omitempty"`

// RemediationTemplate is a reference to a remediation template
// provided by an infrastructure provider.
//
// This field is completely optional, when filled, the MachineHealthCheck controller
// creates a new object from the template referenced and hands off remediation of the machine to
// a controller that lives outside of Cluster API.
// +optional
RemediationTemplate *corev1.ObjectReference `json:"remediationTemplate,omitempty"`
}

// ANCHOR_END: MachineHealthCHeckSpec
Expand Down Expand Up @@ -91,6 +100,10 @@ type MachineHealthCheckStatus struct {
// Targets shows the current list of machines the machine health check is watching
// +optional
Targets []string `json:"targets,omitempty"`

// Conditions defines current service state of the MachineHealthCheck.
// +optional
Conditions Conditions `json:"conditions,omitempty"`
}

// ANCHOR_END: MachineHealthCheckStatus
Expand All @@ -115,6 +128,14 @@ type MachineHealthCheck struct {
Status MachineHealthCheckStatus `json:"status,omitempty"`
}

func (m *MachineHealthCheck) GetConditions() Conditions {
return m.Status.Conditions
}

func (m *MachineHealthCheck) SetConditions(conditions Conditions) {
m.Status.Conditions = conditions
}

// +kubebuilder:object:root=true

// MachineHealthCheckList contains a list of MachineHealthCheck
Expand Down
12 changes: 12 additions & 0 deletions api/v1alpha3/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

85 changes: 85 additions & 0 deletions config/crd/bases/cluster.x-k8s.io_machinehealthchecks.yaml
Expand Up @@ -71,6 +71,47 @@ spec:
description: Machines older than this duration without a node will
be considered to have failed and will be remediated.
type: string
remediationTemplate:
description: "RemediationTemplate is a reference to a remediation
template provided by an infrastructure provider. \n This field is
completely optional, when filled, the MachineHealthCheck controller
creates a new object from the template referenced and hands off
remediation of the machine to a controller that lives outside of
Cluster API."
properties:
apiVersion:
description: API version of the referent.
type: string
fieldPath:
description: 'If referring to a piece of an object instead of
an entire object, this string should contain a valid JSON/Go
field access statement, such as desiredState.manifest.containers[2].
For example, if the object reference is to a container within
a pod, this would take on a value like: "spec.containers{name}"
(where "name" refers to the name of the container that triggered
the event) or if no container name is specified "spec.containers[2]"
(container with index 2 in this pod). This syntax is chosen
only to have some well-defined way of referencing a part of
an object. TODO: this design is not final and this field is
subject to change in the future.'
type: string
kind:
description: 'Kind of the referent. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds'
type: string
name:
description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names'
type: string
namespace:
description: 'Namespace of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/namespaces/'
type: string
resourceVersion:
description: 'Specific resourceVersion to which this reference
is made, if any. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#concurrency-control-and-consistency'
type: string
uid:
description: 'UID of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#uids'
type: string
type: object
selector:
description: Label selector to match machines whose health will be
exercised
Expand Down Expand Up @@ -150,6 +191,50 @@ spec:
status:
description: Most recently observed status of MachineHealthCheck resource
properties:
conditions:
description: Conditions defines current service state of the MachineHealthCheck.
items:
description: Condition defines an observation of a Cluster API resource
operational state.
properties:
lastTransitionTime:
description: Last time the condition transitioned from one status
to another. This should be when the underlying condition changed.
If that is not known, then using the time when the API field
changed is acceptable.
format: date-time
type: string
message:
description: A human readable message indicating details about
the transition. This field may be empty.
type: string
reason:
description: The reason for the condition's last transition
in CamelCase. The specific API may choose whether or not this
field is considered a guaranteed API. This field may not be
empty.
type: string
severity:
description: Severity provides an explicit classification of
Reason code, so the users or machines can immediately understand
the current situation and act accordingly. The Severity field
MUST be set only when Status=False.
type: string
status:
description: Status of the condition, one of True, False, Unknown.
type: string
type:
description: Type of condition in CamelCase or in foo.example.com/CamelCase.
Many .condition.type values are consistent across resources
like Available, but because arbitrary conditions can be useful
(see .node.status.conditions), the ability to deconflict is
important.
type: string
required:
- status
- type
type: object
type: array
currentHealthy:
description: total number of healthy machines counted by this machine
health check
Expand Down
92 changes: 92 additions & 0 deletions controllers/external/testing.go
Expand Up @@ -207,4 +207,96 @@ var (
},
},
}

TestGenericInfrastructureRemediationCRD = &apiextensionsv1.CustomResourceDefinition{
TypeMeta: metav1.TypeMeta{
APIVersion: apiextensionsv1.SchemeGroupVersion.String(),
Kind: "CustomResourceDefinition",
},
ObjectMeta: metav1.ObjectMeta{
Name: "infrastructureremediations.infrastructure.cluster.x-k8s.io",
Labels: map[string]string{
clusterv1.GroupVersion.String(): "v1alpha3",
},
},
Spec: apiextensionsv1.CustomResourceDefinitionSpec{
Group: "infrastructure.cluster.x-k8s.io",
Scope: apiextensionsv1.NamespaceScoped,
Names: apiextensionsv1.CustomResourceDefinitionNames{
Kind: "InfrastructureRemediation",
Plural: "infrastructureremediations",
},
Versions: []apiextensionsv1.CustomResourceDefinitionVersion{
{
Name: "v1alpha3",
Served: true,
Storage: true,
Subresources: &apiextensionsv1.CustomResourceSubresources{
Status: &apiextensionsv1.CustomResourceSubresourceStatus{},
},
Schema: &apiextensionsv1.CustomResourceValidation{
OpenAPIV3Schema: &apiextensionsv1.JSONSchemaProps{
Type: "object",
Properties: map[string]apiextensionsv1.JSONSchemaProps{
"spec": {
Type: "object",
XPreserveUnknownFields: pointer.BoolPtr(true),
},
"status": {
Type: "object",
XPreserveUnknownFields: pointer.BoolPtr(true),
},
},
},
},
},
},
},
}

TestGenericInfrastructureRemediationTemplateCRD = &apiextensionsv1.CustomResourceDefinition{
TypeMeta: metav1.TypeMeta{
APIVersion: apiextensionsv1.SchemeGroupVersion.String(),
Kind: "CustomResourceDefinition",
},
ObjectMeta: metav1.ObjectMeta{
Name: "infrastructureremediationtemplates.infrastructure.cluster.x-k8s.io",
Labels: map[string]string{
clusterv1.GroupVersion.String(): "v1alpha3",
},
},
Spec: apiextensionsv1.CustomResourceDefinitionSpec{
Group: "infrastructure.cluster.x-k8s.io",
Scope: apiextensionsv1.NamespaceScoped,
Names: apiextensionsv1.CustomResourceDefinitionNames{
Kind: "InfrastructureRemediationTemplate",
Plural: "infrastructureremediationtemplates",
},
Versions: []apiextensionsv1.CustomResourceDefinitionVersion{
{
Name: "v1alpha3",
Served: true,
Storage: true,
Subresources: &apiextensionsv1.CustomResourceSubresources{
Status: &apiextensionsv1.CustomResourceSubresourceStatus{},
},
Schema: &apiextensionsv1.CustomResourceValidation{
OpenAPIV3Schema: &apiextensionsv1.JSONSchemaProps{
Type: "object",
Properties: map[string]apiextensionsv1.JSONSchemaProps{
"spec": {
Type: "object",
XPreserveUnknownFields: pointer.BoolPtr(true),
},
"status": {
Type: "object",
XPreserveUnknownFields: pointer.BoolPtr(true),
},
},
},
},
},
},
},
}
)
30 changes: 12 additions & 18 deletions controllers/machine_controller.go
Expand Up @@ -235,8 +235,6 @@ func patchMachine(ctx context.Context, patchHelper *patch.Helper, machine *clust
}

func (r *MachineReconciler) reconcile(ctx context.Context, cluster *clusterv1.Cluster, m *clusterv1.Machine) (ctrl.Result, error) {
logger := r.Log.WithValues("machine", m.Name, "namespace", m.Namespace)
logger = logger.WithValues("cluster", cluster.Name)

// If the Machine belongs to a cluster, add an owner reference.
if r.shouldAdopt(m) {
Expand All @@ -248,28 +246,24 @@ func (r *MachineReconciler) reconcile(ctx context.Context, cluster *clusterv1.Cl
})
}

// Call the inner reconciliation methods.
reconciliationErrors := []error{
r.reconcileBootstrap(ctx, cluster, m),
r.reconcileInfrastructure(ctx, cluster, m),
r.reconcileNodeRef(ctx, cluster, m),
phases := []func(context.Context, *clusterv1.Cluster, *clusterv1.Machine) (ctrl.Result, error){
r.reconcileBootstrap,
r.reconcileInfrastructure,
r.reconcileNodeRef,
}

// Parse the errors, making sure we record if there is a RequeueAfterError.
res := ctrl.Result{}
errs := []error{}
for _, err := range reconciliationErrors {
if requeueErr, ok := errors.Cause(err).(capierrors.HasRequeueAfterError); ok {
// Only record and log the first RequeueAfterError.
if !res.Requeue {
res.Requeue = true
res.RequeueAfter = requeueErr.GetRequeueAfter()
logger.Error(err, "Reconciliation for Machine asked to requeue")
}
for _, phase := range phases {
// Call the inner reconciliation methods.
phaseResult, err := phase(ctx, cluster, m)
if err != nil {
errs = append(errs, err)
}
if len(errs) > 0 {
continue
}

errs = append(errs, err)
res = util.LowestNonZeroResult(res, phaseResult)
}
return res, kerrors.NewAggregate(errs)
}
Expand Down

0 comments on commit 050798a

Please sign in to comment.