From 73899ee513e346cc388299c96427bc7278599a53 Mon Sep 17 00:00:00 2001 From: dntosas Date: Tue, 4 May 2021 16:33:40 +0300 Subject: [PATCH] [addons] Introduce NodeProblemDetector Node Problem Detector aims to make various node problems visible to the upstream layers in the cluster management stack. It is a daemon that runs on each node, detects node problems and reports them to apiserver so to avoid scheduling new pods on bad nodes and also easily identify which are the problems on underlying nodes. Project Home: https://github.com/kubernetes/node-problem-detector Signed-off-by: dntosas --- k8s/crds/kops.k8s.io_clusters.yaml | 29 +++ pkg/apis/kops/cluster.go | 2 + pkg/apis/kops/componentconfig.go | 17 ++ pkg/apis/kops/v1alpha2/cluster.go | 2 + pkg/apis/kops/v1alpha2/componentconfig.go | 17 ++ .../kops/v1alpha2/zz_generated.conversion.go | 54 ++++++ .../kops/v1alpha2/zz_generated.deepcopy.go | 36 ++++ pkg/apis/kops/zz_generated.deepcopy.go | 36 ++++ pkg/model/components/BUILD.bazel | 1 + pkg/model/components/nodeproblemdetector.go | 63 ++++++ upup/models/BUILD.bazel | 1 + .../k8s-1.19.yaml.template | 183 ++++++++++++++++++ .../bootstrapchannelbuilder.go | 21 ++ upup/pkg/fi/cloudup/populate_cluster_spec.go | 1 + 14 files changed, 463 insertions(+) create mode 100644 pkg/model/components/nodeproblemdetector.go create mode 100644 upup/models/cloudup/resources/addons/node-problem-detector.addons.k8s.io/k8s-1.19.yaml.template diff --git a/k8s/crds/kops.k8s.io_clusters.yaml b/k8s/crds/kops.k8s.io_clusters.yaml index f42f78a89f183..53383fb25cc62 100644 --- a/k8s/crds/kops.k8s.io_clusters.yaml +++ b/k8s/crds/kops.k8s.io_clusters.yaml @@ -3962,6 +3962,35 @@ spec: items: type: string type: array + nodeProblemDetector: + description: NodeProblemDetector determines the node problem detector + configuration. + properties: + cpuRequest: + anyOf: + - type: integer + - type: string + description: 'CPURequest of NodeProblemDetector container. Default: + 10m' + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + enabled: + description: 'Enabled enables the NodeProblemDetector. Default: + false' + type: boolean + image: + description: 'Image is the NodeProblemDetector docker container + used Default: k8s.gcr.io/node-problem-detector/node-problem-detector:v0.8.7' + type: string + memoryRequest: + anyOf: + - type: integer + - type: string + description: 'MemoryRequest of NodeProblemDetector container. + Default: 32Mi' + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + type: object nodeTerminationHandler: description: NodeTerminationHandler determines the cluster autoscaler configuration. diff --git a/pkg/apis/kops/cluster.go b/pkg/apis/kops/cluster.go index 602dd6044a8b7..fb4ae3f33d079 100644 --- a/pkg/apis/kops/cluster.go +++ b/pkg/apis/kops/cluster.go @@ -161,6 +161,8 @@ type ClusterSpec struct { // NodeTerminationHandler determines the node termination handler configuration. NodeTerminationHandler *NodeTerminationHandlerConfig `json:"nodeTerminationHandler,omitempty"` + // NodeProblemDetector determines the node problem detector configuration. + NodeProblemDetector *NodeProblemDetectorConfig `json:"nodeProblemDetector,omitempty"` // MetricsServer determines the metrics server configuration. MetricsServer *MetricsServerConfig `json:"metricsServer,omitempty"` // CertManager determines the metrics server configuration. diff --git a/pkg/apis/kops/componentconfig.go b/pkg/apis/kops/componentconfig.go index 792e660adcff0..99b411d09412e 100644 --- a/pkg/apis/kops/componentconfig.go +++ b/pkg/apis/kops/componentconfig.go @@ -880,6 +880,23 @@ type NodeTerminationHandlerConfig struct { CPURequest *resource.Quantity `json:"cpuRequest,omitempty"` } +// NodeProblemDetector determines the node problem detector configuration. +type NodeProblemDetectorConfig struct { + // Enabled enables the NodeProblemDetector. + // Default: false + Enabled *bool `json:"enabled,omitempty"` + // Image is the NodeProblemDetector docker container used + // Default: k8s.gcr.io/node-problem-detector/node-problem-detector:v0.8.7 + Image string `json:"image,omitempty"` + + // MemoryRequest of NodeProblemDetector container. + // Default: 32Mi + MemoryRequest *resource.Quantity `json:"memoryRequest,omitempty"` + // CPURequest of NodeProblemDetector container. + // Default: 10m + CPURequest *resource.Quantity `json:"cpuRequest,omitempty"` +} + // ClusterAutoscalerConfig determines the cluster autoscaler configuration. type ClusterAutoscalerConfig struct { // Enabled enables the cluster autoscaler. diff --git a/pkg/apis/kops/v1alpha2/cluster.go b/pkg/apis/kops/v1alpha2/cluster.go index 7a62d236c88ea..9963d219bfb30 100644 --- a/pkg/apis/kops/v1alpha2/cluster.go +++ b/pkg/apis/kops/v1alpha2/cluster.go @@ -160,6 +160,8 @@ type ClusterSpec struct { // NodeTerminationHandler determines the cluster autoscaler configuration. NodeTerminationHandler *NodeTerminationHandlerConfig `json:"nodeTerminationHandler,omitempty"` + // NodeProblemDetector determines the node problem detector configuration. + NodeProblemDetector *NodeProblemDetectorConfig `json:"nodeProblemDetector,omitempty"` // MetricsServer determines the metrics server configuration. MetricsServer *MetricsServerConfig `json:"metricsServer,omitempty"` // CertManager determines the metrics server configuration. diff --git a/pkg/apis/kops/v1alpha2/componentconfig.go b/pkg/apis/kops/v1alpha2/componentconfig.go index 378ca162845b2..c098d0aac2e90 100644 --- a/pkg/apis/kops/v1alpha2/componentconfig.go +++ b/pkg/apis/kops/v1alpha2/componentconfig.go @@ -879,6 +879,23 @@ type NodeTerminationHandlerConfig struct { CPURequest *resource.Quantity `json:"cpuRequest,omitempty"` } +// NodeProblemDetector determines the node problem detector configuration. +type NodeProblemDetectorConfig struct { + // Enabled enables the NodeProblemDetector. + // Default: false + Enabled *bool `json:"enabled,omitempty"` + // Image is the NodeProblemDetector docker container used + // Default: k8s.gcr.io/node-problem-detector/node-problem-detector:v0.8.7 + Image string `json:"image,omitempty"` + + // MemoryRequest of NodeProblemDetector container. + // Default: 32Mi + MemoryRequest *resource.Quantity `json:"memoryRequest,omitempty"` + // CPURequest of NodeProblemDetector container. + // Default: 10m + CPURequest *resource.Quantity `json:"cpuRequest,omitempty"` +} + // ClusterAutoscalerConfig determines the cluster autoscaler configuration. type ClusterAutoscalerConfig struct { // Enabled enables the cluster autoscaler. diff --git a/pkg/apis/kops/v1alpha2/zz_generated.conversion.go b/pkg/apis/kops/v1alpha2/zz_generated.conversion.go index c12fdb801299d..7fc2db31efbca 100644 --- a/pkg/apis/kops/v1alpha2/zz_generated.conversion.go +++ b/pkg/apis/kops/v1alpha2/zz_generated.conversion.go @@ -853,6 +853,16 @@ func RegisterConversions(s *runtime.Scheme) error { }); err != nil { return err } + if err := s.AddGeneratedConversionFunc((*NodeProblemDetectorConfig)(nil), (*kops.NodeProblemDetectorConfig)(nil), func(a, b interface{}, scope conversion.Scope) error { + return Convert_v1alpha2_NodeProblemDetectorConfig_To_kops_NodeProblemDetectorConfig(a.(*NodeProblemDetectorConfig), b.(*kops.NodeProblemDetectorConfig), scope) + }); err != nil { + return err + } + if err := s.AddGeneratedConversionFunc((*kops.NodeProblemDetectorConfig)(nil), (*NodeProblemDetectorConfig)(nil), func(a, b interface{}, scope conversion.Scope) error { + return Convert_kops_NodeProblemDetectorConfig_To_v1alpha2_NodeProblemDetectorConfig(a.(*kops.NodeProblemDetectorConfig), b.(*NodeProblemDetectorConfig), scope) + }); err != nil { + return err + } if err := s.AddGeneratedConversionFunc((*NodeTerminationHandlerConfig)(nil), (*kops.NodeTerminationHandlerConfig)(nil), func(a, b interface{}, scope conversion.Scope) error { return Convert_v1alpha2_NodeTerminationHandlerConfig_To_kops_NodeTerminationHandlerConfig(a.(*NodeTerminationHandlerConfig), b.(*kops.NodeTerminationHandlerConfig), scope) }); err != nil { @@ -2425,6 +2435,15 @@ func autoConvert_v1alpha2_ClusterSpec_To_kops_ClusterSpec(in *ClusterSpec, out * } else { out.NodeTerminationHandler = nil } + if in.NodeProblemDetector != nil { + in, out := &in.NodeProblemDetector, &out.NodeProblemDetector + *out = new(kops.NodeProblemDetectorConfig) + if err := Convert_v1alpha2_NodeProblemDetectorConfig_To_kops_NodeProblemDetectorConfig(*in, *out, s); err != nil { + return err + } + } else { + out.NodeProblemDetector = nil + } if in.MetricsServer != nil { in, out := &in.MetricsServer, &out.MetricsServer *out = new(kops.MetricsServerConfig) @@ -2819,6 +2838,15 @@ func autoConvert_kops_ClusterSpec_To_v1alpha2_ClusterSpec(in *kops.ClusterSpec, } else { out.NodeTerminationHandler = nil } + if in.NodeProblemDetector != nil { + in, out := &in.NodeProblemDetector, &out.NodeProblemDetector + *out = new(NodeProblemDetectorConfig) + if err := Convert_kops_NodeProblemDetectorConfig_To_v1alpha2_NodeProblemDetectorConfig(*in, *out, s); err != nil { + return err + } + } else { + out.NodeProblemDetector = nil + } if in.MetricsServer != nil { in, out := &in.MetricsServer, &out.MetricsServer *out = new(MetricsServerConfig) @@ -5874,6 +5902,32 @@ func Convert_kops_NodeLocalDNSConfig_To_v1alpha2_NodeLocalDNSConfig(in *kops.Nod return autoConvert_kops_NodeLocalDNSConfig_To_v1alpha2_NodeLocalDNSConfig(in, out, s) } +func autoConvert_v1alpha2_NodeProblemDetectorConfig_To_kops_NodeProblemDetectorConfig(in *NodeProblemDetectorConfig, out *kops.NodeProblemDetectorConfig, s conversion.Scope) error { + out.Enabled = in.Enabled + out.Image = in.Image + out.MemoryRequest = in.MemoryRequest + out.CPURequest = in.CPURequest + return nil +} + +// Convert_v1alpha2_NodeProblemDetectorConfig_To_kops_NodeProblemDetectorConfig is an autogenerated conversion function. +func Convert_v1alpha2_NodeProblemDetectorConfig_To_kops_NodeProblemDetectorConfig(in *NodeProblemDetectorConfig, out *kops.NodeProblemDetectorConfig, s conversion.Scope) error { + return autoConvert_v1alpha2_NodeProblemDetectorConfig_To_kops_NodeProblemDetectorConfig(in, out, s) +} + +func autoConvert_kops_NodeProblemDetectorConfig_To_v1alpha2_NodeProblemDetectorConfig(in *kops.NodeProblemDetectorConfig, out *NodeProblemDetectorConfig, s conversion.Scope) error { + out.Enabled = in.Enabled + out.Image = in.Image + out.MemoryRequest = in.MemoryRequest + out.CPURequest = in.CPURequest + return nil +} + +// Convert_kops_NodeProblemDetectorConfig_To_v1alpha2_NodeProblemDetectorConfig is an autogenerated conversion function. +func Convert_kops_NodeProblemDetectorConfig_To_v1alpha2_NodeProblemDetectorConfig(in *kops.NodeProblemDetectorConfig, out *NodeProblemDetectorConfig, s conversion.Scope) error { + return autoConvert_kops_NodeProblemDetectorConfig_To_v1alpha2_NodeProblemDetectorConfig(in, out, s) +} + func autoConvert_v1alpha2_NodeTerminationHandlerConfig_To_kops_NodeTerminationHandlerConfig(in *NodeTerminationHandlerConfig, out *kops.NodeTerminationHandlerConfig, s conversion.Scope) error { out.Enabled = in.Enabled out.EnableSpotInterruptionDraining = in.EnableSpotInterruptionDraining diff --git a/pkg/apis/kops/v1alpha2/zz_generated.deepcopy.go b/pkg/apis/kops/v1alpha2/zz_generated.deepcopy.go index 9e7eb1f0a8ef6..b80fb2342572e 100644 --- a/pkg/apis/kops/v1alpha2/zz_generated.deepcopy.go +++ b/pkg/apis/kops/v1alpha2/zz_generated.deepcopy.go @@ -1025,6 +1025,11 @@ func (in *ClusterSpec) DeepCopyInto(out *ClusterSpec) { *out = new(NodeTerminationHandlerConfig) (*in).DeepCopyInto(*out) } + if in.NodeProblemDetector != nil { + in, out := &in.NodeProblemDetector, &out.NodeProblemDetector + *out = new(NodeProblemDetectorConfig) + (*in).DeepCopyInto(*out) + } if in.MetricsServer != nil { in, out := &in.MetricsServer, &out.MetricsServer *out = new(MetricsServerConfig) @@ -3929,6 +3934,37 @@ func (in *NodeLocalDNSConfig) DeepCopy() *NodeLocalDNSConfig { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *NodeProblemDetectorConfig) DeepCopyInto(out *NodeProblemDetectorConfig) { + *out = *in + if in.Enabled != nil { + in, out := &in.Enabled, &out.Enabled + *out = new(bool) + **out = **in + } + if in.MemoryRequest != nil { + in, out := &in.MemoryRequest, &out.MemoryRequest + x := (*in).DeepCopy() + *out = &x + } + if in.CPURequest != nil { + in, out := &in.CPURequest, &out.CPURequest + x := (*in).DeepCopy() + *out = &x + } + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new NodeProblemDetectorConfig. +func (in *NodeProblemDetectorConfig) DeepCopy() *NodeProblemDetectorConfig { + if in == nil { + return nil + } + out := new(NodeProblemDetectorConfig) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *NodeTerminationHandlerConfig) DeepCopyInto(out *NodeTerminationHandlerConfig) { *out = *in diff --git a/pkg/apis/kops/zz_generated.deepcopy.go b/pkg/apis/kops/zz_generated.deepcopy.go index 5361c8280e27c..aff5425ca00a0 100644 --- a/pkg/apis/kops/zz_generated.deepcopy.go +++ b/pkg/apis/kops/zz_generated.deepcopy.go @@ -1125,6 +1125,11 @@ func (in *ClusterSpec) DeepCopyInto(out *ClusterSpec) { *out = new(NodeTerminationHandlerConfig) (*in).DeepCopyInto(*out) } + if in.NodeProblemDetector != nil { + in, out := &in.NodeProblemDetector, &out.NodeProblemDetector + *out = new(NodeProblemDetectorConfig) + (*in).DeepCopyInto(*out) + } if in.MetricsServer != nil { in, out := &in.MetricsServer, &out.MetricsServer *out = new(MetricsServerConfig) @@ -4127,6 +4132,37 @@ func (in *NodeLocalDNSConfig) DeepCopy() *NodeLocalDNSConfig { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *NodeProblemDetectorConfig) DeepCopyInto(out *NodeProblemDetectorConfig) { + *out = *in + if in.Enabled != nil { + in, out := &in.Enabled, &out.Enabled + *out = new(bool) + **out = **in + } + if in.MemoryRequest != nil { + in, out := &in.MemoryRequest, &out.MemoryRequest + x := (*in).DeepCopy() + *out = &x + } + if in.CPURequest != nil { + in, out := &in.CPURequest, &out.CPURequest + x := (*in).DeepCopy() + *out = &x + } + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new NodeProblemDetectorConfig. +func (in *NodeProblemDetectorConfig) DeepCopy() *NodeProblemDetectorConfig { + if in == nil { + return nil + } + out := new(NodeProblemDetectorConfig) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *NodeTerminationHandlerConfig) DeepCopyInto(out *NodeTerminationHandlerConfig) { *out = *in diff --git a/pkg/model/components/BUILD.bazel b/pkg/model/components/BUILD.bazel index 1cc61c66a45a2..8aed9d67e7278 100644 --- a/pkg/model/components/BUILD.bazel +++ b/pkg/model/components/BUILD.bazel @@ -21,6 +21,7 @@ go_library( "kubeproxy.go", "kubescheduler.go", "networking.go", + "nodeproblemdetector.go", "nodeterminationhandler.go", "openstack.go", ], diff --git a/pkg/model/components/nodeproblemdetector.go b/pkg/model/components/nodeproblemdetector.go new file mode 100644 index 0000000000000..7b4958bec5213 --- /dev/null +++ b/pkg/model/components/nodeproblemdetector.go @@ -0,0 +1,63 @@ +/* +Copyright 2021 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package components + +import ( + "k8s.io/apimachinery/pkg/api/resource" + "k8s.io/kops/pkg/apis/kops" + "k8s.io/kops/upup/pkg/fi" + "k8s.io/kops/upup/pkg/fi/loader" +) + +// NodeProblemDetectorOptionsBuilder adds options for the node problem detector to the model. +type NodeProblemDetectorOptionsBuilder struct { + *OptionsContext +} + +var _ loader.OptionsBuilder = &NodeProblemDetectorOptionsBuilder{} + +func (b *NodeProblemDetectorOptionsBuilder) BuildOptions(o interface{}) error { + clusterSpec := o.(*kops.ClusterSpec) + if clusterSpec.NodeProblemDetector == nil { + return nil + } + npd := clusterSpec.NodeProblemDetector + + if npd.Enabled == nil { + npd.Enabled = fi.Bool(false) + } + + if npd.CPURequest == nil { + defaultCPURequest := resource.MustParse("10m") + npd.CPURequest = &defaultCPURequest + } + + if npd.MemoryRequest == nil { + defaultMemoryRequest := resource.MustParse("32Mi") + npd.MemoryRequest = &defaultMemoryRequest + } + + if npd.Enabled == nil { + npd.Enabled = fi.Bool(false) + } + + if npd.Image == "" { + npd.Image = "k8s.gcr.io/node-problem-detector/node-problem-detector:v0.8.7" + } + + return nil +} diff --git a/upup/models/BUILD.bazel b/upup/models/BUILD.bazel index b36b98eab67d4..a291cdc9adcfb 100644 --- a/upup/models/BUILD.bazel +++ b/upup/models/BUILD.bazel @@ -56,6 +56,7 @@ go_library( "cloudup/resources/addons/storage-openstack.addons.k8s.io/k8s-1.16.yaml.template", "cloudup/resources/addons/networking.cilium.io/k8s-1.16-v1.10.yaml.template", "cloudup/resources/addons/networking.cilium.io/k8s-1.12-v1.9.yaml.template", + "cloudup/resources/addons/node-problem-detector.addons.k8s.io/k8s-1.19.yaml.template", ], importpath = "k8s.io/kops/upup/models", visibility = ["//visibility:public"], diff --git a/upup/models/cloudup/resources/addons/node-problem-detector.addons.k8s.io/k8s-1.19.yaml.template b/upup/models/cloudup/resources/addons/node-problem-detector.addons.k8s.io/k8s-1.19.yaml.template new file mode 100644 index 0000000000000..17243cc78bb9d --- /dev/null +++ b/upup/models/cloudup/resources/addons/node-problem-detector.addons.k8s.io/k8s-1.19.yaml.template @@ -0,0 +1,183 @@ +{{ with .NodeProblemDetector }} +# Sourced from https://github.com/kubernetes/node-problem-detector/tree/v0.8.7 +--- +# Source: node-problem-detector/deployment/node-problem-detector.yaml +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: node-problem-detector + namespace: kube-system + labels: + app: node-problem-detector +spec: + selector: + matchLabels: + app: node-problem-detector + template: + metadata: + labels: + app: node-problem-detector + spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: kubernetes.io/os + operator: In + values: + - linux + containers: + - name: node-problem-detector + command: + - /node-problem-detector + - --logtostderr + - --config.system-log-monitor=/config/kernel-monitor.json,/config/docker-monitor.json + image: {{ .Image }} + resources: + requests: + cpu: {{ .CPURequest }} + memory: {{ .MemoryRequest }} + imagePullPolicy: Always + securityContext: + privileged: true + env: + - name: NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + volumeMounts: + - name: log + mountPath: /var/log + readOnly: true + - name: kmsg + mountPath: /dev/kmsg + readOnly: true + # Make sure node problem detector is in the same timezone + # with the host. + - name: localtime + mountPath: /etc/localtime + readOnly: true + - name: config + mountPath: /config + readOnly: true + volumes: + - name: log + # Config `log` to your system log directory + hostPath: + path: /var/log/ + - name: kmsg + hostPath: + path: /dev/kmsg + - name: localtime + hostPath: + path: /etc/localtime + - name: config + configMap: + name: node-problem-detector-config + items: + - key: kernel-monitor.json + path: kernel-monitor.json + - key: docker-monitor.json + path: docker-monitor.json + priorityClassName: system-node-critical + tolerations: + - operator: Exists +--- +# Source: node-problem-detector/deployment/node-problem-detector-config.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: node-problem-detector-config + namespace: kube-system +data: + kernel-monitor.json: | + { + "plugin": "kmsg", + "logPath": "/dev/kmsg", + "lookback": "5m", + "bufferSize": 10, + "source": "kernel-monitor", + "conditions": [ + { + "type": "KernelDeadlock", + "reason": "KernelHasNoDeadlock", + "message": "kernel has no deadlock" + }, + { + "type": "ReadonlyFilesystem", + "reason": "FilesystemIsNotReadOnly", + "message": "Filesystem is not read-only" + } + ], + "rules": [ + { + "type": "temporary", + "reason": "OOMKilling", + "pattern": "Kill process \\d+ (.+) score \\d+ or sacrifice child\\nKilled process \\d+ (.+) total-vm:\\d+kB, anon-rss:\\d+kB, file-rss:\\d+kB.*" + }, + { + "type": "temporary", + "reason": "TaskHung", + "pattern": "task \\S+:\\w+ blocked for more than \\w+ seconds\\." + }, + { + "type": "temporary", + "reason": "UnregisterNetDevice", + "pattern": "unregister_netdevice: waiting for \\w+ to become free. Usage count = \\d+" + }, + { + "type": "temporary", + "reason": "KernelOops", + "pattern": "BUG: unable to handle kernel NULL pointer dereference at .*" + }, + { + "type": "temporary", + "reason": "KernelOops", + "pattern": "divide error: 0000 \\[#\\d+\\] SMP" + }, + { + "type": "temporary", + "reason": "MemoryReadError", + "pattern": "CE memory read error .*" + }, + { + "type": "permanent", + "condition": "KernelDeadlock", + "reason": "AUFSUmountHung", + "pattern": "task umount\\.aufs:\\w+ blocked for more than \\w+ seconds\\." + }, + { + "type": "permanent", + "condition": "KernelDeadlock", + "reason": "DockerHung", + "pattern": "task docker:\\w+ blocked for more than \\w+ seconds\\." + }, + { + "type": "permanent", + "condition": "ReadonlyFilesystem", + "reason": "FilesystemIsReadOnly", + "pattern": "Remounting filesystem read-only" + } + ] + } + docker-monitor.json: | + { + "plugin": "journald", + "pluginConfig": { + "source": "dockerd" + }, + "logPath": "/var/log/journal", + "lookback": "5m", + "bufferSize": 10, + "source": "docker-monitor", + "conditions": [], + "rules": [ + { + "type": "temporary", + "reason": "CorruptDockerImage", + "pattern": "Error trying v2 registry: failed to register layer: rename /var/lib/docker/image/(.+) /var/lib/docker/image/(.+): directory not empty.*" + } + ] + } +{{ end }} diff --git a/upup/pkg/fi/cloudup/bootstrapchannelbuilder/bootstrapchannelbuilder.go b/upup/pkg/fi/cloudup/bootstrapchannelbuilder/bootstrapchannelbuilder.go index e55dea7fcbe2e..c1323d7ad78c3 100644 --- a/upup/pkg/fi/cloudup/bootstrapchannelbuilder/bootstrapchannelbuilder.go +++ b/upup/pkg/fi/cloudup/bootstrapchannelbuilder/bootstrapchannelbuilder.go @@ -563,6 +563,27 @@ func (b *BootstrapChannelBuilder) buildAddons(c *fi.ModelBuilderContext) (*chann } } + npd := b.Cluster.Spec.NodeProblemDetector + + if npd != nil && fi.BoolValue(npd.Enabled) { + + key := "node-problem-detector.addons.k8s.io" + version := "0.8.7" + + { + location := key + "/k8s-1.19.yaml" + id := "k8s-1.19" + + addons.Spec.Addons = append(addons.Spec.Addons, &channelsapi.AddonSpec{ + Name: fi.String(key), + Version: fi.String(version), + Selector: map[string]string{"k8s-addon": key}, + Manifest: fi.String(location), + Id: id, + }) + } + } + if b.Cluster.Spec.AWSLoadBalancerController != nil && fi.BoolValue(b.Cluster.Spec.AWSLoadBalancerController.Enabled) { key := "aws-load-balancer-controller.addons.k8s.io" diff --git a/upup/pkg/fi/cloudup/populate_cluster_spec.go b/upup/pkg/fi/cloudup/populate_cluster_spec.go index 1c91a57d9558a..fc6b85433bb6c 100644 --- a/upup/pkg/fi/cloudup/populate_cluster_spec.go +++ b/upup/pkg/fi/cloudup/populate_cluster_spec.go @@ -280,6 +280,7 @@ func (c *populateClusterSpec) run(clientset simple.Clientset) error { codeModels = append(codeModels, &components.DiscoveryOptionsBuilder{OptionsContext: optionsContext}) codeModels = append(codeModels, &components.ClusterAutoscalerOptionsBuilder{OptionsContext: optionsContext}) codeModels = append(codeModels, &components.NodeTerminationHandlerOptionsBuilder{OptionsContext: optionsContext}) + codeModels = append(codeModels, &components.NodeProblemDetectorOptionsBuilder{OptionsContext: optionsContext}) codeModels = append(codeModels, &components.AWSEBSCSIDriverOptionsBuilder{OptionsContext: optionsContext}) } }