Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,5 @@ junit*.xml
debug.test
/output/
coverage.out
.idea/
.idea
vendor
17 changes: 17 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,23 @@ For example, to run without auth, use the following config:

* `--custom-plugin-monitors`: List of paths to custom plugin monitor config files, comma-separated. This option is deprecated, replaced by `--config.custom-plugin-monitor`, and will be removed. NPD will panic if both `--custom-plugin-monitors` and `--config.custom-plugin-monitor` are set.

### Tainting Nodes
You can enable node tainting feature to the response of permanent node problems. For example, on the file [config/kernel-monitor.json](config/kernel-monitor.json),
put a `TaintConfig` object as following for required `Condition` as you need. You can omit the `TaintConfig` or disable it by setting `enabled` as false. By default, it is disabled and will not be enabled until you need it.
```json
{
"type": "ReadonlyFilesystem",
"reason": "FilesystemIsNotReadOnly",
"message": "Filesystem is not read-only",
"taintConfig": {
"enabled": false,
"key": "node-problem-detector/read-only-filesystem",
"value": "true",
"effect": "NoSchedule"
}
}
```

## Build Image

* Install development dependencies for `libsystemd` and the ARM GCC toolchain
Expand Down
16 changes: 14 additions & 2 deletions config/kernel-monitor.json
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,24 @@
{
"type": "KernelDeadlock",
"reason": "KernelHasNoDeadlock",
"message": "kernel has no deadlock"
"message": "kernel has no deadlock",
"taintConfig": {
"enabled": false,
"key": "node-problem-detector/kernel-deadlock",
"value": "true",
"effect": "NoSchedule"
}
},
{
"type": "ReadonlyFilesystem",
"reason": "FilesystemIsNotReadOnly",
"message": "Filesystem is not read-only"
"message": "Filesystem is not read-only",
"taintConfig": {
"enabled": false,
"key": "node-problem-detector/read-only-filesystem",
"value": "true",
"effect": "NoSchedule"
}
}
],
"rules": [
Expand Down
22 changes: 20 additions & 2 deletions deployment/node-problem-detector-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,24 @@ data:
{
"type": "KernelDeadlock",
"reason": "KernelHasNoDeadlock",
"message": "kernel has no deadlock"
"message": "kernel has no deadlock",
"taintConfig": {
"enabled": false,
"key": "node-problem-detector/kernel-deadlock",
"value": "true",
"effect": "NoSchedule"
}
},
{
"type": "ReadonlyFilesystem",
"reason": "FilesystemIsNotReadOnly",
"message": "Filesystem is not read-only"
"message": "Filesystem is not read-only",
"taintConfig": {
"enabled": true,
"key": "node-problem-detector/read-only-filesystem",
"value": "true",
"effect": "NoSchedule"
}
}
],
"rules": [
Expand Down Expand Up @@ -50,6 +62,12 @@ data:
"reason": "MemoryReadError",
"pattern": "CE memory read error .*"
},
{
"type": "permanent",
"condition": "KernelDeadlock",
"reason": "AUFSUmountHung",
"pattern": "task umount\\.aufs:\\w+ blocked for more than \\w+ seconds\\."
},
{
"type": "permanent",
"condition": "KernelDeadlock",
Expand Down
52 changes: 52 additions & 0 deletions pkg/exporters/k8sexporter/condition/manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ package condition

import (
"context"
"fmt"
"reflect"
"sync"
"time"
Expand Down Expand Up @@ -159,7 +160,58 @@ func (c *conditionManager) sync(ctx context.Context) {
conditions := []v1.NodeCondition{}
for i := range c.conditions {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not 100% sure if the node tainting should occur in this sync or in another place, I'd recommend to get a +1 here from someone with more expertise.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i thought that best place is here but i can move it of course!

conditions = append(conditions, problemutil.ConvertToAPICondition(c.conditions[i]))

condition := c.conditions[i]
if condition.TaintConfig == nil || !condition.TaintConfig.Enabled {
// we are skipping tainting since TaintConfig of our condition is nil or disabled
continue
}

taintStr := fmt.Sprintf("%s=%s:%s", c.conditions[i].TaintConfig.Key, c.conditions[i].TaintConfig.Value,
c.conditions[i].TaintConfig.Effect)

node, err := c.client.GetNode(ctx)
if err != nil {
glog.Errorf("failed to get node: %v", err)
continue
}

taintExists := problemclient.CheckIfTaintAlreadyExists(node, *condition.TaintConfig)

switch condition.Status {
case types.True:
if taintExists {
// we are skipping here since node is already tainted with our TaintConfig
continue
}

glog.Infof("for condition %s, tainting is enabled and status is True, tainting with %s",
condition.Type, taintStr)

if err := c.client.TaintNode(node, condition); err != nil {
glog.Errorf("failed to add taint %v: %v", taintStr, err)
continue
}

glog.Infof("successfully tainted node with %s", taintStr)
case types.False:
if !taintExists {
// we are skipping here since node is not tainted with our TaintConfig
continue
}

glog.Infof("for condition %s, tainting is enabled and condition status is False, removing taint %s",
condition.Type, taintStr)

if err := c.client.UntaintNode(node, condition); err != nil {
glog.Errorf("failed to remove taint %v: %v", taintStr, err)
continue
}

glog.Infof("successfully removed taint %s from node", taintStr)
}
}

if err := c.client.SetConditions(ctx, conditions); err != nil {
// The conditions will be updated again in future sync
glog.Errorf("failed to update node conditions: %v", err)
Expand Down
215 changes: 215 additions & 0 deletions pkg/exporters/k8sexporter/condition/manager_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ package condition
import (
"context"
"fmt"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"testing"
"time"

Expand Down Expand Up @@ -126,6 +127,220 @@ func TestResync(t *testing.T) {
assert.True(t, m.needResync(), "Should resync after resync period and resync is needed")
}

func TestSync(t *testing.T) {
cases := []struct {
caseName string
condition types.Condition
node *v1.Node
injectError bool
errorKey string
}{
{"Sync success with Status True and nil taint config",
types.Condition{
Type: "ReadonlyFilesystem",
Status: "True",
}, &v1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "my-node",
},
Spec: v1.NodeSpec{
Taints: []v1.Taint{
{
Key: "node-problem-detector/read-only-filesystem",
Value: "true",
Effect: v1.TaintEffectNoSchedule,
},
},
},
}, false, "TaintNode",
},
{"Sync success with Status True and disabled taint config",
types.Condition{
Type: "ReadonlyFilesystem",
Status: "True",
TaintConfig: &types.TaintConfig{
Enabled: false,
Key: "node-problem-detector/read-only-filesystem",
Value: "true",
Effect: string(v1.TaintEffectNoSchedule),
},
}, &v1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "my-node",
},
Spec: v1.NodeSpec{
Taints: []v1.Taint{
{
Key: "node-problem-detector/read-only-filesystem",
Value: "true",
Effect: v1.TaintEffectNoSchedule,
},
},
},
}, false, "",
},
{"Sync failure with Status True and TaintNode error",
types.Condition{
Type: "ReadonlyFilesystem",
Status: "True",
TaintConfig: &types.TaintConfig{
Enabled: true,
Key: "node-problem-detector/read-only-filesystem",
Value: "true",
Effect: string(v1.TaintEffectNoSchedule),
},
}, &v1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "my-node",
},
Spec: v1.NodeSpec{},
}, true, "TaintNode"},
{"Sync failure with Status True and GetNode error",
types.Condition{
Type: "ReadonlyFilesystem",
Status: "True",
TaintConfig: &types.TaintConfig{
Enabled: true,
Key: "node-problem-detector/read-only-filesystem",
Value: "true",
Effect: string(v1.TaintEffectNoSchedule),
},
}, nil, true, "GetNode"},
{"Sync success with Status True and non-nil and enabled taint config",
types.Condition{
Type: "ReadonlyFilesystem",
Status: "True",
TaintConfig: &types.TaintConfig{
Enabled: true,
Key: "node-problem-detector/read-only",
Value: "true",
Effect: string(v1.TaintEffectNoSchedule),
},
}, &v1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "my-node",
},
Spec: v1.NodeSpec{
Taints: []v1.Taint{
{
Key: "node-problem-detector/read-only-filesystem",
Value: "true",
Effect: v1.TaintEffectNoSchedule,
},
},
},
}, false, "",
},
{"Sync success with Status True and already tainted node",
types.Condition{
Type: "ReadonlyFilesystem",
Status: "True",
TaintConfig: &types.TaintConfig{
Enabled: true,
Key: "node-problem-detector/read-only-filesystem",
Value: "true",
Effect: string(v1.TaintEffectNoSchedule),
},
}, &v1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "my-node",
},
Spec: v1.NodeSpec{
Taints: []v1.Taint{
{
Key: "node-problem-detector/read-only-filesystem",
Value: "true",
Effect: v1.TaintEffectNoSchedule,
},
},
},
}, false, "",
},
{"Sync success with Status False",
types.Condition{
Type: "ReadonlyFilesystem",
Status: "False",
TaintConfig: &types.TaintConfig{
Enabled: true,
Key: "node-problem-detector/read-only-filesystem",
Value: "true",
Effect: string(v1.TaintEffectNoSchedule),
},
}, &v1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "my-node",
},
Spec: v1.NodeSpec{
Taints: []v1.Taint{
{
Key: "node-problem-detector/read-only-filesystem",
Value: "true",
Effect: v1.TaintEffectNoSchedule,
},
},
},
}, false, "",
},
{"Sync success with Status False and taint not exists",
types.Condition{
Type: "ReadonlyFilesystem",
Status: "False",
TaintConfig: &types.TaintConfig{
Enabled: true,
Key: "node-problem-detector/read-only-filesystem",
Value: "true",
Effect: string(v1.TaintEffectNoSchedule),
},
}, &v1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "my-node",
},
Spec: v1.NodeSpec{},
}, false, "",
},
{"Sync failure with Status False",
types.Condition{
Type: "ReadonlyFilesystem",
Status: "False",
TaintConfig: &types.TaintConfig{
Enabled: true,
Key: "node-problem-detector/read-only-filesystem",
Value: "true",
Effect: string(v1.TaintEffectNoSchedule),
},
}, &v1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "my-node",
},
Spec: v1.NodeSpec{
Taints: []v1.Taint{
{
Key: "node-problem-detector/read-only-filesystem",
Value: "true",
Effect: v1.TaintEffectNoSchedule,
},
},
},
}, true, "UntaintNode",
},
}

for _, tc := range cases {
m, fakeClient, _ := newTestManager()
m.conditions = map[string]types.Condition{tc.condition.Type: tc.condition}

if tc.node != nil {
fakeClient.InjectNode("mynode", tc.node)
}

if tc.injectError {
fakeClient.InjectError(tc.errorKey, fmt.Errorf("injected error"))
}

m.sync(context.Background())
}
}

func TestHeartbeat(t *testing.T) {
m, fakeClient, fakeClock := newTestManager()
condition := newTestCondition("TestCondition")
Expand Down
Loading