Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Collect more CPU/disk/memory metrics #410

Merged
merged 4 commits into from
Feb 6, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
44 changes: 44 additions & 0 deletions config/system-stats-monitor.json
Original file line number Diff line number Diff line change
@@ -1,4 +1,14 @@
{
"cpu": {
"metricsConfigs": {
"cpu/runnable_task_count": {
"displayName": "cpu/runnable_task_count"
},
"cpu/usage_time": {
"displayName": "cpu/usage_time"
}
}
},
"disk": {
"metricsConfigs": {
"disk/io_time": {
Expand All @@ -9,6 +19,21 @@
},
"disk/avg_queue_len": {
"displayName": "disk/avg_queue_len"
},
"disk/operation_count": {
"displayName": "disk/operation_count"
},
"disk/merged_operation_count": {
"displayName": "disk/merged_operation_count"
},
"disk/operation_bytes_count": {
"displayName": "disk/operation_bytes_count"
},
"disk/operation_time": {
"displayName": "disk/operation_time"
},
"disk/bytes_used": {
"displayName": "disk/bytes_used"
}
},
"includeRootBlk": true,
Expand All @@ -22,5 +47,24 @@
}
}
},
"memory": {
"metricsConfigs": {
"memory/bytes_used": {
"displayName": "memory/bytes_used"
},
"memory/anonymous_used": {
"displayName": "memory/anonymous_used"
},
"memory/page_cache_used": {
"displayName": "memory/page_cache_used"
},
"memory/unevictable_used": {
"displayName": "memory/unevictable_used"
},
"memory/dirty_used": {
"displayName": "memory/dirty_used"
}
}
},
"invokeInterval": "60s"
}
3 changes: 2 additions & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,8 @@ require (
github.com/pborman/uuid v1.2.0
github.com/prometheus/client_model v0.0.0-20190129233127-fd36f4220a90
github.com/prometheus/common v0.4.1
github.com/shirou/gopsutil v2.18.12+incompatible
github.com/prometheus/procfs v0.0.8
github.com/shirou/gopsutil v2.19.12+incompatible
github.com/shirou/w32 v0.0.0-20160930032740-bb4de0191aa4 // indirect
github.com/sigma/go-inotify v0.0.0-20181102212354-c87b6cf5033d // indirect
github.com/spf13/pflag v1.0.3
Expand Down
6 changes: 6 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -297,13 +297,17 @@ github.com/prometheus/procfs v0.0.0-20190117184657-bf6a532e95b1/go.mod h1:c3At6R
github.com/prometheus/procfs v0.0.2/go.mod h1:TjEm7ze935MbeOT/UhFTIMYKhuLP4wbCsTZCD3I8kEA=
github.com/prometheus/procfs v0.0.4 h1:w8DjqFMJDjuVwdZBQoOozr4MVWOnwF7RcL/7uxBjY78=
github.com/prometheus/procfs v0.0.4/go.mod h1:4A/X28fw3Fc593LaREMrKMqOKvUAntwMDaekg4FpcdQ=
github.com/prometheus/procfs v0.0.8 h1:+fpWZdT24pJBiqJdAwYBjPSk+5YmQzYNPYzQsdzLkt8=
github.com/prometheus/procfs v0.0.8/go.mod h1:7Qr8sr6344vo1JqZ6HhLceV9o3AJ1Ff+GxbHq6oeK9A=
github.com/rcrowley/go-metrics v0.0.0-20181016184325-3113b8401b8a/go.mod h1:bCqnVzQkZxMG4s8nGwiZ5l3QUCyqpo9Y+/ZMZ9VjZe4=
github.com/rogpeppe/go-internal v1.1.0/go.mod h1:M8bDsm7K2OlrFYOpmOWEs/qY81heoFRclV5y23lUDJ4=
github.com/rogpeppe/go-internal v1.3.2/go.mod h1:xXDCJY+GAPziupqXw64V24skbSoqbTEfhy4qGm1nDQc=
github.com/russross/blackfriday v1.5.2/go.mod h1:JO/DiYxRf+HjHt06OyowR9PTA263kcR/rfWxYHBV53g=
github.com/satori/go.uuid v0.0.0-20160713180306-0aa62d5ddceb/go.mod h1:dA0hQrYB0VpLJoorglMZABFdXlWrHn1NEOzdhQKdks0=
github.com/shirou/gopsutil v2.18.12+incompatible h1:1eaJvGomDnH74/5cF4CTmTbLHAriGFsTZppLXDX93OM=
github.com/shirou/gopsutil v2.18.12+incompatible/go.mod h1:5b4v6he4MtMOwMlS0TUMTu2PcXUg8+E1lC7eC3UO/RA=
github.com/shirou/gopsutil v2.19.12+incompatible h1:WRstheAymn1WOPesh+24+bZKFkqrdCR8JOc77v4xV3Q=
github.com/shirou/gopsutil v2.19.12+incompatible/go.mod h1:5b4v6he4MtMOwMlS0TUMTu2PcXUg8+E1lC7eC3UO/RA=
github.com/shirou/w32 v0.0.0-20160930032740-bb4de0191aa4 h1:udFKJ0aHUL60LboW/A+DfgoHVedieIzIXE8uylPue0U=
github.com/shirou/w32 v0.0.0-20160930032740-bb4de0191aa4/go.mod h1:qsXQc7+bwAM3Q1u/4XEfrquwF8Lw7D7y5cD8CuHnfIc=
github.com/shurcooL/githubv4 v0.0.0-20180925043049-51d7b505e2e9/go.mod h1:hAF0iLZy4td2EX+/8Tw+4nodhlMrwN3HupfaXj3zkGo=
Expand Down Expand Up @@ -407,6 +411,8 @@ golang.org/x/sync v0.0.0-20181221193216-37e7f081c4d4/go.mod h1:RxMgew5VJxzue5/jJ
golang.org/x/sync v0.0.0-20190227155943-e225da77a7e6/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20190423024810-112230192c58 h1:8gQV6CLnAEikrhgkHFbMAEhagSSnXWGV915qUMm9mrU=
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e h1:vcxGaoTs7kV8m5Np9uUNQin4BrLOthgV7252N8V+FwY=
golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20180905080454-ebe1bf3edb33/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20180909124046-d0be0721c37e/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
Expand Down
24 changes: 18 additions & 6 deletions pkg/exporters/stackdriver/stackdriver_exporter.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,12 +47,24 @@ func init() {
const exporterName = "stackdriver"

var NPDMetricToSDMetric = map[metrics.MetricID]string{
metrics.HostUptimeID: "compute.googleapis.com/guest/system/uptime",
metrics.ProblemCounterID: "compute.googleapis.com/guest/system/problem_count",
metrics.ProblemGaugeID: "compute.googleapis.com/guest/system/problem_state",
metrics.DiskAvgQueueLenID: "compute.googleapis.com/guest/disk/queue_length",
metrics.DiskIOTimeID: "compute.googleapis.com/guest/disk/io_time",
metrics.DiskWeightedIOID: "compute.googleapis.com/guest/disk/weighted_io_time",
metrics.CPURunnableTaskCountID: "compute.googleapis.com/guest/cpu/runnable_task_count",
metrics.CPUUsageTimeID: "compute.googleapis.com/guest/cpu/usage_time",
metrics.DiskAvgQueueLenID: "compute.googleapis.com/guest/disk/queue_length",
metrics.DiskBytesUsedID: "compute.googleapis.com/guest/disk/bytes_used",
metrics.DiskIOTimeID: "compute.googleapis.com/guest/disk/io_time",
metrics.DiskMergedOpsCountID: "compute.googleapis.com/guest/disk/merged_operation_count",
metrics.DiskOpsBytesID: "compute.googleapis.com/guest/disk/operation_bytes_count",
metrics.DiskOpsCountID: "compute.googleapis.com/guest/disk/operation_count",
metrics.DiskOpsTimeID: "compute.googleapis.com/guest/disk/operation_time",
metrics.DiskWeightedIOID: "compute.googleapis.com/guest/disk/weighted_io_time",
metrics.HostUptimeID: "compute.googleapis.com/guest/system/uptime",
metrics.MemoryAnonymousUsedID: "compute.googleapis.com/guest/memory/anonymous_used",
metrics.MemoryBytesUsedID: "compute.googleapis.com/guest/memory/bytes_used",
metrics.MemoryDirtyUsedID: "compute.googleapis.com/guest/memory/dirty_used",
metrics.MemoryPageCacheUsedID: "compute.googleapis.com/guest/memory/page_cache_used",
metrics.MemoryUnevictableUsedID: "compute.googleapis.com/guest/memory/unevictable_used",
metrics.ProblemCounterID: "compute.googleapis.com/guest/system/problem_count",
metrics.ProblemGaugeID: "compute.googleapis.com/guest/system/problem_state",
}

func getMetricTypeConversionFunction(customMetricPrefix string) func(*view.View) string {
Expand Down
52 changes: 46 additions & 6 deletions pkg/systemstatsmonitor/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,27 +4,67 @@

Currently supported components are:

* cpu
* disk
* host
* memory

See example config file [here](https://github.com/kubernetes/node-problem-detector/blob/master/config/system-stats-monitor.json).

By setting the `metricsConfigs` field and `displayName` field ([example](https://github.com/kubernetes/node-problem-detector/blob/master/config/system-stats-monitor.json)), you can specify the list of metrics to be collected, and their display names on the Prometheus scaping endpoint.

## Detailed Configuration Options

### Global Configurations

Data collection period can be specified globally in the config file, see `invokeInterval` at the [example](https://github.com/kubernetes/node-problem-detector/blob/master/config/system-stats-monitor.json).

### CPU

Below metrics are collected from `cpu` component:

* `cpu_runnable_task_count`: The average number of runnable tasks in the run-queue during the last minute. Collected from [`/proc/loadavg`][/proc doc].
* `cpu_usage_time`: CPU usage, in seconds. The [CPU state][/proc doc] for the corresponding usage is reported under the `state` metric label (e.g. `user`, `nice`, `system`...).

[/proc doc]: http://man7.org/linux/man-pages/man5/proc.5.html

### Disk

Below metrics are collected from `disk` component:

* `disk/io_time`: [# of milliseconds spent doing I/Os on this device](https://www.kernel.org/doc/Documentation/iostats.txt)
* `disk/weighted_io`: [# of milliseconds spent doing I/Os on this device](https://www.kernel.org/doc/Documentation/iostats.txt)
* `disk/avg_queue_len`: [average # of requests that was waiting in queue or being serviced during the last `invokeInterval`](https://www.xaprb.com/blog/2010/01/09/how-linux-iostat-computes-its-results/)
* `disk_io_time`: [# of milliseconds spent doing I/Os on this device][iostat doc]
* `disk_weighted_io`: [# of milliseconds spent doing I/Os on this device][iostat doc]
* `disk_avg_queue_len`: [average # of requests that was waiting in queue or being serviced during the last `invokeInterval`](https://www.xaprb.com/blog/2010/01/09/how-linux-iostat-computes-its-results/)
* `disk_operation_count`: [# of reads/writes completed][iostat doc]
* `disk_merged_operation_count`: [# of reads/writes merged][iostat doc]
* `disk_operation_bytes_count`: # of Bytes used for reads/writes on this device
* `disk_operation_time`: [# of milliseconds spent reading/writing][iostat doc]
* `disk_bytes_used`: Disk usage in Bytes. The usage state is reported under the `state` metric label (e.g. `used`, `free`). Summing values of all states yields the disk size.

The name of the disk block device is reported in the `device_name` metric label (e.g. `sda`).

By setting the `metricsConfigs` field and `displayName` field ([example](https://github.com/kubernetes/node-problem-detector/blob/master/config/system-stats-monitor.json)), you can specify the list of metrics to be collected, and their display names on the Prometheus scaping endpoint. The name of the disk block device will be reported in the `device` metrics label.
For the metrics that separates read/write operations, the IO direction is reported in the `direction` metric label (e.g. `read`, `write`).

And a few other options:
* `includeRootBlk`: When set to `true`, add all block devices that's [not a slave or holder device](http://man7.org/linux/man-pages/man8/lsblk.8.html) to the list of disks that System Stats Monitor collects metrics from. When set to `false`, do not modify the list of disks that System Stats Monitor collects metrics from.
* `includeRootBlk`: When set to `true`, add all block devices that's [not a slave or holder device][lsblk doc] to the list of disks that System Stats Monitor collects metrics from. When set to `false`, do not modify the list of disks that System Stats Monitor collects metrics from.
* `includeAllAttachedBlk`: When set to `true`, add all currently attached block devices to the list of disks that System Stats Monitor collects metrics from. When set to `false`, do not modify the list of disks that System Stats Monitor collects metrics from.
* `lsblkTimeout`: System Stats Monitor uses [`lsblk`](http://man7.org/linux/man-pages/man8/lsblk.8.html) to retrieve block devices information. This option sets the timeout for calling `lsblk` commands.
* `lsblkTimeout`: System Stats Monitor uses [`lsblk`][lsblk doc] to retrieve block devices information. This option sets the timeout for calling `lsblk` commands.

[iostat doc]: https://www.kernel.org/doc/Documentation/iostats.txt
[lsblk doc]: http://man7.org/linux/man-pages/man8/lsblk.8.html

### Host

Below metrics are collected from `host` component:

* `host_uptime`: The uptime of the operating system, in seconds. OS version and kernel versions are reported under the `os_version` and `kernel_version` metric label (e.g. `cos 73-11647.217.0`, `4.14.127+`).

### Memory

Below metrics are collected from `memory` component:

* `memory_bytes_used`: Memory usage by each memory state, in Bytes. The memory state is reported under the `state` metric label (e.g. `free`, `used`, `buffered`...). Summing values of all states yields the total memory of the node.
* `memory_anonymous_used`: Anonymous memory usage, in Bytes. Memory usage state is reported under the `state` metric label (e.g. `active`, `inactive`). `active` means the memory has been used more recently and usually not swapped until needed. Summing values of all states yields the total anonymous memory used.
* `memory_page_cache_used`: Page cache memory usage, in Bytes. Memory usage state is reported under the `state` metric label (e.g. `active`, `inactive`). `active` means the memory has been used more recently and usually not reclaimed until needed. Summing values of all states yields the total page cache memory used.
* `memory_unevictable_used`: [Unevictable memory][/proc doc] usage, in Bytes.
* `memory_dirty_used`: Dirty pages usage, in Bytes. Memory usage state is reported under the `state` metric label (e.g. `dirty`, `writeback`). `dirty` means the memory is waiting to be written back to disk, and `writeback` means the memory is actively being written back to disk.
144 changes: 144 additions & 0 deletions pkg/systemstatsmonitor/cpu_collector.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
/*
Copyright 2020 The Kubernetes Authors All rights reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package systemstatsmonitor

import (
"github.com/golang/glog"
"github.com/shirou/gopsutil/cpu"
"github.com/shirou/gopsutil/load"

ssmtypes "k8s.io/node-problem-detector/pkg/systemstatsmonitor/types"
"k8s.io/node-problem-detector/pkg/util/metrics"
)

// clockTick is the ratio between 1 second and 1 USER_HZ (a clock tick).
//
// CLK_TCK is 100 in most architectures. If NPD ever runs on a super special architecture,
// we can work out a way to detect the clock tick on that architecture (might require
// cross-compilation with C library or parsing kernel ABIs). For now, it's not worth the
// complexity.
//
// See documentation at http://man7.org/linux/man-pages/man5/proc.5.html
const clockTick float64 = 100.0

type cpuCollector struct {
mRunnableTaskCount *metrics.Float64Metric
mUsageTime *metrics.Float64Metric

config *ssmtypes.CPUStatsConfig

lastUsageTime map[string]float64
}

func NewCPUCollectorOrDie(cpuConfig *ssmtypes.CPUStatsConfig) *cpuCollector {
cc := cpuCollector{config: cpuConfig}

var err error

cc.mRunnableTaskCount, err = metrics.NewFloat64Metric(
metrics.CPURunnableTaskCountID,
cpuConfig.MetricsConfigs[string(metrics.CPURunnableTaskCountID)].DisplayName,
"The average number of runnable tasks in the run-queue during the last minute",
"1",
metrics.LastValue,
[]string{})
if err != nil {
glog.Fatalf("Error initializing metric for %q: %v", metrics.CPURunnableTaskCountID, err)
}

cc.mUsageTime, err = metrics.NewFloat64Metric(
metrics.CPUUsageTimeID,
cpuConfig.MetricsConfigs[string(metrics.CPUUsageTimeID)].DisplayName,
"CPU usage, in seconds",
"s",
metrics.Sum,
[]string{stateLabel})
if err != nil {
glog.Fatalf("Error initializing metric for %q: %v", metrics.CPUUsageTimeID, err)
}

cc.lastUsageTime = make(map[string]float64)

return &cc
}

func (cc *cpuCollector) recordLoad() {
if cc.mRunnableTaskCount == nil {
return
}

loadAvg, err := load.Avg()
if err != nil {
glog.Errorf("Failed to retrieve average CPU load: %v", err)
return
}

cc.mRunnableTaskCount.Record(map[string]string{}, loadAvg.Load1)
}

func (cc *cpuCollector) recordUsage() {
if cc.mUsageTime == nil {
return
}

// Set percpu=false to get aggregated usage from all CPUs.
timersStats, err := cpu.Times(false)
if err != nil {
glog.Errorf("Failed to retrieve CPU timers stat: %v", err)
return
}
timersStat := timersStats[0]

cc.mUsageTime.Record(map[string]string{stateLabel: "user"}, clockTick*timersStat.User-cc.lastUsageTime["user"])
cc.lastUsageTime["user"] = clockTick * timersStat.User

cc.mUsageTime.Record(map[string]string{stateLabel: "system"}, clockTick*timersStat.System-cc.lastUsageTime["system"])
cc.lastUsageTime["system"] = clockTick * timersStat.System

cc.mUsageTime.Record(map[string]string{stateLabel: "idle"}, clockTick*timersStat.Idle-cc.lastUsageTime["idle"])
cc.lastUsageTime["idle"] = clockTick * timersStat.Idle

cc.mUsageTime.Record(map[string]string{stateLabel: "nice"}, clockTick*timersStat.Nice-cc.lastUsageTime["nice"])
cc.lastUsageTime["nice"] = clockTick * timersStat.Nice

cc.mUsageTime.Record(map[string]string{stateLabel: "iowait"}, clockTick*timersStat.Iowait-cc.lastUsageTime["iowait"])
cc.lastUsageTime["iowait"] = clockTick * timersStat.Iowait

cc.mUsageTime.Record(map[string]string{stateLabel: "irq"}, clockTick*timersStat.Irq-cc.lastUsageTime["irq"])
cc.lastUsageTime["irq"] = clockTick * timersStat.Irq

cc.mUsageTime.Record(map[string]string{stateLabel: "softirq"}, clockTick*timersStat.Softirq-cc.lastUsageTime["softirq"])
cc.lastUsageTime["softirq"] = clockTick * timersStat.Softirq

cc.mUsageTime.Record(map[string]string{stateLabel: "steal"}, clockTick*timersStat.Steal-cc.lastUsageTime["steal"])
cc.lastUsageTime["steal"] = clockTick * timersStat.Steal

cc.mUsageTime.Record(map[string]string{stateLabel: "guest"}, clockTick*timersStat.Guest-cc.lastUsageTime["guest"])
cc.lastUsageTime["guest"] = clockTick * timersStat.Guest

cc.mUsageTime.Record(map[string]string{stateLabel: "guest_nice"}, clockTick*timersStat.GuestNice-cc.lastUsageTime["guest_nice"])
cc.lastUsageTime["guest_nice"] = clockTick * timersStat.GuestNice
}

func (cc *cpuCollector) collect() {
if cc == nil {
return
}

cc.recordLoad()
cc.recordUsage()
}