Skip to content
This repository has been archived by the owner on Nov 19, 2020. It is now read-only.

Commit

Permalink
Add Prometheus Metrics
Browse files Browse the repository at this point in the history
  • Loading branch information
justenwalker committed Oct 15, 2018
1 parent 0eef161 commit 6bf6a9d
Show file tree
Hide file tree
Showing 112 changed files with 29,987 additions and 0 deletions.
7 changes: 7 additions & 0 deletions go.mod
Expand Up @@ -2,8 +2,15 @@ module github.com/jet/damon

require (
github.com/BurntSushi/toml v0.3.0 // indirect
github.com/beorn7/perks v0.0.0-20180321164747-3a771d992973 // indirect
github.com/golang/protobuf v1.2.0 // indirect
github.com/matttproud/golang_protobuf_extensions v1.0.1 // indirect
github.com/natefinch/lumberjack v2.0.0+incompatible
github.com/pkg/errors v0.8.0
github.com/prometheus/client_golang v0.8.0
github.com/prometheus/client_model v0.0.0-20180712105110-5c3871d89910 // indirect
github.com/prometheus/common v0.0.0-20180801064454-c7de2306084e // indirect
github.com/prometheus/procfs v0.0.0-20180920065004-418d78d0b9a7 // indirect
github.com/rs/zerolog v1.9.1
golang.org/x/sys v0.0.0-20180909124046-d0be0721c37e
gopkg.in/natefinch/lumberjack.v2 v2.0.0 // indirect
Expand Down
14 changes: 14 additions & 0 deletions go.sum
@@ -1,9 +1,23 @@
github.com/BurntSushi/toml v0.3.0 h1:e1/Ivsx3Z0FVTV0NSOv/aVgbUWyQuzj7DDnFblkRvsY=
github.com/BurntSushi/toml v0.3.0/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU=
github.com/beorn7/perks v0.0.0-20180321164747-3a771d992973 h1:xJ4a3vCFaGF/jqvzLMYoU8P317H5OQ+Via4RmuPwCS0=
github.com/beorn7/perks v0.0.0-20180321164747-3a771d992973/go.mod h1:Dwedo/Wpr24TaqPxmxbtue+5NUziq4I4S80YR8gNf3Q=
github.com/golang/protobuf v1.2.0 h1:P3YflyNX/ehuJFLhxviNdFxQPkGK5cDcApsge1SqnvM=
github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
github.com/matttproud/golang_protobuf_extensions v1.0.1 h1:4hp9jkHxhMHkqkrB3Ix0jegS5sx/RkqARlsWZ6pIwiU=
github.com/matttproud/golang_protobuf_extensions v1.0.1/go.mod h1:D8He9yQNgCq6Z5Ld7szi9bcBfOoFv/3dc6xSMkL2PC0=
github.com/natefinch/lumberjack v2.0.0+incompatible h1:4QJd3OLAMgj7ph+yZTuX13Ld4UpgHp07nNdFX7mqFfM=
github.com/natefinch/lumberjack v2.0.0+incompatible/go.mod h1:Wi9p2TTF5DG5oU+6YfsmYQpsTIOm0B1VNzQg9Mw6nPk=
github.com/pkg/errors v0.8.0 h1:WdK/asTD0HN+q6hsWO3/vpuAkAr+tw6aNJNDFFf0+qw=
github.com/pkg/errors v0.8.0/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
github.com/prometheus/client_golang v0.8.0 h1:1921Yw9Gc3iSc4VQh3PIoOqgPCZS7G/4xQNVUp8Mda8=
github.com/prometheus/client_golang v0.8.0/go.mod h1:7SWBe2y4D6OKWSNQJUaRYU/AaXPKyh/dDVn+NZz0KFw=
github.com/prometheus/client_model v0.0.0-20180712105110-5c3871d89910 h1:idejC8f05m9MGOsuEi1ATq9shN03HrxNkD/luQvxCv8=
github.com/prometheus/client_model v0.0.0-20180712105110-5c3871d89910/go.mod h1:MbSGuTsp3dbXC40dX6PRTWyKYBIrTGTE9sqQNg2J8bo=
github.com/prometheus/common v0.0.0-20180801064454-c7de2306084e h1:n/3MEhJQjQxrOUCzh1Y3Re6aJUUWRp2M9+Oc3eVn/54=
github.com/prometheus/common v0.0.0-20180801064454-c7de2306084e/go.mod h1:daVV7qP5qjZbuso7PdcryaAu0sAZbrN9i7WWcTMWvro=
github.com/prometheus/procfs v0.0.0-20180920065004-418d78d0b9a7 h1:NgR6WN8nQ4SmFC1sSUHY8SriLuWCZ6cCIQtH4vDZN3c=
github.com/prometheus/procfs v0.0.0-20180920065004-418d78d0b9a7/go.mod h1:c3At6R/oaqEKCNdg8wHV1ftS6bRYblBhIjjI8uT2IGk=
github.com/rs/zerolog v1.9.1 h1:AjV/SFRF0+gEa6rSjkh0Eji/DnkrJKVpPho6SW5g4mU=
github.com/rs/zerolog v1.9.1/go.mod h1:YbFCdg8HfsridGWAh22vktObvhZbQsZXe4/zB0OKkWU=
golang.org/x/sys v0.0.0-20180909124046-d0be0721c37e h1:o3PsSEY8E4eXWkXrIP9YJALUkVZqzHJT5DOasTyn8Vs=
Expand Down
323 changes: 323 additions & 0 deletions metrics/metrics.go
@@ -0,0 +1,323 @@
package metrics

import (
"net/http"
"sync"
"time"

"github.com/jet/damon/container"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promhttp"
)

type Metrics struct {
Namespace string
Labels map[string]string
MHzPerCore float64
Cores int

cpuCollector *CPUCollector
registry *prometheus.Registry
handler http.Handler

// cpu
cpuKernelTime prometheus.Gauge
cpuUserTime prometheus.Gauge
cpuKernelPercent prometheus.Gauge
cpuUserPercent prometheus.Gauge
cpuKernelHz prometheus.Gauge
cpuUserHz prometheus.Gauge
cpuNotification prometheus.Counter

// memory
memoryWorkingSet prometheus.Gauge
memoryCommitCharge prometheus.Gauge
memoryPageFaultCount prometheus.Gauge
memoryNotification prometheus.Counter

// io
ioTxTotalBytes prometheus.Gauge
ioTxReadBytes prometheus.Gauge
ioTxWriteBytes prometheus.Gauge
ioTxOtherBytes prometheus.Gauge
ioReadOpsTotal prometheus.Gauge
ioWriteOpsTotal prometheus.Gauge
ioOtherOpsTotal prometheus.Gauge
ioTotalOperations prometheus.Gauge
ioNotification prometheus.Counter
}

func (m *Metrics) Init() {
m.cpuCollector = &CPUCollector{
MHzPerCore: m.MHzPerCore,
Cores: m.Cores,
}
m.registry = prometheus.NewRegistry()
m.handler = promhttp.HandlerFor(m.registry, promhttp.HandlerOpts{})
m.cpuKernelTime = prometheus.NewGauge(prometheus.GaugeOpts{
Namespace: m.Namespace,
Subsystem: "cpu",
Name: "kernel_seconds",
Help: `The number of seconds the process spent in kernel-mode`,
ConstLabels: prometheus.Labels(m.Labels),
})
m.registry.MustRegister(m.cpuKernelTime)
m.cpuUserTime = prometheus.NewGauge(prometheus.GaugeOpts{
Namespace: m.Namespace,
Subsystem: "cpu",
Name: "user_seconds",
Help: `The number of seconds the process spent in user-mode`,
ConstLabels: prometheus.Labels(m.Labels),
})
m.registry.MustRegister(m.cpuUserTime)
m.cpuKernelPercent = prometheus.NewGauge(prometheus.GaugeOpts{
Namespace: m.Namespace,
Subsystem: "cpu",
Name: "kernel_percent",
Help: `Percent of the total cpu time this process executed in kernel mode. This is calculated by measuring the total nanoseconds this process spend in kernel mode, and dividing it by the total available cpu time (cores * uptime)`,
ConstLabels: prometheus.Labels(m.Labels),
})
m.registry.MustRegister(m.cpuKernelPercent)
m.cpuUserPercent = prometheus.NewGauge(prometheus.GaugeOpts{
Namespace: m.Namespace,
Subsystem: "cpu",
Name: "user_percent",
Help: `Percent of the total cpu time this process executed in user mode. This is calculated by measuring the total nanoseconds this process spend in user mode, and dividing it by the total available cpu time (cores * uptime)`,
ConstLabels: prometheus.Labels(m.Labels),
})
m.registry.MustRegister(m.cpuUserPercent)
m.cpuKernelHz = prometheus.NewGauge(prometheus.GaugeOpts{
Namespace: m.Namespace,
Subsystem: "cpu",
Name: "kernel_hz",
Help: `Kernel-mode time converted to Hz. This is calculated by taking the kernel percent and multiplying with the total available CPU hz (cores * hz per core)`,
ConstLabels: prometheus.Labels(m.Labels),
})
m.registry.MustRegister(m.cpuKernelHz)
m.cpuUserHz = prometheus.NewGauge(prometheus.GaugeOpts{
Namespace: m.Namespace,
Subsystem: "cpu",
Name: "user_hz",
Help: `User-mode time converted to Hz. This is calculated by taking the user percent and multiplying with the total available CPU hz (cores * hz per core)`,
ConstLabels: prometheus.Labels(m.Labels),
})
m.registry.MustRegister(m.cpuUserHz)
m.cpuNotification = prometheus.NewCounter(prometheus.CounterOpts{
Namespace: m.Namespace,
Subsystem: "cpu",
Name: "notifications_total",
Help: `Total number of CPU limit exceeded notifications.`,
ConstLabels: prometheus.Labels(m.Labels),
})
m.registry.MustRegister(m.cpuNotification)
m.memoryWorkingSet = prometheus.NewGauge(prometheus.GaugeOpts{
Namespace: m.Namespace,
Subsystem: "memory",
Name: "working_set_bytes",
Help: `The current working set size, in bytes`,
ConstLabels: prometheus.Labels(m.Labels),
})
m.registry.MustRegister(m.memoryWorkingSet)
m.memoryCommitCharge = prometheus.NewGauge(prometheus.GaugeOpts{
Namespace: m.Namespace,
Subsystem: "memory",
Name: "commit_charge_bytes",
Help: `The Commit Charge value in bytes for this process. Commit Charge is the total amount of memory that the memory manager has committed for a running process.`,
ConstLabels: prometheus.Labels(m.Labels),
})
m.registry.MustRegister(m.memoryCommitCharge)
m.memoryPageFaultCount = prometheus.NewGauge(prometheus.GaugeOpts{
Namespace: m.Namespace,
Subsystem: "memory",
Name: "page_fault_total",
Help: `The number of page faults.`,
ConstLabels: prometheus.Labels(m.Labels),
})
m.registry.MustRegister(m.memoryPageFaultCount)
m.memoryNotification = prometheus.NewCounter(prometheus.CounterOpts{
Namespace: m.Namespace,
Subsystem: "memory",
Name: "notifications_total",
Help: `Total number of Memory limit exceeded notifications.`,
ConstLabels: prometheus.Labels(m.Labels),
})
m.registry.MustRegister(m.memoryNotification)

// io operations
m.ioReadOpsTotal = prometheus.NewGauge(prometheus.GaugeOpts{
Namespace: m.Namespace,
Subsystem: "io",
Name: "read_operations_total",
Help: `Total number of read IO operations.`,
ConstLabels: prometheus.Labels(m.Labels),
})
m.registry.MustRegister(m.ioReadOpsTotal)
m.ioWriteOpsTotal = prometheus.NewGauge(prometheus.GaugeOpts{
Namespace: m.Namespace,
Subsystem: "io",
Name: "write_operations_total",
Help: `Total number of write IO operations.`,
ConstLabels: prometheus.Labels(m.Labels),
})
m.registry.MustRegister(m.ioWriteOpsTotal)
m.ioOtherOpsTotal = prometheus.NewGauge(prometheus.GaugeOpts{
Namespace: m.Namespace,
Subsystem: "io",
Name: "other_operations_total",
Help: `Total number of other IO operations.`,
ConstLabels: prometheus.Labels(m.Labels),
})
m.registry.MustRegister(m.ioOtherOpsTotal)
m.ioTotalOperations = prometheus.NewGauge(prometheus.GaugeOpts{
Namespace: m.Namespace,
Subsystem: "io",
Name: "operations_total",
Help: `Total number of IO operations.`,
ConstLabels: prometheus.Labels(m.Labels),
})
m.registry.MustRegister(m.ioTotalOperations)
// io bytes
m.ioTxReadBytes = prometheus.NewGauge(prometheus.GaugeOpts{
Namespace: m.Namespace,
Subsystem: "io",
Name: "read_bytes",
Help: `Total number of IO read bytes transferred.`,
ConstLabels: prometheus.Labels(m.Labels),
})
m.registry.MustRegister(m.ioTxReadBytes)
m.ioTxWriteBytes = prometheus.NewGauge(prometheus.GaugeOpts{
Namespace: m.Namespace,
Subsystem: "io",
Name: "write_bytes",
Help: `Total number of IO write bytes transferred.`,
ConstLabels: prometheus.Labels(m.Labels),
})
m.registry.MustRegister(m.ioTxWriteBytes)
m.ioTxOtherBytes = prometheus.NewGauge(prometheus.GaugeOpts{
Namespace: m.Namespace,
Subsystem: "io",
Name: "other_bytes",
Help: `Total number of IO other bytes transferred.`,
ConstLabels: prometheus.Labels(m.Labels),
})
m.registry.MustRegister(m.ioTxOtherBytes)
m.ioTxTotalBytes = prometheus.NewGauge(prometheus.GaugeOpts{
Namespace: m.Namespace,
Subsystem: "io",
Name: "total_bytes",
Help: `Total number of IO bytes trasferred.`,
ConstLabels: prometheus.Labels(m.Labels),
})
m.registry.MustRegister(m.ioTxTotalBytes)
// io notifications
m.ioNotification = prometheus.NewCounter(prometheus.CounterOpts{
Namespace: m.Namespace,
Subsystem: "io",
Name: "notifications_total",
Help: `Total number of IO limit exceeded notifications.`,
ConstLabels: prometheus.Labels(m.Labels),
})
m.registry.MustRegister(m.ioNotification)
}

func (m *Metrics) OnStats(stats container.ProcessStats) {
sample := m.cpuCollector.Sample(CPUMeasurement{
TotalTime: stats.CPUStats.TotalCPUTime,
UserTime: stats.CPUStats.TotalUserTime,
KernelTime: stats.CPUStats.TotalKernelTime,
})
// cpu
m.cpuUserTime.Set(stats.CPUStats.TotalUserTime.Seconds())
m.cpuKernelTime.Set(stats.CPUStats.TotalKernelTime.Seconds())
m.cpuKernelHz.Set(float64(sample.KernelHz))
m.cpuKernelPercent.Set(sample.KernelPercent)
m.cpuUserHz.Set(float64(sample.UserHz))
m.cpuUserPercent.Set(sample.UserPercent)
// memory
m.memoryCommitCharge.Set(float64(stats.MemoryStats.PrivateUsageBytes))
m.memoryWorkingSet.Set(float64(stats.MemoryStats.WorkingSetSizeBytes))
m.memoryPageFaultCount.Set(float64(stats.MemoryStats.PageFaultCount))
// io
m.ioTxReadBytes.Set(float64(stats.IOStats.TotalTxReadBytes))
m.ioTxWriteBytes.Set(float64(stats.IOStats.TotalTxWrittenBytes))
m.ioTxOtherBytes.Set(float64(stats.IOStats.TotalTxOtherBytes))
m.ioTxTotalBytes.Set(float64(stats.IOStats.TotalTxCountBytes))
m.ioReadOpsTotal.Set(float64(stats.IOStats.TotalReadIOOperations))
m.ioWriteOpsTotal.Set(float64(stats.IOStats.TotalWriteIOOperations))
m.ioOtherOpsTotal.Set(float64(stats.IOStats.TotalOtherIOOperations))
m.ioTotalOperations.Set(float64(stats.IOStats.TotalIOOperations))
}

func (m *Metrics) OnViolation(v container.LimitViolation) {
switch v.Type {
case container.IOLimitViolation:
m.ioNotification.Inc()
case container.CPULimitViolation:
m.cpuNotification.Inc()
case container.MemoryLimitViolation:
m.memoryNotification.Inc()
}
}

func (m *Metrics) Handler() http.Handler {
return m.handler
}

type CPUCollector struct {
LastTotalDuration time.Duration
LastUserDuration time.Duration
LastKernelDuration time.Duration
Cores int
MHzPerCore float64
lock sync.Mutex
}

type CPUMeasurement struct {
TotalTime time.Duration
UserTime time.Duration
KernelTime time.Duration
}

type CPUSample struct {
Measurement CPUMeasurement
DeltaKernelTime time.Duration
DeltaUserTime time.Duration
DeltaTotalTime time.Duration
KernelPercent float64
KernelHz uint64
UserPercent float64
UserHz uint64
}

func (c *CPUCollector) Sample(m CPUMeasurement) CPUSample {
c.lock.Lock()
t0 := c.LastTotalDuration
k0 := c.LastKernelDuration
u0 := c.LastUserDuration
c.LastTotalDuration = m.TotalTime
c.LastKernelDuration = m.KernelTime
c.LastUserDuration = m.UserTime
c.lock.Unlock()

// total cpu time = total time * num cores
ttime := (m.TotalTime - t0) * time.Duration(c.Cores)
tmhz := c.MHzPerCore * float64(c.Cores)

kperc := float64(m.KernelTime-k0) / float64(ttime)
uperc := float64(m.UserTime-u0) / float64(ttime)

mHzToHz := 1000000.0
khz := uint64(kperc * mHzToHz * tmhz)
uhz := uint64(uperc * mHzToHz * tmhz)

return CPUSample{
DeltaTotalTime: m.TotalTime - t0,
DeltaKernelTime: m.KernelTime - k0,
DeltaUserTime: m.UserTime - k0,
KernelHz: khz,
KernelPercent: kperc,
UserHz: uhz,
UserPercent: uperc,
Measurement: m,
}
}

0 comments on commit 6bf6a9d

Please sign in to comment.