Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 13 additions & 1 deletion cmd/controller/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ import (
"github.com/kubermatic/machine-controller/pkg/machines"
"github.com/kubermatic/machine-controller/pkg/signals"
"github.com/oklog/run"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promhttp"
)

Expand Down Expand Up @@ -202,7 +203,14 @@ func main() {
machineInformerFactory.Start(stopCh)
kubeSystemInformerFactory.Start(stopCh)

for _, syncsMap := range []map[reflect.Type]bool{kubeInformerFactory.WaitForCacheSync(stopCh), kubePublicKubeInformerFactory.WaitForCacheSync(stopCh), machineInformerFactory.WaitForCacheSync(stopCh), defaultKubeInformerFactory.WaitForCacheSync(stopCh), kubeSystemInformerFactory.WaitForCacheSync(stopCh)} {
syncsMaps := []map[reflect.Type]bool{
kubeInformerFactory.WaitForCacheSync(stopCh),
kubePublicKubeInformerFactory.WaitForCacheSync(stopCh),
machineInformerFactory.WaitForCacheSync(stopCh),
defaultKubeInformerFactory.WaitForCacheSync(stopCh),
kubeSystemInformerFactory.WaitForCacheSync(stopCh),
}
for _, syncsMap := range syncsMaps {
for key, synced := range syncsMap {
if !synced {
glog.Fatalf("unable to sync %s", key)
Expand All @@ -213,6 +221,10 @@ func main() {
ctx, ctxDone := context.WithCancel(context.Background())
var g run.Group
{
prometheus.MustRegister(controller.NewMachineCollector(
machineInformerFactory.Machine().V1alpha1().Machines().Lister(),
))

s := createUtilHttpServer(kubeClient, kubeconfigProvider)
g.Add(func() error {
return s.ListenAndServe()
Expand Down
13 changes: 9 additions & 4 deletions examples/alerts.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,18 @@ groups:
labels:
severity: critical
annotations:
description: "Machine Controller in namespace {{ $labels.namespace }} is down for more than 5 minutes."
summary: "Machine Controller is down"
message: "Machine Controller in namespace {{ $labels.namespace }} is down for more than 5 minutes."
- alert: MachineControllerTooManyErrors
expr: sum(rate(machine_controller_errors_total[5m])) by (namespace) > 0.01
for: 10m
labels:
severity: warning
annotations:
description: "Machine Controller in {{ $labels.namespace }} has too many errors in its loop."
summary: "Machine Controller has many errors"
message: "Machine Controller in {{ $labels.namespace }} has too many errors in its loop."
- alert: MachineControllerDeleting
expr: machine_controller_machine_deleted > 0
for: 10m
labels:
severity: critical
annotations:
message: "Unable to delete machine {{ $labels.machine }}"
53 changes: 2 additions & 51 deletions pkg/controller/machine.go
Original file line number Diff line number Diff line change
Expand Up @@ -102,12 +102,8 @@ type KubeconfigProvider interface {
// MetricsCollection is a struct of all metrics used in
// this controller.
type MetricsCollection struct {
Machines prometheus.Gauge
Nodes prometheus.Gauge
Workers prometheus.Gauge
Errors prometheus.Counter
ControllerOperation *prometheus.HistogramVec
NodeJoinDuration *prometheus.HistogramVec
Workers prometheus.Gauge
Errors prometheus.Counter
}

// NewMachineController returns a new machine controller
Expand All @@ -129,11 +125,7 @@ func NewMachineController(
eventBroadcaster.StartLogging(glog.V(4).Infof)
eventBroadcaster.StartRecordingToSink(&typedcorev1.EventSinkImpl{Interface: kubeClient.CoreV1().Events("")})

prometheus.MustRegister(metrics.ControllerOperation)
prometheus.MustRegister(metrics.Errors)
prometheus.MustRegister(metrics.Machines)
prometheus.MustRegister(metrics.Nodes)
prometheus.MustRegister(metrics.NodeJoinDuration)
prometheus.MustRegister(metrics.Workers)

controller := &Controller{
Expand Down Expand Up @@ -192,7 +184,6 @@ func (c *Controller) Run(threadiness int, stopCh <-chan struct{}) error {
}

c.metrics.Workers.Set(float64(threadiness))
go wait.Until(c.updateMetrics, metricsUpdatePeriod, stopCh)

<-stopCh
return nil
Expand Down Expand Up @@ -280,26 +271,18 @@ func (c *Controller) updateMachineErrorIfTerminalError(machine *machinev1alpha1.
}

func (c *Controller) getProviderInstance(prov cloud.Provider, machine *machinev1alpha1.Machine) (instance.Instance, error) {
start := time.Now()
defer c.metrics.ControllerOperation.With(prometheus.Labels{"operation": "get-cloud-instance"}).Observe(time.Since(start).Seconds())
return prov.Get(machine)
}

func (c *Controller) deleteProviderInstance(prov cloud.Provider, machine *machinev1alpha1.Machine, instance instance.Instance) error {
start := time.Now()
defer c.metrics.ControllerOperation.With(prometheus.Labels{"operation": "delete-cloud-instance"}).Observe(time.Since(start).Seconds())
return prov.Delete(machine, instance)
}

func (c *Controller) createProviderInstance(prov cloud.Provider, machine *machinev1alpha1.Machine, userdata string) (instance.Instance, error) {
start := time.Now()
defer c.metrics.ControllerOperation.With(prometheus.Labels{"operation": "create-cloud-instance"}).Observe(time.Since(start).Seconds())
return prov.Create(machine, userdata)
}

func (c *Controller) validateMachine(prov cloud.Provider, machine *machinev1alpha1.Machine) error {
start := time.Now()
defer c.metrics.ControllerOperation.With(prometheus.Labels{"operation": "validate-machine"}).Observe(time.Since(start).Seconds())
return prov.Validate(machine.Spec)
}

Expand Down Expand Up @@ -580,7 +563,6 @@ func (c *Controller) ensureNodeOwnerRefAndConfigSource(providerInstance instance
}
glog.V(4).Infof("Added owner ref to node %s (machine=%s)", node.Name, machine.Name)
c.recorder.Eventf(machine, corev1.EventTypeNormal, "NodeMatched", "Successfully matched machine to node %s", node.Name)
c.metrics.NodeJoinDuration.WithLabelValues().Observe(node.CreationTimestamp.Sub(machine.CreationTimestamp.Time).Seconds())
}

if node.Spec.ConfigSource == nil && machine.Spec.ConfigSource != nil {
Expand Down Expand Up @@ -878,37 +860,6 @@ func (c *Controller) ReadinessChecks() map[string]healthcheck.Check {
}
}

func (c *Controller) updateMachinesMetric() {
machines, err := c.machinesLister.List(labels.Everything())
if err != nil {
glog.Errorf("failed to list machines for machines metric: %v", err)
return
}
c.metrics.Machines.Set(float64(len(machines)))
}

func (c *Controller) updateNodesMetric() {
nodes, err := c.nodesLister.List(labels.Everything())
if err != nil {
glog.Errorf("failed to list nodes for machine nodes metric: %v", err)
return
}

machineNodes := 0
for _, n := range nodes {
ownerRef := metav1.GetControllerOf(n)
if ownerRef != nil && ownerRef.Kind == machineKind {
machineNodes++
}
}
c.metrics.Nodes.Set(float64(machineNodes))
}

func (c *Controller) updateMetrics() {
c.updateMachinesMetric()
c.updateNodesMetric()
}

func (c *Controller) ensureDeleteFinalizerExists(machine *machinev1alpha1.Machine) (*machinev1alpha1.Machine, error) {
if !sets.NewString(machine.Finalizers...).Has(finalizerDeleteInstance) {
finalizers := sets.NewString(machine.Finalizers...)
Expand Down
113 changes: 76 additions & 37 deletions pkg/controller/metrics.go
Original file line number Diff line number Diff line change
@@ -1,58 +1,97 @@
package controller

import (
"github.com/kubermatic/machine-controller/pkg/client/listers/machines/v1alpha1"
"github.com/prometheus/client_golang/prometheus"
"k8s.io/apimachinery/pkg/labels"
)

const metricsPrefix = "machine_controller_"

// NewMachineControllerMetrics creates new MachineControllerMetrics
// with default values initialized, so metrics always show up.
func NewMachineControllerMetrics() *MetricsCollection {
namespace := "machine"
subsystem := "controller"

cm := &MetricsCollection{
Machines: prometheus.NewGauge(prometheus.GaugeOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "machines",
Help: "The number of machines",
}),
Workers: prometheus.NewGauge(prometheus.GaugeOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "workers",
Help: "The number of running machine controller workers",
}),
Nodes: prometheus.NewGauge(prometheus.GaugeOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "nodes",
Help: "The number of nodes created by a machine",
Name: metricsPrefix + "workers",
Help: "The number of running machine controller workers",
}),
Errors: prometheus.NewCounter(prometheus.CounterOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "errors_total",
Help: "The total number or unexpected errors the controller encountered",
Name: metricsPrefix + "errors_total",
Help: "The total number or unexpected errors the controller encountered",
}),
ControllerOperation: prometheus.NewHistogramVec(prometheus.HistogramOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "controller_operation_duration_seconds",
Help: "The duration it takes to execute an operation",
}, []string{"operation"}),
NodeJoinDuration: prometheus.NewHistogramVec(prometheus.HistogramOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "node_join_duration_seconds",
Help: "The time it takes from creation of the machine resource and the final creation of the node resource",
}, []string{}),
}

// Set default values, so that these metrics always show up
cm.Machines.Set(0)
cm.Workers.Set(0)
cm.Nodes.Set(0)
cm.Errors.Add(0)

return cm
}

type MachineCollector struct {
lister v1alpha1.MachineLister

machines *prometheus.Desc
machineCreated *prometheus.Desc
machineDeleted *prometheus.Desc
}

func NewMachineCollector(lister v1alpha1.MachineLister) *MachineCollector {
return &MachineCollector{
lister: lister,

machines: prometheus.NewDesc(
metricsPrefix+"machines",
"The number of machines managed by this machine controller",
nil, nil,
),
machineCreated: prometheus.NewDesc(
metricsPrefix+"machine_created",
"Timestamp of the machine's creation time",
[]string{"machine"}, nil,
),
machineDeleted: prometheus.NewDesc(
metricsPrefix+"machine_deleted",
"Timestamp of the machine's deletion time",
[]string{"machine"}, nil,
),
}
}

func (mc MachineCollector) Describe(ch chan<- *prometheus.Desc) {
ch <- mc.machines
ch <- mc.machineCreated
ch <- mc.machineDeleted
}

func (mc MachineCollector) Collect(ch chan<- prometheus.Metric) {
machines, err := mc.lister.List(labels.Everything())
if err != nil {
return
}

ch <- prometheus.MustNewConstMetric(
mc.machines,
prometheus.GaugeValue,
float64(len(machines)),
)

for _, machine := range machines {
ch <- prometheus.MustNewConstMetric(
mc.machineCreated,
prometheus.GaugeValue,
float64(machine.CreationTimestamp.Unix()),
machine.Name,
)

if machine.DeletionTimestamp != nil {
ch <- prometheus.MustNewConstMetric(
mc.machineDeleted,
prometheus.GaugeValue,
float64(machine.DeletionTimestamp.Unix()),
machine.Name,
)
}
}
}