From 53fc8fb29f5069407594baa92877ee57411fc9cc Mon Sep 17 00:00:00 2001 From: Matthias Loibl Date: Thu, 12 Jul 2018 17:49:22 +0200 Subject: [PATCH 1/6] Refactor long line --- cmd/controller/main.go | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/cmd/controller/main.go b/cmd/controller/main.go index 56388d4d5..e5d22a226 100644 --- a/cmd/controller/main.go +++ b/cmd/controller/main.go @@ -202,7 +202,14 @@ func main() { machineInformerFactory.Start(stopCh) kubeSystemInformerFactory.Start(stopCh) - for _, syncsMap := range []map[reflect.Type]bool{kubeInformerFactory.WaitForCacheSync(stopCh), kubePublicKubeInformerFactory.WaitForCacheSync(stopCh), machineInformerFactory.WaitForCacheSync(stopCh), defaultKubeInformerFactory.WaitForCacheSync(stopCh), kubeSystemInformerFactory.WaitForCacheSync(stopCh)} { + syncsMaps := []map[reflect.Type]bool{ + kubeInformerFactory.WaitForCacheSync(stopCh), + kubePublicKubeInformerFactory.WaitForCacheSync(stopCh), + machineInformerFactory.WaitForCacheSync(stopCh), + defaultKubeInformerFactory.WaitForCacheSync(stopCh), + kubeSystemInformerFactory.WaitForCacheSync(stopCh), + } + for _, syncsMap := range syncsMaps { for key, synced := range syncsMap { if !synced { glog.Fatalf("unable to sync %s", key) From f61e251659005db3831f6bdac43a94b67e246ab3 Mon Sep 17 00:00:00 2001 From: Matthias Loibl Date: Thu, 12 Jul 2018 18:22:18 +0200 Subject: [PATCH 2/6] Remove old stateful metrics --- pkg/controller/machine.go | 53 ++------------------------------------- pkg/controller/metrics.go | 44 ++++++-------------------------- 2 files changed, 9 insertions(+), 88 deletions(-) diff --git a/pkg/controller/machine.go b/pkg/controller/machine.go index 8dae31fe0..34568a5b5 100644 --- a/pkg/controller/machine.go +++ b/pkg/controller/machine.go @@ -102,12 +102,8 @@ type KubeconfigProvider interface { // MetricsCollection is a struct of all metrics used in // this controller. type MetricsCollection struct { - Machines prometheus.Gauge - Nodes prometheus.Gauge - Workers prometheus.Gauge - Errors prometheus.Counter - ControllerOperation *prometheus.HistogramVec - NodeJoinDuration *prometheus.HistogramVec + Workers prometheus.Gauge + Errors prometheus.Counter } // NewMachineController returns a new machine controller @@ -129,11 +125,7 @@ func NewMachineController( eventBroadcaster.StartLogging(glog.V(4).Infof) eventBroadcaster.StartRecordingToSink(&typedcorev1.EventSinkImpl{Interface: kubeClient.CoreV1().Events("")}) - prometheus.MustRegister(metrics.ControllerOperation) prometheus.MustRegister(metrics.Errors) - prometheus.MustRegister(metrics.Machines) - prometheus.MustRegister(metrics.Nodes) - prometheus.MustRegister(metrics.NodeJoinDuration) prometheus.MustRegister(metrics.Workers) controller := &Controller{ @@ -192,7 +184,6 @@ func (c *Controller) Run(threadiness int, stopCh <-chan struct{}) error { } c.metrics.Workers.Set(float64(threadiness)) - go wait.Until(c.updateMetrics, metricsUpdatePeriod, stopCh) <-stopCh return nil @@ -280,26 +271,18 @@ func (c *Controller) updateMachineErrorIfTerminalError(machine *machinev1alpha1. } func (c *Controller) getProviderInstance(prov cloud.Provider, machine *machinev1alpha1.Machine) (instance.Instance, error) { - start := time.Now() - defer c.metrics.ControllerOperation.With(prometheus.Labels{"operation": "get-cloud-instance"}).Observe(time.Since(start).Seconds()) return prov.Get(machine) } func (c *Controller) deleteProviderInstance(prov cloud.Provider, machine *machinev1alpha1.Machine, instance instance.Instance) error { - start := time.Now() - defer c.metrics.ControllerOperation.With(prometheus.Labels{"operation": "delete-cloud-instance"}).Observe(time.Since(start).Seconds()) return prov.Delete(machine, instance) } func (c *Controller) createProviderInstance(prov cloud.Provider, machine *machinev1alpha1.Machine, userdata string) (instance.Instance, error) { - start := time.Now() - defer c.metrics.ControllerOperation.With(prometheus.Labels{"operation": "create-cloud-instance"}).Observe(time.Since(start).Seconds()) return prov.Create(machine, userdata) } func (c *Controller) validateMachine(prov cloud.Provider, machine *machinev1alpha1.Machine) error { - start := time.Now() - defer c.metrics.ControllerOperation.With(prometheus.Labels{"operation": "validate-machine"}).Observe(time.Since(start).Seconds()) return prov.Validate(machine.Spec) } @@ -580,7 +563,6 @@ func (c *Controller) ensureNodeOwnerRefAndConfigSource(providerInstance instance } glog.V(4).Infof("Added owner ref to node %s (machine=%s)", node.Name, machine.Name) c.recorder.Eventf(machine, corev1.EventTypeNormal, "NodeMatched", "Successfully matched machine to node %s", node.Name) - c.metrics.NodeJoinDuration.WithLabelValues().Observe(node.CreationTimestamp.Sub(machine.CreationTimestamp.Time).Seconds()) } if node.Spec.ConfigSource == nil && machine.Spec.ConfigSource != nil { @@ -878,37 +860,6 @@ func (c *Controller) ReadinessChecks() map[string]healthcheck.Check { } } -func (c *Controller) updateMachinesMetric() { - machines, err := c.machinesLister.List(labels.Everything()) - if err != nil { - glog.Errorf("failed to list machines for machines metric: %v", err) - return - } - c.metrics.Machines.Set(float64(len(machines))) -} - -func (c *Controller) updateNodesMetric() { - nodes, err := c.nodesLister.List(labels.Everything()) - if err != nil { - glog.Errorf("failed to list nodes for machine nodes metric: %v", err) - return - } - - machineNodes := 0 - for _, n := range nodes { - ownerRef := metav1.GetControllerOf(n) - if ownerRef != nil && ownerRef.Kind == machineKind { - machineNodes++ - } - } - c.metrics.Nodes.Set(float64(machineNodes)) -} - -func (c *Controller) updateMetrics() { - c.updateMachinesMetric() - c.updateNodesMetric() -} - func (c *Controller) ensureDeleteFinalizerExists(machine *machinev1alpha1.Machine) (*machinev1alpha1.Machine, error) { if !sets.NewString(machine.Finalizers...).Has(finalizerDeleteInstance) { finalizers := sets.NewString(machine.Finalizers...) diff --git a/pkg/controller/metrics.go b/pkg/controller/metrics.go index a78f8a109..aff2f9bd1 100644 --- a/pkg/controller/metrics.go +++ b/pkg/controller/metrics.go @@ -4,55 +4,25 @@ import ( "github.com/prometheus/client_golang/prometheus" ) +const metricsPrefix = "machine_controller_" + // NewMachineControllerMetrics creates new MachineControllerMetrics // with default values initialized, so metrics always show up. func NewMachineControllerMetrics() *MetricsCollection { - namespace := "machine" - subsystem := "controller" - cm := &MetricsCollection{ - Machines: prometheus.NewGauge(prometheus.GaugeOpts{ - Namespace: namespace, - Subsystem: subsystem, - Name: "machines", - Help: "The number of machines", - }), Workers: prometheus.NewGauge(prometheus.GaugeOpts{ - Namespace: namespace, - Subsystem: subsystem, - Name: "workers", - Help: "The number of running machine controller workers", - }), - Nodes: prometheus.NewGauge(prometheus.GaugeOpts{ - Namespace: namespace, - Subsystem: subsystem, - Name: "nodes", - Help: "The number of nodes created by a machine", + Name: metricsPrefix + "workers", + Help: "The number of running machine controller workers", }), Errors: prometheus.NewCounter(prometheus.CounterOpts{ - Namespace: namespace, - Subsystem: subsystem, - Name: "errors_total", - Help: "The total number or unexpected errors the controller encountered", + Name: metricsPrefix + "errors_total", + Help: "The total number or unexpected errors the controller encountered", }), - ControllerOperation: prometheus.NewHistogramVec(prometheus.HistogramOpts{ - Namespace: namespace, - Subsystem: subsystem, - Name: "controller_operation_duration_seconds", - Help: "The duration it takes to execute an operation", - }, []string{"operation"}), - NodeJoinDuration: prometheus.NewHistogramVec(prometheus.HistogramOpts{ - Namespace: namespace, - Subsystem: subsystem, - Name: "node_join_duration_seconds", - Help: "The time it takes from creation of the machine resource and the final creation of the node resource", - }, []string{}), } // Set default values, so that these metrics always show up - cm.Machines.Set(0) cm.Workers.Set(0) - cm.Nodes.Set(0) + cm.Errors.Add(0) return cm } From a1f005455743212c825a5f2a9937013e0de5a6e7 Mon Sep 17 00:00:00 2001 From: Matthias Loibl Date: Thu, 12 Jul 2018 18:42:54 +0200 Subject: [PATCH 3/6] Create Machine and Node collector --- cmd/controller/main.go | 9 +++ pkg/controller/metrics.go | 135 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 144 insertions(+) diff --git a/cmd/controller/main.go b/cmd/controller/main.go index e5d22a226..2544c7a59 100644 --- a/cmd/controller/main.go +++ b/cmd/controller/main.go @@ -54,6 +54,7 @@ import ( "github.com/kubermatic/machine-controller/pkg/machines" "github.com/kubermatic/machine-controller/pkg/signals" "github.com/oklog/run" + "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/promhttp" ) @@ -220,6 +221,14 @@ func main() { ctx, ctxDone := context.WithCancel(context.Background()) var g run.Group { + prometheus.MustRegister(controller.NewMachineCollector( + machineInformerFactory.Machine().V1alpha1().Machines().Lister(), + )) + + prometheus.MustRegister(controller.NewNodeController( + kubeInformerFactory.Core().V1().Nodes().Lister(), + )) + s := createUtilHttpServer(kubeClient, kubeconfigProvider) g.Add(func() error { return s.ListenAndServe() diff --git a/pkg/controller/metrics.go b/pkg/controller/metrics.go index aff2f9bd1..e372e67cc 100644 --- a/pkg/controller/metrics.go +++ b/pkg/controller/metrics.go @@ -1,7 +1,10 @@ package controller import ( + "github.com/kubermatic/machine-controller/pkg/client/listers/machines/v1alpha1" "github.com/prometheus/client_golang/prometheus" + "k8s.io/apimachinery/pkg/labels" + "k8s.io/client-go/listers/core/v1" ) const metricsPrefix = "machine_controller_" @@ -26,3 +29,135 @@ func NewMachineControllerMetrics() *MetricsCollection { return cm } + +type MachineCollector struct { + lister v1alpha1.MachineLister + + machines *prometheus.Desc + machineCreated *prometheus.Desc + machineDeleted *prometheus.Desc +} + +func NewMachineCollector(lister v1alpha1.MachineLister) *MachineCollector { + return &MachineCollector{ + lister: lister, + + machines: prometheus.NewDesc( + metricsPrefix+"machines", + "The number of machines managed by this machine controller", + nil, nil, + ), + machineCreated: prometheus.NewDesc( + metricsPrefix+"machine_created", + "Timestamp of the machine's creation time", + []string{"machine"}, nil, + ), + machineDeleted: prometheus.NewDesc( + metricsPrefix+"machine_deleted", + "Timestamp of the machine's deletion time", + []string{"machine"}, nil, + ), + } +} + +func (mc MachineCollector) Describe(ch chan<- *prometheus.Desc) { + ch <- mc.machines + ch <- mc.machineCreated + ch <- mc.machineDeleted +} + +func (mc MachineCollector) Collect(ch chan<- prometheus.Metric) { + machines, err := mc.lister.List(labels.Everything()) + if err != nil { + return + } + + ch <- prometheus.MustNewConstMetric( + mc.machines, + prometheus.GaugeValue, + float64(len(machines)), + ) + + for _, machine := range machines { + ch <- prometheus.MustNewConstMetric( + mc.machineCreated, + prometheus.GaugeValue, + float64(machine.CreationTimestamp.Unix()), + machine.Name, + ) + + if machine.DeletionTimestamp != nil { + ch <- prometheus.MustNewConstMetric( + mc.machineDeleted, + prometheus.GaugeValue, + float64(machine.DeletionTimestamp.Unix()), + machine.Name, + ) + } + } +} + +type NodeController struct { + lister v1.NodeLister + + nodes *prometheus.Desc + nodeCreated *prometheus.Desc + nodeDeleted *prometheus.Desc +} + +func NewNodeController(lister v1.NodeLister) *NodeController { + return &NodeController{ + lister: lister, + + nodes: prometheus.NewDesc( + metricsPrefix+"nodes", + "The number of nodes created by a machine", + nil, nil, + ), + nodeCreated: prometheus.NewDesc( + metricsPrefix+"node_created", + "The number of nodes created by a machine", + []string{"node"}, nil, + ), + nodeDeleted: prometheus.NewDesc( + metricsPrefix+"node_deleted", + "The number of nodes created by a machine", + []string{"node"}, nil, + ), + } +} + +func (nc *NodeController) Describe(ch chan<- *prometheus.Desc) { + ch <- nc.nodes +} + +func (nc *NodeController) Collect(ch chan<- prometheus.Metric) { + nodes, err := nc.lister.List(labels.Everything()) + if err != nil { + return + } + + ch <- prometheus.MustNewConstMetric( + nc.nodes, + prometheus.GaugeValue, + float64(len(nodes)), + ) + + for _, node := range nodes { + ch <- prometheus.MustNewConstMetric( + nc.nodeCreated, + prometheus.GaugeValue, + float64(node.CreationTimestamp.Unix()), + node.Name, + ) + + if node.DeletionTimestamp != nil { + ch <- prometheus.MustNewConstMetric( + nc.nodeDeleted, + prometheus.GaugeValue, + float64(node.DeletionTimestamp.Unix()), + node.Name, + ) + } + } +} From 81f6fd2e1df30777d5cad6270e8fe34b2220349c Mon Sep 17 00:00:00 2001 From: Matthias Loibl Date: Thu, 12 Jul 2018 19:03:41 +0200 Subject: [PATCH 4/6] Add example alert for machine being unable to delete --- examples/alerts.yaml | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/examples/alerts.yaml b/examples/alerts.yaml index 4525363a1..c37179ecb 100644 --- a/examples/alerts.yaml +++ b/examples/alerts.yaml @@ -7,13 +7,18 @@ groups: labels: severity: critical annotations: - description: "Machine Controller in namespace {{ $labels.namespace }} is down for more than 5 minutes." - summary: "Machine Controller is down" + message: "Machine Controller in namespace {{ $labels.namespace }} is down for more than 5 minutes." - alert: MachineControllerTooManyErrors expr: sum(rate(machine_controller_errors_total[5m])) by (namespace) > 0.01 for: 10m labels: severity: warning annotations: - description: "Machine Controller in {{ $labels.namespace }} has too many errors in its loop." - summary: "Machine Controller has many errors" + message: "Machine Controller in {{ $labels.namespace }} has too many errors in its loop." + - alert: MachineControllerDeleting + expr: machine_controller_machine_deleted > 0 + for: 10m + labels: + severity: critical + annotations: + message: "Unable to delete machine {{ $labels.machine }}" From 116c92888f42f938d2bebc5733325704915788d0 Mon Sep 17 00:00:00 2001 From: Matthias Loibl Date: Fri, 13 Jul 2018 14:37:37 +0200 Subject: [PATCH 5/6] Fix typo and rename to NodeCollector --- cmd/controller/main.go | 2 +- pkg/controller/metrics.go | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/cmd/controller/main.go b/cmd/controller/main.go index 2544c7a59..e1ba8a680 100644 --- a/cmd/controller/main.go +++ b/cmd/controller/main.go @@ -225,7 +225,7 @@ func main() { machineInformerFactory.Machine().V1alpha1().Machines().Lister(), )) - prometheus.MustRegister(controller.NewNodeController( + prometheus.MustRegister(controller.NewNodeCollector( kubeInformerFactory.Core().V1().Nodes().Lister(), )) diff --git a/pkg/controller/metrics.go b/pkg/controller/metrics.go index e372e67cc..848d71345 100644 --- a/pkg/controller/metrics.go +++ b/pkg/controller/metrics.go @@ -97,7 +97,7 @@ func (mc MachineCollector) Collect(ch chan<- prometheus.Metric) { } } -type NodeController struct { +type NodeCollector struct { lister v1.NodeLister nodes *prometheus.Desc @@ -105,8 +105,8 @@ type NodeController struct { nodeDeleted *prometheus.Desc } -func NewNodeController(lister v1.NodeLister) *NodeController { - return &NodeController{ +func NewNodeCollector(lister v1.NodeLister) *NodeCollector { + return &NodeCollector{ lister: lister, nodes: prometheus.NewDesc( @@ -127,11 +127,11 @@ func NewNodeController(lister v1.NodeLister) *NodeController { } } -func (nc *NodeController) Describe(ch chan<- *prometheus.Desc) { +func (nc *NodeCollector) Describe(ch chan<- *prometheus.Desc) { ch <- nc.nodes } -func (nc *NodeController) Collect(ch chan<- prometheus.Metric) { +func (nc *NodeCollector) Collect(ch chan<- prometheus.Metric) { nodes, err := nc.lister.List(labels.Everything()) if err != nil { return From 08e8f1787101229707bd40de0a4f572d8ec37393 Mon Sep 17 00:00:00 2001 From: Matthias Loibl Date: Fri, 13 Jul 2018 16:52:45 +0200 Subject: [PATCH 6/6] Remove NodeCollector as it's a kube-state-metrics dup --- cmd/controller/main.go | 4 --- pkg/controller/metrics.go | 66 --------------------------------------- 2 files changed, 70 deletions(-) diff --git a/cmd/controller/main.go b/cmd/controller/main.go index e1ba8a680..3db9f61f5 100644 --- a/cmd/controller/main.go +++ b/cmd/controller/main.go @@ -225,10 +225,6 @@ func main() { machineInformerFactory.Machine().V1alpha1().Machines().Lister(), )) - prometheus.MustRegister(controller.NewNodeCollector( - kubeInformerFactory.Core().V1().Nodes().Lister(), - )) - s := createUtilHttpServer(kubeClient, kubeconfigProvider) g.Add(func() error { return s.ListenAndServe() diff --git a/pkg/controller/metrics.go b/pkg/controller/metrics.go index 848d71345..0b4c4acfb 100644 --- a/pkg/controller/metrics.go +++ b/pkg/controller/metrics.go @@ -4,7 +4,6 @@ import ( "github.com/kubermatic/machine-controller/pkg/client/listers/machines/v1alpha1" "github.com/prometheus/client_golang/prometheus" "k8s.io/apimachinery/pkg/labels" - "k8s.io/client-go/listers/core/v1" ) const metricsPrefix = "machine_controller_" @@ -96,68 +95,3 @@ func (mc MachineCollector) Collect(ch chan<- prometheus.Metric) { } } } - -type NodeCollector struct { - lister v1.NodeLister - - nodes *prometheus.Desc - nodeCreated *prometheus.Desc - nodeDeleted *prometheus.Desc -} - -func NewNodeCollector(lister v1.NodeLister) *NodeCollector { - return &NodeCollector{ - lister: lister, - - nodes: prometheus.NewDesc( - metricsPrefix+"nodes", - "The number of nodes created by a machine", - nil, nil, - ), - nodeCreated: prometheus.NewDesc( - metricsPrefix+"node_created", - "The number of nodes created by a machine", - []string{"node"}, nil, - ), - nodeDeleted: prometheus.NewDesc( - metricsPrefix+"node_deleted", - "The number of nodes created by a machine", - []string{"node"}, nil, - ), - } -} - -func (nc *NodeCollector) Describe(ch chan<- *prometheus.Desc) { - ch <- nc.nodes -} - -func (nc *NodeCollector) Collect(ch chan<- prometheus.Metric) { - nodes, err := nc.lister.List(labels.Everything()) - if err != nil { - return - } - - ch <- prometheus.MustNewConstMetric( - nc.nodes, - prometheus.GaugeValue, - float64(len(nodes)), - ) - - for _, node := range nodes { - ch <- prometheus.MustNewConstMetric( - nc.nodeCreated, - prometheus.GaugeValue, - float64(node.CreationTimestamp.Unix()), - node.Name, - ) - - if node.DeletionTimestamp != nil { - ch <- prometheus.MustNewConstMetric( - nc.nodeDeleted, - prometheus.GaugeValue, - float64(node.DeletionTimestamp.Unix()), - node.Name, - ) - } - } -}