Skip to content

Commit

Permalink
add dynamic config metrics
Browse files Browse the repository at this point in the history
This PR exports config-releated metrics from the Kubelet.
The Guages for active, assigned, and last-known-good config can be used
to identify config versions and produce aggregate counts across several
nodes. The error-reporting Gauge can be used to determine whether a node
is experiencing a config-related error, and to prodouce an aggregate
count of nodes in an error state.
  • Loading branch information
mtaufen committed May 22, 2018
1 parent b5648c3 commit 1faff12
Show file tree
Hide file tree
Showing 4 changed files with 292 additions and 3 deletions.
19 changes: 19 additions & 0 deletions pkg/kubelet/kubeletconfig/status/status.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ import (
"k8s.io/apimachinery/pkg/types"
clientset "k8s.io/client-go/kubernetes"
utillog "k8s.io/kubernetes/pkg/kubelet/kubeletconfig/util/log"
"k8s.io/kubernetes/pkg/kubelet/metrics"
nodeutil "k8s.io/kubernetes/pkg/util/node"
)

Expand Down Expand Up @@ -176,6 +177,24 @@ func (s *nodeConfigStatus) Sync(client clientset.Interface, nodeName string) {
status.Error = s.errorOverride
}

// update metrics based on the status we will sync
metrics.SetConfigError(len(status.Error) > 0)
err = metrics.SetAssignedConfig(status.Assigned)
if err != nil {
err = fmt.Errorf("failed to update Assigned config metric, error: %v", err)
return
}
err = metrics.SetActiveConfig(status.Active)
if err != nil {
err = fmt.Errorf("failed to update Active config metric, error: %v", err)
return
}
err = metrics.SetLastKnownGoodConfig(status.LastKnownGood)
if err != nil {
err = fmt.Errorf("failed to update LastKnownGood config metric, error: %v", err)
return
}

// apply the status to a copy of the node so we don't modify the object in the informer's store
newNode := oldNode.DeepCopy()
newNode.Status.Config = status
Expand Down
140 changes: 140 additions & 0 deletions pkg/kubelet/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,15 @@ limitations under the License.
package metrics

import (
"fmt"
"sync"
"time"

"github.com/golang/glog"
"github.com/prometheus/client_golang/prometheus"
corev1 "k8s.io/api/core/v1"
utilfeature "k8s.io/apiserver/pkg/util/feature"
"k8s.io/kubernetes/pkg/features"
kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
)

Expand All @@ -47,6 +51,17 @@ const (
// Metrics keys of device plugin operations
DevicePluginRegistrationCountKey = "device_plugin_registration_count"
DevicePluginAllocationLatencyKey = "device_plugin_alloc_latency_microseconds"

// Metric keys for node config (these mirror the Assigned, Active, and LastKnownGood sources reported in Node.Status.Config)
AssignedConfigKey = "node_config_assigned"
ActiveConfigKey = "node_config_active"
LastKnownGoodConfigKey = "node_config_last_known_good"
ConfigErrorKey = "node_config_error"
ConfigNameLabelKey = "node_config_name" // this is a fully resolved name, e.g. an API path or a URL, "Name" is short but a little confusing name for this key...
ConfigNameLabelValueLocal = "local"
ConfigUIDLabelKey = "node_config_uid"
ConfigResourceVersionLabelKey = "node_config_resource_version"
KubeletConfigKeyLabelKey = "node_config_kubelet_key"
)

var (
Expand Down Expand Up @@ -150,6 +165,40 @@ var (
},
[]string{"resource_name"},
)

// Metrics for node config

AssignedConfig = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Subsystem: KubeletSubsystem,
Name: AssignedConfigKey,
Help: "The node's understanding of intended config. The count is always 1.",
},
[]string{ConfigNameLabelKey, ConfigUIDLabelKey, ConfigResourceVersionLabelKey, KubeletConfigKeyLabelKey},
)
ActiveConfig = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Subsystem: KubeletSubsystem,
Name: ActiveConfigKey,
Help: "The config source the node is actively using. The count is always 1.",
},
[]string{ConfigNameLabelKey, ConfigUIDLabelKey, ConfigResourceVersionLabelKey, KubeletConfigKeyLabelKey},
)
LastKnownGoodConfig = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Subsystem: KubeletSubsystem,
Name: LastKnownGoodConfigKey,
Help: "The config source the node will fall back to when it encounters certain errors. The count is always 1.",
},
[]string{ConfigNameLabelKey, ConfigUIDLabelKey, ConfigResourceVersionLabelKey, KubeletConfigKeyLabelKey},
)
ConfigError = prometheus.NewGauge(
prometheus.GaugeOpts{
Subsystem: KubeletSubsystem,
Name: ConfigErrorKey,
Help: "This metric is true (1) if the node is experiencing a configuration-related error, false (0) otherwise.",
},
)
)

var registerMetrics sync.Once
Expand All @@ -172,6 +221,12 @@ func Register(containerCache kubecontainer.RuntimeCache, collectors ...prometheu
prometheus.MustRegister(EvictionStatsAge)
prometheus.MustRegister(DevicePluginRegistrationCount)
prometheus.MustRegister(DevicePluginAllocationLatency)
if utilfeature.DefaultFeatureGate.Enabled(features.DynamicKubeletConfig) {
prometheus.MustRegister(AssignedConfig)
prometheus.MustRegister(ActiveConfig)
prometheus.MustRegister(LastKnownGoodConfig)
prometheus.MustRegister(ConfigError)
}
for _, collector := range collectors {
prometheus.MustRegister(collector)
}
Expand Down Expand Up @@ -232,3 +287,88 @@ func (pc *podAndContainerCollector) Collect(ch chan<- prometheus.Metric) {
prometheus.GaugeValue,
float64(runningContainers))
}

const configMapAPIPathFmt = "/api/v1/namespaces/%s/configmaps/%s"

func configLabels(source *corev1.NodeConfigSource) (map[string]string, error) {
if source == nil {
return map[string]string{
// prometheus requires all of the labels that can be set on the metric
ConfigNameLabelKey: "local",
ConfigUIDLabelKey: "",
ConfigResourceVersionLabelKey: "",
KubeletConfigKeyLabelKey: "",
}, nil
}
if source.ConfigMap != nil {
return map[string]string{
ConfigNameLabelKey: fmt.Sprintf(configMapAPIPathFmt, source.ConfigMap.Namespace, source.ConfigMap.Name),
ConfigUIDLabelKey: string(source.ConfigMap.UID),
ConfigResourceVersionLabelKey: source.ConfigMap.ResourceVersion,
KubeletConfigKeyLabelKey: source.ConfigMap.KubeletConfigKey,
}, nil
}
return nil, fmt.Errorf("unrecognized config source type, all source subfields were nil")
}

// track labels across metric updates, so we can delete old label sets and prevent leaks
var assignedConfigLabels map[string]string = map[string]string{}

func SetAssignedConfig(source *corev1.NodeConfigSource) error {
// compute the timeseries labels from the source
labels, err := configLabels(source)
if err != nil {
return err
}
// clean up the old timeseries (WithLabelValues creates a new one for each distinct label set)
AssignedConfig.Delete(assignedConfigLabels)
// record the new timeseries
assignedConfigLabels = labels
// expose the new timeseries with a constant count of 1
AssignedConfig.With(assignedConfigLabels).Set(1)
return nil
}

// track labels across metric updates, so we can delete old label sets and prevent leaks
var activeConfigLabels map[string]string = map[string]string{}

func SetActiveConfig(source *corev1.NodeConfigSource) error {
// compute the timeseries labels from the source
labels, err := configLabels(source)
if err != nil {
return err
}
// clean up the old timeseries (WithLabelValues creates a new one for each distinct label set)
ActiveConfig.Delete(activeConfigLabels)
// record the new timeseries
activeConfigLabels = labels
// expose the new timeseries with a constant count of 1
ActiveConfig.With(activeConfigLabels).Set(1)
return nil
}

// track labels across metric updates, so we can delete old label sets and prevent leaks
var lastKnownGoodConfigLabels map[string]string = map[string]string{}

func SetLastKnownGoodConfig(source *corev1.NodeConfigSource) error {
// compute the timeseries labels from the source
labels, err := configLabels(source)
if err != nil {
return err
}
// clean up the old timeseries (WithLabelValues creates a new one for each distinct label set)
LastKnownGoodConfig.Delete(lastKnownGoodConfigLabels)
// record the new timeseries
lastKnownGoodConfigLabels = labels
// expose the new timeseries with a constant count of 1
LastKnownGoodConfig.With(lastKnownGoodConfigLabels).Set(1)
return nil
}

func SetConfigError(err bool) {
if err {
ConfigError.Set(1)
} else {
ConfigError.Set(0)
}
}
117 changes: 114 additions & 3 deletions test/e2e_node/dynamic_kubelet_config_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ package e2e_node

import (
"fmt"
"reflect"
"strings"
"time"

Expand All @@ -27,12 +28,17 @@ import (
apiequality "k8s.io/apimachinery/pkg/api/equality"
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/sets"
"k8s.io/kubernetes/pkg/kubelet/apis/kubeletconfig"
controller "k8s.io/kubernetes/pkg/kubelet/kubeletconfig"
"k8s.io/kubernetes/pkg/kubelet/kubeletconfig/status"
"k8s.io/kubernetes/pkg/kubelet/metrics"
frameworkMetrics "k8s.io/kubernetes/test/e2e/framework/metrics"

"k8s.io/kubernetes/test/e2e/framework"

"github.com/prometheus/common/model"

. "github.com/onsi/ginkgo"
. "github.com/onsi/gomega"
)
Expand All @@ -45,8 +51,6 @@ type expectNodeConfigStatus struct {
// If true, expect Status.Config.Active == Status.Config.LastKnownGood,
// otherwise expect Status.Config.Active == Status.Config.Assigned.
lkgActive bool
// If true, skip checking Status.Config.LastKnownGood == this.lastKnownGood in the status.
skipLkg bool
}

type nodeConfigTestCase struct {
Expand Down Expand Up @@ -809,6 +813,8 @@ func (tc *nodeConfigTestCase) run(f *framework.Framework, fn func(f *framework.F
tc.checkNodeConfigSource(f)
// check status
tc.checkConfigStatus(f)
// check that the Kubelet's config-related metrics are correct
tc.checkConfigMetrics(f)
// check expectConfig
if tc.expectConfig != nil {
tc.checkConfig(f)
Expand Down Expand Up @@ -929,7 +935,7 @@ func expectConfigStatus(tc *nodeConfigTestCase, actual *apiv1.NodeConfigStatus)
errs = append(errs, spew.Sprintf("expected Assigned %#v but got %#v", expectAssigned, actual.Assigned))
}
// check LastKnownGood matches tc.expectConfigStatus.lastKnownGood
if !tc.expectConfigStatus.skipLkg && !apiequality.Semantic.DeepEqual(tc.expectConfigStatus.lastKnownGood, actual.LastKnownGood) {
if !apiequality.Semantic.DeepEqual(tc.expectConfigStatus.lastKnownGood, actual.LastKnownGood) {
errs = append(errs, spew.Sprintf("expected LastKnownGood %#v but got %#v", tc.expectConfigStatus.lastKnownGood, actual.LastKnownGood))
}
// check Active matches Assigned or LastKnownGood, depending on tc.expectConfigStatus.lkgActive
Expand Down Expand Up @@ -1016,6 +1022,111 @@ func (tc *nodeConfigTestCase) checkEvent(f *framework.Framework) {
}, timeout, interval).Should(BeNil())
}

// checkConfigMetrics makes sure the Kubelet's config related metrics are as we expect, given the test case
func (tc *nodeConfigTestCase) checkConfigMetrics(f *framework.Framework) {
const (
timeout = time.Minute
interval = time.Second
assignedConfigKey = metrics.KubeletSubsystem + "_" + metrics.AssignedConfigKey
activeConfigKey = metrics.KubeletSubsystem + "_" + metrics.ActiveConfigKey
lastKnownGoodConfigKey = metrics.KubeletSubsystem + "_" + metrics.LastKnownGoodConfigKey
configErrorKey = metrics.KubeletSubsystem + "_" + metrics.ConfigErrorKey
)
// local config helper
mkLocalSample := func(name model.LabelValue) *model.Sample {
return &model.Sample{
Metric: model.Metric(map[model.LabelName]model.LabelValue{
model.MetricNameLabel: name,
metrics.ConfigNameLabelKey: "local",
metrics.ConfigUIDLabelKey: "",
metrics.ConfigResourceVersionLabelKey: "",
metrics.KubeletConfigKeyLabelKey: "",
}),
Value: 1,
}
}
// remote config helper
mkRemoteSample := func(name model.LabelValue, source *apiv1.NodeConfigSource) *model.Sample {
return &model.Sample{
Metric: model.Metric(map[model.LabelName]model.LabelValue{
model.MetricNameLabel: name,
metrics.ConfigNameLabelKey: model.LabelValue(fmt.Sprintf("/api/v1/namespaces/%s/configmaps/%s", source.ConfigMap.Namespace, source.ConfigMap.Name)),
metrics.ConfigUIDLabelKey: model.LabelValue(source.ConfigMap.UID),
metrics.ConfigResourceVersionLabelKey: model.LabelValue(source.ConfigMap.ResourceVersion),
metrics.KubeletConfigKeyLabelKey: model.LabelValue(source.ConfigMap.KubeletConfigKey),
}),
Value: 1,
}
}
// error helper
mkErrorSample := func(expectError bool) *model.Sample {
v := model.SampleValue(0)
if expectError {
v = model.SampleValue(1)
}
return &model.Sample{
Metric: model.Metric(map[model.LabelName]model.LabelValue{model.MetricNameLabel: configErrorKey}),
Value: v,
}
}
// construct expected metrics
// assigned
assignedSamples := model.Samples{mkLocalSample(assignedConfigKey)}
assignedSource := tc.configSource.DeepCopy()
if assignedSource != nil && assignedSource.ConfigMap != nil {
assignedSource.ConfigMap.UID = tc.configMap.UID
assignedSource.ConfigMap.ResourceVersion = tc.configMap.ResourceVersion
assignedSamples = model.Samples{mkRemoteSample(assignedConfigKey, assignedSource)}
}
// last-known-good
lastKnownGoodSamples := model.Samples{mkLocalSample(lastKnownGoodConfigKey)}
lastKnownGoodSource := tc.expectConfigStatus.lastKnownGood
if lastKnownGoodSource != nil && lastKnownGoodSource.ConfigMap != nil {
lastKnownGoodSamples = model.Samples{mkRemoteSample(lastKnownGoodConfigKey, lastKnownGoodSource)}
}
// active
activeSamples := model.Samples{mkLocalSample(activeConfigKey)}
activeSource := assignedSource
if tc.expectConfigStatus.lkgActive {
activeSource = lastKnownGoodSource
}
if activeSource != nil && activeSource.ConfigMap != nil {
activeSamples = model.Samples{mkRemoteSample(activeConfigKey, activeSource)}
}
// error
errorSamples := model.Samples{mkErrorSample(len(tc.expectConfigStatus.err) > 0)}
// expected metrics
expect := frameworkMetrics.KubeletMetrics(map[string]model.Samples{
assignedConfigKey: assignedSamples,
activeConfigKey: activeSamples,
lastKnownGoodConfigKey: lastKnownGoodSamples,
configErrorKey: errorSamples,
})
// wait for expected metrics to appear
Eventually(func() error {
actual, err := getKubeletMetrics(sets.NewString(
assignedConfigKey,
activeConfigKey,
lastKnownGoodConfigKey,
configErrorKey,
))
if err != nil {
return err
}
// clear timestamps from actual, so DeepEqual is time-invariant
for _, samples := range actual {
for _, sample := range samples {
sample.Timestamp = 0
}
}
// compare to expected
if !reflect.DeepEqual(expect, actual) {
return fmt.Errorf("checkConfigMetrics: case: %s: expect metrics %s but got %s", tc.desc, spew.Sprintf("%#v", expect), spew.Sprintf("%#v", actual))
}
return nil
}, timeout, interval).Should(BeNil())
}

// constructs the expected SelfLink for a config map
func configMapAPIPath(cm *apiv1.ConfigMap) string {
return fmt.Sprintf("/api/v1/namespaces/%s/configmaps/%s", cm.Namespace, cm.Name)
Expand Down
Loading

0 comments on commit 1faff12

Please sign in to comment.