diff --git a/cmd/virtual-kubelet/root/flag.go b/cmd/virtual-kubelet/root/flag.go index 8825319140..5ae3ba7570 100644 --- a/cmd/virtual-kubelet/root/flag.go +++ b/cmd/virtual-kubelet/root/flag.go @@ -72,6 +72,7 @@ func InstallFlags(flags *pflag.FlagSet, o *Opts) { flags.BoolVar(&o.EnableStorage, "enable-storage", false, "Enable the Liqo storage reflection") flags.StringVar(&o.VirtualStorageClassName, "virtual-storage-class-name", "liqo", "Name of the virtual storage class") flags.StringVar(&o.RemoteRealStorageClassName, "remote-real-storage-class-name", "", "Name of the real storage class to use for the actual volumes") + flags.BoolVar(&o.EnableMetrics, "enable-metrics", false, "Enable the metrics server") flags.StringVar(&o.HomeAPIServerHost, "home-api-server-host", "", "Home cluster API server HOST, this parameter is optional and required only to override the default values") flags.StringVar(&o.HomeAPIServerPort, "home-api-server-port", "", diff --git a/cmd/virtual-kubelet/root/opts.go b/cmd/virtual-kubelet/root/opts.go index 02671de900..09ad4cac5b 100644 --- a/cmd/virtual-kubelet/root/opts.go +++ b/cmd/virtual-kubelet/root/opts.go @@ -96,6 +96,7 @@ type Opts struct { EnableStorage bool VirtualStorageClassName string RemoteRealStorageClassName string + EnableMetrics bool HomeAPIServerHost string HomeAPIServerPort string diff --git a/cmd/virtual-kubelet/root/root.go b/cmd/virtual-kubelet/root/root.go index 14721cfd5d..9e8150d228 100644 --- a/cmd/virtual-kubelet/root/root.go +++ b/cmd/virtual-kubelet/root/root.go @@ -36,6 +36,7 @@ import ( "github.com/liqotech/liqo/pkg/utils" "github.com/liqotech/liqo/pkg/utils/restcfg" nodeprovider "github.com/liqotech/liqo/pkg/virtualKubelet/liqoNodeProvider" + metrics "github.com/liqotech/liqo/pkg/virtualKubelet/metrics" podprovider "github.com/liqotech/liqo/pkg/virtualKubelet/provider" ) @@ -110,6 +111,7 @@ func runRootCommand(ctx context.Context, c *Opts) error { EnableStorage: c.EnableStorage, VirtualStorageClassName: c.VirtualStorageClassName, RemoteRealStorageClassName: c.RemoteRealStorageClassName, + EnableMetrics: c.EnableMetrics, HomeAPIServerHost: c.HomeAPIServerHost, HomeAPIServerPort: c.HomeAPIServerPort, @@ -194,6 +196,10 @@ func runRootCommand(ctx context.Context, c *Opts) error { return errors.Wrap(err, "error while setting up HTTPS server") } + if c.EnableMetrics { + metrics.SetupMetricHandler() + } + go func() { if err := nodeRunner.Run(ctx); err != nil { klog.Error(err, "error in pod controller running") diff --git a/deployments/liqo/README.md b/deployments/liqo/README.md index ab1597c53b..367a35272f 100644 --- a/deployments/liqo/README.md +++ b/deployments/liqo/README.md @@ -134,6 +134,11 @@ | virtualKubelet.extra.labels | object | `{}` | virtual kubelet pod extra labels | | virtualKubelet.extra.resources | object | `{"limits":{},"requests":{}}` | virtual kubelet pod containers' resource requests and limits (https://kubernetes.io/docs/user-guide/compute-resources/) | | virtualKubelet.imageName | string | `"ghcr.io/liqotech/virtual-kubelet"` | virtual kubelet image repository | +| virtualKubelet.metrics.enabled | bool | `false` | expose metrics about virtual kubelet resources. | +| virtualKubelet.metrics.podMonitor.enabled | bool | `false` | | +| virtualKubelet.metrics.podMonitor.interval | string | `""` | | +| virtualKubelet.metrics.podMonitor.scrapeTimeout | string | `""` | | +| virtualKubelet.metrics.port | int | `9090` | port used to expose metrics. | | virtualKubelet.virtualNode.extra.annotations | object | `{}` | virtual node extra annotations | | virtualKubelet.virtualNode.extra.labels | object | `{}` | virtual node extra labels | | webhook.failurePolicy | string | `"Fail"` | the webhook failure policy, among Ignore and Fail | diff --git a/deployments/liqo/templates/liqo-controller-manager-deployment.yaml b/deployments/liqo/templates/liqo-controller-manager-deployment.yaml index c82b1201e5..eceb41fc0e 100644 --- a/deployments/liqo/templates/liqo-controller-manager-deployment.yaml +++ b/deployments/liqo/templates/liqo-controller-manager-deployment.yaml @@ -26,6 +26,9 @@ {{- $vkargs = append $vkargs "--certificate-type=aws" }} {{- end }} {{- end }} +{{- if not (or (has "--enable-metrics" $vkargs ) (has "--enable-metrics=true" $vkargs ) (has "--enable-metrics=false" $vkargs )) }} +{{- $vkargs = append $vkargs "--enable-metrics=true" }} +{{- end}} apiVersion: apps/v1 kind: Deployment diff --git a/deployments/liqo/templates/liqo-virtualkubelet-podmonitor.yaml b/deployments/liqo/templates/liqo-virtualkubelet-podmonitor.yaml new file mode 100644 index 0000000000..82640fabd5 --- /dev/null +++ b/deployments/liqo/templates/liqo-virtualkubelet-podmonitor.yaml @@ -0,0 +1,22 @@ +{{- $kubeletMetricsConfig := (merge (dict "name" "virtual-kubelet" "module" "virtual-kubelet") .) -}} +{{- if .Values.virtualKubelet.metrics.podMonitor.enabled }} + +apiVersion: monitoring.coreos.com/v1 +kind: PodMonitor +metadata: + name: {{ include "liqo.prefixedName" $kubeletMetricsConfig }} + labels: + {{- include "liqo.labels" $kubeletMetricsConfig | nindent 4 }} +spec: + namespaceSelector: + any: true + selector: + matchLabels: + app.kubernetes.io/name: "virtual-kubelet" + app.kubernetes.io/component: "virtual-kubelet" + podMetricsEndpoints: + - port: metrics + interval: {{ .Values.virtualKubelet.metrics.podMonitor.interval }} + scrapeTimeout: {{ .Values.virtualKubelet.metrics.podMonitor.scrapeTimeout }} +{{- end }} + diff --git a/deployments/liqo/values.yaml b/deployments/liqo/values.yaml index 1a484cc87c..3d47f315e8 100644 --- a/deployments/liqo/values.yaml +++ b/deployments/liqo/values.yaml @@ -315,6 +315,20 @@ virtualKubelet: annotations: {} # -- virtual node extra labels labels: {} + metrics: + # -- expose metrics about virtual kubelet resources. + enabled: false + # -- port used to expose metrics. + port: 9090 + podMonitor: + # # -- create a prometheus podmonitor. + enabled: false + # # -- setup pod monitor requests interval. If empty, Prometheus uses the global scrape interval. + # # ref: https://github.com/prometheus-operator/prometheus-operator/blob/main/Documentation/api.md#endpoint + interval: "" + # # -- setup pod monitor scrape timeout. If empty, Prometheus uses the global scrape timeout. + # # ref: https://github.com/prometheus-operator/prometheus-operator/blob/main/Documentation/api.md#endpoint + scrapeTimeout: "" uninstaller: pod: diff --git a/pkg/liqoctl/install/handler.go b/pkg/liqoctl/install/handler.go index fb1bb8f939..c30d97d220 100644 --- a/pkg/liqoctl/install/handler.go +++ b/pkg/liqoctl/install/handler.go @@ -335,6 +335,15 @@ func (o *Options) values() map[string]interface{} { }, }, + "virtualKubelet": map[string]interface{}{ + "metrics": map[string]interface{}{ + "enabled": o.EnableMetrics, + "podMonitor": map[string]interface{}{ + "enabled": o.EnableMetrics, + }, + }, + }, + "telemetry": map[string]interface{}{ "enable": !o.DisableTelemetry, }, diff --git a/pkg/virtualKubelet/metrics/doc.go b/pkg/virtualKubelet/metrics/doc.go new file mode 100644 index 0000000000..f5e789af06 --- /dev/null +++ b/pkg/virtualKubelet/metrics/doc.go @@ -0,0 +1,16 @@ +// Copyright 2019-2023 The Liqo Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package metrics provides a set of metrics for Virtual Kubelet component. +package metrics diff --git a/pkg/virtualKubelet/metrics/metrics.go b/pkg/virtualKubelet/metrics/metrics.go new file mode 100644 index 0000000000..82133bfa41 --- /dev/null +++ b/pkg/virtualKubelet/metrics/metrics.go @@ -0,0 +1,84 @@ +// Copyright 2019-2023 The Liqo Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package metrics + +import ( + "net/http" + "os" + "time" + + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promhttp" + "k8s.io/klog/v2" +) + +const ( + // MetricsPort is the metrics port constant. + MetricsPort = ":9090" +) + +var ( + // ErrorsCounter is the counter of the errors occurred during the reflection. + ErrorsCounter *prometheus.CounterVec + // ItemsCounter is the counter of the reflected resources. + // A fast increase of this metric can indicate a race condition between local and remote operators. + ItemsCounter *prometheus.CounterVec +) + +// Init initializes the metrics. If no error occurs or no item is processed, the corresponding metric is not exported. +func init() { + var MetricsLabels = []string{"namespace", "reflector_resource"} + + ErrorsCounter = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Name: "liqo_virtual_kubelet_reflection_error_counter", + Help: "The counter of the transient errors.", + }, + MetricsLabels, + ) + + ItemsCounter = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Name: "liqo_virtual_kubelet_reflection_item_counter", + Help: "The counter of the reflected resources. A fast increase of this metric can indicate a race condition between local and remote operators.", + }, + MetricsLabels, + ) +} + +// SetupMetricHandler sets up the metric handler. +func SetupMetricHandler() { + // Register the metrics to the prometheus registry. + prometheus.MustRegister(ErrorsCounter) + // Register the metrics to the prometheus registry. + prometheus.MustRegister(ItemsCounter) + + http.Handle("/metrics", promhttp.Handler()) + + go func() { + klog.Infof("Starting the virtual kubelet Metric Handler listening on %q", MetricsPort) + + server := &http.Server{ + Addr: ":1234", + ReadHeaderTimeout: 10 * time.Second, + } + + // Key and certificate paths are not specified, since already configured as part of the TLSConfig. + if err := server.ListenAndServe(); err != nil { + klog.Errorf("Failed to start the Metric Handler: %v", err) + os.Exit(1) + } + }() +} diff --git a/pkg/virtualKubelet/provider/provider.go b/pkg/virtualKubelet/provider/provider.go index e7dffd8e8b..1263bcfc4e 100644 --- a/pkg/virtualKubelet/provider/provider.go +++ b/pkg/virtualKubelet/provider/provider.go @@ -75,6 +75,7 @@ type InitConfig struct { EnableStorage bool VirtualStorageClassName string RemoteRealStorageClassName string + EnableMetrics bool HomeAPIServerHost string HomeAPIServerPort string diff --git a/pkg/virtualKubelet/reflection/generic/reflector.go b/pkg/virtualKubelet/reflection/generic/reflector.go index abc566f439..2a58ff21b9 100644 --- a/pkg/virtualKubelet/reflection/generic/reflector.go +++ b/pkg/virtualKubelet/reflection/generic/reflector.go @@ -21,6 +21,7 @@ import ( "sync" "time" + "github.com/prometheus/client_golang/prometheus" "k8s.io/apimachinery/pkg/api/meta" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" @@ -33,6 +34,7 @@ import ( "k8s.io/utils/trace" traceutils "github.com/liqotech/liqo/pkg/utils/trace" + "github.com/liqotech/liqo/pkg/virtualKubelet/metrics" "github.com/liqotech/liqo/pkg/virtualKubelet/reflection/manager" "github.com/liqotech/liqo/pkg/virtualKubelet/reflection/options" ) @@ -199,9 +201,27 @@ func (gr *reflector) processNextWorkItem() bool { // Put the item back on the workqueue to handle any transient errors. gr.workqueue.AddRateLimited(key) + + // Increase the error counter metric. + metrics.ErrorsCounter.With(prometheus.Labels{"namespace": key.(types.NamespacedName).Namespace, + "reflector_resource": gr.name}).Inc() + + if errors.As(err, &eae) { + // Put the item back on the workqueue after the given duration elapsed. + gr.workqueue.AddAfter(key, eae.duration) + return true + } + + // Put the item back on the workqueue to handle any transient errors. + gr.workqueue.AddRateLimited(key) + return true } + // Increase the item counter metric. + metrics.ItemsCounter.With(prometheus.Labels{"namespace": key.(types.NamespacedName).Namespace, + "reflector_resource": gr.name}).Inc() + // Finally, if no error occurs we Forget this item so it does not // get queued again until another change happens. gr.workqueue.Forget(key) diff --git a/pkg/vkMachinery/forge/forge.go b/pkg/vkMachinery/forge/forge.go index f8874bf02c..190eedc94e 100644 --- a/pkg/vkMachinery/forge/forge.go +++ b/pkg/vkMachinery/forge/forge.go @@ -85,6 +85,13 @@ func forgeVKContainers( ValueFrom: &v1.EnvVarSource{FieldRef: &v1.ObjectFieldSelector{FieldPath: "status.podIP"}}, }, }, + Ports: []v1.ContainerPort{ + { + Name: "metrics", + ContainerPort: 9090, + Protocol: v1.ProtocolTCP, + }, + }, }, } }