diff --git a/charts/victoria-metrics-k8s-stack/.gitignore b/charts/victoria-metrics-k8s-stack/.gitignore new file mode 100644 index 000000000..89f9ac04a --- /dev/null +++ b/charts/victoria-metrics-k8s-stack/.gitignore @@ -0,0 +1 @@ +out/ diff --git a/charts/victoria-metrics-k8s-stack/Chart.lock b/charts/victoria-metrics-k8s-stack/Chart.lock index 60a5ad10b..0484a6be2 100644 --- a/charts/victoria-metrics-k8s-stack/Chart.lock +++ b/charts/victoria-metrics-k8s-stack/Chart.lock @@ -4,12 +4,12 @@ dependencies: version: 0.1.17 - name: kube-state-metrics repository: https://prometheus-community.github.io/helm-charts - version: 3.2.2 + version: 3.4.1 - name: prometheus-node-exporter repository: https://prometheus-community.github.io/helm-charts - version: 1.18.2 + version: 2.0.2 - name: grafana repository: https://grafana.github.io/helm-charts - version: 6.12.1 -digest: sha256:58ad75ef412eed7eff3fbfc4d721c33d34ca1af838f60d7297de2388dc2d8b26 -generated: "2021-07-12T20:06:41.848769241Z" + version: 6.14.1 +digest: sha256:ef56bd6d0c02f87ffbf5f3ae2debf4a8d6a914c1cd46a999940ce1d62354e039 +generated: "2021-07-27T19:55:13.172435+08:00" diff --git a/charts/victoria-metrics-k8s-stack/Chart.yaml b/charts/victoria-metrics-k8s-stack/Chart.yaml index 64e527900..434df53fe 100644 --- a/charts/victoria-metrics-k8s-stack/Chart.yaml +++ b/charts/victoria-metrics-k8s-stack/Chart.yaml @@ -2,22 +2,23 @@ apiVersion: v2 name: victoria-metrics-k8s-stack description: Kubernetes monitoring on VictoriaMetrics stack. Includes VictoriaMetrics Operator, Grafana dashboards, ServiceScrapes and VMRules type: application -version: 0.2.9 -appVersion: "1.16.0" +version: 0.3.0 +appVersion: "1.63.0" dependencies: -- name: victoria-metrics-operator - version: "0.1.*" - repository: https://victoriametrics.github.io/helm-charts -- name: kube-state-metrics - version: "3.2.*" - repository: https://prometheus-community.github.io/helm-charts - condition: kube-state-metrics.enabled -- name: prometheus-node-exporter - version: "1.18.*" - repository: https://prometheus-community.github.io/helm-charts - condition: prometheus-node-exporter.enabled -- name: grafana - version: "6.12.*" - repository: https://grafana.github.io/helm-charts - condition: grafana.enabled \ No newline at end of file + - name: victoria-metrics-operator + version: "0.1.*" + repository: https://victoriametrics.github.io/helm-charts + condition: operator.enabled + - name: kube-state-metrics + version: "3.4.*" + repository: https://prometheus-community.github.io/helm-charts + condition: kube-state-metrics.enabled + - name: prometheus-node-exporter + version: "2.0.*" + repository: https://prometheus-community.github.io/helm-charts + condition: prometheus-node-exporter.enabled + - name: grafana + version: "6.14.*" + repository: https://grafana.github.io/helm-charts + condition: grafana.enabled diff --git a/charts/victoria-metrics-k8s-stack/README.md b/charts/victoria-metrics-k8s-stack/README.md index e8713eb02..1f8c97768 100644 --- a/charts/victoria-metrics-k8s-stack/README.md +++ b/charts/victoria-metrics-k8s-stack/README.md @@ -1,6 +1,6 @@ # Helm Chart For Victoria Metrics kubernetes monitoring stack. -![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![Version: 0.2.9](https://img.shields.io/badge/Version-0.2.9-informational?style=flat-square) +![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![Version: 0.3.0](https://img.shields.io/badge/Version-0.3.0-informational?style=flat-square) Kubernetes monitoring on VictoriaMetrics stack. Includes VictoriaMetrics Operator, Grafana dashboards, ServiceScrapes and VMRules @@ -252,6 +252,7 @@ Change the values according to the need of the environment in ``victoria-metrics | alertmanager.ingress.tls | list | `[]` | | | alertmanager.monzoTemplate.enabled | bool | `true` | | | alertmanager.spec.externalURL | string | `""` | | +| alertmanager.spec.image.tag | string | `"v0.22.2"` | | | alertmanager.spec.routePrefix | string | `"/"` | | | coreDns.enabled | bool | `true` | | | coreDns.service.enabled | bool | `true` | | @@ -282,7 +283,7 @@ Change the values according to the need of the environment in ``victoria-metrics | defaultRules.rules.kubernetesSystem | bool | `true` | | | defaultRules.rules.network | bool | `true` | | | defaultRules.rules.node | bool | `true` | | -| defaultRules.runbookUrl | string | `"https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#"` | | +| defaultRules.runbookUrl | string | `"https://runbooks.prometheus-operator.dev/runbooks"` | | | fullnameOverride | string | `""` | | | grafana.additionalDataSources | list | `[]` | | | grafana.dashboardProviders."dashboardproviders.yaml".apiVersion | int | `1` | | @@ -390,6 +391,7 @@ Change the values according to the need of the environment in ``victoria-metrics | operator.cleanupCRD | bool | `true` | | | operator.cleanupSA.create | bool | `true` | | | operator.cleanupSA.name | string | `""` | | +| operator.enabled | bool | `true` | | | operator.kubectlImage.pullPolicy | string | `"IfNotPresent"` | | | operator.kubectlImage.repository | string | `"gcr.io/google_containers/hyperkube"` | | | operator.kubectlImage.tag | string | `"v1.16.0"` | | @@ -415,6 +417,7 @@ Change the values according to the need of the environment in ``victoria-metrics | vmagent.ingress.tls | list | `[]` | | | vmagent.spec.externalLabels.cluster | string | `"cluster-name"` | | | vmagent.spec.extraArgs."promscrape.streamParse" | string | `"true"` | | +| vmagent.spec.image.tag | string | `"v1.63.0"` | | | vmagent.spec.scrapeInterval | string | `"25s"` | | | vmalert.enabled | bool | `true` | | | vmalert.ingress.annotations | object | `{}` | | @@ -426,6 +429,54 @@ Change the values according to the need of the environment in ``victoria-metrics | vmalert.ingress.pathType | string | `"Prefix"` | | | vmalert.ingress.tls | list | `[]` | | | vmalert.spec.evaluationInterval | string | `"15s"` | | +| vmalert.spec.image.tag | string | `"v1.63.0"` | | +| vmcluster.enabled | bool | `false` | | +| vmcluster.ingress.insert.annotations | object | `{}` | | +| vmcluster.ingress.insert.enabled | bool | `false` | | +| vmcluster.ingress.insert.extraPaths | list | `[]` | | +| vmcluster.ingress.insert.hosts[0] | string | `"vminsert.domain.com"` | | +| vmcluster.ingress.insert.labels | object | `{}` | | +| vmcluster.ingress.insert.path | string | `"/"` | | +| vmcluster.ingress.insert.pathType | string | `"Prefix"` | | +| vmcluster.ingress.insert.tls | list | `[]` | | +| vmcluster.ingress.select.annotations | object | `{}` | | +| vmcluster.ingress.select.enabled | bool | `false` | | +| vmcluster.ingress.select.extraPaths | list | `[]` | | +| vmcluster.ingress.select.hosts[0] | string | `"vmselect.domain.com"` | | +| vmcluster.ingress.select.labels | object | `{}` | | +| vmcluster.ingress.select.path | string | `"/"` | | +| vmcluster.ingress.select.pathType | string | `"Prefix"` | | +| vmcluster.ingress.select.tls | list | `[]` | | +| vmcluster.ingress.storage.annotations | object | `{}` | | +| vmcluster.ingress.storage.enabled | bool | `false` | | +| vmcluster.ingress.storage.extraPaths | list | `[]` | | +| vmcluster.ingress.storage.hosts[0] | string | `"vmstorage.domain.com"` | | +| vmcluster.ingress.storage.labels | object | `{}` | | +| vmcluster.ingress.storage.path | string | `"/"` | | +| vmcluster.ingress.storage.pathType | string | `"Prefix"` | | +| vmcluster.ingress.storage.tls | list | `[]` | | +| vmcluster.spec.replicationFactor | int | `2` | | +| vmcluster.spec.retentionPeriod | string | `"14"` | | +| vmcluster.spec.vminsert.image.tag | string | `"v1.63.0-cluster"` | | +| vmcluster.spec.vminsert.replicaCount | int | `2` | | +| vmcluster.spec.vminsert.resources.limits.cpu | string | `"1"` | | +| vmcluster.spec.vminsert.resources.limits.memory | string | `"1000Mi"` | | +| vmcluster.spec.vminsert.resources.requests.cpu | string | `"0.5"` | | +| vmcluster.spec.vminsert.resources.requests.memory | string | `"500Mi"` | | +| vmcluster.spec.vmselect.cacheMountPath | string | `"/select-cache"` | | +| vmcluster.spec.vmselect.image.tag | string | `"v1.63.0-cluster"` | | +| vmcluster.spec.vmselect.replicaCount | int | `2` | | +| vmcluster.spec.vmselect.resources.limits.cpu | string | `"1"` | | +| vmcluster.spec.vmselect.resources.limits.memory | string | `"1000Mi"` | | +| vmcluster.spec.vmselect.resources.requests.cpu | string | `"0.5"` | | +| vmcluster.spec.vmselect.resources.requests.memory | string | `"500Mi"` | | +| vmcluster.spec.vmselect.storage.volumeClaimTemplate.spec.resources.requests.storage | string | `"2Gi"` | | +| vmcluster.spec.vmstorage.image.tag | string | `"v1.63.0-cluster"` | | +| vmcluster.spec.vmstorage.replicaCount | int | `2` | | +| vmcluster.spec.vmstorage.resources.limits.cpu | string | `"1"` | | +| vmcluster.spec.vmstorage.resources.limits.memory | string | `"1500Mi"` | | +| vmcluster.spec.vmstorage.storage.volumeClaimTemplate.spec.resources.requests.storage | string | `"10Gi"` | | +| vmcluster.spec.vmstorage.storageDataPath | string | `"/vm-data"` | | | vmsingle.enabled | bool | `true` | | | vmsingle.ingress.annotations | object | `{}` | | | vmsingle.ingress.enabled | bool | `false` | | @@ -435,6 +486,7 @@ Change the values according to the need of the environment in ``victoria-metrics | vmsingle.ingress.path | string | `"/"` | | | vmsingle.ingress.pathType | string | `"Prefix"` | | | vmsingle.ingress.tls | list | `[]` | | +| vmsingle.spec.image.tag | string | `"v1.63.0"` | | | vmsingle.spec.replicaCount | int | `1` | | | vmsingle.spec.retentionPeriod | string | `"14"` | | | vmsingle.spec.storage.accessModes[0] | string | `"ReadWriteOnce"` | | diff --git a/charts/victoria-metrics-k8s-stack/hack/sync_grafana_dashboards.py b/charts/victoria-metrics-k8s-stack/hack/sync_grafana_dashboards.py index 8eed4ab36..4f081b642 100644 --- a/charts/victoria-metrics-k8s-stack/hack/sync_grafana_dashboards.py +++ b/charts/victoria-metrics-k8s-stack/hack/sync_grafana_dashboards.py @@ -45,6 +45,11 @@ def new_representer(dumper, data): 'destination': '../templates/grafana/dashboards', 'type': 'json' }, + { + 'source': 'https://raw.githubusercontent.com/VictoriaMetrics/VictoriaMetrics/cluster/dashboards/clusterbytenant.json', + 'destination': '../templates/grafana/dashboards', + 'type': 'json' + }, ] skip_list = [ @@ -63,6 +68,7 @@ def new_representer(dumper, data): 'scheduler': ' .Values.kubeScheduler.enabled', 'node-rsrc-use': ' (index .Values "prometheus-node-exporter" "enabled")', 'node-cluster-rsrc-use': ' (index .Values "prometheus-node-exporter" "enabled")', + 'clusterbytenant': '.Values.vmcluster.enabled' } # standard header diff --git a/charts/victoria-metrics-k8s-stack/hack/sync_rules.py b/charts/victoria-metrics-k8s-stack/hack/sync_rules.py index 6215271f5..9f6b64686 100644 --- a/charts/victoria-metrics-k8s-stack/hack/sync_rules.py +++ b/charts/victoria-metrics-k8s-stack/hack/sync_rules.py @@ -95,12 +95,9 @@ def new_representer(dumper, data): } replacement_map = { - 'https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#': { + 'https://runbooks.prometheus-operator.dev/runbooks': { 'replacement': '{{ .Values.defaultRules.runbookUrl }}', 'init': ''}, - 'https://github.com/prometheus-operator/kube-prometheus/wiki/': { - 'replacement': '{{ .Values.defaultRules.runbookUrl }}alert-name-', - 'init': ''}, 'job="kube-state-metrics"': { 'replacement': 'job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"', 'limitGroup': ['kubernetes-apps'], diff --git a/charts/victoria-metrics-k8s-stack/templates/_helpers.tpl b/charts/victoria-metrics-k8s-stack/templates/_helpers.tpl index 0a41901b6..1ab7d2607 100644 --- a/charts/victoria-metrics-k8s-stack/templates/_helpers.tpl +++ b/charts/victoria-metrics-k8s-stack/templates/_helpers.tpl @@ -82,57 +82,78 @@ app.kubernetes.io/name: {{ include "victoria-metrics-k8s-stack.name" . }} app.kubernetes.io/instance: {{ .Release.Name }} {{- end }} -{{/* -VM endpoint -*/}} -{{- define "victoria-metrics-k8s-stack.vmEndpoint" -}} + +{{- define "victoria-metrics-k8s-stack.vmSelectEndpoint" -}} {{- if .Values.vmsingle.enabled -}} -url: "http://{{ include "victoria-metrics-k8s-stack.vmsingleName" .}}.{{ .Release.Namespace }}.svc:{{ .Values.vmsingle.spec.port | default 8429 }}" +{{ printf "http://%s.%s.svc:%d" (include "victoria-metrics-k8s-stack.vmsingleName" .) .Release.Namespace (.Values.vmsingle.spec.port | default 8429) }} +{{- end }} +{{- if .Values.vmcluster.enabled -}} +{{ printf "http://%s-%s.%s.svc:%d/select/0/prometheus" "vmselect" (include "victoria-metrics-k8s-stack.fullname" .) .Release.Namespace (.Values.vmcluster.spec.vmselect.port | default 8481) }} {{- end }} {{- end }} - -{{/* -Alermanager spec -*/}} -{{- define "victoria-metrics-k8s-stack.alertmanagerSpec" -}} -{{ omit .Values.alertmanager.spec "configMaps" "configSecret" | toYaml }} -configSecret: {{ .Values.alertmanager.spec.configSecret | default (printf "%s-alertmanager" (include "victoria-metrics-k8s-stack.fullname" .)) }} -{{- if or .Values.alertmanager.spec.configMaps .Values.alertmanager.monzoTemplate.enabled }} -{{- $list := .Values.alertmanager.spec.configMaps | default (list "") }} -{{- if .Values.alertmanager.monzoTemplate.enabled }} -{{- $list = append $list (printf "%s-%s" (include "victoria-metrics-k8s-stack.fullname" $) "alertmanager-monzo-tpl" | trunc 63 | trimSuffix "-") }} +{{- define "victoria-metrics-k8s-stack.vmInsertEndpoint" -}} +{{- if .Values.vmsingle.enabled -}} +{{ printf "http://%s.%s.svc:%d" (include "victoria-metrics-k8s-stack.vmsingleName" .) .Release.Namespace (.Values.vmsingle.spec.port | default 8429) }} {{- end }} -configMaps: -{{- range compact $list }} -- {{ . }} +{{- if .Values.vmcluster.enabled -}} +{{ printf "http://%s-%s.%s.svc:%d/insert/0" "vminsert" (include "victoria-metrics-k8s-stack.fullname" .) .Release.Namespace (.Values.vmcluster.spec.vminsert.port | default 8480) }} {{- end }} {{- end }} + + +{{/* +VMAlert remotes +*/}} +{{- define "victoria-metrics-k8s-stack.vmAlertRemotes" -}} +remoteWrite: + - url: {{ include "victoria-metrics-k8s-stack.vmInsertEndpoint" . }} +remoteRead: + - url: {{ include "victoria-metrics-k8s-stack.vmSelectEndpoint" . }} +datasource: + - url: {{ include "victoria-metrics-k8s-stack.vmSelectEndpoint" . }} +notifier: + - url: {{ printf "http://%s-%s.%s.svc:9093" "vmalertmanager" (include "victoria-metrics-k8s-stack.fullname" .) .Release.Namespace }} {{- end }} {{/* VMAlert spec */}} {{- define "victoria-metrics-k8s-stack.vmAlertSpec" -}} -{{- $vmAlertStackRemoteWrite := dict "remoteWrite" ( include "victoria-metrics-k8s-stack.vmEndpoint" . | fromYaml ) -}} -{{- $vmAlertStackRemoteRead := dict "remoteRead" ( include "victoria-metrics-k8s-stack.vmEndpoint" . | fromYaml ) -}} -{{- $vmAlertStackDatasource := dict "datasource" ( include "victoria-metrics-k8s-stack.vmEndpoint" . | fromYaml ) -}} -{{- $vmAlertStackNotifier := dict "notifier" ( dict "url" ( printf "http://vmalertmanager-%s.%s.svc:9093" (include "victoria-metrics-k8s-stack.fullname" .) .Release.Namespace ) ) -}} -{{ deepCopy .Values.vmalert.spec | mergeOverwrite $vmAlertStackRemoteWrite $vmAlertStackRemoteRead $vmAlertStackDatasource $vmAlertStackNotifier | toYaml }} +{{ deepCopy .Values.vmalert.spec | mergeOverwrite (include "victoria-metrics-k8s-stack.vmAlertRemotes" . | fromYaml) | toYaml }} {{- end }} {{/* -VM remoteWrite +VM Agent remoteWrite */}} {{- define "victoria-metrics-k8s-stack.vmAgentRemoteWrite" -}} remoteWrite: - - url: "http://{{ .Values.vmsingle.name | default (printf "vmsingle-%s" (include "victoria-metrics-k8s-stack.fullname" .))}}.{{ .Release.Namespace }}.svc:{{ .Values.vmsingle.spec.port | default 8429 }}/api/v1/write" + - url: {{ include "victoria-metrics-k8s-stack.vmInsertEndpoint" . }}/api/v1/write {{- end }} {{/* VMAgent spec */}} {{- define "victoria-metrics-k8s-stack.vmAgentSpec" -}} -{{ deepCopy .Values.vmagent.spec | mergeOverwrite ( include "victoria-metrics-k8s-stack.vmAgentRemoteWrite" . | fromYaml ) | toYaml }} +{{ deepCopy .Values.vmagent.spec | mergeOverwrite ( include "victoria-metrics-k8s-stack.vmAgentRemoteWrite" . | fromYaml) | toYaml }} +{{- end }} + + +{{/* +Alermanager spec +*/}} +{{- define "victoria-metrics-k8s-stack.alertmanagerSpec" -}} +{{ omit .Values.alertmanager.spec "configMaps" "configSecret" | toYaml }} +configSecret: {{ .Values.alertmanager.spec.configSecret | default (printf "%s-alertmanager" (include "victoria-metrics-k8s-stack.fullname" .)) }} +{{- if or .Values.alertmanager.spec.configMaps .Values.alertmanager.monzoTemplate.enabled }} +{{- $list := .Values.alertmanager.spec.configMaps | default (list "") }} +{{- if .Values.alertmanager.monzoTemplate.enabled }} +{{- $list = append $list (printf "%s-%s" (include "victoria-metrics-k8s-stack.fullname" $) "alertmanager-monzo-tpl" | trunc 63 | trimSuffix "-") }} +{{- end }} +configMaps: +{{- range compact $list }} +- {{ . }} +{{- end }} {{- end }} +{{- end }} \ No newline at end of file diff --git a/charts/victoria-metrics-k8s-stack/templates/grafana/dashboards/alertmanager-overview.yaml b/charts/victoria-metrics-k8s-stack/templates/grafana/dashboards/alertmanager-overview.yaml new file mode 100644 index 000000000..3b765f032 --- /dev/null +++ b/charts/victoria-metrics-k8s-stack/templates/grafana/dashboards/alertmanager-overview.yaml @@ -0,0 +1,607 @@ +{{- /* +Generated from 'alertmanager-overview' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/grafana-dashboardDefinitions.yaml +Do not change in-place! In order to change this file first read following link: +https://github.com/VictoriaMetrics/helm-charts/tree/master/charts/victoria-metrics-k8s-stack/hack +*/ -}} +{{- if and .Values.grafana.enabled .Values.grafana.defaultDashboardsEnabled }} +apiVersion: v1 +kind: ConfigMap +metadata: + namespace: {{ .Release.Namespace }} + name: {{ printf "%s-%s" (include "victoria-metrics-k8s-stack.fullname" $) "alertmanager-overview" | trunc 63 | trimSuffix "-" }} + labels: + {{- if $.Values.grafana.sidecar.dashboards.label }} + {{ $.Values.grafana.sidecar.dashboards.label }}: "1" + {{- end }} + app: {{ include "victoria-metrics-k8s-stack.name" $ }}-grafana +{{ include "victoria-metrics-k8s-stack.labels" $ | indent 4 }} +data: + alertmanager-overview.json: |- + { + "__inputs": [ + + ], + "__requires": [ + + ], + "annotations": { + "list": [ + + ] + }, + "editable": false, + "gnetId": null, + "graphTooltip": 1, + "hideControls": false, + "id": null, + "links": [ + + ], + "refresh": "30s", + "rows": [ + { + "collapse": false, + "collapsed": false, + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + + }, + "id": 2, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": false, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 6, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(alertmanager_alerts{namespace=\"$namespace\",service=\"$service\"}) by (namespace,service,instance)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{`{{`}}instance{{`}}`}}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Alerts", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "none", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "none", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + + }, + "id": 3, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": false, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 6, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(alertmanager_alerts_received_total{namespace=\"$namespace\",service=\"$service\"}[5m])) by (namespace,service,instance)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{`{{`}}instance{{`}}`}} Received", + "refId": "A" + }, + { + "expr": "sum(rate(alertmanager_alerts_invalid_total{namespace=\"$namespace\",service=\"$service\"}[5m])) by (namespace,service,instance)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{`{{`}}instance{{`}}`}} Invalid", + "refId": "B" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Alerts receive rate", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Alerts", + "titleSize": "h6", + "type": "row" + }, + { + "collapse": false, + "collapsed": false, + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + + }, + "id": 4, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": false, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": "integration", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(alertmanager_notifications_total{namespace=\"$namespace\",service=\"$service\", integration=\"$integration\"}[5m])) by (integration,namespace,service,instance)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{`{{`}}instance{{`}}`}} Total", + "refId": "A" + }, + { + "expr": "sum(rate(alertmanager_notifications_failed_total{namespace=\"$namespace\",service=\"$service\", integration=\"$integration\"}[5m])) by (integration,namespace,service,instance)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{`{{`}}instance{{`}}`}} Failed", + "refId": "B" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "$integration: Notifications Send Rate", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + + }, + "id": 5, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": false, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": "integration", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.99,\n sum(rate(alertmanager_notification_latency_seconds_bucket{namespace=\"$namespace\",service=\"$service\", integration=\"$integration\"}[5m])) by (le,namespace,service,instance)\n) \n", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{`{{`}}instance{{`}}`}} 99th Percentile", + "refId": "A" + }, + { + "expr": "histogram_quantile(0.50,\n sum(rate(alertmanager_notification_latency_seconds_bucket{namespace=\"$namespace\",service=\"$service\", integration=\"$integration\"}[5m])) by (le,namespace,service,instance)\n) \n", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{`{{`}}instance{{`}}`}} Median", + "refId": "B" + }, + { + "expr": "sum(rate(alertmanager_notification_latency_seconds_sum{namespace=\"$namespace\",service=\"$service\", integration=\"$integration\"}[5m])) by (namespace,service,instance)\n/\nsum(rate(alertmanager_notification_latency_seconds_count{namespace=\"$namespace\",service=\"$service\", integration=\"$integration\"}[5m])) by (namespace,service,instance)\n", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{`{{`}}instance{{`}}`}} Average", + "refId": "C" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "$integration: Notification Duration", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Notifications", + "titleSize": "h6", + "type": "row" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [ + "alertmanager-mixin" + ], + "templating": { + "list": [ + { + "current": { + "text": "Prometheus", + "value": "Prometheus" + }, + "hide": 0, + "label": null, + "name": "datasource", + "options": [ + + ], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + }, + { + "allValue": null, + "current": { + "text": "", + "value": "" + }, + "datasource": "$datasource", + "hide": 0, + "includeAll": false, + "label": null, + "multi": false, + "name": "namespace", + "options": [ + + ], + "query": "label_values(alertmanager_alerts, namespace)", + "refresh": 2, + "regex": "", + "sort": 1, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { + "text": "", + "value": "" + }, + "datasource": "$datasource", + "hide": 0, + "includeAll": false, + "label": null, + "multi": false, + "name": "service", + "options": [ + + ], + "query": "label_values(alertmanager_alerts, service)", + "refresh": 2, + "regex": "", + "sort": 1, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { + "text": "all", + "value": "$__all" + }, + "datasource": "$datasource", + "hide": 2, + "includeAll": true, + "label": null, + "multi": false, + "name": "integration", + "options": [ + + ], + "query": "label_values(alertmanager_notifications_total{integration=~\".*\"}, integration)", + "refresh": 2, + "regex": "", + "sort": 1, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "utc", + "title": "Alertmanager / Overview", + "uid": "alertmanager-overview", + "version": 0 + } +{{- end }} \ No newline at end of file diff --git a/charts/victoria-metrics-k8s-stack/templates/grafana/dashboards/clusterbytenant.yaml b/charts/victoria-metrics-k8s-stack/templates/grafana/dashboards/clusterbytenant.yaml new file mode 100644 index 000000000..35860fd77 --- /dev/null +++ b/charts/victoria-metrics-k8s-stack/templates/grafana/dashboards/clusterbytenant.yaml @@ -0,0 +1,695 @@ +{{- /* +Generated from 'clusterbytenant' from https://raw.githubusercontent.com/VictoriaMetrics/VictoriaMetrics/cluster/dashboards/clusterbytenant.json +Do not change in-place! In order to change this file first read following link: +https://github.com/VictoriaMetrics/helm-charts/tree/master/charts/victoria-metrics-k8s-stack/hack +*/ -}} +{{- if and .Values.grafana.enabled .Values.grafana.defaultDashboardsEnabled.Values.vmcluster.enabled }} +apiVersion: v1 +kind: ConfigMap +metadata: + namespace: {{ .Release.Namespace }} + name: {{ printf "%s-%s" (include "victoria-metrics-k8s-stack.fullname" $) "clusterbytenant" | trunc 63 | trimSuffix "-" }} + labels: + {{- if $.Values.grafana.sidecar.dashboards.label }} + {{ $.Values.grafana.sidecar.dashboards.label }}: "1" + {{- end }} + app: {{ include "victoria-metrics-k8s-stack.name" $ }}-grafana +{{ include "victoria-metrics-k8s-stack.labels" $ | indent 4 }} +data: + clusterbytenant.json: |- + { + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "description": "Overview for enterprise cluster VictoriaMetrics v1.56.0 or higher", + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "id": 13, + "iteration": 1617980754279, + "links": [], + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$ds", + "description": "How many datapoints are inserted into storage per second by accountID and projectID", + "fieldConfig": { + "defaults": { + "custom": {}, + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 0 + }, + "hiddenSeries": false, + "id": 2, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": false, + "min": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.3.4", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(increase(vm_tenant_inserted_rows_total{job=~\"$job\", instance=~\"$instance\",accountID=~\"$accountID\", projectID=~\"$projectID\"}[1m])/60) by (accountID,projectID) ", + "interval": "", + "legendFormat": "inserted rows: {{`{{`}}accountID{{`}}`}}:{{`{{`}}projectID{{`}}`}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Datapoints ingestion rate ($instance)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$ds", + "description": "Request rate accepted by vmselect nodes per tenant", + "fieldConfig": { + "defaults": { + "custom": {}, + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 0 + }, + "hiddenSeries": false, + "id": 4, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": false, + "min": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.3.4", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(vm_tenant_select_requests_total{job=~\"$job\", instance=~\"$instance.*\",accountID=~\"$accountID\", projectID=~\"$projectID\"}[5m])) by (accountID,projectID) ", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "tenant: {{`{{`}}accountID{{`}}`}}{{`{{`}}projectID{{`}}`}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Query rate ($instance)", + "tooltip": { + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$ds", + "description": "Shows the number of active time series with new data points inserted during the last hour. High value may result in ingestion slowdown. \n\nSee following link for details:", + "fieldConfig": { + "defaults": { + "custom": {}, + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 8 + }, + "hiddenSeries": false, + "id": 6, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + { + "targetBlank": true, + "title": "troubleshooting", + "url": "https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/README.md#troubleshooting" + } + ], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.3.4", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(vm_tenant_active_timeseries{job=~\"$job\", instance=~\"$instance.*\",accountID=~\"$accountID\",projectID=~\"$projectID\"}) by(accountID,projectID)", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "Active time series tenant: {{`{{`}}accountID{{`}}`}}:{{`{{`}}projectID{{`}}`}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Active time series ($instance)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$ds", + "description": "Shows how many of new time-series are created every second. High churn rate tightly connected with database performance and may result in unexpected OOM's or slow queries. It is recommended to always keep an eye on this metric to avoid unexpected cardinality \"explosions\".\n\nGood references to read:\n* https://www.robustperception.io/cardinality-is-key\n* https://www.robustperception.io/using-tsdb-analyze-to-investigate-churn-and-cardinality", + "fieldConfig": { + "defaults": { + "custom": {}, + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 8 + }, + "hiddenSeries": false, + "id": 8, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.3.4", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(increase(vm_tenant_timeseries_created_total{job=~\"$job\", instance=~\"$instance\",accountID=~\"$accountID\", projectID=~\"$projectID\"}[1m])/60) by(accountID,projectID)", + "interval": "", + "legendFormat": "churn rate tenant: {{`{{`}}accountID{{`}}`}}:{{`{{`}}projectID{{`}}`}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Churn rate ($instance)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$ds", + "description": "Shows amount of on-disk space occupied by data points.", + "fieldConfig": { + "defaults": { + "custom": {}, + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 16 + }, + "hiddenSeries": false, + "id": 10, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.3.4", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(vm_tenant_used_tenant_bytes{job=\"$job_storage\", instance=~\"$instance\",accountID=~\"$accountID\",projectID=~\"$projectID\"}) by(accountID,projectID)", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{`{{`}}accountID{{`}}`}}:{{`{{`}}projectID{{`}}`}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Disk space usage (datapoints) ($instance)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "schemaVersion": 26, + "style": "dark", + "tags": [ + "VictoriaMetrics", + "monitoring" + ], + "templating": { + "list": [ + { + "current": { + "selected": false, + "text": "gw", + "value": "gw" + }, + "error": null, + "hide": 0, + "includeAll": false, + "label": null, + "multi": false, + "name": "ds", + "options": [], + "query": "prometheus", + "queryValue": "", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "type": "datasource" + }, + { + "allValue": ".*", + "current": { + "selected": false, + "text": "All", + "value": "$__all" + }, + "datasource": "$ds", + "definition": "label_values(vm_app_version{version=~\"^vm(insert|select|storage).*\"}, job)", + "error": null, + "hide": 0, + "includeAll": true, + "label": null, + "multi": true, + "name": "job", + "options": [], + "query": "label_values(vm_app_version{version=~\"^vm(insert|select|storage).*\"}, job)", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": ".*", + "current": { + "selected": false, + "text": "All", + "value": "$__all" + }, + "datasource": "$ds", + "definition": "label_values(vm_app_version{job=~\"$job\"}, instance)", + "error": null, + "hide": 0, + "includeAll": true, + "label": null, + "multi": false, + "name": "instance", + "options": [], + "query": "label_values(vm_app_version{job=~\"$job\"}, instance)", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": ".*", + "current": { + "selected": false, + "text": "All", + "value": "$__all" + }, + "datasource": "$ds", + "definition": "label_values(vm_tenant_active_timeseries{job=~\"$job\"},accountID)", + "error": null, + "hide": 0, + "includeAll": true, + "label": null, + "multi": false, + "name": "accountID", + "options": [], + "query": "label_values(vm_tenant_active_timeseries{job=~\"$job\"},accountID)", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": ".*", + "current": { + "selected": false, + "text": "All", + "value": "$__all" + }, + "datasource": "$ds", + "definition": "label_values(vm_tenant_active_timeseries{accountID=~\"$accountID\"},projectID)", + "error": null, + "hide": 0, + "includeAll": true, + "label": null, + "multi": false, + "name": "projectID", + "options": [], + "query": "label_values(vm_tenant_active_timeseries{accountID=~\"$accountID\"},projectID)", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "VictoriaMetrics cluster per tenant Copy", + "uid": "IZFqd3lMz", + "version": 1 + } +{{- end }} \ No newline at end of file diff --git a/charts/victoria-metrics-k8s-stack/templates/grafana/dashboards/controller-manager.yaml b/charts/victoria-metrics-k8s-stack/templates/grafana/dashboards/controller-manager.yaml index 572b330b7..b5b76d4e1 100644 --- a/charts/victoria-metrics-k8s-stack/templates/grafana/dashboards/controller-manager.yaml +++ b/charts/victoria-metrics-k8s-stack/templates/grafana/dashboards/controller-manager.yaml @@ -173,10 +173,10 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(rate(workqueue_adds_total{cluster=\"$cluster\", job=\"kube-controller-manager\", instance=~\"$instance\"}[5m])) by (instance, name)", + "expr": "sum(rate(workqueue_adds_total{cluster=\"$cluster\", job=\"kube-controller-manager\", instance=~\"$instance\"}[5m])) by (cluster, instance, name)", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{`{{`}}instance{{`}}`}} {{`{{`}}name{{`}}`}}", + "legendFormat": "{{`{{`}}cluster{{`}}`}} {{`{{`}}instance{{`}}`}} {{`{{`}}name{{`}}`}}", "refId": "A" } ], @@ -279,10 +279,10 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(rate(workqueue_depth{cluster=\"$cluster\", job=\"kube-controller-manager\", instance=~\"$instance\"}[5m])) by (instance, name)", + "expr": "sum(rate(workqueue_depth{cluster=\"$cluster\", job=\"kube-controller-manager\", instance=~\"$instance\"}[5m])) by (cluster, instance, name)", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{`{{`}}instance{{`}}`}} {{`{{`}}name{{`}}`}}", + "legendFormat": "{{`{{`}}cluster{{`}}`}} {{`{{`}}instance{{`}}`}} {{`{{`}}name{{`}}`}}", "refId": "A" } ], @@ -385,10 +385,10 @@ data: "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(workqueue_queue_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-controller-manager\", instance=~\"$instance\"}[5m])) by (instance, name, le))", + "expr": "histogram_quantile(0.99, sum(rate(workqueue_queue_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-controller-manager\", instance=~\"$instance\"}[5m])) by (cluster, instance, name, le))", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{`{{`}}instance{{`}}`}} {{`{{`}}name{{`}}`}}", + "legendFormat": "{{`{{`}}cluster{{`}}`}} {{`{{`}}instance{{`}}`}} {{`{{`}}name{{`}}`}}", "refId": "A" } ], diff --git a/charts/victoria-metrics-k8s-stack/templates/grafana/dashboards/k8s-resources-cluster.yaml b/charts/victoria-metrics-k8s-stack/templates/grafana/dashboards/k8s-resources-cluster.yaml index a2e03eb63..de52d5e1e 100644 --- a/charts/victoria-metrics-k8s-stack/templates/grafana/dashboards/k8s-resources-cluster.yaml +++ b/charts/victoria-metrics-k8s-stack/templates/grafana/dashboards/k8s-resources-cluster.yaml @@ -244,7 +244,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(kube_pod_container_resource_limits{cluster=\"$cluster\", resource=\"cpu\"}) / sum(kube_node_status_allocatable{resource=\"cpu\",cluster=\"$cluster\"})", + "expr": "sum(namespace_cpu:kube_pod_container_resource_limits:sum{cluster=\"$cluster\"}) / sum(kube_node_status_allocatable{resource=\"cpu\",cluster=\"$cluster\"})", "format": "time_series", "instant": true, "intervalFactor": 2, @@ -496,7 +496,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(kube_pod_container_resource_limits{cluster=\"$cluster\", resource=\"memory\"}) / sum(kube_node_status_allocatable{resource=\"memory\",cluster=\"$cluster\"})", + "expr": "sum(namespace_memory:kube_pod_container_resource_limits:sum{cluster=\"$cluster\"}) / sum(kube_node_status_allocatable{resource=\"memory\",cluster=\"$cluster\"})", "format": "time_series", "instant": true, "intervalFactor": 2, @@ -591,7 +591,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\"}) by (namespace)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\"}) by (namespace)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{`{{`}}namespace{{`}}`}}", @@ -882,7 +882,7 @@ data: "step": 10 }, { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\"}) by (namespace)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\"}) by (namespace)", "format": "table", "instant": true, "intervalFactor": 2, @@ -900,7 +900,7 @@ data: "step": 10 }, { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\"}) by (namespace) / sum(kube_pod_container_resource_requests{cluster=\"$cluster\", resource=\"cpu\"}) by (namespace)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\"}) by (namespace) / sum(namespace_cpu:kube_pod_container_resource_requests:sum{cluster=\"$cluster\"}) by (namespace)", "format": "table", "instant": true, "intervalFactor": 2, @@ -909,7 +909,7 @@ data: "step": 10 }, { - "expr": "sum(kube_pod_container_resource_limits{cluster=\"$cluster\", resource=\"cpu\"}) by (namespace)", + "expr": "sum(namespace_cpu:kube_pod_container_resource_limits:sum{cluster=\"$cluster\"}) by (namespace)", "format": "table", "instant": true, "intervalFactor": 2, @@ -918,7 +918,7 @@ data: "step": 10 }, { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\"}) by (namespace) / sum(kube_pod_container_resource_limits{cluster=\"$cluster\", resource=\"cpu\"}) by (namespace)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\"}) by (namespace) / sum(namespace_cpu:kube_pod_container_resource_limits:sum{cluster=\"$cluster\"}) by (namespace)", "format": "table", "instant": true, "intervalFactor": 2, @@ -1318,7 +1318,7 @@ data: "step": 10 }, { - "expr": "sum(kube_pod_container_resource_requests{cluster=\"$cluster\", resource=\"memory\"}) by (namespace)", + "expr": "sum(namespace_memory:kube_pod_container_resource_requests:sum{cluster=\"$cluster\"}) by (namespace)", "format": "table", "instant": true, "intervalFactor": 2, @@ -1327,7 +1327,7 @@ data: "step": 10 }, { - "expr": "sum(container_memory_rss{cluster=\"$cluster\", container!=\"\"}) by (namespace) / sum(kube_pod_container_resource_requests{cluster=\"$cluster\", resource=\"memory\"}) by (namespace)", + "expr": "sum(container_memory_rss{cluster=\"$cluster\", container!=\"\"}) by (namespace) / sum(namespace_memory:kube_pod_container_resource_requests:sum{cluster=\"$cluster\"}) by (namespace)", "format": "table", "instant": true, "intervalFactor": 2, @@ -1336,7 +1336,7 @@ data: "step": 10 }, { - "expr": "sum(kube_pod_container_resource_limits{cluster=\"$cluster\", resource=\"memory\"}) by (namespace)", + "expr": "sum(namespace_memory:kube_pod_container_resource_limits:sum{cluster=\"$cluster\"}) by (namespace)", "format": "table", "instant": true, "intervalFactor": 2, @@ -1345,7 +1345,7 @@ data: "step": 10 }, { - "expr": "sum(container_memory_rss{cluster=\"$cluster\", container!=\"\"}) by (namespace) / sum(kube_pod_container_resource_limits{cluster=\"$cluster\", resource=\"memory\"}) by (namespace)", + "expr": "sum(container_memory_rss{cluster=\"$cluster\", container!=\"\"}) by (namespace) / sum(namespace_memory:kube_pod_container_resource_limits:sum{cluster=\"$cluster\"}) by (namespace)", "format": "table", "instant": true, "intervalFactor": 2, @@ -2146,7 +2146,7 @@ data: }, "yaxes": [ { - "format": "Bps", + "format": "pps", "label": null, "logBase": 1, "max": null, @@ -2232,7 +2232,7 @@ data: }, "yaxes": [ { - "format": "Bps", + "format": "pps", "label": null, "logBase": 1, "max": null, @@ -2330,7 +2330,7 @@ data: }, "yaxes": [ { - "format": "Bps", + "format": "pps", "label": null, "logBase": 1, "max": null, @@ -2416,7 +2416,7 @@ data: }, "yaxes": [ { - "format": "Bps", + "format": "pps", "label": null, "logBase": 1, "max": null, diff --git a/charts/victoria-metrics-k8s-stack/templates/grafana/dashboards/k8s-resources-namespace.yaml b/charts/victoria-metrics-k8s-stack/templates/grafana/dashboards/k8s-resources-namespace.yaml index 18f2753e7..70dd9b322 100644 --- a/charts/victoria-metrics-k8s-stack/templates/grafana/dashboards/k8s-resources-namespace.yaml +++ b/charts/victoria-metrics-k8s-stack/templates/grafana/dashboards/k8s-resources-namespace.yaml @@ -75,7 +75,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}) / sum(kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"})", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}) / sum(kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"})", "format": "time_series", "instant": true, "intervalFactor": 2, @@ -159,7 +159,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}) / sum(kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"})", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}) / sum(kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"})", "format": "time_series", "instant": true, "intervalFactor": 2, @@ -443,7 +443,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{`{{`}}pod{{`}}`}}", @@ -694,7 +694,7 @@ data: ], "targets": [ { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -703,7 +703,7 @@ data: "step": 10 }, { - "expr": "sum(kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}) by (pod)", + "expr": "sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -712,7 +712,7 @@ data: "step": 10 }, { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod) / sum(kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}) by (pod)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod) / sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -721,7 +721,7 @@ data: "step": 10 }, { - "expr": "sum(kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}) by (pod)", + "expr": "sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -730,7 +730,7 @@ data: "step": 10 }, { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod) / sum(kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}) by (pod)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod) / sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -1168,7 +1168,7 @@ data: "step": 10 }, { - "expr": "sum(kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"memory\"}) by (pod)", + "expr": "sum(cluster:namespace:pod_memory:active:kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -1177,7 +1177,7 @@ data: "step": 10 }, { - "expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\",container!=\"\", image!=\"\"}) by (pod) / sum(kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"memory\"}) by (pod)", + "expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\",container!=\"\", image!=\"\"}) by (pod) / sum(cluster:namespace:pod_memory:active:kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -1186,7 +1186,7 @@ data: "step": 10 }, { - "expr": "sum(kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"memory\"}) by (pod)", + "expr": "sum(cluster:namespace:pod_memory:active:kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -1195,7 +1195,7 @@ data: "step": 10 }, { - "expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\",container!=\"\", image!=\"\"}) by (pod) / sum(kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"memory\"}) by (pod)", + "expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\",container!=\"\", image!=\"\"}) by (pod) / sum(cluster:namespace:pod_memory:active:kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -1839,7 +1839,7 @@ data: }, "yaxes": [ { - "format": "Bps", + "format": "pps", "label": null, "logBase": 1, "max": null, @@ -1925,7 +1925,7 @@ data: }, "yaxes": [ { - "format": "Bps", + "format": "pps", "label": null, "logBase": 1, "max": null, @@ -2023,7 +2023,7 @@ data: }, "yaxes": [ { - "format": "Bps", + "format": "pps", "label": null, "logBase": 1, "max": null, @@ -2109,7 +2109,7 @@ data: }, "yaxes": [ { - "format": "Bps", + "format": "pps", "label": null, "logBase": 1, "max": null, diff --git a/charts/victoria-metrics-k8s-stack/templates/grafana/dashboards/k8s-resources-node.yaml b/charts/victoria-metrics-k8s-stack/templates/grafana/dashboards/k8s-resources-node.yaml index 75cab8d1b..758119a64 100644 --- a/charts/victoria-metrics-k8s-stack/templates/grafana/dashboards/k8s-resources-node.yaml +++ b/charts/victoria-metrics-k8s-stack/templates/grafana/dashboards/k8s-resources-node.yaml @@ -74,7 +74,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", node=~\"$node\"}) by (pod)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", node=~\"$node\"}) by (pod)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{`{{`}}pod{{`}}`}}", @@ -309,7 +309,7 @@ data: ], "targets": [ { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", node=~\"$node\"}) by (pod)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", node=~\"$node\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -318,7 +318,7 @@ data: "step": 10 }, { - "expr": "sum(kube_pod_container_resource_requests{cluster=\"$cluster\", node=~\"$node\", resource=\"cpu\"}) by (pod)", + "expr": "sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests{cluster=\"$cluster\", node=~\"$node\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -327,7 +327,7 @@ data: "step": 10 }, { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", node=~\"$node\"}) by (pod) / sum(kube_pod_container_resource_requests{cluster=\"$cluster\", node=~\"$node\", resource=\"cpu\"}) by (pod)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", node=~\"$node\"}) by (pod) / sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests{cluster=\"$cluster\", node=~\"$node\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -336,7 +336,7 @@ data: "step": 10 }, { - "expr": "sum(kube_pod_container_resource_limits{cluster=\"$cluster\", node=~\"$node\", resource=\"cpu\"}) by (pod)", + "expr": "sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits{cluster=\"$cluster\", node=~\"$node\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -345,7 +345,7 @@ data: "step": 10 }, { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", node=~\"$node\"}) by (pod) / sum(kube_pod_container_resource_limits{cluster=\"$cluster\", node=~\"$node\", resource=\"cpu\"}) by (pod)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", node=~\"$node\"}) by (pod) / sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits{cluster=\"$cluster\", node=~\"$node\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -746,7 +746,7 @@ data: "step": 10 }, { - "expr": "sum(kube_pod_container_resource_requests{cluster=\"$cluster\", node=~\"$node\", resource=\"memory\"}) by (pod)", + "expr": "sum(cluster:namespace:pod_memory:active:kube_pod_container_resource_requests{cluster=\"$cluster\", node=~\"$node\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -755,7 +755,7 @@ data: "step": 10 }, { - "expr": "sum(node_namespace_pod_container:container_memory_working_set_bytes{cluster=\"$cluster\", node=~\"$node\",container!=\"\"}) by (pod) / sum(kube_pod_container_resource_requests{cluster=\"$cluster\", node=~\"$node\", resource=\"memory\"}) by (pod)", + "expr": "sum(node_namespace_pod_container:container_memory_working_set_bytes{cluster=\"$cluster\", node=~\"$node\",container!=\"\"}) by (pod) / sum(cluster:namespace:pod_memory:active:kube_pod_container_resource_requests{cluster=\"$cluster\", node=~\"$node\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -764,7 +764,7 @@ data: "step": 10 }, { - "expr": "sum(kube_pod_container_resource_limits{cluster=\"$cluster\", node=~\"$node\", resource=\"memory\"}) by (pod)", + "expr": "sum(cluster:namespace:pod_memory:active:kube_pod_container_resource_limits{cluster=\"$cluster\", node=~\"$node\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -773,7 +773,7 @@ data: "step": 10 }, { - "expr": "sum(node_namespace_pod_container:container_memory_working_set_bytes{cluster=\"$cluster\", node=~\"$node\",container!=\"\"}) by (pod) / sum(kube_pod_container_resource_limits{cluster=\"$cluster\", node=~\"$node\", resource=\"memory\"}) by (pod)", + "expr": "sum(node_namespace_pod_container:container_memory_working_set_bytes{cluster=\"$cluster\", node=~\"$node\",container!=\"\"}) by (pod) / sum(cluster:namespace:pod_memory:active:kube_pod_container_resource_limits{cluster=\"$cluster\", node=~\"$node\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, diff --git a/charts/victoria-metrics-k8s-stack/templates/grafana/dashboards/k8s-resources-pod.yaml b/charts/victoria-metrics-k8s-stack/templates/grafana/dashboards/k8s-resources-pod.yaml index 4bb66eb14..09b182896 100644 --- a/charts/victoria-metrics-k8s-stack/templates/grafana/dashboards/k8s-resources-pod.yaml +++ b/charts/victoria-metrics-k8s-stack/templates/grafana/dashboards/k8s-resources-pod.yaml @@ -91,7 +91,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{namespace=\"$namespace\", pod=\"$pod\", cluster=\"$cluster\"}) by (container)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{namespace=\"$namespace\", pod=\"$pod\", cluster=\"$cluster\"}) by (container)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{`{{`}}container{{`}}`}}", @@ -447,7 +447,7 @@ data: ], "targets": [ { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container)", "format": "table", "instant": true, "intervalFactor": 2, @@ -456,7 +456,7 @@ data: "step": 10 }, { - "expr": "sum(kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", resource=\"cpu\"}) by (container)", + "expr": "sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container)", "format": "table", "instant": true, "intervalFactor": 2, @@ -465,7 +465,7 @@ data: "step": 10 }, { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container) / sum(kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", resource=\"cpu\"}) by (container)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container) / sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container)", "format": "table", "instant": true, "intervalFactor": 2, @@ -474,7 +474,7 @@ data: "step": 10 }, { - "expr": "sum(kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", resource=\"cpu\"}) by (container)", + "expr": "sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container)", "format": "table", "instant": true, "intervalFactor": 2, @@ -483,7 +483,7 @@ data: "step": 10 }, { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container) / sum(kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", resource=\"cpu\"}) by (container)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container) / sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container)", "format": "table", "instant": true, "intervalFactor": 2, @@ -919,7 +919,7 @@ data: "step": 10 }, { - "expr": "sum(kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", resource=\"memory\"}) by (container)", + "expr": "sum(cluster:namespace:pod_memory:active:kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container)", "format": "table", "instant": true, "intervalFactor": 2, @@ -928,7 +928,7 @@ data: "step": 10 }, { - "expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", image!=\"\"}) by (container) / sum(kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", resource=\"memory\"}) by (container)", + "expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", image!=\"\"}) by (container) / sum(cluster:namespace:pod_memory:active:kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container)", "format": "table", "instant": true, "intervalFactor": 2, @@ -937,7 +937,7 @@ data: "step": 10 }, { - "expr": "sum(kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", resource=\"memory\"}) by (container)", + "expr": "sum(cluster:namespace:pod_memory:active:kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container)", "format": "table", "instant": true, "intervalFactor": 2, @@ -946,7 +946,7 @@ data: "step": 10 }, { - "expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", container!=\"\", image!=\"\"}) by (container) / sum(kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", resource=\"memory\"}) by (container)", + "expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", container!=\"\", image!=\"\"}) by (container) / sum(cluster:namespace:pod_memory:active:kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container)", "format": "table", "instant": true, "intervalFactor": 2, @@ -1291,7 +1291,7 @@ data: }, "yaxes": [ { - "format": "Bps", + "format": "pps", "label": null, "logBase": 1, "max": null, @@ -1378,7 +1378,7 @@ data: }, "yaxes": [ { - "format": "Bps", + "format": "pps", "label": null, "logBase": 1, "max": null, @@ -1477,7 +1477,7 @@ data: }, "yaxes": [ { - "format": "Bps", + "format": "pps", "label": null, "logBase": 1, "max": null, @@ -1564,7 +1564,7 @@ data: }, "yaxes": [ { - "format": "Bps", + "format": "pps", "label": null, "logBase": 1, "max": null, diff --git a/charts/victoria-metrics-k8s-stack/templates/grafana/dashboards/k8s-resources-workload.yaml b/charts/victoria-metrics-k8s-stack/templates/grafana/dashboards/k8s-resources-workload.yaml index 2e96a4374..baee08097 100644 --- a/charts/victoria-metrics-k8s-stack/templates/grafana/dashboards/k8s-resources-workload.yaml +++ b/charts/victoria-metrics-k8s-stack/templates/grafana/dashboards/k8s-resources-workload.yaml @@ -74,7 +74,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", + "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{`{{`}}pod{{`}}`}}", @@ -309,7 +309,7 @@ data: ], "targets": [ { - "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", + "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -327,7 +327,7 @@ data: "step": 10 }, { - "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n/sum(\n kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", + "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n/sum(\n kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -345,7 +345,7 @@ data: "step": 10 }, { - "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n/sum(\n kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", + "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n/sum(\n kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -1517,7 +1517,7 @@ data: }, "yaxes": [ { - "format": "Bps", + "format": "pps", "label": null, "logBase": 1, "max": null, @@ -1603,7 +1603,7 @@ data: }, "yaxes": [ { - "format": "Bps", + "format": "pps", "label": null, "logBase": 1, "max": null, @@ -1701,7 +1701,7 @@ data: }, "yaxes": [ { - "format": "Bps", + "format": "pps", "label": null, "logBase": 1, "max": null, @@ -1787,7 +1787,7 @@ data: }, "yaxes": [ { - "format": "Bps", + "format": "pps", "label": null, "logBase": 1, "max": null, diff --git a/charts/victoria-metrics-k8s-stack/templates/grafana/dashboards/k8s-resources-workloads-namespace.yaml b/charts/victoria-metrics-k8s-stack/templates/grafana/dashboards/k8s-resources-workloads-namespace.yaml index 32f5f0688..f06866d84 100644 --- a/charts/victoria-metrics-k8s-stack/templates/grafana/dashboards/k8s-resources-workloads-namespace.yaml +++ b/charts/victoria-metrics-k8s-stack/templates/grafana/dashboards/k8s-resources-workloads-namespace.yaml @@ -95,7 +95,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", + "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{`{{`}}workload{{`}}`}} - {{`{{`}}workload_type{{`}}`}}", @@ -393,7 +393,7 @@ data: "step": 10 }, { - "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", + "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -411,7 +411,7 @@ data: "step": 10 }, { - "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n/sum(\n kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", + "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n/sum(\n kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -429,7 +429,7 @@ data: "step": 10 }, { - "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n/sum(\n kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", + "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n/sum(\n kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -1704,7 +1704,7 @@ data: }, "yaxes": [ { - "format": "Bps", + "format": "pps", "label": null, "logBase": 1, "max": null, @@ -1790,7 +1790,7 @@ data: }, "yaxes": [ { - "format": "Bps", + "format": "pps", "label": null, "logBase": 1, "max": null, @@ -1888,7 +1888,7 @@ data: }, "yaxes": [ { - "format": "Bps", + "format": "pps", "label": null, "logBase": 1, "max": null, @@ -1974,7 +1974,7 @@ data: }, "yaxes": [ { - "format": "Bps", + "format": "pps", "label": null, "logBase": 1, "max": null, diff --git a/charts/victoria-metrics-k8s-stack/templates/grafana/dashboards/kubelet.yaml b/charts/victoria-metrics-k8s-stack/templates/grafana/dashboards/kubelet.yaml index 1d84c7b9e..925c0d074 100644 --- a/charts/victoria-metrics-k8s-stack/templates/grafana/dashboards/kubelet.yaml +++ b/charts/victoria-metrics-k8s-stack/templates/grafana/dashboards/kubelet.yaml @@ -37,2384 +37,2099 @@ data: "links": [ ], - "refresh": "10s", - "rows": [ + "panels": [ { - "collapse": false, - "collapsed": false, - "panels": [ - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "#299c46", - "rgba(237, 129, 40, 0.89)", - "#d44a3a" - ], - "datasource": "$datasource", - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { - - }, - "id": 2, - "interval": null, - "links": [ - - ], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 2, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "sum(kubelet_node_name{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\"})", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "", - "refId": "A" - } - ], - "thresholds": "", - "title": "Running Kubelets", - "tooltip": { - "shared": false - }, - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "min" - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "#299c46", - "rgba(237, 129, 40, 0.89)", - "#d44a3a" - ], - "datasource": "$datasource", - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { - - }, - "id": 3, - "interval": null, - "links": [ - - ], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 2, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "sum(kubelet_running_pods{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"}) OR sum(kubelet_running_pod_count{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"})", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{`{{`}}instance{{`}}`}}", - "refId": "A" - } - ], - "thresholds": "", - "title": "Running Pods", - "tooltip": { - "shared": false - }, - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "min" - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "#299c46", - "rgba(237, 129, 40, 0.89)", - "#d44a3a" - ], - "datasource": "$datasource", - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { - - }, - "id": 4, - "interval": null, + "datasource": "$datasource", + "fieldConfig": { + "defaults": { "links": [ ], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 2, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "sum(kubelet_running_containers{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"}) OR sum(kubelet_running_container_count{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"})", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{`{{`}}instance{{`}}`}}", - "refId": "A" - } - ], - "thresholds": "", - "title": "Running Container", - "tooltip": { - "shared": false - }, - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "min" - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "#299c46", - "rgba(237, 129, 40, 0.89)", - "#d44a3a" - ], - "datasource": "$datasource", - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { - - }, - "id": 5, - "interval": null, - "links": [ + "mappings": [ ], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 2, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "sum(volume_manager_total_volumes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\", state=\"actual_state_of_world\"})", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{`{{`}}instance{{`}}`}}", - "refId": "A" - } - ], - "thresholds": "", - "title": "Actual Volume Count", - "tooltip": { - "shared": false - }, - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "min" - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "#299c46", - "rgba(237, 129, 40, 0.89)", - "#d44a3a" - ], - "datasource": "$datasource", - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { + "thresholds": { + "mode": "absolute", + "steps": [ + ] }, - "id": 6, - "interval": null, - "links": [ + "unit": "none" + } + }, + "gridPos": { + "h": 7, + "w": 4, + "x": 0, + "y": 0 + }, + "id": 2, + "links": [ - ], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 2, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "sum(volume_manager_total_volumes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\",state=\"desired_state_of_world\"})", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{`{{`}}instance{{`}}`}}", - "refId": "A" - } - ], - "thresholds": "", - "title": "Desired Volume Count", - "tooltip": { - "shared": false - }, - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "min" - }, + ], + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + }, + "pluginVersion": "7", + "targets": [ { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "#299c46", - "rgba(237, 129, 40, 0.89)", - "#d44a3a" - ], - "datasource": "$datasource", - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { - - }, - "id": 7, - "interval": null, - "links": [ - - ], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 2, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "sum(rate(kubelet_node_config_error{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"}[5m]))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{`{{`}}instance{{`}}`}}", - "refId": "A" - } - ], - "thresholds": "", - "title": "Config Error Count", - "tooltip": { - "shared": false - }, - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "min" + "expr": "sum(kubelet_node_name{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A" } ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" + "title": "Running Kubelets", + "transparent": false, + "type": "stat" }, { - "collapse": false, - "collapsed": false, - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "fillGradient": 0, - "gridPos": { - - }, - "id": 8, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "sideWidth": null, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, + "datasource": "$datasource", + "fieldConfig": { + "defaults": { "links": [ ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(kubelet_runtime_operations_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (operation_type, instance)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{`{{`}}instance{{`}}`}} {{`{{`}}operation_type{{`}}`}}", - "refId": "A" - } - ], - "thresholds": [ + "mappings": [ ], - "timeFrom": null, - "timeShift": null, - "title": "Operation Rate", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ + "thresholds": { + "mode": "absolute", + "steps": [ ] }, - "yaxes": [ - { - "format": "ops", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "ops", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "fillGradient": 0, - "gridPos": { + "unit": "none" + } + }, + "gridPos": { + "h": 7, + "w": 4, + "x": 4, + "y": 0 + }, + "id": 3, + "links": [ - }, - "id": 9, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "sideWidth": null, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, + ], + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + }, + "pluginVersion": "7", + "targets": [ + { + "expr": "sum(kubelet_running_pods{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"}) OR sum(kubelet_running_pod_count{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{`{{`}}instance{{`}}`}}", + "refId": "A" + } + ], + "title": "Running Pods", + "transparent": false, + "type": "stat" + }, + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { "links": [ ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ + "mappings": [ ], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(kubelet_runtime_operations_errors_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, operation_type)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{`{{`}}instance{{`}}`}} {{`{{`}}operation_type{{`}}`}}", - "refId": "A" - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Operation Error Rate", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ + "thresholds": { + "mode": "absolute", + "steps": [ ] }, - "yaxes": [ - { - "format": "ops", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "ops", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - } - ] + "unit": "none" } + }, + "gridPos": { + "h": 7, + "w": 4, + "x": 8, + "y": 0 + }, + "id": 4, + "links": [ + ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + }, + "pluginVersion": "7", + "targets": [ + { + "expr": "sum(kubelet_running_containers{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"}) OR sum(kubelet_running_container_count{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{`{{`}}instance{{`}}`}}", + "refId": "A" + } + ], + "title": "Running Container", + "transparent": false, + "type": "stat" }, { - "collapse": false, - "collapsed": false, - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "fillGradient": 0, - "gridPos": { - - }, - "id": 10, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "sideWidth": null, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, + "datasource": "$datasource", + "fieldConfig": { + "defaults": { "links": [ ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 12, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "histogram_quantile(0.99, sum(rate(kubelet_runtime_operations_duration_seconds_bucket{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, operation_type, le))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{`{{`}}instance{{`}}`}} {{`{{`}}operation_type{{`}}`}}", - "refId": "A" - } - ], - "thresholds": [ + "mappings": [ ], - "timeFrom": null, - "timeShift": null, - "title": "Operation duration 99th quantile", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ + "thresholds": { + "mode": "absolute", + "steps": [ ] }, - "yaxes": [ - { - "format": "s", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "s", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] + "unit": "none" } + }, + "gridPos": { + "h": 7, + "w": 4, + "x": 12, + "y": 0 + }, + "id": 5, + "links": [ + ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + }, + "pluginVersion": "7", + "targets": [ + { + "expr": "sum(volume_manager_total_volumes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\", state=\"actual_state_of_world\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{`{{`}}instance{{`}}`}}", + "refId": "A" + } + ], + "title": "Actual Volume Count", + "transparent": false, + "type": "stat" }, { - "collapse": false, - "collapsed": false, - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "fillGradient": 0, - "gridPos": { - - }, - "id": 11, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "sideWidth": null, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, + "datasource": "$datasource", + "fieldConfig": { + "defaults": { "links": [ ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(kubelet_pod_start_duration_seconds_count{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{`{{`}}instance{{`}}`}} pod", - "refId": "A" - }, - { - "expr": "sum(rate(kubelet_pod_worker_duration_seconds_count{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{`{{`}}instance{{`}}`}} worker", - "refId": "B" - } - ], - "thresholds": [ + "mappings": [ ], - "timeFrom": null, - "timeShift": null, - "title": "Pod Start Rate", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ + "thresholds": { + "mode": "absolute", + "steps": [ ] }, - "yaxes": [ - { - "format": "ops", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "ops", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - } - ] - }, - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "fillGradient": 0, - "gridPos": { + "unit": "none" + } + }, + "gridPos": { + "h": 7, + "w": 4, + "x": 16, + "y": 0 + }, + "id": 6, + "links": [ - }, - "id": 12, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "sideWidth": null, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, + ], + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + }, + "pluginVersion": "7", + "targets": [ + { + "expr": "sum(volume_manager_total_volumes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\",state=\"desired_state_of_world\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{`{{`}}instance{{`}}`}}", + "refId": "A" + } + ], + "title": "Desired Volume Count", + "transparent": false, + "type": "stat" + }, + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { "links": [ ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "histogram_quantile(0.99, sum(rate(kubelet_pod_start_duration_seconds_count{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, le))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{`{{`}}instance{{`}}`}} pod", - "refId": "A" - }, - { - "expr": "histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, le))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{`{{`}}instance{{`}}`}} worker", - "refId": "B" - } - ], - "thresholds": [ + "mappings": [ ], - "timeFrom": null, - "timeShift": null, - "title": "Pod Start Duration", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ + "thresholds": { + "mode": "absolute", + "steps": [ ] }, - "yaxes": [ - { - "format": "s", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "s", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - } - ] + "unit": "none" } + }, + "gridPos": { + "h": 7, + "w": 4, + "x": 20, + "y": 0 + }, + "id": 7, + "links": [ + ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + }, + "pluginVersion": "7", + "targets": [ + { + "expr": "sum(rate(kubelet_node_config_error{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"}[5m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{`{{`}}instance{{`}}`}}", + "refId": "A" + } + ], + "title": "Config Error Count", + "transparent": false, + "type": "stat" }, { - "collapse": false, - "collapsed": false, - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "fillGradient": 0, - "gridPos": { - - }, - "id": 13, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": true, - "hideZero": true, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "sideWidth": null, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [ + "aliasColors": { - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 7 + }, + "id": 8, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ - ], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(storage_operation_duration_seconds_count{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, operation_name, volume_plugin)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{`{{`}}instance{{`}}`}} {{`{{`}}operation_name{{`}}`}} {{`{{`}}volume_plugin{{`}}`}}", - "refId": "A" - } - ], - "thresholds": [ + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ - ], - "timeFrom": null, - "timeShift": null, - "title": "Storage Operation Rate", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(kubelet_runtime_operations_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (operation_type, instance)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{`{{`}}instance{{`}}`}} {{`{{`}}operation_type{{`}}`}}", + "refId": "A" + } + ], + "thresholds": [ - ] - }, - "yaxes": [ - { - "format": "ops", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "ops", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - } - ] + ], + "timeFrom": null, + "timeShift": null, + "title": "Operation Rate", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true }, { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "fillGradient": 0, - "gridPos": { - - }, - "id": 14, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": true, - "hideZero": true, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "sideWidth": null, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { - ], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(storage_operation_errors_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, operation_name, volume_plugin)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{`{{`}}instance{{`}}`}} {{`{{`}}operation_name{{`}}`}} {{`{{`}}volume_plugin{{`}}`}}", - "refId": "A" - } - ], - "thresholds": [ + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 7 + }, + "id": 9, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ - ], - "timeFrom": null, - "timeShift": null, - "title": "Storage Operation Error Rate", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ - ] - }, - "yaxes": [ - { - "format": "ops", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "ops", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - } - ] + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(kubelet_runtime_operations_errors_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, operation_type)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{`{{`}}instance{{`}}`}} {{`{{`}}operation_type{{`}}`}}", + "refId": "A" } ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Operation Error Rate", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] }, { - "collapse": false, - "collapsed": false, - "panels": [ - { - "aliasColors": { + "aliasColors": { - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "fillGradient": 0, - "gridPos": { + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 14 + }, + "id": 10, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ - }, - "id": 15, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": true, - "hideZero": true, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "sideWidth": null, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 12, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "histogram_quantile(0.99, sum(rate(storage_operation_duration_seconds_bucket{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"}[5m])) by (instance, operation_name, volume_plugin, le))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{`{{`}}instance{{`}}`}} {{`{{`}}operation_name{{`}}`}} {{`{{`}}volume_plugin{{`}}`}}", - "refId": "A" - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Storage Operation Duration 99th quantile", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ - ] - }, - "yaxes": [ - { - "format": "s", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "s", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - } - ] + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(kubelet_runtime_operations_duration_seconds_bucket{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, operation_type, le))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{`{{`}}instance{{`}}`}} {{`{{`}}operation_type{{`}}`}}", + "refId": "A" } ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Operation duration 99th quantile", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] }, { - "collapse": false, - "collapsed": false, - "panels": [ - { - "aliasColors": { + "aliasColors": { - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "fillGradient": 0, - "gridPos": { - - }, - "id": 16, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "sideWidth": null, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(kubelet_cgroup_manager_duration_seconds_count{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"}[5m])) by (instance, operation_type)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{`{{`}}operation_type{{`}}`}}", - "refId": "A" - } - ], - "thresholds": [ + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 21 + }, + "id": 11, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ - ], - "timeFrom": null, - "timeShift": null, - "title": "Cgroup manager operation rate", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ - ] - }, - "yaxes": [ - { - "format": "ops", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "ops", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - } - ] + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(kubelet_pod_start_duration_seconds_count{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{`{{`}}instance{{`}}`}} pod", + "refId": "A" }, { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "fillGradient": 0, - "gridPos": { - - }, - "id": 17, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "sideWidth": null, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [ + "expr": "sum(rate(kubelet_pod_worker_duration_seconds_count{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{`{{`}}instance{{`}}`}} worker", + "refId": "B" + } + ], + "thresholds": [ - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ + ], + "timeFrom": null, + "timeShift": null, + "title": "Pod Start Rate", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { - ], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "histogram_quantile(0.99, sum(rate(kubelet_cgroup_manager_duration_seconds_bucket{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"}[5m])) by (instance, operation_type, le))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{`{{`}}instance{{`}}`}} {{`{{`}}operation_type{{`}}`}}", - "refId": "A" - } - ], - "thresholds": [ + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 21 + }, + "id": 12, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ - ], - "timeFrom": null, - "timeShift": null, - "title": "Cgroup manager 99th quantile", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ - ] - }, - "yaxes": [ - { - "format": "s", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "s", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - } - ] + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(kubelet_pod_start_duration_seconds_count{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, le))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{`{{`}}instance{{`}}`}} pod", + "refId": "A" + }, + { + "expr": "histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, le))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{`{{`}}instance{{`}}`}} worker", + "refId": "B" } ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Pod Start Duration", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] }, { - "collapse": false, - "collapsed": false, - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "description": "Pod lifecycle event generator", - "fill": 1, - "fillGradient": 0, - "gridPos": { - - }, - "id": 18, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "sideWidth": null, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [ + "aliasColors": { - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 28 + }, + "id": 13, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ - ], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(kubelet_pleg_relist_duration_seconds_count{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"}[5m])) by (instance)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{`{{`}}instance{{`}}`}}", - "refId": "A" - } - ], - "thresholds": [ + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ - ], - "timeFrom": null, - "timeShift": null, - "title": "PLEG relist rate", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(storage_operation_duration_seconds_count{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, operation_name, volume_plugin)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{`{{`}}instance{{`}}`}} {{`{{`}}operation_name{{`}}`}} {{`{{`}}volume_plugin{{`}}`}}", + "refId": "A" + } + ], + "thresholds": [ - ] - }, - "yaxes": [ - { - "format": "ops", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "ops", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - } - ] + ], + "timeFrom": null, + "timeShift": null, + "title": "Storage Operation Rate", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true }, { - "aliasColors": { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "fillGradient": 0, - "gridPos": { + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 28 + }, + "id": 14, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ - }, - "id": 19, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "sideWidth": null, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [ + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(storage_operation_errors_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, operation_name, volume_plugin)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{`{{`}}instance{{`}}`}} {{`{{`}}operation_name{{`}}`}} {{`{{`}}volume_plugin{{`}}`}}", + "refId": "A" + } + ], + "thresholds": [ - ], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "histogram_quantile(0.99, sum(rate(kubelet_pleg_relist_interval_seconds_bucket{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, le))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{`{{`}}instance{{`}}`}}", - "refId": "A" - } - ], - "thresholds": [ + ], + "timeFrom": null, + "timeShift": null, + "title": "Storage Operation Error Rate", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { - ], - "timeFrom": null, - "timeShift": null, - "title": "PLEG relist interval", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 35 + }, + "id": 15, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ - ] - }, - "yaxes": [ - { - "format": "s", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "s", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - } - ] - } ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(storage_operation_duration_seconds_bucket{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"}[5m])) by (instance, operation_name, volume_plugin, le))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{`{{`}}instance{{`}}`}} {{`{{`}}operation_name{{`}}`}} {{`{{`}}volume_plugin{{`}}`}}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Storage Operation Duration 99th quantile", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] }, { - "collapse": false, - "collapsed": false, - "panels": [ - { - "aliasColors": { + "aliasColors": { - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "fillGradient": 0, - "gridPos": { + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 42 + }, + "id": 16, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ - }, - "id": 20, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "sideWidth": null, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [ + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(kubelet_cgroup_manager_duration_seconds_count{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"}[5m])) by (instance, operation_type)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{`{{`}}operation_type{{`}}`}}", + "refId": "A" + } + ], + "thresholds": [ - ], - "spaceLength": 10, - "span": 12, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "histogram_quantile(0.99, sum(rate(kubelet_pleg_relist_duration_seconds_bucket{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, le))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{`{{`}}instance{{`}}`}}", - "refId": "A" - } - ], - "thresholds": [ + ], + "timeFrom": null, + "timeShift": null, + "title": "Cgroup manager operation rate", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { - ], - "timeFrom": null, - "timeShift": null, - "title": "PLEG relist duration", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 42 + }, + "id": 17, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ - ] - }, - "yaxes": [ - { - "format": "s", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "s", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - } - ] - } ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(kubelet_cgroup_manager_duration_seconds_bucket{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"}[5m])) by (instance, operation_type, le))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{`{{`}}instance{{`}}`}} {{`{{`}}operation_type{{`}}`}}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Cgroup manager 99th quantile", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] }, { - "collapse": false, - "collapsed": false, - "panels": [ - { - "aliasColors": { + "aliasColors": { - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "fillGradient": 0, - "gridPos": { + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Pod lifecycle event generator", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 49 + }, + "id": 18, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ - }, - "id": 21, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sideWidth": null, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [ + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(kubelet_pleg_relist_duration_seconds_count{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"}[5m])) by (instance)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{`{{`}}instance{{`}}`}}", + "refId": "A" + } + ], + "thresholds": [ - ], - "spaceLength": 10, - "span": 12, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\",code=~\"2..\"}[5m]))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "2xx", - "refId": "A" - }, - { - "expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\",code=~\"3..\"}[5m]))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "3xx", - "refId": "B" - }, - { - "expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\",code=~\"4..\"}[5m]))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "4xx", - "refId": "C" - }, - { - "expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\",code=~\"5..\"}[5m]))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "5xx", - "refId": "D" - } - ], - "thresholds": [ + ], + "timeFrom": null, + "timeShift": null, + "title": "PLEG relist rate", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { - ], - "timeFrom": null, - "timeShift": null, - "title": "RPC Rate", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 49 + }, + "id": 19, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ - ] - }, - "yaxes": [ - { - "format": "ops", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "ops", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - } - ] - } ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(kubelet_pleg_relist_interval_seconds_bucket{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, le))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{`{{`}}instance{{`}}`}}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "PLEG relist interval", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] }, { - "collapse": false, - "collapsed": false, - "panels": [ - { - "aliasColors": { + "aliasColors": { - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "fillGradient": 0, - "gridPos": { + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 56 + }, + "id": 20, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ - }, - "id": 22, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "sideWidth": null, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [ + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(kubelet_pleg_relist_duration_seconds_bucket{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, le))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{`{{`}}instance{{`}}`}}", + "refId": "A" + } + ], + "thresholds": [ - ], - "spaceLength": 10, - "span": 12, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"}[5m])) by (instance, verb, url, le))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{`{{`}}instance{{`}}`}} {{`{{`}}verb{{`}}`}} {{`{{`}}url{{`}}`}}", - "refId": "A" - } - ], - "thresholds": [ + ], + "timeFrom": null, + "timeShift": null, + "title": "PLEG relist duration", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { - ], - "timeFrom": null, - "timeShift": null, - "title": "Request duration 99th quantile", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 63 + }, + "id": 21, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ - ] - }, - "yaxes": [ - { - "format": "s", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "s", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - } - ] - } ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" - }, - { - "collapse": false, - "collapsed": false, - "panels": [ - { - "aliasColors": { + "seriesOverrides": [ - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "fillGradient": 0, - "gridPos": { + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\",code=~\"2..\"}[5m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "2xx", + "refId": "A" + }, + { + "expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\",code=~\"3..\"}[5m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "3xx", + "refId": "B" + }, + { + "expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\",code=~\"4..\"}[5m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "4xx", + "refId": "C" + }, + { + "expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\",code=~\"5..\"}[5m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "5xx", + "refId": "D" + } + ], + "thresholds": [ - }, - "id": 23, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sideWidth": null, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [ + ], + "timeFrom": null, + "timeShift": null, + "title": "RPC Rate", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 70 + }, + "id": 22, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ - ], - "spaceLength": 10, - "span": 4, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "process_resident_memory_bytes{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{`{{`}}instance{{`}}`}}", - "refId": "A" - } - ], - "thresholds": [ + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ - ], - "timeFrom": null, - "timeShift": null, - "title": "Memory", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"}[5m])) by (instance, verb, url, le))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{`{{`}}instance{{`}}`}} {{`{{`}}verb{{`}}`}} {{`{{`}}url{{`}}`}}", + "refId": "A" + } + ], + "thresholds": [ - ] - }, - "yaxes": [ - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] + ], + "timeFrom": null, + "timeShift": null, + "title": "Request duration 99th quantile", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true }, { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "fillGradient": 0, - "gridPos": { - - }, - "id": 24, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sideWidth": null, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [ + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 8, + "x": 0, + "y": 77 + }, + "id": 23, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ - ], - "spaceLength": 10, - "span": 4, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "rate(process_cpu_seconds_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{`{{`}}instance{{`}}`}}", - "refId": "A" - } - ], - "thresholds": [ + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ - ], - "timeFrom": null, - "timeShift": null, - "title": "CPU usage", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "process_resident_memory_bytes{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{`{{`}}instance{{`}}`}}", + "refId": "A" + } + ], + "thresholds": [ - ] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - } - ] + ], + "timeFrom": null, + "timeShift": null, + "title": "Memory", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true }, { - "aliasColors": { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "fillGradient": 0, - "gridPos": { + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 8, + "x": 8, + "y": 77 + }, + "id": 24, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ - }, - "id": 25, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sideWidth": null, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [ + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(process_cpu_seconds_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{`{{`}}instance{{`}}`}}", + "refId": "A" + } + ], + "thresholds": [ - ], - "spaceLength": 10, - "span": 4, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "go_goroutines{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{`{{`}}instance{{`}}`}}", - "refId": "A" - } - ], - "thresholds": [ + ], + "timeFrom": null, + "timeShift": null, + "title": "CPU usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { - ], - "timeFrom": null, - "timeShift": null, - "title": "Goroutines", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 8, + "x": 16, + "y": 77 + }, + "id": 25, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ - ] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - } ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "go_goroutines{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{`{{`}}instance{{`}}`}}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Goroutines", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] } + ], + "refresh": "10s", + "rows": [ + ], "schemaVersion": 14, "style": "dark", diff --git a/charts/victoria-metrics-k8s-stack/templates/grafana/dashboards/scheduler.yaml b/charts/victoria-metrics-k8s-stack/templates/grafana/dashboards/scheduler.yaml index 29cab75a3..b11b4da31 100644 --- a/charts/victoria-metrics-k8s-stack/templates/grafana/dashboards/scheduler.yaml +++ b/charts/victoria-metrics-k8s-stack/templates/grafana/dashboards/scheduler.yaml @@ -173,31 +173,31 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(rate(scheduler_e2e_scheduling_duration_seconds_count{cluster=\"$cluster\", job=\"kube-scheduler\", instance=~\"$instance\"}[5m])) by (instance)", + "expr": "sum(rate(scheduler_e2e_scheduling_duration_seconds_count{cluster=\"$cluster\", job=\"kube-scheduler\", instance=~\"$instance\"}[5m])) by (cluster, instance)", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{`{{`}}instance{{`}}`}} e2e", + "legendFormat": "{{`{{`}}cluster{{`}}`}} {{`{{`}}instance{{`}}`}} e2e", "refId": "A" }, { - "expr": "sum(rate(scheduler_binding_duration_seconds_count{cluster=\"$cluster\", job=\"kube-scheduler\", instance=~\"$instance\"}[5m])) by (instance)", + "expr": "sum(rate(scheduler_binding_duration_seconds_count{cluster=\"$cluster\", job=\"kube-scheduler\", instance=~\"$instance\"}[5m])) by (cluster, instance)", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{`{{`}}instance{{`}}`}} binding", + "legendFormat": "{{`{{`}}cluster{{`}}`}} {{`{{`}}instance{{`}}`}} binding", "refId": "B" }, { - "expr": "sum(rate(scheduler_scheduling_algorithm_duration_seconds_count{cluster=\"$cluster\", job=\"kube-scheduler\", instance=~\"$instance\"}[5m])) by (instance)", + "expr": "sum(rate(scheduler_scheduling_algorithm_duration_seconds_count{cluster=\"$cluster\", job=\"kube-scheduler\", instance=~\"$instance\"}[5m])) by (cluster, instance)", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{`{{`}}instance{{`}}`}} scheduling algorithm", + "legendFormat": "{{`{{`}}cluster{{`}}`}} {{`{{`}}instance{{`}}`}} scheduling algorithm", "refId": "C" }, { - "expr": "sum(rate(scheduler_volume_scheduling_duration_seconds_count{cluster=\"$cluster\", job=\"kube-scheduler\", instance=~\"$instance\"}[5m])) by (instance)", + "expr": "sum(rate(scheduler_volume_scheduling_duration_seconds_count{cluster=\"$cluster\", job=\"kube-scheduler\", instance=~\"$instance\"}[5m])) by (cluster, instance)", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{`{{`}}instance{{`}}`}} volume", + "legendFormat": "{{`{{`}}cluster{{`}}`}} {{`{{`}}instance{{`}}`}} volume", "refId": "D" } ], @@ -287,31 +287,31 @@ data: "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-scheduler\",instance=~\"$instance\"}[5m])) by (instance, le))", + "expr": "histogram_quantile(0.99, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-scheduler\",instance=~\"$instance\"}[5m])) by (cluster, instance, le))", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{`{{`}}instance{{`}}`}} e2e", + "legendFormat": "{{`{{`}}cluster{{`}}`}} {{`{{`}}instance{{`}}`}} e2e", "refId": "A" }, { - "expr": "histogram_quantile(0.99, sum(rate(scheduler_binding_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-scheduler\",instance=~\"$instance\"}[5m])) by (instance, le))", + "expr": "histogram_quantile(0.99, sum(rate(scheduler_binding_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-scheduler\",instance=~\"$instance\"}[5m])) by (cluster, instance, le))", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{`{{`}}instance{{`}}`}} binding", + "legendFormat": "{{`{{`}}cluster{{`}}`}} {{`{{`}}instance{{`}}`}} binding", "refId": "B" }, { - "expr": "histogram_quantile(0.99, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-scheduler\",instance=~\"$instance\"}[5m])) by (instance, le))", + "expr": "histogram_quantile(0.99, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-scheduler\",instance=~\"$instance\"}[5m])) by (cluster, instance, le))", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{`{{`}}instance{{`}}`}} scheduling algorithm", + "legendFormat": "{{`{{`}}cluster{{`}}`}} {{`{{`}}instance{{`}}`}} scheduling algorithm", "refId": "C" }, { - "expr": "histogram_quantile(0.99, sum(rate(scheduler_volume_scheduling_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-scheduler\",instance=~\"$instance\"}[5m])) by (instance, le))", + "expr": "histogram_quantile(0.99, sum(rate(scheduler_volume_scheduling_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-scheduler\",instance=~\"$instance\"}[5m])) by (cluster, instance, le))", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{`{{`}}instance{{`}}`}} volume", + "legendFormat": "{{`{{`}}cluster{{`}}`}} {{`{{`}}instance{{`}}`}} volume", "refId": "D" } ], diff --git a/charts/victoria-metrics-k8s-stack/templates/grafana/dashboards/victoriametrics.yaml b/charts/victoria-metrics-k8s-stack/templates/grafana/dashboards/victoriametrics.yaml index 95dddd8b0..342b0cac3 100644 --- a/charts/victoria-metrics-k8s-stack/templates/grafana/dashboards/victoriametrics.yaml +++ b/charts/victoria-metrics-k8s-stack/templates/grafana/dashboards/victoriametrics.yaml @@ -24,12 +24,12 @@ data: "type": "grafana", "id": "grafana", "name": "Grafana", - "version": "7.1.1" + "version": "8.0.0" }, { "type": "panel", "id": "graph", - "name": "Graph", + "name": "Graph (old)", "version": "" }, { @@ -40,15 +40,15 @@ data: }, { "type": "panel", - "id": "singlestat", - "name": "Singlestat", + "id": "stat", + "name": "Stat", "version": "" }, { "type": "panel", "id": "text", "name": "Text", - "version": "7.1.0" + "version": "" } ], "annotations": { @@ -64,12 +64,12 @@ data: } ] }, - "description": "Overview for single node VictoriaMetrics v1.56.0 or higher", + "description": "Overview for single node VictoriaMetrics v1.57.0 or higher", "editable": true, "gnetId": 10229, "graphTooltip": 0, "id": null, - "iteration": 1616956884194, + "iteration": 1624970666582, "links": [ { "icon": "doc", @@ -99,7 +99,7 @@ data: ], "panels": [ { - "collapsed": false, + "collapsed": true, "datasource": "$ds", "gridPos": { "h": 1, @@ -108,706 +108,956 @@ data: "y": 0 }, "id": 6, - "panels": [], - "title": "Configuration", + "panels": [ + { + "datasource": null, + "description": "", + "gridPos": { + "h": 2, + "w": 4, + "x": 0, + "y": 1 + }, + "id": 85, + "options": { + "content": "
$version
", + "mode": "markdown" + }, + "pluginVersion": "8.0.0", + "title": "Version", + "type": "text" + }, + { + "cacheTimeout": null, + "datasource": "$ds", + "description": "How many datapoints are in storage", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 2, + "w": 5, + "x": 4, + "y": 1 + }, + "id": 26, + "interval": null, + "links": [], + "maxDataPoints": 100, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "text": {}, + "textMode": "auto" + }, + "pluginVersion": "8.0.0", + "targets": [ + { + "exemplar": true, + "expr": "sum(vm_rows{job=\"$job\", instance=~\"$instance\", type!=\"indexdb\"})", + "format": "time_series", + "instant": true, + "interval": "", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Total datapoints", + "type": "stat" + }, + { + "cacheTimeout": null, + "datasource": "$ds", + "description": "Total amount of used disk space", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 2, + "w": 5, + "x": 9, + "y": 1 + }, + "id": 81, + "interval": null, + "links": [], + "maxDataPoints": 100, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "text": {}, + "textMode": "auto" + }, + "pluginVersion": "8.0.0", + "targets": [ + { + "exemplar": true, + "expr": "sum(vm_data_size_bytes{job=\"$job\", type!=\"indexdb\"})", + "format": "time_series", + "instant": true, + "interval": "", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Disk space usage", + "type": "stat" + }, + { + "cacheTimeout": null, + "datasource": "$ds", + "description": "Average disk usage per datapoint.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 2, + "w": 5, + "x": 14, + "y": 1 + }, + "id": 82, + "interval": null, + "links": [], + "maxDataPoints": 100, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "text": {}, + "textMode": "auto" + }, + "pluginVersion": "8.0.0", + "targets": [ + { + "exemplar": true, + "expr": "sum(vm_data_size_bytes{job=\"$job\", type!=\"indexdb\"}) / sum(vm_rows{job=\"$job\", type!=\"indexdb\"})", + "format": "time_series", + "instant": true, + "interval": "", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Bytes per point", + "type": "stat" + }, + { + "cacheTimeout": null, + "datasource": "$ds", + "description": "Total size of allowed memory via flag `-memory.allowedPercent`", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 2, + "w": 5, + "x": 19, + "y": 1 + }, + "id": 79, + "interval": null, + "links": [], + "maxDataPoints": 100, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "text": {}, + "textMode": "auto" + }, + "pluginVersion": "8.0.0", + "targets": [ + { + "exemplar": true, + "expr": "sum(vm_allowed_memory_bytes{job=\"$job\", instance=~\"$instance\"})", + "format": "time_series", + "instant": true, + "interval": "", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Allowed memory", + "type": "stat" + }, + { + "datasource": null, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "green", + "value": 1800 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 2, + "w": 4, + "x": 0, + "y": 3 + }, + "id": 87, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "text": {}, + "textMode": "auto" + }, + "pluginVersion": "8.0.0", + "targets": [ + { + "exemplar": true, + "expr": "vm_app_uptime_seconds{job=\"$job\", instance=\"$instance\"}", + "instant": true, + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Uptime", + "type": "stat" + }, + { + "cacheTimeout": null, + "datasource": "$ds", + "description": "How many entries inverted index contains. This value is proportional to the number of unique timeseries in storage(cardinality).", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 2, + "w": 5, + "x": 4, + "y": 3 + }, + "id": 38, + "interval": null, + "links": [], + "maxDataPoints": 100, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "text": {}, + "textMode": "auto" + }, + "pluginVersion": "8.0.0", + "targets": [ + { + "exemplar": true, + "expr": "sum(vm_rows{job=\"$job\", instance=~\"$instance\", type=\"indexdb\"})", + "format": "time_series", + "instant": true, + "interval": "", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Index size", + "type": "stat" + }, + { + "cacheTimeout": null, + "datasource": "$ds", + "description": "The minimum free disk space left", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 2, + "w": 5, + "x": 9, + "y": 3 + }, + "id": 80, + "interval": null, + "links": [], + "maxDataPoints": 100, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "text": {}, + "textMode": "auto" + }, + "pluginVersion": "8.0.0", + "targets": [ + { + "exemplar": true, + "expr": "min(vm_free_disk_space_bytes{job=\"$job\", instance=~\"$instance\"})", + "format": "time_series", + "instant": true, + "interval": "", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Min free disk space", + "type": "stat" + }, + { + "cacheTimeout": null, + "datasource": "$ds", + "description": "Total number of available CPUs for VM process", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 2, + "w": 5, + "x": 14, + "y": 3 + }, + "id": 77, + "interval": null, + "links": [], + "maxDataPoints": 100, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "text": {}, + "textMode": "auto" + }, + "pluginVersion": "8.0.0", + "targets": [ + { + "exemplar": true, + "expr": "sum(vm_available_cpu_cores{job=\"$job\", instance=~\"$instance\"})", + "format": "time_series", + "instant": true, + "interval": "", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Available CPU", + "type": "stat" + }, + { + "cacheTimeout": null, + "datasource": "$ds", + "description": "Total size of available memory for VM process", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 2, + "w": 5, + "x": 19, + "y": 3 + }, + "id": 78, + "interval": null, + "links": [], + "maxDataPoints": 100, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "text": {}, + "textMode": "auto" + }, + "pluginVersion": "8.0.0", + "targets": [ + { + "exemplar": true, + "expr": "sum(vm_available_memory_bytes{job=\"$job\", instance=~\"$instance\"})", + "format": "time_series", + "instant": true, + "interval": "", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Available memory", + "type": "stat" + } + ], + "title": "Stats", "type": "row" }, { - "content": "
$version
", + "collapsed": false, "datasource": "$ds", - "description": "", - "fieldConfig": { - "defaults": { - "custom": {} - }, - "overrides": [] - }, "gridPos": { - "h": 2, - "w": 6, + "h": 1, + "w": 24, "x": 0, "y": 1 }, - "id": 2, - "links": [ - { - "targetBlank": true, - "title": "VictoriaMetrics releases", - "url": "https://github.com/VictoriaMetrics/VictoriaMetrics/releases" - } - ], - "mode": "html", - "options": { - "content": "
$version
", - "mode": "html" - }, - "pluginVersion": "7.1.0", - "timeFrom": null, - "timeShift": null, - "title": "Version", - "type": "text" + "id": 24, + "panels": [], + "title": "Performance", + "type": "row" }, { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "#299c46", - "rgba(237, 129, 40, 0.89)", - "#d44a3a" - ], + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, "datasource": "$ds", - "description": "How many datapoints are in storage", + "description": "* `*` - unsupported query path\n* `/write` - insert into VM\n* `/metrics` - query VM system metrics\n* `/query` - query instant values\n* `/query_range` - query over a range of time\n* `/series` - match a certain label set\n* `/label/{}/values` - query a list of label values (variables mostly)", "fieldConfig": { "defaults": { - "custom": {} + "links": [] }, "overrides": [] }, - "format": "short", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, + "fill": 1, + "fillGradient": 0, "gridPos": { - "h": 2, - "w": 6, - "x": 6, - "y": 1 + "h": 8, + "w": 12, + "x": 0, + "y": 2 + }, + "hiddenSeries": false, + "id": 12, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true }, - "id": 26, - "interval": null, + "lines": true, + "linewidth": 1, "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": true, - "lineColor": "rgb(31, 120, 193)", - "show": true + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true }, - "tableColumn": "", + "percentage": false, + "pluginVersion": "8.0.0", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, "targets": [ { - "expr": "sum(vm_rows{job=\"$job\", instance=~\"$instance\", type!=\"indexdb\"})", + "expr": "sum(rate(vm_http_requests_total{job=\"$job\", instance=~\"$instance\", path!~\"/favicon.ico\"}[$__interval])) by (path) > 0", "format": "time_series", - "instant": false, + "interval": "", "intervalFactor": 1, + "legendFormat": "{{`{{`}}path{{`}}`}}", "refId": "A" } ], - "thresholds": "", + "thresholds": [], "timeFrom": null, + "timeRegions": [], "timeShift": null, - "title": "Total datapoints", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" + "title": "Requests rate ($instance)", + "tooltip": { + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true } ], - "valueName": "current" + "yaxis": { + "align": false, + "alignLevel": null + } }, { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "#299c46", - "rgba(237, 129, 40, 0.89)", - "#d44a3a" - ], + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, "datasource": "$ds", - "description": "The size of the free disk space left", + "description": "The less time it takes is better.\n* `*` - unsupported query path\n* `/write` - insert into VM\n* `/metrics` - query VM system metrics\n* `/query` - query instant values\n* `/query_range` - query over a range of time\n* `/series` - match a certain label set\n* `/label/{}/values` - query a list of label values (variables mostly)", "fieldConfig": { "defaults": { - "custom": {} + "links": [] }, "overrides": [] }, - "format": "bytes", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, + "fill": 1, + "fillGradient": 0, "gridPos": { - "h": 2, - "w": 6, + "h": 8, + "w": 12, "x": 12, - "y": 1 + "y": 2 + }, + "hiddenSeries": false, + "id": 22, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true }, - "id": 80, - "interval": null, + "lines": true, + "linewidth": 1, "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": true, - "lineColor": "rgb(31, 120, 193)", - "show": true + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true }, - "tableColumn": "", + "percentage": false, + "pluginVersion": "8.0.0", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, "targets": [ { - "expr": "sum(vm_free_disk_space_bytes{job=\"$job\", instance=~\"$instance\", path=\"/storage\"})", + "expr": "max(vm_request_duration_seconds{job=\"$job\", instance=~\"$instance\", quantile=~\"(0.5|0.99)\"}) by (path, quantile) > 0", "format": "time_series", - "instant": false, - "interval": "", "intervalFactor": 1, - "legendFormat": "", + "legendFormat": "{{`{{`}}quantile{{`}}`}} ({{`{{`}}path{{`}}`}})", "refId": "A" } ], - "thresholds": "", + "thresholds": [], "timeFrom": null, + "timeRegions": [], "timeShift": null, - "title": "Free disk space", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" + "title": "Query duration ($instance)", + "tooltip": { + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true } ], - "valueName": "current" + "yaxis": { + "align": false, + "alignLevel": null + } }, { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "#299c46", - "rgba(237, 129, 40, 0.89)", - "#d44a3a" - ], + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, "datasource": "$ds", - "description": "Total size of available memory for VM process", + "description": "Shows the number of active time series with new data points inserted during the last hour. High value may result in ingestion slowdown. \n\nSee following link for details:", "fieldConfig": { "defaults": { - "custom": {} + "links": [] }, "overrides": [] }, - "format": "bytes", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, + "fill": 1, + "fillGradient": 0, "gridPos": { - "h": 2, - "w": 6, - "x": 18, - "y": 1 + "h": 8, + "w": 12, + "x": 0, + "y": 10 }, - "id": 78, - "interval": null, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, + "hiddenSeries": false, + "id": 51, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" + "targetBlank": true, + "title": "troubleshooting", + "url": "https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/README.md#troubleshooting" } ], - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": true, - "lineColor": "rgb(31, 120, 193)", - "show": true + "nullPointMode": "null", + "options": { + "alertThreshold": true }, - "tableColumn": "", + "percentage": false, + "pluginVersion": "8.0.0", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, "targets": [ { - "expr": "sum(vm_available_memory_bytes{job=\"$job\", instance=~\"$instance\"})", + "expr": "vm_cache_entries{job=\"$job\", instance=~\"$instance\", type=\"storage/hour_metric_ids\"}", "format": "time_series", - "instant": false, - "interval": "", "intervalFactor": 1, - "legendFormat": "", + "legendFormat": "Active time series", "refId": "A" } ], - "thresholds": "", + "thresholds": [], "timeFrom": null, + "timeRegions": [], "timeShift": null, - "title": "Available memory", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" + "title": "Active time series ($instance)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true } ], - "valueName": "current" + "yaxis": { + "align": false, + "alignLevel": null + } }, { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "#299c46", - "rgba(237, 129, 40, 0.89)", - "#d44a3a" - ], + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, "datasource": "$ds", + "description": "VictoriaMetrics stores various caches in RAM. Memory size for these caches may be limited with -`memory.allowedPercent` flag. Line `max allowed` shows max allowed memory size for cache.", "fieldConfig": { "defaults": { - "custom": {} - }, - "overrides": [] - }, - "format": "s", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { - "h": 2, - "w": 6, - "x": 0, - "y": 3 - }, - "id": 8, - "interval": null, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "vm_app_uptime_seconds{instance=\"victoriametrics:8428\", job=\"victoriametrics\"}", - "targets": [ - { - "expr": "vm_app_uptime_seconds{job=\"$job\", instance=\"$instance\"}", - "format": "time_series", - "intervalFactor": 1, - "refId": "A" - } - ], - "thresholds": "", - "timeFrom": null, - "timeShift": null, - "title": "Uptime", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "current" - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "#299c46", - "rgba(237, 129, 40, 0.89)", - "#d44a3a" - ], - "datasource": "$ds", - "description": "How many entries inverted index contains. This value is proportional to the number of unique timeseries in storage(cardinality).", - "fieldConfig": { - "defaults": { - "custom": {} - }, - "overrides": [] - }, - "format": "short", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { - "h": 2, - "w": 6, - "x": 6, - "y": 3 - }, - "id": 38, - "interval": null, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": true, - "lineColor": "rgb(31, 120, 193)", - "show": true - }, - "tableColumn": "", - "targets": [ - { - "expr": "sum(vm_rows{job=\"$job\", instance=~\"$instance\", type=\"indexdb\"})", - "format": "time_series", - "instant": false, - "intervalFactor": 1, - "refId": "A" - } - ], - "thresholds": "", - "timeFrom": null, - "timeShift": null, - "title": "Index size", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "current" - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "#299c46", - "rgba(237, 129, 40, 0.89)", - "#d44a3a" - ], - "datasource": "$ds", - "description": "Total number of available CPUs for VM process", - "fieldConfig": { - "defaults": { - "custom": {} - }, - "overrides": [] - }, - "format": "short", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { - "h": 2, - "w": 6, - "x": 12, - "y": 3 - }, - "id": 77, - "interval": null, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": true, - "lineColor": "rgb(31, 120, 193)", - "show": true - }, - "tableColumn": "", - "targets": [ - { - "expr": "sum(vm_available_cpu_cores{job=\"$job\", instance=~\"$instance\"})", - "format": "time_series", - "instant": false, - "interval": "", - "intervalFactor": 1, - "legendFormat": "", - "refId": "A" - } - ], - "thresholds": "", - "timeFrom": null, - "timeShift": null, - "title": "Available CPU", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "current" - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "#299c46", - "rgba(237, 129, 40, 0.89)", - "#d44a3a" - ], - "datasource": "$ds", - "description": "Total size of allowed memory via flag `-memory.allowedPercent`", - "fieldConfig": { - "defaults": { - "custom": {} - }, - "overrides": [] - }, - "format": "bytes", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { - "h": 2, - "w": 6, - "x": 18, - "y": 3 - }, - "id": 79, - "interval": null, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": true, - "lineColor": "rgb(31, 120, 193)", - "show": true - }, - "tableColumn": "", - "targets": [ - { - "expr": "sum(vm_allowed_memory_bytes{job=\"$job\", instance=~\"$instance\"})", - "format": "time_series", - "instant": false, - "interval": "", - "intervalFactor": 1, - "legendFormat": "", - "refId": "A" - } - ], - "thresholds": "", - "timeFrom": null, - "timeShift": null, - "title": "Allowed memory", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "current" - }, - { - "collapsed": false, - "datasource": "$ds", - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 5 - }, - "id": 24, - "panels": [], - "title": "Performance", - "type": "row" - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$ds", - "description": "* `*` - unsupported query path\n* `/write` - insert into VM\n* `/metrics` - query VM system metrics\n* `/query` - query instant values\n* `/query_range` - query over a range of time\n* `/series` - match a certain label set\n* `/label/{}/values` - query a list of label values (variables mostly)", - "fieldConfig": { - "defaults": { - "custom": {}, - "links": [] + "links": [] }, "overrides": [] }, @@ -816,16 +1066,16 @@ data: "gridPos": { "h": 8, "w": 12, - "x": 0, - "y": 6 + "x": 12, + "y": 10 }, "hiddenSeries": false, - "id": 12, + "id": 33, "legend": { "alignAsTable": true, "avg": true, "current": true, - "max": false, + "max": true, "min": false, "show": true, "sort": "current", @@ -836,34 +1086,50 @@ data: "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null as zero", + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, "percentage": false, - "pluginVersion": "7.1.1", + "pluginVersion": "8.0.0", "pointradius": 2, "points": false, "renderer": "flot", - "seriesOverrides": [], + "seriesOverrides": [ + { + "alias": "max allowed", + "color": "#C4162A" + } + ], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(rate(vm_http_requests_total{job=\"$job\", instance=~\"$instance\", path!~\"/favicon.ico\"}[$__interval])) by (path) > 0", + "expr": "sum(vm_cache_size_bytes{job=\"$job\", instance=\"$instance\"})", "format": "time_series", - "interval": "", + "hide": false, "intervalFactor": 1, - "legendFormat": "{{`{{`}}path{{`}}`}}", + "legendFormat": "size", "refId": "A" + }, + { + "expr": "max(vm_allowed_memory_bytes{job=\"$job\", instance=\"$instance\"})", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "max allowed", + "refId": "B" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "Requests rate ($instance)", + "title": "Cache size ($instance)", "tooltip": { "shared": true, - "sort": 2, + "sort": 0, "value_type": "individual" }, "type": "graph", @@ -876,7 +1142,7 @@ data: }, "yaxes": [ { - "format": "short", + "format": "bytes", "label": null, "logBase": 1, "max": null, @@ -903,10 +1169,9 @@ data: "dashLength": 10, "dashes": false, "datasource": "$ds", - "description": "The less time it takes is better.\n* `*` - unsupported query path\n* `/write` - insert into VM\n* `/metrics` - query VM system metrics\n* `/query` - query instant values\n* `/query_range` - query over a range of time\n* `/series` - match a certain label set\n* `/label/{}/values` - query a list of label values (variables mostly)", + "description": "Shows how many ongoing insertions (not API /write calls) on disk are taking place, where:\n* `max` - equal to number of CPUs;\n* `current` - current number of goroutines busy with inserting rows into underlying storage.\n\nEvery successful API /write call results into flush on disk. However, these two actions are separated and controlled via different concurrency limiters. The `max` on this panel can't be changed and always equal to number of CPUs. \n\nWhen `current` hits `max` constantly, it means storage is overloaded and requires more CPU.\n\n", "fieldConfig": { "defaults": { - "custom": {}, "links": [] }, "overrides": [] @@ -916,15 +1181,17 @@ data: "gridPos": { "h": 8, "w": 12, - "x": 12, - "y": 6 + "x": 0, + "y": 18 }, "hiddenSeries": false, - "id": 22, + "id": 59, "legend": { "alignAsTable": true, "avg": true, "current": true, + "hideEmpty": false, + "hideZero": false, "max": false, "min": false, "show": true, @@ -936,30 +1203,46 @@ data: "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null as zero", + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, "percentage": false, - "pluginVersion": "7.1.1", + "pluginVersion": "8.0.0", "pointradius": 2, "points": false, "renderer": "flot", - "seriesOverrides": [], + "seriesOverrides": [ + { + "alias": "max", + "color": "#C4162A" + } + ], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { - "expr": "max(vm_request_duration_seconds{job=\"$job\", instance=~\"$instance\", quantile=~\"(0.5|0.99)\"}) by (path, quantile) > 0", + "expr": "sum(vm_concurrent_addrows_capacity{job=\"$job\", instance=\"$instance\"})", "format": "time_series", + "interval": "", "intervalFactor": 1, - "legendFormat": "{{`{{`}}quantile{{`}}`}} ({{`{{`}}path{{`}}`}})", + "legendFormat": "max", "refId": "A" + }, + { + "expr": "sum(vm_concurrent_addrows_current{job=\"$job\", instance=\"$instance\"})", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "current", + "refId": "B" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "Query duration ($instance)", + "title": "Concurrent flushes on disk ($instance)", "tooltip": { "shared": true, "sort": 2, @@ -975,7 +1258,8 @@ data: }, "yaxes": [ { - "format": "s", + "decimals": 0, + "format": "short", "label": null, "logBase": 1, "max": null, @@ -983,6 +1267,7 @@ data: "show": true }, { + "decimals": 0, "format": "short", "label": null, "logBase": 1, @@ -1002,10 +1287,9 @@ data: "dashLength": 10, "dashes": false, "datasource": "$ds", - "description": "Shows the number of active time series with new data points inserted during the last hour. High value may result in ingestion slowdown. \n\nSee following link for details:", + "description": "* `*` - unsupported query path\n* `/write` - insert into VM\n* `/metrics` - query VM system metrics\n* `/query` - query instant values\n* `/query_range` - query over a range of time\n* `/series` - match a certain label set\n* `/label/{}/values` - query a list of label values (variables mostly)", "fieldConfig": { "defaults": { - "custom": {}, "links": [] }, "overrides": [] @@ -1015,32 +1299,32 @@ data: "gridPos": { "h": 8, "w": 12, - "x": 0, - "y": 14 + "x": 12, + "y": 18 }, "hiddenSeries": false, - "id": 51, + "id": 35, "legend": { - "avg": false, - "current": false, + "alignAsTable": true, + "avg": true, + "current": true, "max": false, "min": false, "show": true, + "sort": "current", + "sortDesc": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, - "links": [ - { - "targetBlank": true, - "title": "troubleshooting", - "url": "https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/README.md#troubleshooting" - } - ], - "nullPointMode": "null", + "links": [], + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true + }, "percentage": false, - "pluginVersion": "7.1.1", + "pluginVersion": "8.0.0", "pointradius": 2, "points": false, "renderer": "flot", @@ -1050,10 +1334,12 @@ data: "steppedLine": false, "targets": [ { - "expr": "vm_cache_entries{job=\"$job\", instance=~\"$instance\", type=\"storage/hour_metric_ids\"}", + "exemplar": true, + "expr": "sum(rate(vm_http_request_errors_total{job=\"$job\", instance=\"$instance\"}[$__interval])) by (path) > 0", "format": "time_series", + "interval": "", "intervalFactor": 1, - "legendFormat": "Active time series", + "legendFormat": "{{`{{`}}path{{`}}`}}", "refId": "A" } ], @@ -1061,10 +1347,10 @@ data: "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "Active time series ($instance)", + "title": "Requests error rate ($instance)", "tooltip": { "shared": true, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -1098,16 +1384,29 @@ data: "alignLevel": null } }, + { + "collapsed": false, + "datasource": "$ds", + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 26 + }, + "id": 14, + "panels": [], + "title": "Storage", + "type": "row" + }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "$ds", - "description": "VictoriaMetrics stores various caches in RAM. Memory size for these caches may be limited with -`memory.allowedPercent` flag. Line `max allowed` shows max allowed memory size for cache.", + "description": "How many datapoints are inserted into storage per second", "fieldConfig": { "defaults": { - "custom": {}, "links": [] }, "overrides": [] @@ -1117,64 +1416,163 @@ data: "gridPos": { "h": 8, "w": 12, - "x": 12, - "y": 14 + "x": 0, + "y": 27 }, "hiddenSeries": false, - "id": 33, + "id": 10, "legend": { - "avg": false, - "current": false, + "alignAsTable": true, + "avg": true, + "current": true, + "hideZero": true, "max": false, "min": false, - "show": false, + "show": true, + "sort": "current", + "sortDesc": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null", + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true + }, "percentage": false, - "pluginVersion": "7.1.1", + "pluginVersion": "8.0.0", "pointradius": 2, "points": false, "renderer": "flot", - "seriesOverrides": [ - { - "alias": "max allowed", - "color": "#C4162A" - } - ], + "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(vm_cache_size_bytes{job=\"$job\", instance=\"$instance\"})", + "expr": "sum(rate(vm_rows_inserted_total{job=\"$job\", instance=\"$instance\"}[$__interval])) by (type) > 0", "format": "time_series", "hide": false, "intervalFactor": 1, - "legendFormat": "size", + "legendFormat": "{{`{{`}}type{{`}}`}}", "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Datapoints ingestion rate ($instance)", + "tooltip": { + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": null, + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true }, { - "expr": "max(vm_allowed_memory_bytes{job=\"$job\", instance=\"$instance\"})", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$ds", + "description": "Shows the time needed to reach the 100% of disk capacity based on the following params:\n* free disk space;\n* row ingestion rate;\n* dedup rate;\n* compression.\n\nUse this panel for capacity planning in order to estimate the time remaining for running out of the disk space.\n\n", + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 27 + }, + "hiddenSeries": false, + "id": 73, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideZero": true, + "max": false, + "min": true, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.0.0", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "vm_free_disk_space_bytes{job=\"$job\", instance=\"$instance\"} / ignoring(path) ((rate(vm_rows_added_to_storage_total{job=\"$job\", instance=\"$instance\"}[1d]) - ignoring(type) rate(vm_deduplicated_samples_total{job=\"$job\", instance=\"$instance\", type=\"merge\"}[1d])) * scalar(sum(vm_data_size_bytes{job=\"$job\", instance=\"$instance\", type!=\"indexdb\"}) / sum(vm_rows{job=\"$job\", instance=\"$instance\", type!=\"indexdb\"})))", "format": "time_series", "hide": false, + "interval": "", "intervalFactor": 1, - "legendFormat": "max allowed", - "refId": "B" + "legendFormat": "", + "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "Cache size ($instance)", + "title": "Storage full ETA ($instance)", "tooltip": { "shared": true, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -1187,7 +1585,8 @@ data: }, "yaxes": [ { - "format": "bytes", + "decimals": null, + "format": "s", "label": null, "logBase": 1, "max": null, @@ -1214,10 +1613,9 @@ data: "dashLength": 10, "dashes": false, "datasource": "$ds", - "description": "Shows how many ongoing insertions (not API /write calls) on disk are taking place, where:\n* `max` - equal to number of CPUs;\n* `current` - current number of goroutines busy with inserting rows into underlying storage.\n\nEvery successful API /write call results into flush on disk. However, these two actions are separated and controlled via different concurrency limiters. The `max` on this panel can't be changed and always equal to number of CPUs. \n\nWhen `current` hits `max` constantly, it means storage is overloaded and requires more CPU.\n\n", + "description": "Shows how many datapoints are in the storage and what is average disk usage per datapoint.", "fieldConfig": { "defaults": { - "custom": {}, "links": [] }, "overrides": [] @@ -1228,17 +1626,15 @@ data: "h": 8, "w": 12, "x": 0, - "y": 22 + "y": 35 }, "hiddenSeries": false, - "id": 59, + "id": 30, "legend": { "alignAsTable": true, "avg": true, "current": true, - "hideEmpty": false, - "hideZero": false, - "max": false, + "max": true, "min": false, "show": true, "sort": "current", @@ -1250,15 +1646,18 @@ data: "linewidth": 1, "links": [], "nullPointMode": "null", + "options": { + "alertThreshold": true + }, "percentage": false, - "pluginVersion": "7.1.1", + "pluginVersion": "8.0.0", "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [ { - "alias": "max", - "color": "#C4162A" + "alias": "bytes-per-datapoint", + "yaxis": 2 } ], "spaceLength": 10, @@ -1266,18 +1665,19 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(vm_concurrent_addrows_capacity{job=\"$job\", instance=\"$instance\"})", + "expr": "sum(vm_rows{job=\"$job\", instance=~\"$instance\", type != \"indexdb\"})", "format": "time_series", "interval": "", "intervalFactor": 1, - "legendFormat": "max", + "legendFormat": "total datapoints", "refId": "A" }, { - "expr": "sum(vm_concurrent_addrows_current{job=\"$job\", instance=\"$instance\"})", + "expr": "sum(vm_data_size_bytes{job=\"$job\", instance=~\"$instance\", type!=\"indexdb\"}) / sum(vm_rows{job=\"$job\", instance=~\"$instance\", type != \"indexdb\"})", "format": "time_series", + "interval": "", "intervalFactor": 1, - "legendFormat": "current", + "legendFormat": "bytes-per-datapoint", "refId": "B" } ], @@ -1285,7 +1685,7 @@ data: "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "Concurrent flushes on disk ($instance)", + "title": "Datapoints ($instance)", "tooltip": { "shared": true, "sort": 2, @@ -1301,7 +1701,6 @@ data: }, "yaxes": [ { - "decimals": 0, "format": "short", "label": null, "logBase": 1, @@ -1310,8 +1709,8 @@ data: "show": true }, { - "decimals": 0, - "format": "short", + "decimals": 2, + "format": "bytes", "label": null, "logBase": 1, "max": null, @@ -1330,10 +1729,9 @@ data: "dashLength": 10, "dashes": false, "datasource": "$ds", - "description": "* `*` - unsupported query path\n* `/write` - insert into VM\n* `/metrics` - query VM system metrics\n* `/query` - query instant values\n* `/query_range` - query over a range of time\n* `/series` - match a certain label set\n* `/label/{}/values` - query a list of label values (variables mostly)", + "description": "How many datapoints are in RAM queue waiting to be written into storage. The number of pending data points should be in the range from 0 to `2*`, since VictoriaMetrics pushes pending data to persistent storage every second.", "fieldConfig": { "defaults": { - "custom": {}, "links": [] }, "overrides": [] @@ -1344,15 +1742,15 @@ data: "h": 8, "w": 12, "x": 12, - "y": 22 + "y": 35 }, "hiddenSeries": false, - "id": 35, + "id": 34, "legend": { "alignAsTable": true, "avg": true, "current": true, - "max": false, + "max": true, "min": false, "show": true, "sort": "current", @@ -1363,33 +1761,50 @@ data: "lines": true, "linewidth": 1, "links": [], - "nullPointMode": "null as zero", + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, "percentage": false, - "pluginVersion": "7.1.1", + "pluginVersion": "8.0.0", "pointradius": 2, "points": false, "renderer": "flot", - "seriesOverrides": [], + "seriesOverrides": [ + { + "alias": "pending index entries", + "yaxis": 2 + } + ], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(rate(vm_http_request_errors_total{job=\"$job\", instance=\"$instance\"}[$__interval])) by (path) > 0", + "expr": "vm_pending_rows{job=\"$job\", instance=~\"$instance\", type=\"storage\"}", "format": "time_series", + "hide": false, "intervalFactor": 1, - "legendFormat": "{{`{{`}}path{{`}}`}}", + "legendFormat": "pending datapoints", "refId": "A" + }, + { + "expr": "vm_pending_rows{job=\"$job\", instance=~\"$instance\", type=\"indexdb\"}", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "pending index entries", + "refId": "B" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "Requests error rate ($instance)", + "title": "Pending datapoints ($instance)", "tooltip": { "shared": true, - "sort": 2, + "sort": 0, "value_type": "individual" }, "type": "graph", @@ -1410,7 +1825,8 @@ data: "show": true }, { - "format": "short", + "decimals": 3, + "format": "none", "label": null, "logBase": 1, "max": null, @@ -1424,1237 +1840,727 @@ data: } }, { - "collapsed": true, + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, "datasource": "$ds", + "description": "Shows amount of on-disk space occupied by data points and the remaining disk space at `-storageDataPath`", + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, "gridPos": { - "h": 1, - "w": 24, + "h": 8, + "w": 12, "x": 0, - "y": 30 + "y": 43 }, - "id": 14, - "panels": [ + "hiddenSeries": false, + "id": 53, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.0.0", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$ds", - "description": "How many datapoints are inserted into storage per second", - "fieldConfig": { - "defaults": { - "custom": {}, - "links": [] - }, - "overrides": [] - }, - "fill": 1, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 40 - }, - "hiddenSeries": false, - "id": 10, - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "hideZero": true, - "max": false, - "min": false, - "show": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "null as zero", - "percentage": false, - "pluginVersion": "7.1.1", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(vm_rows_inserted_total{job=\"$job\", instance=\"$instance\"}[$__interval])) by (type) > 0", - "format": "time_series", - "hide": false, - "intervalFactor": 1, - "legendFormat": "{{`{{`}}type{{`}}`}}", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Datapoints ingestion rate ($instance)", - "tooltip": { - "shared": true, - "sort": 2, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "decimals": null, - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } + "expr": "sum(vm_data_size_bytes{job=\"$job\", instance=~\"$instance\", type!=\"indexdb\"})", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "Used", + "refId": "A" }, { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$ds", - "description": "Shows the time needed to reach the 100% of disk capacity based on the following params:\n* free disk space;\n* row ingestion rate;\n* dedup rate;\n* compression.\n\nUse this panel for capacity planning in order to estimate the time remaining for running out of the disk space.\n\n", - "fieldConfig": { - "defaults": { - "custom": {}, - "links": [] - }, - "overrides": [] - }, - "fill": 1, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 40 - }, - "hiddenSeries": false, - "id": 73, - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "hideZero": true, - "max": false, - "min": false, - "show": false, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "null as zero", - "percentage": false, - "pluginVersion": "7.1.1", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "vm_free_disk_space_bytes{job=\"$job\", instance=\"$instance\"} / ignoring(path) ((rate(vm_rows_added_to_storage_total{job=\"$job\", instance=\"$instance\"}[1d]) - ignoring(type) rate(vm_deduplicated_samples_total{job=\"$job\", instance=\"$instance\", type=\"merge\"}[1d])) * scalar(sum(vm_data_size_bytes{job=\"$job\", instance=\"$instance\", type!=\"indexdb\"}) / sum(vm_rows{job=\"$job\", instance=\"$instance\", type!=\"indexdb\"})))", - "format": "time_series", - "hide": false, - "interval": "", - "intervalFactor": 1, - "legendFormat": "", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Storage full ETA ($instance)", - "tooltip": { - "shared": true, - "sort": 2, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "decimals": null, - "format": "s", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } + "expr": "vm_free_disk_space_bytes{job=\"$job\", instance=\"$instance\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "Free", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Disk space usage - datapoints ($instance)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true }, { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$ds", - "description": "Shows how many datapoints are in the storage and what is average disk usage per datapoint.", - "fieldConfig": { - "defaults": { - "custom": {}, - "links": [] - }, - "overrides": [] - }, - "fill": 1, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 48 - }, - "hiddenSeries": false, - "id": 30, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": false, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "null", - "percentage": false, - "pluginVersion": "7.1.1", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "alias": "bytes-per-datapoint", - "yaxis": 2 - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(vm_rows{job=\"$job\", instance=~\"$instance\", type != \"indexdb\"})", - "format": "time_series", - "interval": "", - "intervalFactor": 1, - "legendFormat": "total datapoints", - "refId": "A" - }, - { - "expr": "sum(vm_data_size_bytes{job=\"$job\", instance=~\"$instance\", type!=\"indexdb\"}) / sum(vm_rows{job=\"$job\", instance=~\"$instance\", type != \"indexdb\"})", - "format": "time_series", - "interval": "", - "intervalFactor": 1, - "legendFormat": "bytes-per-datapoint", - "refId": "B" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Datapoints ($instance)", - "tooltip": { - "shared": true, - "sort": 2, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "decimals": 2, - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$ds", + "description": "Data parts of LSM tree.\nHigh number of parts could be an evidence of slow merge performance - check the resource utilization.\n* `indexdb` - inverted index\n* `storage/small` - recently added parts of data ingested into storage(hot data)\n* `storage/big` - small parts gradually merged into big parts (cold data)", + "fieldConfig": { + "defaults": { + "links": [] }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 43 + }, + "hiddenSeries": false, + "id": 36, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.0.0", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$ds", - "description": "How many datapoints are in RAM queue waiting to be written into storage. The number of pending data points should be in the range from 0 to `2*`, since VictoriaMetrics pushes pending data to persistent storage every second.", - "fieldConfig": { - "defaults": { - "custom": {}, - "links": [] - }, - "overrides": [] - }, - "fill": 1, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 48 - }, - "hiddenSeries": false, - "id": 34, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": false, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "null", - "percentage": false, - "pluginVersion": "7.1.1", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "alias": "pending index entries", - "yaxis": 2 - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "vm_pending_rows{job=\"$job\", instance=~\"$instance\", type=\"storage\"}", - "format": "time_series", - "hide": false, - "intervalFactor": 1, - "legendFormat": "pending datapoints", - "refId": "A" - }, - { - "expr": "vm_pending_rows{job=\"$job\", instance=~\"$instance\", type=\"indexdb\"}", - "format": "time_series", - "hide": false, - "intervalFactor": 1, - "legendFormat": "pending index entries", - "refId": "B" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Pending datapoints ($instance)", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "decimals": 3, - "format": "none", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } + "expr": "sum(vm_parts{job=\"$job\", instance=\"$instance\"}) by (type)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{`{{`}}type{{`}}`}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "LSM parts ($instance)", + "tooltip": { + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true }, { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$ds", - "description": "Shows amount of on-disk space occupied by data points and the remaining disk space at `-storageDataPath`", - "fieldConfig": { - "defaults": { - "custom": {}, - "links": [] - }, - "overrides": [] - }, - "fill": 1, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 56 - }, - "hiddenSeries": false, - "id": 53, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "null", - "percentage": false, - "pluginVersion": "7.1.1", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(vm_data_size_bytes{job=\"$job\", instance=~\"$instance\", type!=\"indexdb\"})", - "format": "time_series", - "interval": "", - "intervalFactor": 1, - "legendFormat": "Used", - "refId": "A" - }, - { - "expr": "vm_free_disk_space_bytes{job=\"$job\", instance=\"$instance\"}", - "format": "time_series", - "interval": "", - "intervalFactor": 1, - "legendFormat": "Free", - "refId": "B" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Disk space usage - datapoints ($instance)", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$ds", + "description": "Shows amount of on-disk space occupied by inverted index.", + "fieldConfig": { + "defaults": { + "links": [] }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 51 + }, + "hiddenSeries": false, + "id": 55, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.0.0", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$ds", - "description": "Data parts of LSM tree.\nHigh number of parts could be an evidence of slow merge performance - check the resource utilization.\n* `indexdb` - inverted index\n* `storage/small` - recently added parts of data ingested into storage(hot data)\n* `storage/big` - small parts gradually merged into big parts (cold data)", - "fieldConfig": { - "defaults": { - "custom": {}, - "links": [] - }, - "overrides": [] - }, - "fill": 1, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 56 - }, - "hiddenSeries": false, - "id": 36, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "null", - "percentage": false, - "pluginVersion": "7.1.1", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(vm_parts{job=\"$job\", instance=\"$instance\"}) by (type)", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "{{`{{`}}type{{`}}`}}", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "LSM parts ($instance)", - "tooltip": { - "shared": true, - "sort": 2, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } + "exemplar": true, + "expr": "vm_data_size_bytes{job=\"$job\", instance=~\"$instance\", type=\"indexdb\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "disk space used", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Disk space usage - index ($instance)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true }, { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$ds", - "description": "Shows amount of on-disk space occupied by inverted index.", - "fieldConfig": { - "defaults": { - "custom": {}, - "links": [] - }, - "overrides": [] - }, - "fill": 1, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 64 - }, - "hiddenSeries": false, - "id": 55, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": false, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "null", - "percentage": false, - "pluginVersion": "7.1.1", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "vm_data_size_bytes{job=\"$job\", instance=~\"$instance\", type=\"indexdb\"}", - "format": "time_series", - "intervalFactor": 1, - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Disk space usage - index ($instance)", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$ds", + "description": "The number of on-going merges in storage nodes. It is expected to have high numbers for `storage/small` metric.", + "fieldConfig": { + "defaults": { + "links": [] }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 51 + }, + "hiddenSeries": false, + "id": 62, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.0.0", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$ds", - "description": "The number of on-going merges in storage nodes. It is expected to have high numbers for `storage/small` metric.", - "fieldConfig": { - "defaults": { - "custom": {}, - "links": [] - }, - "overrides": [] - }, - "fill": 1, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 64 - }, - "hiddenSeries": false, - "id": 62, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "percentage": false, - "pluginVersion": "7.1.1", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(vm_active_merges{job=\"$job\", instance=\"$instance\"}) by(type)", - "legendFormat": "{{`{{`}}type{{`}}`}}", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Active merges ($instance)", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "decimals": 0, - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } + "expr": "sum(vm_active_merges{job=\"$job\", instance=\"$instance\"}) by(type)", + "legendFormat": "{{`{{`}}type{{`}}`}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Active merges ($instance)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": 0, + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true }, { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$ds", - "description": "Shows the number of bytes read/write from the storage layer.", - "fieldConfig": { - "defaults": { - "custom": {}, - "links": [] - }, - "overrides": [] - }, - "fill": 1, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 72 - }, - "hiddenSeries": false, - "id": 76, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": false, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "null", - "percentage": false, - "pluginVersion": "7.1.1", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "alias": "read", - "transform": "negative-Y" - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(process_io_storage_read_bytes_total{job=\"$job\", instance=\"$instance\"}[5m]))", - "format": "time_series", - "hide": false, - "interval": "", - "intervalFactor": 1, - "legendFormat": "read", - "refId": "A" - }, - { - "expr": "sum(rate(process_io_storage_written_bytes_total{job=\"$job\", instance=\"$instance\"}[5m]))", - "format": "time_series", - "hide": false, - "interval": "", - "intervalFactor": 1, - "legendFormat": "write", - "refId": "B" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Disk writes/reads ($instance)", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "decimals": null, - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$ds", + "description": "Shows how many rows were ignored on insertion due to corrupted or out of retention timestamps.", + "fieldConfig": { + "defaults": { + "links": [] }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 59 + }, + "hiddenSeries": false, + "id": 58, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.0.0", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$ds", - "description": "The number of rows merged per second by storage nodes.", - "fieldConfig": { - "defaults": { - "custom": {}, - "links": [] - }, - "overrides": [] - }, - "fill": 1, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 72 - }, - "hiddenSeries": false, - "id": 64, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "percentage": false, - "pluginVersion": "7.1.1", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(vm_rows_merged_total{job=\"$job\", instance=\"$instance\"}[5m])) by(type)", - "legendFormat": "{{`{{`}}type{{`}}`}}", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Merge speed ($instance)", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "decimals": 0, - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } + "exemplar": true, + "expr": "sum(vm_rows_ignored_total{job=\"$job\", instance=\"$instance\"}) by (reason)", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{`{{`}}reason{{`}}`}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Rows ignored ($instance)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": null, + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$ds", + "description": "The number of rows merged per second by storage nodes.", + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 59 + }, + "hiddenSeries": false, + "id": 64, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.0.0", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(vm_rows_merged_total{job=\"$job\", instance=\"$instance\"}[5m])) by(type)", + "legendFormat": "{{`{{`}}type{{`}}`}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Merge speed ($instance)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:867", + "decimals": 0, + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true }, { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$ds", - "description": "Shows how many rows were ignored on insertion due to corrupted or out of retention timestamps.", - "fieldConfig": { - "defaults": { - "custom": {}, - "links": [] - }, - "overrides": [] - }, - "fill": 1, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 80 - }, - "hiddenSeries": false, - "id": 58, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": false, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "null", - "percentage": false, - "pluginVersion": "7.1.1", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(vm_rows_ignored_total{job=\"$job\", instance=\"$instance\"}) by (reason) > 0", - "format": "time_series", - "hide": false, - "intervalFactor": 1, - "legendFormat": "{{`{{`}}reason{{`}}`}}", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Rows ignored ($instance)", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "decimals": null, - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } + "$$hashKey": "object:868", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$ds", + "description": "Shows the rate of logging the messages by their level. Unexpected spike in rate is a good reason to check logs.", + "fieldConfig": { + "defaults": { + "links": [] }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 67 + }, + "hiddenSeries": false, + "id": 67, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.0.0", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$ds", - "description": "Shows the rate of logging the messages by their level. Unexpected spike in rate is a good reason to check logs.", - "fieldConfig": { - "defaults": { - "custom": {}, - "links": [] - }, - "overrides": [] - }, - "fill": 1, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 80 - }, - "hiddenSeries": false, - "id": 67, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "null", - "percentage": false, - "pluginVersion": "7.1.1", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(vm_log_messages_total{job=\"$job\", instance=\"$instance\"}[5m])) by (level) ", - "format": "time_series", - "hide": false, - "intervalFactor": 1, - "legendFormat": "{{`{{`}}level{{`}}`}}", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Logging rate ($instance)", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "decimals": null, - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } + "expr": "sum(rate(vm_log_messages_total{job=\"$job\", instance=\"$instance\"}[5m])) by (level) ", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "{{`{{`}}level{{`}}`}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Logging rate ($instance)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": null, + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true } ], - "title": "Storage", - "type": "row" + "yaxis": { + "align": false, + "alignLevel": null + } }, { "collapsed": true, @@ -2663,7 +2569,7 @@ data: "h": 1, "w": 24, "x": 0, - "y": 31 + "y": 75 }, "id": 71, "panels": [ @@ -2676,7 +2582,6 @@ data: "description": "Shows the rate and total number of new series created over last 24h.\n\nHigh churn rate tightly connected with database performance and may result in unexpected OOM's or slow queries. It is recommended to always keep an eye on this metric to avoid unexpected cardinality \"explosions\".\n\nThe higher churn rate is, the more resources required to handle it. Consider to keep the churn rate as low as possible.\n\nGood references to read:\n* https://www.robustperception.io/cardinality-is-key\n* https://www.robustperception.io/using-tsdb-analyze-to-investigate-churn-and-cardinality", "fieldConfig": { "defaults": { - "custom": {}, "links": [] }, "overrides": [] @@ -2692,19 +2597,25 @@ data: "hiddenSeries": false, "id": 66, "legend": { - "avg": false, - "current": false, - "max": false, + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, "min": false, "show": true, + "sort": "current", + "sortDesc": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, "nullPointMode": "null", + "options": { + "alertThreshold": true + }, "percentage": false, - "pluginVersion": "7.1.1", + "pluginVersion": "8.0.0", "pointradius": 2, "points": false, "renderer": "flot", @@ -2781,7 +2692,6 @@ data: "description": "Slow queries rate according to `search.logSlowQueryDuration` flag, which is `5s` by default.", "fieldConfig": { "defaults": { - "custom": {}, "links": [] }, "overrides": [] @@ -2797,20 +2707,26 @@ data: "hiddenSeries": false, "id": 60, "legend": { - "avg": false, - "current": false, - "max": false, + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, "min": false, - "show": false, + "show": true, + "sort": "current", + "sortDesc": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", + "options": { + "alertThreshold": true + }, "percentage": false, - "pluginVersion": "7.1.1", + "pluginVersion": "8.0.0", "pointradius": 2, "points": false, "renderer": "flot", @@ -2879,7 +2795,6 @@ data: "description": "The percentage of slow inserts comparing to total insertion rate during the last 5 minutes. \n\nThe less value is better. If percentage remains high (>50%) during extended periods of time, then it is likely more RAM is needed for optimal handling of the current number of active time series. \n\nIn general, VictoriaMetrics requires ~1KB or RAM per active time series, so it should be easy calculating the required amounts of RAM for the current workload according to capacity planning docs. But the resulting number may be far from the real number because the required amounts of memory depends on may other factors such as the number of labels per time series and the length of label values.", "fieldConfig": { "defaults": { - "custom": {}, "links": [] }, "overrides": [] @@ -2895,20 +2810,26 @@ data: "hiddenSeries": false, "id": 68, "legend": { - "avg": false, - "current": false, - "max": false, + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, "min": false, - "show": false, + "show": true, + "sort": "current", + "sortDesc": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", + "options": { + "alertThreshold": true + }, "percentage": false, - "pluginVersion": "7.1.1", + "pluginVersion": "8.0.0", "pointradius": 2, "points": false, "renderer": "flot", @@ -2978,7 +2899,6 @@ data: "description": "VictoriaMetrics limits the number of labels per each metric with `-maxLabelsPerTimeseries` command-line flag.\n\nThis prevents from ingesting metrics with too many labels. The value of `maxLabelsPerTimeseries` must be adjusted for your workload.\n\nWhen limit is exceeded (graph is > 0) - extra labels are dropped, which could result in unexpected identical time series.", "fieldConfig": { "defaults": { - "custom": {}, "links": [] }, "overrides": [] @@ -2994,20 +2914,24 @@ data: "hiddenSeries": false, "id": 74, "legend": { - "avg": false, - "current": false, - "max": false, + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, "min": false, "show": false, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", + "options": { + "alertThreshold": true + }, "percentage": false, - "pluginVersion": "7.1.1", + "pluginVersion": "8.0.0", "pointradius": 2, "points": false, "renderer": "flot", @@ -3017,12 +2941,13 @@ data: "steppedLine": false, "targets": [ { + "exemplar": true, "expr": "sum(increase(vm_metrics_with_dropped_labels_total{job=\"$job\", instance=\"$instance\"}[5m]))", "format": "time_series", "hide": false, "interval": "", "intervalFactor": 1, - "legendFormat": "", + "legendFormat": "limit exceeded", "refId": "A" } ], @@ -3079,7 +3004,7 @@ data: "h": 1, "w": 24, "x": 0, - "y": 32 + "y": 76 }, "id": 46, "panels": [ @@ -3092,7 +3017,6 @@ data: "description": "", "fieldConfig": { "defaults": { - "custom": {}, "links": [] }, "overrides": [] @@ -3103,25 +3027,31 @@ data: "h": 8, "w": 12, "x": 0, - "y": 103 + "y": 29 }, "hiddenSeries": false, "id": 44, "legend": { - "avg": false, - "current": false, - "max": false, + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, "min": false, - "show": false, + "show": true, + "sort": "current", + "sortDesc": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", + "options": { + "alertThreshold": true + }, "percentage": false, - "pluginVersion": "7.1.1", + "pluginVersion": "8.0.0", "pointradius": 2, "points": false, "renderer": "flot", @@ -3162,6 +3092,16 @@ data: "intervalFactor": 1, "legendFormat": "resident", "refId": "D" + }, + { + "exemplar": true, + "expr": "sum(process_resident_memory_anon_bytes{job=\"$job\", instance=\"$instance\"})", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "resident anonymous", + "refId": "E" } ], "thresholds": [], @@ -3213,7 +3153,6 @@ data: "datasource": "$ds", "fieldConfig": { "defaults": { - "custom": {}, "links": [] }, "overrides": [] @@ -3224,25 +3163,31 @@ data: "h": 8, "w": 12, "x": 12, - "y": 103 + "y": 29 }, "hiddenSeries": false, "id": 57, "legend": { - "avg": false, - "current": false, - "max": false, + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, "min": false, "show": true, + "sort": "current", + "sortDesc": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", + "options": { + "alertThreshold": true + }, "percentage": false, - "pluginVersion": "7.1.1", + "pluginVersion": "8.0.0", "pointradius": 2, "points": false, "renderer": "flot", @@ -3310,7 +3255,6 @@ data: "description": "Panel shows the number of open file descriptors in the OS.\nReaching the limit of open files can cause various issues and must be prevented.\n\nSee how to change limits here https://medium.com/@muhammadtriwibowo/set-permanently-ulimit-n-open-files-in-ubuntu-4d61064429a", "fieldConfig": { "defaults": { - "custom": {}, "links": [] }, "overrides": [] @@ -3321,25 +3265,31 @@ data: "h": 8, "w": 12, "x": 0, - "y": 111 + "y": 37 }, "hiddenSeries": false, "id": 75, "legend": { - "avg": false, - "current": false, - "max": false, + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, "min": false, - "show": false, + "show": true, + "sort": "current", + "sortDesc": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", + "options": { + "alertThreshold": true + }, "percentage": false, - "pluginVersion": "7.1.1", + "pluginVersion": "8.0.0", "pointradius": 2, "points": false, "renderer": "flot", @@ -3418,10 +3368,9 @@ data: "dashLength": 10, "dashes": false, "datasource": "$ds", - "description": "Shows avg GC duration", + "description": "Shows the number of bytes read/write from the storage layer.", "fieldConfig": { "defaults": { - "custom": {}, "links": [] }, "overrides": [] @@ -3432,46 +3381,68 @@ data: "h": 8, "w": 12, "x": 12, - "y": 111 + "y": 37 }, "hiddenSeries": false, - "id": 42, + "id": 76, "legend": { - "avg": false, - "current": false, - "max": false, + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, "min": false, - "show": false, + "show": true, + "sort": "current", + "sortDesc": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", + "options": { + "alertThreshold": true + }, "percentage": false, - "pluginVersion": "7.1.1", + "pluginVersion": "8.0.0", "pointradius": 2, "points": false, "renderer": "flot", - "seriesOverrides": [], + "seriesOverrides": [ + { + "alias": "read", + "transform": "negative-Y" + } + ], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(rate(go_gc_duration_seconds_sum{job=\"$job\", instance=\"$instance\"}[5m]))\n/\nsum(rate(go_gc_duration_seconds_count{job=\"$job\", instance=\"$instance\"}[5m]))", + "expr": "sum(rate(process_io_storage_read_bytes_total{job=\"$job\", instance=\"$instance\"}[5m]))", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "avg gc duration", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "read", "refId": "A" + }, + { + "expr": "sum(rate(process_io_storage_written_bytes_total{job=\"$job\", instance=\"$instance\"}[5m]))", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "write", + "refId": "B" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "GC duration ($instance)", + "title": "Disk writes/reads ($instance)", "tooltip": { "shared": true, "sort": 0, @@ -3487,11 +3458,12 @@ data: }, "yaxes": [ { - "format": "s", + "decimals": null, + "format": "bytes", "label": null, "logBase": 1, "max": null, - "min": "0", + "min": null, "show": true }, { @@ -3516,7 +3488,6 @@ data: "datasource": "$ds", "fieldConfig": { "defaults": { - "custom": {}, "links": [] }, "overrides": [] @@ -3527,25 +3498,31 @@ data: "h": 8, "w": 12, "x": 0, - "y": 119 + "y": 45 }, "hiddenSeries": false, "id": 47, "legend": { - "avg": false, - "current": false, - "max": false, + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, "min": false, - "show": false, + "show": true, + "sort": "current", + "sortDesc": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", + "options": { + "alertThreshold": true + }, "percentage": false, - "pluginVersion": "7.1.1", + "pluginVersion": "8.0.0", "pointradius": 2, "points": false, "renderer": "flot", @@ -3610,10 +3587,9 @@ data: "dashLength": 10, "dashes": false, "datasource": "$ds", - "description": "", + "description": "Shows avg GC duration", "fieldConfig": { "defaults": { - "custom": {}, "links": [] }, "overrides": [] @@ -3624,25 +3600,31 @@ data: "h": 8, "w": 12, "x": 12, - "y": 119 + "y": 45 }, "hiddenSeries": false, - "id": 37, + "id": 42, "legend": { - "avg": false, - "current": false, - "max": false, + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, "min": false, - "show": false, + "show": true, + "sort": "current", + "sortDesc": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", + "options": { + "alertThreshold": true + }, "percentage": false, - "pluginVersion": "7.1.1", + "pluginVersion": "8.0.0", "pointradius": 2, "points": false, "renderer": "flot", @@ -3652,11 +3634,10 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(vm_tcplistener_conns{job=\"$job\", instance=\"$instance\"})", + "expr": "sum(rate(go_gc_duration_seconds_sum{job=\"$job\", instance=\"$instance\"}[5m]))\n/\nsum(rate(go_gc_duration_seconds_count{job=\"$job\", instance=\"$instance\"}[5m]))", "format": "time_series", - "hide": false, - "intervalFactor": 1, - "legendFormat": "connections", + "intervalFactor": 2, + "legendFormat": "avg gc duration", "refId": "A" } ], @@ -3664,7 +3645,7 @@ data: "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "TCP connections ($instance)", + "title": "GC duration ($instance)", "tooltip": { "shared": true, "sort": 0, @@ -3680,8 +3661,7 @@ data: }, "yaxes": [ { - "decimals": null, - "format": "short", + "format": "s", "label": null, "logBase": 1, "max": null, @@ -3710,7 +3690,6 @@ data: "datasource": "$ds", "fieldConfig": { "defaults": { - "custom": {}, "links": [] }, "overrides": [] @@ -3721,25 +3700,31 @@ data: "h": 8, "w": 12, "x": 0, - "y": 127 + "y": 53 }, "hiddenSeries": false, "id": 48, "legend": { - "avg": false, - "current": false, - "max": false, + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, "min": false, - "show": false, + "show": true, + "sort": "current", + "sortDesc": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", + "options": { + "alertThreshold": true + }, "percentage": false, - "pluginVersion": "7.1.1", + "pluginVersion": "8.0.0", "pointradius": 2, "points": false, "renderer": "flot", @@ -3807,7 +3792,6 @@ data: "description": "", "fieldConfig": { "defaults": { - "custom": {}, "links": [] }, "overrides": [] @@ -3818,25 +3802,134 @@ data: "h": 8, "w": 12, "x": 12, - "y": 127 + "y": 53 + }, + "hiddenSeries": false, + "id": 37, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.0.0", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(vm_tcplistener_conns{job=\"$job\", instance=\"$instance\"})", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "connections", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "TCP connections ($instance)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": null, + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$ds", + "description": "", + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 61 }, "hiddenSeries": false, "id": 49, "legend": { - "avg": false, - "current": false, - "max": false, + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, "min": false, - "show": false, + "show": true, + "sort": "current", + "sortDesc": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", + "options": { + "alertThreshold": true + }, "percentage": false, - "pluginVersion": "7.1.1", + "pluginVersion": "8.0.0", "pointradius": 2, "points": false, "renderer": "flot", @@ -3902,9 +3995,12 @@ data: } ], "refresh": "30s", - "schemaVersion": 26, + "schemaVersion": 30, "style": "dark", - "tags": [], + "tags": [ + "victoriametrics", + "vmsingle" + ], "templating": { "list": [ { @@ -3913,6 +4009,8 @@ data: "text": "VictoriaMetrics", "value": "VictoriaMetrics" }, + "description": null, + "error": null, "hide": 0, "includeAll": false, "label": null, @@ -3931,19 +4029,23 @@ data: "current": {}, "datasource": "$ds", "definition": "label_values(vm_app_version{version=~\"victoria-metrics-.*\"}, job)", + "description": null, + "error": null, "hide": 0, "includeAll": false, "label": null, "multi": false, "name": "job", "options": [], - "query": "label_values(vm_app_version{version=~\"victoria-metrics-.*\"}, job)", + "query": { + "query": "label_values(vm_app_version{version=~\"victoria-metrics-.*\"}, job)", + "refId": "VictoriaMetrics-job-Variable-Query" + }, "refresh": 1, "regex": "", "skipUrlSync": false, "sort": 0, "tagValuesQuery": "", - "tags": [], "tagsQuery": "", "type": "query", "useTags": false @@ -3953,19 +4055,23 @@ data: "current": {}, "datasource": "$ds", "definition": "label_values(vm_app_version{job=\"$job\", instance=\"$instance\"}, version)", + "description": null, + "error": null, "hide": 2, "includeAll": false, "label": null, "multi": false, "name": "version", "options": [], - "query": "label_values(vm_app_version{job=\"$job\", instance=\"$instance\"}, version)", + "query": { + "query": "label_values(vm_app_version{job=\"$job\", instance=\"$instance\"}, version)", + "refId": "VictoriaMetrics-version-Variable-Query" + }, "refresh": 1, "regex": "/.*-tags-(v\\d+\\.\\d+\\.\\d+)/", "skipUrlSync": false, "sort": 2, "tagValuesQuery": "", - "tags": [], "tagsQuery": "", "type": "query", "useTags": false @@ -3975,19 +4081,23 @@ data: "current": {}, "datasource": "$ds", "definition": "label_values(vm_app_version{job=~\"$job\"}, instance)", + "description": null, + "error": null, "hide": 0, "includeAll": false, "label": null, "multi": false, "name": "instance", "options": [], - "query": "label_values(vm_app_version{job=~\"$job\"}, instance)", + "query": { + "query": "label_values(vm_app_version{job=~\"$job\"}, instance)", + "refId": "VictoriaMetrics-instance-Variable-Query" + }, "refresh": 1, "regex": "", "skipUrlSync": false, "sort": 0, "tagValuesQuery": "", - "tags": [], "tagsQuery": "", "type": "query", "useTags": false diff --git a/charts/victoria-metrics-k8s-stack/templates/grafana/dashboards/vmagent.yaml b/charts/victoria-metrics-k8s-stack/templates/grafana/dashboards/vmagent.yaml index 4ab3d2dbd..f6e188fe4 100644 --- a/charts/victoria-metrics-k8s-stack/templates/grafana/dashboards/vmagent.yaml +++ b/charts/victoria-metrics-k8s-stack/templates/grafana/dashboards/vmagent.yaml @@ -24,12 +24,12 @@ data: "type": "grafana", "id": "grafana", "name": "Grafana", - "version": "7.1.1" + "version": "8.0.0" }, { "type": "panel", "id": "graph", - "name": "Graph", + "name": "Graph (old)", "version": "" }, { @@ -70,12 +70,12 @@ data: } ] }, - "description": "Overview for VictoriaMetrics vmagent v1.56.0 or higher", + "description": "Overview for VictoriaMetrics vmagent v1.57.0 or higher", "editable": true, "gnetId": null, "graphTooltip": 1, "id": null, - "iteration": 1616957263139, + "iteration": 1623414948941, "links": [ { "icon": "doc", @@ -107,6 +107,10 @@ data: { "collapsed": false, "datasource": "$ds", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, "gridPos": { "h": 1, "w": 24, @@ -123,7 +127,6 @@ data: "description": "Shows total number of all configured scrape targets in state \"up\".\n\nSee `http://vmagent-host:8429/targets` to get list of all targets. \n", "fieldConfig": { "defaults": { - "custom": {}, "mappings": [], "thresholds": { "mode": "absolute", @@ -156,9 +159,10 @@ data: "fields": "", "values": false }, + "text": {}, "textMode": "auto" }, - "pluginVersion": "7.1.1", + "pluginVersion": "8.0.0", "targets": [ { "expr": "sum(vm_promscrape_targets{job=~\"$job\", instance=~\"$instance\", status=\"up\"})", @@ -177,7 +181,6 @@ data: "description": "Shows total number of all configured scrape targets in state \"down\".\n\nSee `http://vmagent-host:8429/targets` to get list of all targets. \n", "fieldConfig": { "defaults": { - "custom": {}, "mappings": [], "thresholds": { "mode": "absolute", @@ -220,9 +223,10 @@ data: "fields": "", "values": false }, + "text": {}, "textMode": "auto" }, - "pluginVersion": "7.1.1", + "pluginVersion": "8.0.0", "targets": [ { "expr": "sum(vm_promscrape_targets{job=~\"$job\", instance=~\"$instance\", status=\"down\"})", @@ -241,7 +245,6 @@ data: "description": "Shows number of generated error messages in logs over last 30m. Non-zero value may be a sign of connectivity or missconfiguration errors.", "fieldConfig": { "defaults": { - "custom": {}, "mappings": [], "min": 0, "thresholds": { @@ -287,9 +290,10 @@ data: "fields": "", "values": false }, + "text": {}, "textMode": "auto" }, - "pluginVersion": "7.1.1", + "pluginVersion": "8.0.0", "targets": [ { "expr": "sum(increase(vm_log_messages_total{job=~\"$job\", instance=~\"$instance\", level!=\"info\"}[30m]))", @@ -308,7 +312,6 @@ data: "description": "Persistent queue size shows size of pending samples in bytes which hasn't been flushed to remote storage yet. \nIncreasing of value might be a sign of connectivity issues. In such cases, vmagent starts to flush pending data on disk with attempt to send it later once connection is restored.", "fieldConfig": { "defaults": { - "custom": {}, "mappings": [], "thresholds": { "mode": "absolute", @@ -346,9 +349,10 @@ data: "fields": "", "values": false }, + "text": {}, "textMode": "auto" }, - "pluginVersion": "7.1.1", + "pluginVersion": "8.0.0", "targets": [ { "expr": "sum(vm_persistentqueue_bytes_pending{job=~\"$job\", instance=~\"$instance\"})", @@ -365,12 +369,6 @@ data: { "columns": [], "datasource": "$ds", - "fieldConfig": { - "defaults": { - "custom": {} - }, - "overrides": [] - }, "fontSize": "100%", "gridPos": { "h": 7, @@ -383,7 +381,7 @@ data: "scroll": true, "showHeader": true, "sort": { - "col": null, + "col": 3, "desc": false }, "styles": [ @@ -467,7 +465,6 @@ data: "datasource": "$ds", "fieldConfig": { "defaults": { - "custom": {}, "links": [] }, "overrides": [] @@ -501,8 +498,11 @@ data: "lines": true, "linewidth": 1, "nullPointMode": "null as zero", + "options": { + "alertThreshold": true + }, "percentage": false, - "pluginVersion": "7.1.1", + "pluginVersion": "8.0.0", "pointradius": 2, "points": false, "renderer": "flot", @@ -572,7 +572,6 @@ data: "description": "Shows in/out samples rate including push and pull models. \n\nThe out-rate could be different to in-rate because of replication or additional timeseries added by vmagent for every scraped target.", "fieldConfig": { "defaults": { - "custom": {}, "links": [] }, "overrides": [] @@ -588,19 +587,25 @@ data: "hiddenSeries": false, "id": 5, "legend": { - "avg": false, - "current": false, - "max": false, + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, "min": false, - "show": false, + "show": true, + "sort": "current", + "sortDesc": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, "nullPointMode": "null", + "options": { + "alertThreshold": true + }, "percentage": false, - "pluginVersion": "7.1.1", + "pluginVersion": "8.0.0", "pointradius": 2, "points": false, "renderer": "flot", @@ -678,7 +683,6 @@ data: "description": "Shows the rate of requests served by vmagent HTTP server.", "fieldConfig": { "defaults": { - "custom": {}, "links": [] }, "overrides": [] @@ -694,20 +698,26 @@ data: "hiddenSeries": false, "id": 15, "legend": { - "avg": false, - "current": false, - "max": false, + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, "min": false, - "show": false, + "show": true, + "sort": "current", + "sortDesc": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", + "options": { + "alertThreshold": true + }, "percentage": false, - "pluginVersion": "7.1.1", + "pluginVersion": "8.0.0", "pointradius": 2, "points": false, "renderer": "flot", @@ -773,7 +783,6 @@ data: "description": "Network usage shows the bytes rate for data accepted by vmagent and pushed via remotewrite protocol.\nDiscrepancies are possible because of different protocols used for ingesting, scraping and writing data.", "fieldConfig": { "defaults": { - "custom": {}, "links": [] }, "overrides": [] @@ -789,19 +798,26 @@ data: "hiddenSeries": false, "id": 7, "legend": { - "avg": false, - "current": false, - "max": false, + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, "min": false, - "show": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, "nullPointMode": "null", + "options": { + "alertThreshold": true + }, "percentage": false, - "pluginVersion": "7.1.1", + "pluginVersion": "8.0.0", "pointradius": 2, "points": false, "renderer": "flot", @@ -878,7 +894,6 @@ data: "description": "Errors rate shows rate for multiple metrics that track possible errors in vmagent, such as network or parsing errors.", "fieldConfig": { "defaults": { - "custom": {}, "links": [] }, "overrides": [] @@ -894,13 +909,16 @@ data: "hiddenSeries": false, "id": 69, "legend": { - "avg": false, - "current": false, - "max": false, + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, "min": false, - "show": false, + "show": true, + "sort": "current", + "sortDesc": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, @@ -912,8 +930,11 @@ data: } ], "nullPointMode": "null as zero", + "options": { + "alertThreshold": true + }, "percentage": false, - "pluginVersion": "7.1.1", + "pluginVersion": "8.0.0", "pointradius": 2, "points": false, "renderer": "flot", @@ -1003,7 +1024,6 @@ data: "description": "Shows rate of dropped samples from persistent queue. VMagent drops samples from queue if in-memory and on-disk queues are full and it is unable to flush them to remote storage.\nThe max size of on-disk queue is configured by `-remoteWrite.maxDiskUsagePerURL` flag.", "fieldConfig": { "defaults": { - "custom": {}, "links": [] }, "overrides": [] @@ -1019,13 +1039,16 @@ data: "hiddenSeries": false, "id": 49, "legend": { - "avg": false, - "current": false, - "max": false, + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, "min": false, - "show": false, + "show": true, + "sort": "current", + "sortDesc": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, @@ -1037,8 +1060,11 @@ data: } ], "nullPointMode": "null", + "options": { + "alertThreshold": true + }, "percentage": false, - "pluginVersion": "7.1.1", + "pluginVersion": "8.0.0", "pointradius": 2, "points": false, "renderer": "flot", @@ -1104,7 +1130,6 @@ data: "description": "Shows the persistent queue size of pending samples in bytes which hasn't been flushed to remote storage yet. \n\nIncreasing of value might be a sign of connectivity issues. In such cases, vmagent starts to flush pending data on disk with attempt to send it later once connection is restored.\n\nRemote write URLs are hidden by default but might be unveiled once `-remoteWrite.showURL` is set to true.", "fieldConfig": { "defaults": { - "custom": {}, "links": [] }, "overrides": [] @@ -1120,13 +1145,16 @@ data: "hiddenSeries": false, "id": 17, "legend": { - "avg": false, - "current": false, - "max": false, + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, "min": false, - "show": false, + "show": true, + "sort": "current", + "sortDesc": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, @@ -1137,8 +1165,11 @@ data: } ], "nullPointMode": "null", + "options": { + "alertThreshold": true + }, "percentage": false, - "pluginVersion": "7.1.1", + "pluginVersion": "8.0.0", "pointradius": 2, "points": false, "renderer": "flot", @@ -1204,7 +1235,6 @@ data: "description": "Shows the rate of dropped samples due to relabeling. \nMetric tracks drops for `-remoteWrite.relabelConfig` configuration only.", "fieldConfig": { "defaults": { - "custom": {}, "links": [] }, "overrides": [] @@ -1220,13 +1250,16 @@ data: "hiddenSeries": false, "id": 18, "legend": { - "avg": false, - "current": false, - "max": false, + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, "min": false, - "show": false, + "show": true, + "sort": "current", + "sortDesc": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, @@ -1238,8 +1271,11 @@ data: } ], "nullPointMode": "null", + "options": { + "alertThreshold": true + }, "percentage": false, - "pluginVersion": "7.1.1", + "pluginVersion": "8.0.0", "pointradius": 2, "points": false, "renderer": "flot", @@ -1311,7 +1347,6 @@ data: "description": "Shows the rate of dropped data blocks in cases when remote storage replies with `400 Bad Request` and `409 Conflict` HTTP responses.\n\nSee https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1149", "fieldConfig": { "defaults": { - "custom": {}, "links": [] }, "overrides": [] @@ -1327,20 +1362,26 @@ data: "hiddenSeries": false, "id": 79, "legend": { - "avg": false, - "current": false, - "max": false, + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, "min": false, - "show": false, + "show": true, + "sort": "current", + "sortDesc": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", + "options": { + "alertThreshold": true + }, "percentage": false, - "pluginVersion": "7.1.1", + "pluginVersion": "8.0.0", "pointradius": 2, "points": false, "renderer": "flot", @@ -1350,9 +1391,10 @@ data: "steppedLine": false, "targets": [ { + "exemplar": true, "expr": "sum(rate(vmagent_remotewrite_packets_dropped_total{job=~\"$job\", instance=~\"$instance\"}[$__interval]))", "interval": "", - "legendFormat": "", + "legendFormat": "dropped", "refId": "A" } ], @@ -1400,6 +1442,10 @@ data: { "collapsed": true, "datasource": "$ds", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, "gridPos": { "h": 1, "w": 24, @@ -1416,7 +1462,6 @@ data: "datasource": "$ds", "fieldConfig": { "defaults": { - "custom": {}, "links": [] }, "overrides": [] @@ -1432,19 +1477,25 @@ data: "hiddenSeries": false, "id": 48, "legend": { - "avg": false, - "current": false, - "max": false, + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, "min": false, - "show": false, + "show": true, + "sort": "current", + "sortDesc": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, "nullPointMode": "null", + "options": { + "alertThreshold": true + }, "percentage": false, - "pluginVersion": "7.1.1", + "pluginVersion": "8.0.0", "pointradius": 2, "points": false, "renderer": "flot", @@ -1454,7 +1505,8 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(vm_promscrape_targets{job=~\"$job\", instance=~\"$instance\", status=\"up\"}) by(type)", + "exemplar": true, + "expr": "sum(vm_promscrape_targets{job=~\"$job\", instance=~\"$instance\", status=\"up\"}) by(type) > 0", "format": "time_series", "interval": "", "legendFormat": "{{`{{`}}type{{`}}`}}", @@ -1471,7 +1523,6 @@ data: "sort": 2, "value_type": "individual" }, - "transparent": true, "type": "graph", "xaxis": { "buckets": null, @@ -1511,7 +1562,6 @@ data: "datasource": "$ds", "fieldConfig": { "defaults": { - "custom": {}, "links": [] }, "overrides": [] @@ -1527,19 +1577,25 @@ data: "hiddenSeries": false, "id": 76, "legend": { - "avg": false, - "current": false, - "max": false, + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, "min": false, - "show": false, + "show": true, + "sort": "current", + "sortDesc": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, "nullPointMode": "null", + "options": { + "alertThreshold": true + }, "percentage": false, - "pluginVersion": "7.1.1", + "pluginVersion": "8.0.0", "pointradius": 2, "points": false, "renderer": "flot", @@ -1549,6 +1605,7 @@ data: "steppedLine": false, "targets": [ { + "exemplar": true, "expr": "sum(vm_promscrape_targets{job=~\"$job\", instance=~\"$instance\", status=\"down\"}) by(type) > 0", "format": "time_series", "interval": "", @@ -1566,7 +1623,6 @@ data: "sort": 2, "value_type": "individual" }, - "transparent": true, "type": "graph", "xaxis": { "buckets": null, @@ -1606,7 +1662,6 @@ data: "datasource": "$ds", "fieldConfig": { "defaults": { - "custom": {}, "links": [] }, "overrides": [] @@ -1622,19 +1677,25 @@ data: "hiddenSeries": false, "id": 20, "legend": { - "avg": false, - "current": false, - "max": false, + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, "min": false, - "show": false, + "show": true, + "sort": "current", + "sortDesc": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, "nullPointMode": "null", + "options": { + "alertThreshold": true + }, "percentage": false, - "pluginVersion": "7.1.1", + "pluginVersion": "8.0.0", "pointradius": 2, "points": false, "renderer": "flot", @@ -1710,7 +1771,6 @@ data: "datasource": "$ds", "fieldConfig": { "defaults": { - "custom": {}, "links": [] }, "overrides": [] @@ -1726,19 +1786,25 @@ data: "hiddenSeries": false, "id": 31, "legend": { - "avg": false, - "current": false, - "max": false, + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, "min": false, - "show": false, + "show": true, + "sort": "current", + "sortDesc": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, "nullPointMode": "null", + "options": { + "alertThreshold": true + }, "percentage": false, - "pluginVersion": "7.1.1", + "pluginVersion": "8.0.0", "pointradius": 2, "points": false, "renderer": "flot", @@ -1821,7 +1887,6 @@ data: "datasource": "$ds", "fieldConfig": { "defaults": { - "custom": {}, "links": [] }, "overrides": [] @@ -1837,19 +1902,25 @@ data: "hiddenSeries": false, "id": 46, "legend": { - "avg": false, - "current": false, - "max": false, + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, "min": false, "show": true, + "sort": "current", + "sortDesc": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, "nullPointMode": "null", + "options": { + "alertThreshold": true + }, "percentage": false, - "pluginVersion": "7.1.1", + "pluginVersion": "8.0.0", "pointradius": 2, "points": false, "renderer": "flot", @@ -1928,12 +1999,6 @@ data: "dataFormat": "tsbuckets", "datasource": "$ds", "description": "works in vm only disclaimer", - "fieldConfig": { - "defaults": { - "custom": {} - }, - "overrides": [] - }, "gridPos": { "h": 8, "w": 12, @@ -1991,6 +2056,10 @@ data: { "collapsed": true, "datasource": "$ds", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, "gridPos": { "h": 1, "w": 24, @@ -2008,7 +2077,6 @@ data: "description": "Shows the rate of write requests served by ingestserver (UDP, TCP connections) and HTTP server.", "fieldConfig": { "defaults": { - "custom": {}, "links": [] }, "overrides": [] @@ -2024,20 +2092,26 @@ data: "hiddenSeries": false, "id": 73, "legend": { - "avg": false, - "current": false, - "max": false, + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, "min": false, - "show": false, + "show": true, + "sort": "current", + "sortDesc": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", + "options": { + "alertThreshold": true + }, "percentage": false, - "pluginVersion": "7.1.1", + "pluginVersion": "8.0.0", "pointradius": 2, "points": false, "renderer": "flot", @@ -2047,13 +2121,15 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(rate(vm_ingestserver_requests_total{job=~\"$job\", instance=~\"$instance\", path!~\"/favicon.ico\"}[$__interval])) by(type, net)", + "exemplar": true, + "expr": "sum(rate(vm_ingestserver_requests_total{job=~\"$job\", instance=~\"$instance\", path!~\"/favicon.ico\"}[$__interval])) by(type, net) > 0", "interval": "", "legendFormat": "{{`{{`}} type {{`}}`}} ({{`{{`}}net{{`}}`}})", "refId": "A" }, { - "expr": "sum(rate(vmagent_http_requests_total{job=~\"$job\", instance=~\"$instance\", protocol!=\"\"}[$__interval])) by(protocol)", + "exemplar": true, + "expr": "sum(rate(vmagent_http_requests_total{job=~\"$job\", instance=~\"$instance\", protocol!=\"\"}[$__interval])) by(protocol) > 0", "interval": "", "legendFormat": "{{`{{`}} protocol {{`}}`}} (http)", "refId": "B" @@ -2109,7 +2185,6 @@ data: "description": "Shows the rate of write errors in ingestserver (UDP, TCP connections) and HTTP server.", "fieldConfig": { "defaults": { - "custom": {}, "links": [] }, "overrides": [] @@ -2125,20 +2200,26 @@ data: "hiddenSeries": false, "id": 77, "legend": { - "avg": false, - "current": false, - "max": false, + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, "min": false, - "show": false, + "show": true, + "sort": "current", + "sortDesc": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", + "options": { + "alertThreshold": true + }, "percentage": false, - "pluginVersion": "7.1.1", + "pluginVersion": "8.0.0", "pointradius": 2, "points": false, "renderer": "flot", @@ -2148,13 +2229,15 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(rate(vm_ingestserver_request_errors_total{job=~\"$job\", instance=~\"$instance\", path!~\"/favicon.ico\"}[$__interval])) by(type, net)", + "exemplar": true, + "expr": "sum(rate(vm_ingestserver_request_errors_total{job=~\"$job\", instance=~\"$instance\", path!~\"/favicon.ico\"}[$__interval])) by(type, net) > 0", "interval": "", "legendFormat": "{{`{{`}} type {{`}}`}} ({{`{{`}}net{{`}}`}})", "refId": "A" }, { - "expr": "sum(rate(vmagent_http_request_errors_total{job=~\"$job\", instance=~\"$instance\", protocol!=\"\"}[$__interval])) by(protocol)", + "exemplar": true, + "expr": "sum(rate(vmagent_http_request_errors_total{job=~\"$job\", instance=~\"$instance\", protocol!=\"\"}[$__interval])) by(protocol) > 0", "interval": "", "legendFormat": "{{`{{`}} protocol {{`}}`}} (http)", "refId": "B" @@ -2210,7 +2293,6 @@ data: "description": "Shows the rate of parsed rows from write or scrape requests.", "fieldConfig": { "defaults": { - "custom": {}, "links": [] }, "overrides": [] @@ -2226,20 +2308,26 @@ data: "hiddenSeries": false, "id": 78, "legend": { - "avg": false, - "current": false, - "max": false, + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, "min": false, - "show": false, + "show": true, + "sort": "current", + "sortDesc": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", + "options": { + "alertThreshold": true + }, "percentage": false, - "pluginVersion": "7.1.1", + "pluginVersion": "8.0.0", "pointradius": 2, "points": false, "renderer": "flot", @@ -2249,9 +2337,10 @@ data: "steppedLine": false, "targets": [ { + "exemplar": true, "expr": "sum(rate(vm_protoparser_rows_read_total{job=~\"$job\", instance=~\"$instance\"}[$__interval])) by(type)", "interval": "", - "legendFormat": "{{`{{`}} type {{`}}`}} ({{`{{`}}net{{`}}`}})", + "legendFormat": "{{`{{`}} type {{`}}`}}", "refId": "A" } ], @@ -2305,7 +2394,6 @@ data: "description": "Tracks the rate of dropped invalid rows because of errors while unmarshaling write requests. The exact errors messages will be printed in logs.", "fieldConfig": { "defaults": { - "custom": {}, "links": [] }, "overrides": [] @@ -2321,19 +2409,25 @@ data: "hiddenSeries": false, "id": 50, "legend": { - "avg": false, - "current": false, - "max": false, + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, "min": false, - "show": false, + "show": true, + "sort": "current", + "sortDesc": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, "nullPointMode": "null", + "options": { + "alertThreshold": true + }, "percentage": false, - "pluginVersion": "7.1.1", + "pluginVersion": "8.0.0", "pointradius": 2, "points": false, "renderer": "flot", @@ -2343,7 +2437,8 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(rate(vm_rows_invalid_total{job=~\"$job\", instance=~\"$instance\"}[$__interval])) by(type)", + "exemplar": true, + "expr": "sum(rate(vm_rows_invalid_total{job=~\"$job\", instance=~\"$instance\"}[$__interval])) by(type) > 0", "interval": "", "legendFormat": "{{`{{`}}type{{`}}`}}", "refId": "A" @@ -2397,6 +2492,10 @@ data: { "collapsed": true, "datasource": "$ds", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, "gridPos": { "h": 1, "w": 24, @@ -2414,7 +2513,6 @@ data: "description": "Shows the rate of requests to configured remote write endpoints by url and status code.\n\nRemote write URLs are hidden by default but might be unveiled once `-remoteWrite.showURL` is set to true.\n\n", "fieldConfig": { "defaults": { - "custom": {}, "links": [] }, "overrides": [] @@ -2430,19 +2528,25 @@ data: "hiddenSeries": false, "id": 60, "legend": { - "avg": false, - "current": false, - "max": false, + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, "min": false, - "show": false, + "show": true, + "sort": "current", + "sortDesc": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, "nullPointMode": "null", + "options": { + "alertThreshold": true + }, "percentage": false, - "pluginVersion": "7.1.1", + "pluginVersion": "8.0.0", "pointradius": 2, "points": false, "renderer": "flot", @@ -2509,7 +2613,6 @@ data: "description": "Shows the global rate for number of written bytes via remote write connections.", "fieldConfig": { "defaults": { - "custom": {}, "links": [] }, "overrides": [] @@ -2525,19 +2628,25 @@ data: "hiddenSeries": false, "id": 66, "legend": { - "avg": false, - "current": false, - "max": false, + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, "min": false, - "show": false, + "show": true, + "sort": "current", + "sortDesc": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, "nullPointMode": "null", + "options": { + "alertThreshold": true + }, "percentage": false, - "pluginVersion": "7.1.1", + "pluginVersion": "8.0.0", "pointradius": 2, "points": false, "renderer": "flot", @@ -2603,7 +2712,6 @@ data: "description": "Shows requests retry rate by url. Number of retries is unlimited but protected with delays up to 1m between attempts.\n\nRemote write URLs are hidden by default but might be unveiled once `-remoteWrite.showURL` is set to true.\n\n", "fieldConfig": { "defaults": { - "custom": {}, "links": [] }, "overrides": [] @@ -2619,19 +2727,25 @@ data: "hiddenSeries": false, "id": 61, "legend": { - "avg": false, - "current": false, - "max": false, + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, "min": false, - "show": false, + "show": true, + "sort": "current", + "sortDesc": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, "nullPointMode": "null", + "options": { + "alertThreshold": true + }, "percentage": false, - "pluginVersion": "7.1.1", + "pluginVersion": "8.0.0", "pointradius": 2, "points": false, "renderer": "flot", @@ -2641,9 +2755,10 @@ data: "steppedLine": false, "targets": [ { + "exemplar": true, "expr": "sum(rate(vmagent_remotewrite_retries_count_total{job=~\"$job\", instance=~\"$instance\"}[$__interval])) by(url)", "interval": "", - "legendFormat": "{{`{{`}} url {{`}}`}}", + "legendFormat": "", "refId": "A" } ], @@ -2697,7 +2812,6 @@ data: "description": "Shows current number of established connections to remote write endpoints.\n\n", "fieldConfig": { "defaults": { - "custom": {}, "links": [] }, "overrides": [] @@ -2713,19 +2827,25 @@ data: "hiddenSeries": false, "id": 65, "legend": { - "avg": false, - "current": false, - "max": false, + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, "min": false, - "show": false, + "show": true, + "sort": "current", + "sortDesc": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, "nullPointMode": "null", + "options": { + "alertThreshold": true + }, "percentage": false, - "pluginVersion": "7.1.1", + "pluginVersion": "8.0.0", "pointradius": 2, "points": false, "renderer": "flot", @@ -2797,12 +2917,6 @@ data: "dataFormat": "tsbuckets", "datasource": "$ds", "description": "Shows the remote write request block size distribution in rows.", - "fieldConfig": { - "defaults": { - "custom": {} - }, - "overrides": [] - }, "gridPos": { "h": 8, "w": 12, @@ -2868,12 +2982,6 @@ data: "dataFormat": "tsbuckets", "datasource": "$ds", "description": "Shows the remote write request block size distribution in bytes.", - "fieldConfig": { - "defaults": { - "custom": {} - }, - "overrides": [] - }, "gridPos": { "h": 8, "w": 12, @@ -2939,12 +3047,6 @@ data: "dataFormat": "tsbuckets", "datasource": "$ds", "description": "Shows the remote write request duration distribution in seconds. Value depends on block size, network quality and remote storage performance.", - "fieldConfig": { - "defaults": { - "custom": {} - }, - "overrides": [] - }, "gridPos": { "h": 8, "w": 24, @@ -3002,6 +3104,10 @@ data: { "collapsed": true, "datasource": "$ds", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, "gridPos": { "h": 1, "w": 24, @@ -3019,7 +3125,6 @@ data: "description": "Shows the CPU usage per vmagent instance. \nIf you think that usage is abnormal or unexpected pls file an issue and attach CPU profile if possible.", "fieldConfig": { "defaults": { - "custom": {}, "links": [] }, "overrides": [] @@ -3035,13 +3140,16 @@ data: "hiddenSeries": false, "id": 35, "legend": { - "avg": false, - "current": false, - "max": false, + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, "min": false, "show": true, + "sort": "current", + "sortDesc": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, @@ -3053,8 +3161,11 @@ data: } ], "nullPointMode": "null", + "options": { + "alertThreshold": true + }, "percentage": false, - "pluginVersion": "7.1.1", + "pluginVersion": "8.0.0", "pointradius": 2, "points": false, "renderer": "flot", @@ -3118,10 +3229,9 @@ data: "dashLength": 10, "dashes": false, "datasource": "$ds", - "description": "Amount of used memory (resident)\n\nIf you think that usage is abnormal or unexpected pls file an issue and attach memory profile if possible.", + "description": "Amount of used memory\n\nIf you think that usage is abnormal or unexpected, please file an issue and attach memory profile if possible.", "fieldConfig": { "defaults": { - "custom": {}, "links": [] }, "overrides": [] @@ -3137,13 +3247,16 @@ data: "hiddenSeries": false, "id": 37, "legend": { - "avg": false, - "current": false, - "max": false, + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, "min": false, - "show": false, + "show": true, + "sort": "current", + "sortDesc": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, @@ -3155,8 +3268,11 @@ data: } ], "nullPointMode": "null", + "options": { + "alertThreshold": true + }, "percentage": false, - "pluginVersion": "7.1.1", + "pluginVersion": "8.0.0", "pointradius": 2, "points": false, "renderer": "flot", @@ -3166,10 +3282,19 @@ data: "steppedLine": false, "targets": [ { + "exemplar": true, "expr": "sum(process_resident_memory_bytes{job=~\"$job\", instance=~\"$instance\"}) by (instance)", "interval": "", - "legendFormat": "{{`{{`}}instance{{`}}`}}", + "legendFormat": "resident {{`{{`}}instance{{`}}`}}", "refId": "A" + }, + { + "exemplar": true, + "expr": "sum(process_resident_memory_anon_bytes{job=~\"$job\", instance=~\"$instance\"}) by (instance)", + "hide": false, + "interval": "", + "legendFormat": "anonymous {{`{{`}}instance{{`}}`}}", + "refId": "B" } ], "thresholds": [], @@ -3222,7 +3347,6 @@ data: "description": "Panel shows the number of open file descriptors in the OS.\nReaching the limit of open files can cause various issues and must be prevented.\n\nSee how to change limits here https://medium.com/@muhammadtriwibowo/set-permanently-ulimit-n-open-files-in-ubuntu-4d61064429a", "fieldConfig": { "defaults": { - "custom": {}, "links": [] }, "overrides": [] @@ -3238,20 +3362,26 @@ data: "hiddenSeries": false, "id": 83, "legend": { - "avg": false, - "current": false, - "max": false, + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, "min": false, - "show": false, + "show": true, + "sort": "current", + "sortDesc": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", + "options": { + "alertThreshold": true + }, "percentage": false, - "pluginVersion": "7.1.1", + "pluginVersion": "8.0.0", "pointradius": 2, "points": false, "renderer": "flot", @@ -3332,7 +3462,6 @@ data: "datasource": "$ds", "fieldConfig": { "defaults": { - "custom": {}, "links": [] }, "overrides": [] @@ -3348,20 +3477,26 @@ data: "hiddenSeries": false, "id": 39, "legend": { - "avg": false, - "current": false, - "max": false, + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, "min": false, - "show": false, + "show": true, + "sort": "current", + "sortDesc": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", + "options": { + "alertThreshold": true + }, "percentage": false, - "pluginVersion": "7.1.1", + "pluginVersion": "8.0.0", "pointradius": 2, "points": false, "renderer": "flot", @@ -3430,7 +3565,6 @@ data: "description": "Shows the number of bytes read/write from the storage layer when vmagent has to buffer data on disk or read already buffered data.", "fieldConfig": { "defaults": { - "custom": {}, "links": [] }, "overrides": [] @@ -3446,20 +3580,26 @@ data: "hiddenSeries": false, "id": 81, "legend": { - "avg": false, - "current": false, - "max": false, + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, "min": false, - "show": false, + "show": true, + "sort": "current", + "sortDesc": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", + "options": { + "alertThreshold": true + }, "percentage": false, - "pluginVersion": "7.1.1", + "pluginVersion": "8.0.0", "pointradius": 2, "points": false, "renderer": "flot", @@ -3542,7 +3682,6 @@ data: "datasource": "$ds", "fieldConfig": { "defaults": { - "custom": {}, "links": [] }, "overrides": [] @@ -3558,20 +3697,26 @@ data: "hiddenSeries": false, "id": 41, "legend": { - "avg": false, - "current": false, - "max": false, + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, "min": false, - "show": false, + "show": true, + "sort": "current", + "sortDesc": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", + "options": { + "alertThreshold": true + }, "percentage": false, - "pluginVersion": "7.1.1", + "pluginVersion": "8.0.0", "pointradius": 2, "points": false, "renderer": "flot", @@ -3638,7 +3783,6 @@ data: "datasource": "$ds", "fieldConfig": { "defaults": { - "custom": {}, "links": [] }, "overrides": [] @@ -3654,20 +3798,26 @@ data: "hiddenSeries": false, "id": 43, "legend": { - "avg": false, - "current": false, - "max": false, + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, "min": false, - "show": false, + "show": true, + "sort": "current", + "sortDesc": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", + "options": { + "alertThreshold": true + }, "percentage": false, - "pluginVersion": "7.1.1", + "pluginVersion": "8.0.0", "pointradius": 2, "points": false, "renderer": "flot", @@ -3731,7 +3881,7 @@ data: } ], "refresh": false, - "schemaVersion": 26, + "schemaVersion": 30, "style": "dark", "tags": [ "vmagent", @@ -3745,6 +3895,8 @@ data: "text": "VictoriaMetrics", "value": "VictoriaMetrics" }, + "description": null, + "error": null, "hide": 0, "includeAll": false, "label": null, @@ -3763,19 +3915,23 @@ data: "current": {}, "datasource": "$ds", "definition": "label_values(vm_app_version{version=~\"^vmagent.*\"}, job)", + "description": null, + "error": null, "hide": 0, "includeAll": false, "label": null, "multi": true, "name": "job", "options": [], - "query": "label_values(vm_app_version{version=~\"^vmagent.*\"}, job)", + "query": { + "query": "label_values(vm_app_version{version=~\"^vmagent.*\"}, job)", + "refId": "VictoriaMetrics-job-Variable-Query" + }, "refresh": 1, "regex": "", "skipUrlSync": false, "sort": 0, "tagValuesQuery": "", - "tags": [], "tagsQuery": "", "type": "query", "useTags": false @@ -3785,19 +3941,23 @@ data: "current": {}, "datasource": "$ds", "definition": "label_values(vm_app_version{job=~\"$job\"}, instance)", + "description": null, + "error": null, "hide": 0, "includeAll": true, "label": null, "multi": true, "name": "instance", "options": [], - "query": "label_values(vm_app_version{job=~\"$job\"}, instance)", + "query": { + "query": "label_values(vm_app_version{job=~\"$job\"}, instance)", + "refId": "VictoriaMetrics-instance-Variable-Query" + }, "refresh": 1, "regex": "", "skipUrlSync": false, "sort": 0, "tagValuesQuery": "", - "tags": [], "tagsQuery": "", "type": "query", "useTags": false diff --git a/charts/victoria-metrics-k8s-stack/templates/grafana/datasource.yaml b/charts/victoria-metrics-k8s-stack/templates/grafana/datasource.yaml index 96560801c..039eba30d 100644 --- a/charts/victoria-metrics-k8s-stack/templates/grafana/datasource.yaml +++ b/charts/victoria-metrics-k8s-stack/templates/grafana/datasource.yaml @@ -13,13 +13,13 @@ metadata: app: {{ include "victoria-metrics-k8s-stack.name" $ }}-grafana {{- include "victoria-metrics-k8s-stack.labels" . | nindent 4 }} data: -{{- if .Values.vmsingle.enabled }} +{{- if or .Values.vmsingle.enabled .Values.vmcluster.enabled }} datasource.yaml: |- apiVersion: 1 datasources: - name: VictoriaMetrics type: prometheus - {{ include "victoria-metrics-k8s-stack.vmEndpoint" . }} + url: {{ include "victoria-metrics-k8s-stack.vmSelectEndpoint" . }} access: proxy isDefault: true {{- end }} diff --git a/charts/victoria-metrics-k8s-stack/templates/rules/general.rules.yaml b/charts/victoria-metrics-k8s-stack/templates/rules/general.rules.yaml index b0d121f9f..d85009f21 100644 --- a/charts/victoria-metrics-k8s-stack/templates/rules/general.rules.yaml +++ b/charts/victoria-metrics-k8s-stack/templates/rules/general.rules.yaml @@ -26,7 +26,7 @@ spec: - alert: TargetDown annotations: description: '{{`{{`}} printf "%.4g" $value {{`}}`}}% of the {{`{{`}} $labels.job {{`}}`}}/{{`{{`}} $labels.service {{`}}`}} targets in {{`{{`}} $labels.namespace {{`}}`}} namespace are down.' - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-targetdown + runbook_url: {{ .Values.defaultRules.runbookUrl }}/general/targetdown summary: One or more targets are unreachable. expr: 100 * (count(up == 0) BY (job, namespace, service) / count(up) BY (job, namespace, service)) > 10 for: 10m @@ -48,7 +48,7 @@ spec: "DeadMansSnitch" integration in PagerDuty. ' - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-watchdog + runbook_url: {{ .Values.defaultRules.runbookUrl }}/general/watchdog summary: An alert that should always be firing to certify that Alertmanager is working properly. expr: vector(1) labels: diff --git a/charts/victoria-metrics-k8s-stack/templates/rules/k8s.rules.yaml b/charts/victoria-metrics-k8s-stack/templates/rules/k8s.rules.yaml index 4928d8710..174b35c3a 100644 --- a/charts/victoria-metrics-k8s-stack/templates/rules/k8s.rules.yaml +++ b/charts/victoria-metrics-k8s-stack/templates/rules/k8s.rules.yaml @@ -25,11 +25,11 @@ spec: rules: - expr: |- sum by (cluster, namespace, pod, container) ( - rate(container_cpu_usage_seconds_total{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}[5m]) + irate(container_cpu_usage_seconds_total{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}[5m]) ) * on (cluster, namespace, pod) group_left(node) topk by (cluster, namespace, pod) ( 1, max by(cluster, namespace, pod, node) (kube_pod_info{node!=""}) ) - record: node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate + record: node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate - expr: |- container_memory_working_set_bytes{job="kubelet", metrics_path="/metrics/cadvisor", image!=""} * on (namespace, pod) group_left(node) topk by(namespace, pod) (1, @@ -54,6 +54,12 @@ spec: max by(namespace, pod, node) (kube_pod_info{node!=""}) ) record: node_namespace_pod_container:container_memory_swap + - expr: |- + kube_pod_container_resource_requests{resource="memory",job="kube-state-metrics"} * on (namespace, pod, cluster) + group_left() max by (namespace, pod) ( + (kube_pod_status_phase{phase=~"Pending|Running"} == 1) + ) + record: cluster:namespace:pod_memory:active:kube_pod_container_resource_requests - expr: |- sum by (namespace, cluster) ( sum by (namespace, pod, cluster) ( @@ -65,6 +71,12 @@ spec: ) ) record: namespace_memory:kube_pod_container_resource_requests:sum + - expr: |- + kube_pod_container_resource_requests{resource="cpu",job="kube-state-metrics"} * on (namespace, pod, cluster) + group_left() max by (namespace, pod) ( + (kube_pod_status_phase{phase=~"Pending|Running"} == 1) + ) + record: cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests - expr: |- sum by (namespace, cluster) ( sum by (namespace, pod, cluster) ( @@ -76,6 +88,40 @@ spec: ) ) record: namespace_cpu:kube_pod_container_resource_requests:sum + - expr: |- + kube_pod_container_resource_limits{resource="memory",job="kube-state-metrics"} * on (namespace, pod, cluster) + group_left() max by (namespace, pod) ( + (kube_pod_status_phase{phase=~"Pending|Running"} == 1) + ) + record: cluster:namespace:pod_memory:active:kube_pod_container_resource_limits + - expr: |- + sum by (namespace, cluster) ( + sum by (namespace, pod, cluster) ( + max by (namespace, pod, container, cluster) ( + kube_pod_container_resource_limits{resource="memory",job="kube-state-metrics"} + ) * on(namespace, pod, cluster) group_left() max by (namespace, pod) ( + kube_pod_status_phase{phase=~"Pending|Running"} == 1 + ) + ) + ) + record: namespace_memory:kube_pod_container_resource_limits:sum + - expr: |- + kube_pod_container_resource_limits{resource="cpu",job="kube-state-metrics"} * on (namespace, pod, cluster) + group_left() max by (namespace, pod) ( + (kube_pod_status_phase{phase=~"Pending|Running"} == 1) + ) + record: cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits + - expr: |- + sum by (namespace, cluster) ( + sum by (namespace, pod, cluster) ( + max by (namespace, pod, container, cluster) ( + kube_pod_container_resource_limits{resource="cpu",job="kube-state-metrics"} + ) * on(namespace, pod, cluster) group_left() max by (namespace, pod) ( + kube_pod_status_phase{phase=~"Pending|Running"} == 1 + ) + ) + ) + record: namespace_cpu:kube_pod_container_resource_limits:sum - expr: |- max by (cluster, namespace, workload, pod) ( label_replace( diff --git a/charts/victoria-metrics-k8s-stack/templates/rules/kube-apiserver-availability.rules.yaml b/charts/victoria-metrics-k8s-stack/templates/rules/kube-apiserver-availability.rules.yaml index fc8c9f4c8..a006cb9e9 100644 --- a/charts/victoria-metrics-k8s-stack/templates/rules/kube-apiserver-availability.rules.yaml +++ b/charts/victoria-metrics-k8s-stack/templates/rules/kube-apiserver-availability.rules.yaml @@ -24,6 +24,16 @@ spec: - interval: 3m name: kube-apiserver-availability.rules rules: + - expr: avg_over_time(code_verb:apiserver_request_total:increase1h[30d]) * 24 * 30 + record: code_verb:apiserver_request_total:increase30d + - expr: sum by (cluster, code) (code_verb:apiserver_request_total:increase30d{verb=~"LIST|GET"}) + labels: + verb: read + record: code:apiserver_request_total:increase30d + - expr: sum by (cluster, code) (code_verb:apiserver_request_total:increase30d{verb=~"POST|PUT|PATCH|DELETE"}) + labels: + verb: write + record: code:apiserver_request_total:increase30d - expr: |- 1 - ( ( @@ -38,14 +48,14 @@ spec: - ( ( - sum by (cluster) (increase(apiserver_request_duration_seconds_bucket{verb=~"LIST|GET",scope=~"resource|",le="0.1"}[30d])) + sum by (cluster) (increase(apiserver_request_duration_seconds_bucket{verb=~"LIST|GET",scope=~"resource|",le="1"}[30d])) or vector(0) ) + - sum by (cluster) (increase(apiserver_request_duration_seconds_bucket{verb=~"LIST|GET",scope="namespace",le="0.5"}[30d])) + sum by (cluster) (increase(apiserver_request_duration_seconds_bucket{verb=~"LIST|GET",scope="namespace",le="5"}[30d])) + - sum by (cluster) (increase(apiserver_request_duration_seconds_bucket{verb=~"LIST|GET",scope="cluster",le="5"}[30d])) + sum by (cluster) (increase(apiserver_request_duration_seconds_bucket{verb=~"LIST|GET",scope="cluster",le="40"}[30d])) ) ) + # errors @@ -63,14 +73,14 @@ spec: ( # too slow ( - sum by (cluster) (increase(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[30d])) + sum by (cluster) (increase(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="1"}[30d])) or vector(0) ) + - sum by (cluster) (increase(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[30d])) + sum by (cluster) (increase(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="5"}[30d])) + - sum by (cluster) (increase(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[30d])) + sum by (cluster) (increase(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="40"}[30d])) ) + # errors @@ -98,62 +108,20 @@ spec: labels: verb: write record: apiserver_request:availability30d - - expr: avg_over_time(code_verb:apiserver_request_total:increase1h[30d]) * 24 * 30 - record: code_verb:apiserver_request_total:increase30d - - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="LIST",code=~"2.."}[1h])) - record: code_verb:apiserver_request_total:increase1h - - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="GET",code=~"2.."}[1h])) - record: code_verb:apiserver_request_total:increase1h - - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="POST",code=~"2.."}[1h])) - record: code_verb:apiserver_request_total:increase1h - - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="PUT",code=~"2.."}[1h])) - record: code_verb:apiserver_request_total:increase1h - - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="PATCH",code=~"2.."}[1h])) - record: code_verb:apiserver_request_total:increase1h - - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="DELETE",code=~"2.."}[1h])) - record: code_verb:apiserver_request_total:increase1h - - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="LIST",code=~"3.."}[1h])) - record: code_verb:apiserver_request_total:increase1h - - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="GET",code=~"3.."}[1h])) - record: code_verb:apiserver_request_total:increase1h - - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="POST",code=~"3.."}[1h])) - record: code_verb:apiserver_request_total:increase1h - - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="PUT",code=~"3.."}[1h])) - record: code_verb:apiserver_request_total:increase1h - - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="PATCH",code=~"3.."}[1h])) - record: code_verb:apiserver_request_total:increase1h - - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="DELETE",code=~"3.."}[1h])) - record: code_verb:apiserver_request_total:increase1h - - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="LIST",code=~"4.."}[1h])) - record: code_verb:apiserver_request_total:increase1h - - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="GET",code=~"4.."}[1h])) - record: code_verb:apiserver_request_total:increase1h - - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="POST",code=~"4.."}[1h])) - record: code_verb:apiserver_request_total:increase1h - - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="PUT",code=~"4.."}[1h])) - record: code_verb:apiserver_request_total:increase1h - - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="PATCH",code=~"4.."}[1h])) - record: code_verb:apiserver_request_total:increase1h - - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="DELETE",code=~"4.."}[1h])) - record: code_verb:apiserver_request_total:increase1h - - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="LIST",code=~"5.."}[1h])) - record: code_verb:apiserver_request_total:increase1h - - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="GET",code=~"5.."}[1h])) - record: code_verb:apiserver_request_total:increase1h - - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="POST",code=~"5.."}[1h])) - record: code_verb:apiserver_request_total:increase1h - - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="PUT",code=~"5.."}[1h])) - record: code_verb:apiserver_request_total:increase1h - - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="PATCH",code=~"5.."}[1h])) - record: code_verb:apiserver_request_total:increase1h - - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="DELETE",code=~"5.."}[1h])) - record: code_verb:apiserver_request_total:increase1h - - expr: sum by (cluster, code) (code_verb:apiserver_request_total:increase30d{verb=~"LIST|GET"}) + - expr: sum by (cluster,code,resource) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[5m])) labels: verb: read - record: code:apiserver_request_total:increase30d - - expr: sum by (cluster, code) (code_verb:apiserver_request_total:increase30d{verb=~"POST|PUT|PATCH|DELETE"}) + record: code_resource:apiserver_request_total:rate5m + - expr: sum by (cluster,code,resource) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m])) labels: verb: write - record: code:apiserver_request_total:increase30d + record: code_resource:apiserver_request_total:rate5m + - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"2.."}[1h])) + record: code_verb:apiserver_request_total:increase1h + - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"3.."}[1h])) + record: code_verb:apiserver_request_total:increase1h + - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"4.."}[1h])) + record: code_verb:apiserver_request_total:increase1h + - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"5.."}[1h])) + record: code_verb:apiserver_request_total:increase1h {{- end }} \ No newline at end of file diff --git a/charts/victoria-metrics-k8s-stack/templates/rules/kube-apiserver-burnrate.rules.yaml b/charts/victoria-metrics-k8s-stack/templates/rules/kube-apiserver-burnrate.rules.yaml new file mode 100644 index 000000000..7950b818b --- /dev/null +++ b/charts/victoria-metrics-k8s-stack/templates/rules/kube-apiserver-burnrate.rules.yaml @@ -0,0 +1,327 @@ +{{- /* +Generated from 'kube-apiserver-burnrate.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/kubernetes-prometheusRule.yaml +Do not change in-place! In order to change this file first read following link: +https://github.com/VictoriaMetrics/helm-charts/tree/master/charts/victoria-metrics-k8s-stack/hack +*/ -}} +{{- if and .Values.defaultRules.create }} +apiVersion: operator.victoriametrics.com/v1beta1 +kind: VMRule +metadata: + namespace: {{ .Release.Namespace }} + name: {{ printf "%s-%s" (include "victoria-metrics-k8s-stack.fullname" .) "kube-apiserver-burnrate.rules" | trunc 63 | trimSuffix "-" }} + labels: + app: {{ include "victoria-metrics-k8s-stack.name" $ }} +{{ include "victoria-metrics-k8s-stack.labels" $ | indent 4 }} +{{- if .Values.defaultRules.labels }} +{{ toYaml .Values.defaultRules.labels | indent 4 }} +{{- end }} +{{- if .Values.defaultRules.annotations }} + annotations: +{{ toYaml .Values.defaultRules.annotations | indent 4 }} +{{- end }} +spec: + groups: + - name: kube-apiserver-burnrate.rules + rules: + - expr: |- + ( + ( + # too slow + sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[1d])) + - + ( + ( + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="1"}[1d])) + or + vector(0) + ) + + + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="5"}[1d])) + + + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="40"}[1d])) + ) + ) + + + # errors + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[1d])) + ) + / + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[1d])) + labels: + verb: read + record: apiserver_request:burnrate1d + - expr: |- + ( + ( + # too slow + sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[1h])) + - + ( + ( + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="1"}[1h])) + or + vector(0) + ) + + + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="5"}[1h])) + + + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="40"}[1h])) + ) + ) + + + # errors + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[1h])) + ) + / + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[1h])) + labels: + verb: read + record: apiserver_request:burnrate1h + - expr: |- + ( + ( + # too slow + sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[2h])) + - + ( + ( + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="1"}[2h])) + or + vector(0) + ) + + + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="5"}[2h])) + + + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="40"}[2h])) + ) + ) + + + # errors + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[2h])) + ) + / + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[2h])) + labels: + verb: read + record: apiserver_request:burnrate2h + - expr: |- + ( + ( + # too slow + sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[30m])) + - + ( + ( + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="1"}[30m])) + or + vector(0) + ) + + + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="5"}[30m])) + + + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="40"}[30m])) + ) + ) + + + # errors + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[30m])) + ) + / + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[30m])) + labels: + verb: read + record: apiserver_request:burnrate30m + - expr: |- + ( + ( + # too slow + sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[3d])) + - + ( + ( + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="1"}[3d])) + or + vector(0) + ) + + + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="5"}[3d])) + + + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="40"}[3d])) + ) + ) + + + # errors + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[3d])) + ) + / + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[3d])) + labels: + verb: read + record: apiserver_request:burnrate3d + - expr: |- + ( + ( + # too slow + sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[5m])) + - + ( + ( + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="1"}[5m])) + or + vector(0) + ) + + + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="5"}[5m])) + + + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="40"}[5m])) + ) + ) + + + # errors + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[5m])) + ) + / + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[5m])) + labels: + verb: read + record: apiserver_request:burnrate5m + - expr: |- + ( + ( + # too slow + sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[6h])) + - + ( + ( + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="1"}[6h])) + or + vector(0) + ) + + + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="5"}[6h])) + + + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="40"}[6h])) + ) + ) + + + # errors + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[6h])) + ) + / + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[6h])) + labels: + verb: read + record: apiserver_request:burnrate6h + - expr: |- + ( + ( + # too slow + sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1d])) + - + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[1d])) + ) + + + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[1d])) + ) + / + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1d])) + labels: + verb: write + record: apiserver_request:burnrate1d + - expr: |- + ( + ( + # too slow + sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1h])) + - + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[1h])) + ) + + + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[1h])) + ) + / + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1h])) + labels: + verb: write + record: apiserver_request:burnrate1h + - expr: |- + ( + ( + # too slow + sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[2h])) + - + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[2h])) + ) + + + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[2h])) + ) + / + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[2h])) + labels: + verb: write + record: apiserver_request:burnrate2h + - expr: |- + ( + ( + # too slow + sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[30m])) + - + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[30m])) + ) + + + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[30m])) + ) + / + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[30m])) + labels: + verb: write + record: apiserver_request:burnrate30m + - expr: |- + ( + ( + # too slow + sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[3d])) + - + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[3d])) + ) + + + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[3d])) + ) + / + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[3d])) + labels: + verb: write + record: apiserver_request:burnrate3d + - expr: |- + ( + ( + # too slow + sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m])) + - + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[5m])) + ) + + + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[5m])) + ) + / + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m])) + labels: + verb: write + record: apiserver_request:burnrate5m + - expr: |- + ( + ( + # too slow + sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[6h])) + - + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[6h])) + ) + + + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[6h])) + ) + / + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[6h])) + labels: + verb: write + record: apiserver_request:burnrate6h +{{- end }} \ No newline at end of file diff --git a/charts/victoria-metrics-k8s-stack/templates/rules/kube-apiserver-histogram.rules.yaml b/charts/victoria-metrics-k8s-stack/templates/rules/kube-apiserver-histogram.rules.yaml new file mode 100644 index 000000000..85ddf2f0c --- /dev/null +++ b/charts/victoria-metrics-k8s-stack/templates/rules/kube-apiserver-histogram.rules.yaml @@ -0,0 +1,48 @@ +{{- /* +Generated from 'kube-apiserver-histogram.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/kubernetes-prometheusRule.yaml +Do not change in-place! In order to change this file first read following link: +https://github.com/VictoriaMetrics/helm-charts/tree/master/charts/victoria-metrics-k8s-stack/hack +*/ -}} +{{- if and .Values.defaultRules.create }} +apiVersion: operator.victoriametrics.com/v1beta1 +kind: VMRule +metadata: + namespace: {{ .Release.Namespace }} + name: {{ printf "%s-%s" (include "victoria-metrics-k8s-stack.fullname" .) "kube-apiserver-histogram.rules" | trunc 63 | trimSuffix "-" }} + labels: + app: {{ include "victoria-metrics-k8s-stack.name" $ }} +{{ include "victoria-metrics-k8s-stack.labels" $ | indent 4 }} +{{- if .Values.defaultRules.labels }} +{{ toYaml .Values.defaultRules.labels | indent 4 }} +{{- end }} +{{- if .Values.defaultRules.annotations }} + annotations: +{{ toYaml .Values.defaultRules.annotations | indent 4 }} +{{- end }} +spec: + groups: + - name: kube-apiserver-histogram.rules + rules: + - expr: histogram_quantile(0.99, sum by (cluster, le, resource) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET"}[5m]))) > 0 + labels: + quantile: '0.99' + verb: read + record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile + - expr: histogram_quantile(0.99, sum by (cluster, le, resource) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m]))) > 0 + labels: + quantile: '0.99' + verb: write + record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile + - expr: histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod)) + labels: + quantile: '0.99' + record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile + - expr: histogram_quantile(0.9, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod)) + labels: + quantile: '0.9' + record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile + - expr: histogram_quantile(0.5, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod)) + labels: + quantile: '0.5' + record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile +{{- end }} \ No newline at end of file diff --git a/charts/victoria-metrics-k8s-stack/templates/rules/kube-apiserver-slos.yaml b/charts/victoria-metrics-k8s-stack/templates/rules/kube-apiserver-slos.yaml index d85d1c187..f7a4dde76 100644 --- a/charts/victoria-metrics-k8s-stack/templates/rules/kube-apiserver-slos.yaml +++ b/charts/victoria-metrics-k8s-stack/templates/rules/kube-apiserver-slos.yaml @@ -26,7 +26,7 @@ spec: - alert: KubeAPIErrorBudgetBurn annotations: description: The API server is burning too much error budget. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeapierrorbudgetburn + runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeapierrorbudgetburn summary: The API server is burning too much error budget. expr: |- sum(apiserver_request:burnrate1h) > (14.40 * 0.01000) @@ -43,7 +43,7 @@ spec: - alert: KubeAPIErrorBudgetBurn annotations: description: The API server is burning too much error budget. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeapierrorbudgetburn + runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeapierrorbudgetburn summary: The API server is burning too much error budget. expr: |- sum(apiserver_request:burnrate6h) > (6.00 * 0.01000) @@ -60,7 +60,7 @@ spec: - alert: KubeAPIErrorBudgetBurn annotations: description: The API server is burning too much error budget. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeapierrorbudgetburn + runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeapierrorbudgetburn summary: The API server is burning too much error budget. expr: |- sum(apiserver_request:burnrate1d) > (3.00 * 0.01000) @@ -77,7 +77,7 @@ spec: - alert: KubeAPIErrorBudgetBurn annotations: description: The API server is burning too much error budget. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeapierrorbudgetburn + runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeapierrorbudgetburn summary: The API server is burning too much error budget. expr: |- sum(apiserver_request:burnrate3d) > (1.00 * 0.01000) diff --git a/charts/victoria-metrics-k8s-stack/templates/rules/kube-state-metrics.yaml b/charts/victoria-metrics-k8s-stack/templates/rules/kube-state-metrics.yaml index 1ee495ea0..d8fe76402 100644 --- a/charts/victoria-metrics-k8s-stack/templates/rules/kube-state-metrics.yaml +++ b/charts/victoria-metrics-k8s-stack/templates/rules/kube-state-metrics.yaml @@ -26,7 +26,7 @@ spec: - alert: KubeStateMetricsListErrors annotations: description: kube-state-metrics is experiencing errors at an elevated rate in list operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubestatemetricslisterrors + runbook_url: {{ .Values.defaultRules.runbookUrl }}/kube-state-metrics/kubestatemetricslisterrors summary: kube-state-metrics is experiencing errors in list operations. expr: |- (sum(rate(kube_state_metrics_list_total{job="kube-state-metrics",result="error"}[5m])) @@ -42,7 +42,7 @@ spec: - alert: KubeStateMetricsWatchErrors annotations: description: kube-state-metrics is experiencing errors at an elevated rate in watch operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubestatemetricswatcherrors + runbook_url: {{ .Values.defaultRules.runbookUrl }}/kube-state-metrics/kubestatemetricswatcherrors summary: kube-state-metrics is experiencing errors in watch operations. expr: |- (sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics",result="error"}[5m])) @@ -58,7 +58,7 @@ spec: - alert: KubeStateMetricsShardingMismatch annotations: description: kube-state-metrics pods are running with different --total-shards configuration, some Kubernetes objects may be exposed multiple times or not exposed at all. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubestatemetricsshardingmismatch + runbook_url: {{ .Values.defaultRules.runbookUrl }}/kube-state-metrics/kubestatemetricsshardingmismatch summary: kube-state-metrics sharding is misconfigured. expr: stdvar (kube_state_metrics_total_shards{job="kube-state-metrics"}) != 0 for: 15m @@ -70,7 +70,7 @@ spec: - alert: KubeStateMetricsShardsMissing annotations: description: kube-state-metrics shards are missing, some Kubernetes objects are not being exposed. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubestatemetricsshardsmissing + runbook_url: {{ .Values.defaultRules.runbookUrl }}/kube-state-metrics/kubestatemetricsshardsmissing summary: kube-state-metrics shards are missing. expr: |- 2^max(kube_state_metrics_total_shards{job="kube-state-metrics"}) - 1 diff --git a/charts/victoria-metrics-k8s-stack/templates/rules/kubernetes-apps.yaml b/charts/victoria-metrics-k8s-stack/templates/rules/kubernetes-apps.yaml index 3517d07e5..4e9d6aedf 100644 --- a/charts/victoria-metrics-k8s-stack/templates/rules/kubernetes-apps.yaml +++ b/charts/victoria-metrics-k8s-stack/templates/rules/kubernetes-apps.yaml @@ -27,9 +27,12 @@ spec: - alert: KubePodCrashLooping annotations: description: Pod {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod {{`}}`}} ({{`{{`}} $labels.container {{`}}`}}) is restarting {{`{{`}} printf "%.2f" $value {{`}}`}} times / 10 minutes. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubepodcrashlooping + runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubepodcrashlooping summary: Pod is crash looping. - expr: rate(kube_pod_container_status_restarts_total{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}[10m]) * 60 * 5 > 0 + expr: |- + increase(kube_pod_container_status_restarts_total{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}[10m]) > 0 + and + kube_pod_container_status_waiting{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} == 1 for: 15m labels: severity: warning @@ -39,7 +42,7 @@ spec: - alert: KubePodNotReady annotations: description: Pod {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod {{`}}`}} has been in a non-ready state for longer than 15 minutes. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubepodnotready + runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubepodnotready summary: Pod has been in a non-ready state for more than 15 minutes. expr: |- sum by (namespace, pod) ( @@ -58,7 +61,7 @@ spec: - alert: KubeDeploymentGenerationMismatch annotations: description: Deployment generation for {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.deployment {{`}}`}} does not match, this indicates that the Deployment has failed but has not been rolled back. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubedeploymentgenerationmismatch + runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubedeploymentgenerationmismatch summary: Deployment generation mismatch due to possible roll-back expr: |- kube_deployment_status_observed_generation{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} @@ -73,12 +76,12 @@ spec: - alert: KubeDeploymentReplicasMismatch annotations: description: Deployment {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.deployment {{`}}`}} has not matched the expected number of replicas for longer than 15 minutes. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubedeploymentreplicasmismatch + runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubedeploymentreplicasmismatch summary: Deployment has not matched the expected number of replicas. expr: |- ( kube_deployment_spec_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} - != + > kube_deployment_status_replicas_available{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} ) and ( changes(kube_deployment_status_replicas_updated{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}[10m]) @@ -94,7 +97,7 @@ spec: - alert: KubeStatefulSetReplicasMismatch annotations: description: StatefulSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.statefulset {{`}}`}} has not matched the expected number of replicas for longer than 15 minutes. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubestatefulsetreplicasmismatch + runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubestatefulsetreplicasmismatch summary: Deployment has not matched the expected number of replicas. expr: |- ( @@ -115,7 +118,7 @@ spec: - alert: KubeStatefulSetGenerationMismatch annotations: description: StatefulSet generation for {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.statefulset {{`}}`}} does not match, this indicates that the StatefulSet has failed but has not been rolled back. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubestatefulsetgenerationmismatch + runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubestatefulsetgenerationmismatch summary: StatefulSet generation mismatch due to possible roll-back expr: |- kube_statefulset_status_observed_generation{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} @@ -130,7 +133,7 @@ spec: - alert: KubeStatefulSetUpdateNotRolledOut annotations: description: StatefulSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.statefulset {{`}}`}} update has not been rolled out. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubestatefulsetupdatenotrolledout + runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubestatefulsetupdatenotrolledout summary: StatefulSet update has not been rolled out. expr: |- ( @@ -159,7 +162,7 @@ spec: - alert: KubeDaemonSetRolloutStuck annotations: description: DaemonSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.daemonset {{`}}`}} has not finished or progressed for at least 15 minutes. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubedaemonsetrolloutstuck + runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubedaemonsetrolloutstuck summary: DaemonSet rollout is stuck. expr: |- ( @@ -194,7 +197,7 @@ spec: - alert: KubeContainerWaiting annotations: description: Pod {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod {{`}}`}} container {{`{{`}} $labels.container{{`}}`}} has been in waiting state for longer than 1 hour. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubecontainerwaiting + runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubecontainerwaiting summary: Pod container waiting longer than 1 hour expr: sum by (namespace, pod, container) (kube_pod_container_status_waiting_reason{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}) > 0 for: 1h @@ -206,7 +209,7 @@ spec: - alert: KubeDaemonSetNotScheduled annotations: description: '{{`{{`}} $value {{`}}`}} Pods of DaemonSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.daemonset {{`}}`}} are not scheduled.' - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubedaemonsetnotscheduled + runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubedaemonsetnotscheduled summary: DaemonSet pods are not scheduled. expr: |- kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} @@ -221,7 +224,7 @@ spec: - alert: KubeDaemonSetMisScheduled annotations: description: '{{`{{`}} $value {{`}}`}} Pods of DaemonSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.daemonset {{`}}`}} are running where they are not supposed to run.' - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubedaemonsetmisscheduled + runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubedaemonsetmisscheduled summary: DaemonSet pods are misscheduled. expr: kube_daemonset_status_number_misscheduled{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} > 0 for: 15m @@ -233,7 +236,7 @@ spec: - alert: KubeJobCompletion annotations: description: Job {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.job_name {{`}}`}} is taking more than 12 hours to complete. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubejobcompletion + runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubejobcompletion summary: Job did not complete in time expr: kube_job_spec_completions{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} - kube_job_status_succeeded{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} > 0 for: 12h @@ -245,7 +248,7 @@ spec: - alert: KubeJobFailed annotations: description: Job {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.job_name {{`}}`}} failed to complete. Removing failed job after investigation should clear this alert. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubejobfailed + runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubejobfailed summary: Job failed to complete. expr: kube_job_failed{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} > 0 for: 15m @@ -256,23 +259,23 @@ spec: {{- end }} - alert: KubeHpaReplicasMismatch annotations: - description: HPA {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.hpa {{`}}`}} has not matched the desired number of replicas for longer than 15 minutes. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubehpareplicasmismatch + description: HPA {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.horizontalpodautoscaler {{`}}`}} has not matched the desired number of replicas for longer than 15 minutes. + runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubehpareplicasmismatch summary: HPA has not matched descired number of replicas. expr: |- - (kube_hpa_status_desired_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} + (kube_horizontalpodautoscaler_status_desired_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} != - kube_hpa_status_current_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}) + kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}) and - (kube_hpa_status_current_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} + (kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} > - kube_hpa_spec_min_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}) + kube_horizontalpodautoscaler_spec_min_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}) and - (kube_hpa_status_current_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} + (kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} < - kube_hpa_spec_max_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}) + kube_horizontalpodautoscaler_spec_max_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}) and - changes(kube_hpa_status_current_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}[15m]) == 0 + changes(kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}[15m]) == 0 for: 15m labels: severity: warning @@ -281,13 +284,13 @@ spec: {{- end }} - alert: KubeHpaMaxedOut annotations: - description: HPA {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.hpa {{`}}`}} has been running at max replicas for longer than 15 minutes. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubehpamaxedout + description: HPA {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.horizontalpodautoscaler {{`}}`}} has been running at max replicas for longer than 15 minutes. + runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubehpamaxedout summary: HPA is running at max replicas expr: |- - kube_hpa_status_current_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} + kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} == - kube_hpa_spec_max_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} + kube_horizontalpodautoscaler_spec_max_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} for: 15m labels: severity: warning diff --git a/charts/victoria-metrics-k8s-stack/templates/rules/kubernetes-resources.yaml b/charts/victoria-metrics-k8s-stack/templates/rules/kubernetes-resources.yaml index 1158e8762..165209a67 100644 --- a/charts/victoria-metrics-k8s-stack/templates/rules/kubernetes-resources.yaml +++ b/charts/victoria-metrics-k8s-stack/templates/rules/kubernetes-resources.yaml @@ -26,7 +26,7 @@ spec: - alert: KubeCPUOvercommit annotations: description: Cluster has overcommitted CPU resource requests for Pods and cannot tolerate node failure. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubecpuovercommit + runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubecpuovercommit summary: Cluster has overcommitted CPU resource requests. expr: |- sum(namespace_cpu:kube_pod_container_resource_requests:sum{}) @@ -43,7 +43,7 @@ spec: - alert: KubeMemoryOvercommit annotations: description: Cluster has overcommitted memory resource requests for Pods and cannot tolerate node failure. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubememoryovercommit + runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubememoryovercommit summary: Cluster has overcommitted memory resource requests. expr: |- sum(namespace_memory:kube_pod_container_resource_requests:sum{}) @@ -62,7 +62,7 @@ spec: - alert: KubeCPUQuotaOvercommit annotations: description: Cluster has overcommitted CPU resource requests for Namespaces. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubecpuquotaovercommit + runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubecpuquotaovercommit summary: Cluster has overcommitted CPU resource requests. expr: |- sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="cpu"}) @@ -78,7 +78,7 @@ spec: - alert: KubeMemoryQuotaOvercommit annotations: description: Cluster has overcommitted memory resource requests for Namespaces. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubememoryquotaovercommit + runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubememoryquotaovercommit summary: Cluster has overcommitted memory resource requests. expr: |- sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="memory"}) @@ -94,7 +94,7 @@ spec: - alert: KubeQuotaAlmostFull annotations: description: Namespace {{`{{`}} $labels.namespace {{`}}`}} is using {{`{{`}} $value | humanizePercentage {{`}}`}} of its {{`{{`}} $labels.resource {{`}}`}} quota. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubequotaalmostfull + runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubequotaalmostfull summary: Namespace quota is going to be full. expr: |- kube_resourcequota{job="kube-state-metrics", type="used"} @@ -110,7 +110,7 @@ spec: - alert: KubeQuotaFullyUsed annotations: description: Namespace {{`{{`}} $labels.namespace {{`}}`}} is using {{`{{`}} $value | humanizePercentage {{`}}`}} of its {{`{{`}} $labels.resource {{`}}`}} quota. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubequotafullyused + runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubequotafullyused summary: Namespace quota is fully used. expr: |- kube_resourcequota{job="kube-state-metrics", type="used"} @@ -126,7 +126,7 @@ spec: - alert: KubeQuotaExceeded annotations: description: Namespace {{`{{`}} $labels.namespace {{`}}`}} is using {{`{{`}} $value | humanizePercentage {{`}}`}} of its {{`{{`}} $labels.resource {{`}}`}} quota. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubequotaexceeded + runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubequotaexceeded summary: Namespace quota has exceeded the limits. expr: |- kube_resourcequota{job="kube-state-metrics", type="used"} @@ -142,7 +142,7 @@ spec: - alert: CPUThrottlingHigh annotations: description: '{{`{{`}} $value | humanizePercentage {{`}}`}} throttling of CPU in namespace {{`{{`}} $labels.namespace {{`}}`}} for container {{`{{`}} $labels.container {{`}}`}} in pod {{`{{`}} $labels.pod {{`}}`}}.' - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-cputhrottlinghigh + runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/cputhrottlinghigh summary: Processes experience elevated CPU throttling. expr: |- sum(increase(container_cpu_cfs_throttled_periods_total{container!="", }[5m])) by (container, pod, namespace) diff --git a/charts/victoria-metrics-k8s-stack/templates/rules/kubernetes-storage.yaml b/charts/victoria-metrics-k8s-stack/templates/rules/kubernetes-storage.yaml index 0eb9141ca..56217031d 100644 --- a/charts/victoria-metrics-k8s-stack/templates/rules/kubernetes-storage.yaml +++ b/charts/victoria-metrics-k8s-stack/templates/rules/kubernetes-storage.yaml @@ -27,13 +27,16 @@ spec: - alert: KubePersistentVolumeFillingUp annotations: description: The PersistentVolume claimed by {{`{{`}} $labels.persistentvolumeclaim {{`}}`}} in Namespace {{`{{`}} $labels.namespace {{`}}`}} is only {{`{{`}} $value | humanizePercentage {{`}}`}} free. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubepersistentvolumefillingup + runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubepersistentvolumefillingup summary: PersistentVolume is filling up. expr: |- - kubelet_volume_stats_available_bytes{job="kubelet", namespace=~"{{ $targetNamespace }}", metrics_path="/metrics"} - / - kubelet_volume_stats_capacity_bytes{job="kubelet", namespace=~"{{ $targetNamespace }}", metrics_path="/metrics"} - < 0.03 + ( + kubelet_volume_stats_available_bytes{job="kubelet", namespace=~"{{ $targetNamespace }}", metrics_path="/metrics"} + / + kubelet_volume_stats_capacity_bytes{job="kubelet", namespace=~"{{ $targetNamespace }}", metrics_path="/metrics"} + ) < 0.03 + and + kubelet_volume_stats_used_bytes{job="kubelet", namespace=~"{{ $targetNamespace }}", metrics_path="/metrics"} > 0 for: 1m labels: severity: critical @@ -43,7 +46,7 @@ spec: - alert: KubePersistentVolumeFillingUp annotations: description: Based on recent sampling, the PersistentVolume claimed by {{`{{`}} $labels.persistentvolumeclaim {{`}}`}} in Namespace {{`{{`}} $labels.namespace {{`}}`}} is expected to fill up within four days. Currently {{`{{`}} $value | humanizePercentage {{`}}`}} is available. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubepersistentvolumefillingup + runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubepersistentvolumefillingup summary: PersistentVolume is filling up. expr: |- ( @@ -52,6 +55,8 @@ spec: kubelet_volume_stats_capacity_bytes{job="kubelet", namespace=~"{{ $targetNamespace }}", metrics_path="/metrics"} ) < 0.15 and + kubelet_volume_stats_used_bytes{job="kubelet", namespace=~"{{ $targetNamespace }}", metrics_path="/metrics"} > 0 + and predict_linear(kubelet_volume_stats_available_bytes{job="kubelet", namespace=~"{{ $targetNamespace }}", metrics_path="/metrics"}[6h], 4 * 24 * 3600) < 0 for: 1h labels: @@ -62,7 +67,7 @@ spec: - alert: KubePersistentVolumeErrors annotations: description: The persistent volume {{`{{`}} $labels.persistentvolume {{`}}`}} has status {{`{{`}} $labels.phase {{`}}`}}. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubepersistentvolumeerrors + runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubepersistentvolumeerrors summary: PersistentVolume is having issues with provisioning. expr: kube_persistentvolume_status_phase{phase=~"Failed|Pending",job="kube-state-metrics"} > 0 for: 5m diff --git a/charts/victoria-metrics-k8s-stack/templates/rules/kubernetes-system-apiserver.yaml b/charts/victoria-metrics-k8s-stack/templates/rules/kubernetes-system-apiserver.yaml index 3d84cdb5a..f3726e8d6 100644 --- a/charts/victoria-metrics-k8s-stack/templates/rules/kubernetes-system-apiserver.yaml +++ b/charts/victoria-metrics-k8s-stack/templates/rules/kubernetes-system-apiserver.yaml @@ -26,7 +26,7 @@ spec: - alert: KubeClientCertificateExpiration annotations: description: A client certificate used to authenticate to the apiserver is expiring in less than 7.0 days. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeclientcertificateexpiration + runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeclientcertificateexpiration summary: Client certificate is about to expire. expr: apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 604800 labels: @@ -37,7 +37,7 @@ spec: - alert: KubeClientCertificateExpiration annotations: description: A client certificate used to authenticate to the apiserver is expiring in less than 24.0 hours. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeclientcertificateexpiration + runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeclientcertificateexpiration summary: Client certificate is about to expire. expr: apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 86400 labels: @@ -48,7 +48,7 @@ spec: - alert: AggregatedAPIErrors annotations: description: An aggregated API {{`{{`}} $labels.name {{`}}`}}/{{`{{`}} $labels.namespace {{`}}`}} has reported errors. It has appeared unavailable {{`{{`}} $value | humanize {{`}}`}} times averaged over the past 10m. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-aggregatedapierrors + runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/aggregatedapierrors summary: An aggregated API has reported errors. expr: sum by(name, namespace)(increase(aggregator_unavailable_apiservice_total[10m])) > 4 labels: @@ -59,7 +59,7 @@ spec: - alert: AggregatedAPIDown annotations: description: An aggregated API {{`{{`}} $labels.name {{`}}`}}/{{`{{`}} $labels.namespace {{`}}`}} has been only {{`{{`}} $value | humanize {{`}}`}}% available over the last 10m. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-aggregatedapidown + runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/aggregatedapidown summary: An aggregated API is down. expr: (1 - max by(name, namespace)(avg_over_time(aggregator_unavailable_apiservice[10m]))) * 100 < 85 for: 5m @@ -72,7 +72,7 @@ spec: - alert: KubeAPIDown annotations: description: KubeAPI has disappeared from Prometheus target discovery. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeapidown + runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeapidown summary: Target disappeared from Prometheus target discovery. expr: absent(up{job="apiserver"} == 1) for: 15m @@ -85,7 +85,7 @@ spec: - alert: KubeAPITerminatedRequests annotations: description: The apiserver has terminated {{`{{`}} $value | humanizePercentage {{`}}`}} of its incoming requests. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeapiterminatedrequests + runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeapiterminatedrequests summary: The apiserver has terminated {{`{{`}} $value | humanizePercentage {{`}}`}} of its incoming requests. expr: sum(rate(apiserver_request_terminations_total{job="apiserver"}[10m])) / ( sum(rate(apiserver_request_total{job="apiserver"}[10m])) + sum(rate(apiserver_request_terminations_total{job="apiserver"}[10m])) ) > 0.20 for: 5m diff --git a/charts/victoria-metrics-k8s-stack/templates/rules/kubernetes-system-controller-manager.yaml b/charts/victoria-metrics-k8s-stack/templates/rules/kubernetes-system-controller-manager.yaml index 7c7d108d8..56c95826a 100644 --- a/charts/victoria-metrics-k8s-stack/templates/rules/kubernetes-system-controller-manager.yaml +++ b/charts/victoria-metrics-k8s-stack/templates/rules/kubernetes-system-controller-manager.yaml @@ -27,7 +27,7 @@ spec: - alert: KubeControllerManagerDown annotations: description: KubeControllerManager has disappeared from Prometheus target discovery. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubecontrollermanagerdown + runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubecontrollermanagerdown summary: Target disappeared from Prometheus target discovery. expr: absent(up{job="kube-controller-manager"} == 1) for: 15m diff --git a/charts/victoria-metrics-k8s-stack/templates/rules/kubernetes-system-kubelet.yaml b/charts/victoria-metrics-k8s-stack/templates/rules/kubernetes-system-kubelet.yaml index a4443a079..4b324efed 100644 --- a/charts/victoria-metrics-k8s-stack/templates/rules/kubernetes-system-kubelet.yaml +++ b/charts/victoria-metrics-k8s-stack/templates/rules/kubernetes-system-kubelet.yaml @@ -26,7 +26,7 @@ spec: - alert: KubeNodeNotReady annotations: description: '{{`{{`}} $labels.node {{`}}`}} has been unready for more than 15 minutes.' - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubenodenotready + runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubenodenotready summary: Node is not ready. expr: kube_node_status_condition{job="kube-state-metrics",condition="Ready",status="true"} == 0 for: 15m @@ -38,7 +38,7 @@ spec: - alert: KubeNodeUnreachable annotations: description: '{{`{{`}} $labels.node {{`}}`}} is unreachable and some workloads may be rescheduled.' - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubenodeunreachable + runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubenodeunreachable summary: Node is unreachable. expr: (kube_node_spec_taint{job="kube-state-metrics",key="node.kubernetes.io/unreachable",effect="NoSchedule"} unless ignoring(key,value) kube_node_spec_taint{job="kube-state-metrics",key=~"ToBeDeletedByClusterAutoscaler|cloud.google.com/impending-node-termination|aws-node-termination-handler/spot-itn"}) == 1 for: 15m @@ -50,7 +50,7 @@ spec: - alert: KubeletTooManyPods annotations: description: Kubelet '{{`{{`}} $labels.node {{`}}`}}' is running at {{`{{`}} $value | humanizePercentage {{`}}`}} of its Pod capacity. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubelettoomanypods + runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubelettoomanypods summary: Kubelet is running at capacity. expr: |- count by(node) ( @@ -69,7 +69,7 @@ spec: - alert: KubeNodeReadinessFlapping annotations: description: The readiness status of node {{`{{`}} $labels.node {{`}}`}} has changed {{`{{`}} $value {{`}}`}} times in the last 15 minutes. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubenodereadinessflapping + runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubenodereadinessflapping summary: Node readiness status is flapping. expr: sum(changes(kube_node_status_condition{status="true",condition="Ready"}[15m])) by (node) > 2 for: 15m @@ -81,7 +81,7 @@ spec: - alert: KubeletPlegDurationHigh annotations: description: The Kubelet Pod Lifecycle Event Generator has a 99th percentile duration of {{`{{`}} $value {{`}}`}} seconds on node {{`{{`}} $labels.node {{`}}`}}. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeletplegdurationhigh + runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeletplegdurationhigh summary: Kubelet Pod Lifecycle Event Generator is taking too long to relist. expr: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile{quantile="0.99"} >= 10 for: 5m @@ -93,7 +93,7 @@ spec: - alert: KubeletPodStartUpLatencyHigh annotations: description: Kubelet Pod startup 99th percentile latency is {{`{{`}} $value {{`}}`}} seconds on node {{`{{`}} $labels.node {{`}}`}}. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeletpodstartuplatencyhigh + runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeletpodstartuplatencyhigh summary: Kubelet Pod startup latency is too high. expr: histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{job="kubelet", metrics_path="/metrics"}[5m])) by (instance, le)) * on(instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"} > 60 for: 15m @@ -105,7 +105,7 @@ spec: - alert: KubeletClientCertificateExpiration annotations: description: Client certificate for Kubelet on node {{`{{`}} $labels.node {{`}}`}} expires in {{`{{`}} $value | humanizeDuration {{`}}`}}. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeletclientcertificateexpiration + runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeletclientcertificateexpiration summary: Kubelet client certificate is about to expire. expr: kubelet_certificate_manager_client_ttl_seconds < 604800 labels: @@ -116,7 +116,7 @@ spec: - alert: KubeletClientCertificateExpiration annotations: description: Client certificate for Kubelet on node {{`{{`}} $labels.node {{`}}`}} expires in {{`{{`}} $value | humanizeDuration {{`}}`}}. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeletclientcertificateexpiration + runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeletclientcertificateexpiration summary: Kubelet client certificate is about to expire. expr: kubelet_certificate_manager_client_ttl_seconds < 86400 labels: @@ -127,7 +127,7 @@ spec: - alert: KubeletServerCertificateExpiration annotations: description: Server certificate for Kubelet on node {{`{{`}} $labels.node {{`}}`}} expires in {{`{{`}} $value | humanizeDuration {{`}}`}}. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeletservercertificateexpiration + runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeletservercertificateexpiration summary: Kubelet server certificate is about to expire. expr: kubelet_certificate_manager_server_ttl_seconds < 604800 labels: @@ -138,7 +138,7 @@ spec: - alert: KubeletServerCertificateExpiration annotations: description: Server certificate for Kubelet on node {{`{{`}} $labels.node {{`}}`}} expires in {{`{{`}} $value | humanizeDuration {{`}}`}}. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeletservercertificateexpiration + runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeletservercertificateexpiration summary: Kubelet server certificate is about to expire. expr: kubelet_certificate_manager_server_ttl_seconds < 86400 labels: @@ -149,7 +149,7 @@ spec: - alert: KubeletClientCertificateRenewalErrors annotations: description: Kubelet on node {{`{{`}} $labels.node {{`}}`}} has failed to renew its client certificate ({{`{{`}} $value | humanize {{`}}`}} errors in the last 5 minutes). - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeletclientcertificaterenewalerrors + runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeletclientcertificaterenewalerrors summary: Kubelet has failed to renew its client certificate. expr: increase(kubelet_certificate_manager_client_expiration_renew_errors[5m]) > 0 for: 15m @@ -161,7 +161,7 @@ spec: - alert: KubeletServerCertificateRenewalErrors annotations: description: Kubelet on node {{`{{`}} $labels.node {{`}}`}} has failed to renew its server certificate ({{`{{`}} $value | humanize {{`}}`}} errors in the last 5 minutes). - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeletservercertificaterenewalerrors + runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeletservercertificaterenewalerrors summary: Kubelet has failed to renew its server certificate. expr: increase(kubelet_server_expiration_renew_errors[5m]) > 0 for: 15m @@ -174,7 +174,7 @@ spec: - alert: KubeletDown annotations: description: Kubelet has disappeared from Prometheus target discovery. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeletdown + runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeletdown summary: Target disappeared from Prometheus target discovery. expr: absent(up{job="kubelet", metrics_path="/metrics"} == 1) for: 15m diff --git a/charts/victoria-metrics-k8s-stack/templates/rules/kubernetes-system-scheduler.yaml b/charts/victoria-metrics-k8s-stack/templates/rules/kubernetes-system-scheduler.yaml index 4ec8844f0..2e30ddb16 100644 --- a/charts/victoria-metrics-k8s-stack/templates/rules/kubernetes-system-scheduler.yaml +++ b/charts/victoria-metrics-k8s-stack/templates/rules/kubernetes-system-scheduler.yaml @@ -27,7 +27,7 @@ spec: - alert: KubeSchedulerDown annotations: description: KubeScheduler has disappeared from Prometheus target discovery. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeschedulerdown + runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeschedulerdown summary: Target disappeared from Prometheus target discovery. expr: absent(up{job="kube-scheduler"} == 1) for: 15m diff --git a/charts/victoria-metrics-k8s-stack/templates/rules/kubernetes-system.yaml b/charts/victoria-metrics-k8s-stack/templates/rules/kubernetes-system.yaml index 487ab17d5..79ce6036a 100644 --- a/charts/victoria-metrics-k8s-stack/templates/rules/kubernetes-system.yaml +++ b/charts/victoria-metrics-k8s-stack/templates/rules/kubernetes-system.yaml @@ -26,7 +26,7 @@ spec: - alert: KubeVersionMismatch annotations: description: There are {{`{{`}} $value {{`}}`}} different semantic versions of Kubernetes components running. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeversionmismatch + runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeversionmismatch summary: Different semantic versions of Kubernetes components running. expr: count(count by (git_version) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"git_version","$1","git_version","(v[0-9]*.[0-9]*).*"))) > 1 for: 15m @@ -38,7 +38,7 @@ spec: - alert: KubeClientErrors annotations: description: Kubernetes API server client '{{`{{`}} $labels.job {{`}}`}}/{{`{{`}} $labels.instance {{`}}`}}' is experiencing {{`{{`}} $value | humanizePercentage {{`}}`}} errors.' - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeclienterrors + runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeclienterrors summary: Kubernetes API server client is experiencing errors. expr: |- (sum(rate(rest_client_requests_total{code=~"5.."}[5m])) by (instance, job) diff --git a/charts/victoria-metrics-k8s-stack/templates/rules/node-exporter.yaml b/charts/victoria-metrics-k8s-stack/templates/rules/node-exporter.yaml index e4cab93fb..b4bc0b198 100644 --- a/charts/victoria-metrics-k8s-stack/templates/rules/node-exporter.yaml +++ b/charts/victoria-metrics-k8s-stack/templates/rules/node-exporter.yaml @@ -26,7 +26,7 @@ spec: - alert: NodeFilesystemSpaceFillingUp annotations: description: Filesystem on {{`{{`}} $labels.device {{`}}`}} at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available space left and is filling up. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodefilesystemspacefillingup + runbook_url: {{ .Values.defaultRules.runbookUrl }}/node/nodefilesystemspacefillingup summary: Filesystem is predicted to run out of space within the next 24 hours. expr: |- ( @@ -45,7 +45,7 @@ spec: - alert: NodeFilesystemSpaceFillingUp annotations: description: Filesystem on {{`{{`}} $labels.device {{`}}`}} at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available space left and is filling up fast. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodefilesystemspacefillingup + runbook_url: {{ .Values.defaultRules.runbookUrl }}/node/nodefilesystemspacefillingup summary: Filesystem is predicted to run out of space within the next 4 hours. expr: |- ( @@ -64,7 +64,7 @@ spec: - alert: NodeFilesystemAlmostOutOfSpace annotations: description: Filesystem on {{`{{`}} $labels.device {{`}}`}} at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available space left. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodefilesystemalmostoutofspace + runbook_url: {{ .Values.defaultRules.runbookUrl }}/node/nodefilesystemalmostoutofspace summary: Filesystem has less than 5% space left. expr: |- ( @@ -81,7 +81,7 @@ spec: - alert: NodeFilesystemAlmostOutOfSpace annotations: description: Filesystem on {{`{{`}} $labels.device {{`}}`}} at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available space left. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodefilesystemalmostoutofspace + runbook_url: {{ .Values.defaultRules.runbookUrl }}/node/nodefilesystemalmostoutofspace summary: Filesystem has less than 3% space left. expr: |- ( @@ -98,7 +98,7 @@ spec: - alert: NodeFilesystemFilesFillingUp annotations: description: Filesystem on {{`{{`}} $labels.device {{`}}`}} at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available inodes left and is filling up. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodefilesystemfilesfillingup + runbook_url: {{ .Values.defaultRules.runbookUrl }}/node/nodefilesystemfilesfillingup summary: Filesystem is predicted to run out of inodes within the next 24 hours. expr: |- ( @@ -117,7 +117,7 @@ spec: - alert: NodeFilesystemFilesFillingUp annotations: description: Filesystem on {{`{{`}} $labels.device {{`}}`}} at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available inodes left and is filling up fast. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodefilesystemfilesfillingup + runbook_url: {{ .Values.defaultRules.runbookUrl }}/node/nodefilesystemfilesfillingup summary: Filesystem is predicted to run out of inodes within the next 4 hours. expr: |- ( @@ -136,7 +136,7 @@ spec: - alert: NodeFilesystemAlmostOutOfFiles annotations: description: Filesystem on {{`{{`}} $labels.device {{`}}`}} at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available inodes left. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodefilesystemalmostoutoffiles + runbook_url: {{ .Values.defaultRules.runbookUrl }}/node/nodefilesystemalmostoutoffiles summary: Filesystem has less than 5% inodes left. expr: |- ( @@ -153,7 +153,7 @@ spec: - alert: NodeFilesystemAlmostOutOfFiles annotations: description: Filesystem on {{`{{`}} $labels.device {{`}}`}} at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available inodes left. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodefilesystemalmostoutoffiles + runbook_url: {{ .Values.defaultRules.runbookUrl }}/node/nodefilesystemalmostoutoffiles summary: Filesystem has less than 3% inodes left. expr: |- ( @@ -170,7 +170,7 @@ spec: - alert: NodeNetworkReceiveErrs annotations: description: '{{`{{`}} $labels.instance {{`}}`}} interface {{`{{`}} $labels.device {{`}}`}} has encountered {{`{{`}} printf "%.0f" $value {{`}}`}} receive errors in the last two minutes.' - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodenetworkreceiveerrs + runbook_url: {{ .Values.defaultRules.runbookUrl }}/node/nodenetworkreceiveerrs summary: Network interface is reporting many receive errors. expr: rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01 for: 1h @@ -182,7 +182,7 @@ spec: - alert: NodeNetworkTransmitErrs annotations: description: '{{`{{`}} $labels.instance {{`}}`}} interface {{`{{`}} $labels.device {{`}}`}} has encountered {{`{{`}} printf "%.0f" $value {{`}}`}} transmit errors in the last two minutes.' - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodenetworktransmiterrs + runbook_url: {{ .Values.defaultRules.runbookUrl }}/node/nodenetworktransmiterrs summary: Network interface is reporting many transmit errors. expr: rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01 for: 1h @@ -194,7 +194,7 @@ spec: - alert: NodeHighNumberConntrackEntriesUsed annotations: description: '{{`{{`}} $value | humanizePercentage {{`}}`}} of conntrack entries are used.' - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodehighnumberconntrackentriesused + runbook_url: {{ .Values.defaultRules.runbookUrl }}/node/nodehighnumberconntrackentriesused summary: Number of conntrack are getting close to the limit. expr: (node_nf_conntrack_entries / node_nf_conntrack_entries_limit) > 0.75 labels: @@ -205,7 +205,7 @@ spec: - alert: NodeTextFileCollectorScrapeError annotations: description: Node Exporter text file collector failed to scrape. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodetextfilecollectorscrapeerror + runbook_url: {{ .Values.defaultRules.runbookUrl }}/node/nodetextfilecollectorscrapeerror summary: Node Exporter text file collector failed to scrape. expr: node_textfile_scrape_error{job="node-exporter"} == 1 labels: @@ -216,7 +216,7 @@ spec: - alert: NodeClockSkewDetected annotations: description: Clock on {{`{{`}} $labels.instance {{`}}`}} is out of sync by more than 300s. Ensure NTP is configured correctly on this host. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodeclockskewdetected + runbook_url: {{ .Values.defaultRules.runbookUrl }}/node/nodeclockskewdetected summary: Clock skew detected. expr: |- ( @@ -239,7 +239,7 @@ spec: - alert: NodeClockNotSynchronising annotations: description: Clock on {{`{{`}} $labels.instance {{`}}`}} is not synchronising. Ensure NTP is configured on this host. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodeclocknotsynchronising + runbook_url: {{ .Values.defaultRules.runbookUrl }}/node/nodeclocknotsynchronising summary: Clock not synchronising. expr: |- min_over_time(node_timex_sync_status[5m]) == 0 @@ -254,7 +254,7 @@ spec: - alert: NodeRAIDDegraded annotations: description: RAID array '{{`{{`}} $labels.device {{`}}`}}' on {{`{{`}} $labels.instance {{`}}`}} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-noderaiddegraded + runbook_url: {{ .Values.defaultRules.runbookUrl }}/node/noderaiddegraded summary: RAID Array is degraded expr: node_md_disks_required - ignoring (state) (node_md_disks{state="active"}) > 0 for: 15m @@ -266,12 +266,42 @@ spec: - alert: NodeRAIDDiskFailure annotations: description: At least one device in RAID array on {{`{{`}} $labels.instance {{`}}`}} failed. Array '{{`{{`}} $labels.device {{`}}`}}' needs attention and possibly a disk swap. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-noderaiddiskfailure + runbook_url: {{ .Values.defaultRules.runbookUrl }}/node/noderaiddiskfailure summary: Failed device in RAID array expr: node_md_disks{state="failed"} > 0 labels: severity: warning {{- if .Values.defaultRules.additionalRuleLabels }} {{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} +{{- end }} + - alert: NodeFileDescriptorLimit + annotations: + description: File descriptors limit at {{`{{`}} $labels.instance {{`}}`}} is currently at {{`{{`}} printf "%.2f" $value {{`}}`}}%. + runbook_url: {{ .Values.defaultRules.runbookUrl }}/node/nodefiledescriptorlimit + summary: Kernel is predicted to exhaust file descriptors limit soon. + expr: |- + ( + node_filefd_allocated{job="node-exporter"} * 100 / node_filefd_maximum{job="node-exporter"} > 70 + ) + for: 15m + labels: + severity: warning +{{- if .Values.defaultRules.additionalRuleLabels }} +{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} +{{- end }} + - alert: NodeFileDescriptorLimit + annotations: + description: File descriptors limit at {{`{{`}} $labels.instance {{`}}`}} is currently at {{`{{`}} printf "%.2f" $value {{`}}`}}%. + runbook_url: {{ .Values.defaultRules.runbookUrl }}/node/nodefiledescriptorlimit + summary: Kernel is predicted to exhaust file descriptors limit soon. + expr: |- + ( + node_filefd_allocated{job="node-exporter"} * 100 / node_filefd_maximum{job="node-exporter"} > 90 + ) + for: 15m + labels: + severity: critical +{{- if .Values.defaultRules.additionalRuleLabels }} +{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} {{- end }} {{- end }} \ No newline at end of file diff --git a/charts/victoria-metrics-k8s-stack/templates/rules/node-network.yaml b/charts/victoria-metrics-k8s-stack/templates/rules/node-network.yaml index a6c907fe8..540737dfa 100644 --- a/charts/victoria-metrics-k8s-stack/templates/rules/node-network.yaml +++ b/charts/victoria-metrics-k8s-stack/templates/rules/node-network.yaml @@ -25,8 +25,9 @@ spec: rules: - alert: NodeNetworkInterfaceFlapping annotations: - message: Network interface "{{`{{`}} $labels.device {{`}}`}}" changing it's up status often on node-exporter {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod {{`}}`}} - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodenetworkinterfaceflapping + description: Network interface "{{`{{`}} $labels.device {{`}}`}}" changing its up status often on node-exporter {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod {{`}}`}} + runbook_url: {{ .Values.defaultRules.runbookUrl }}/general/nodenetworkinterfaceflapping + summary: Network interface is often changing its status expr: changes(node_network_up{job="node-exporter",device!~"veth.+"}[2m]) > 2 for: 2m labels: diff --git a/charts/victoria-metrics-k8s-stack/templates/rules/service-health.yaml b/charts/victoria-metrics-k8s-stack/templates/rules/vm-health.yaml similarity index 88% rename from charts/victoria-metrics-k8s-stack/templates/rules/service-health.yaml rename to charts/victoria-metrics-k8s-stack/templates/rules/vm-health.yaml index a789491a1..d0f5823c6 100644 --- a/charts/victoria-metrics-k8s-stack/templates/rules/service-health.yaml +++ b/charts/victoria-metrics-k8s-stack/templates/rules/vm-health.yaml @@ -1,5 +1,5 @@ {{- /* -Generated from 'serviceHealth' group from https://raw.githubusercontent.com/VictoriaMetrics/VictoriaMetrics/master/deployment/docker/alerts.yml +Generated from 'vm-health' group from https://raw.githubusercontent.com/VictoriaMetrics/VictoriaMetrics/master/deployment/docker/alerts.yml Do not change in-place! In order to change this file first read following link: https://github.com/VictoriaMetrics/helm-charts/tree/master/charts/victoria-metrics-k8s-stack/hack */ -}} @@ -8,7 +8,7 @@ apiVersion: operator.victoriametrics.com/v1beta1 kind: VMRule metadata: namespace: {{ .Release.Namespace }} - name: {{ printf "%s-%s" (include "victoria-metrics-k8s-stack.fullname" .) "service-health" | trunc 63 | trimSuffix "-" }} + name: {{ printf "%s-%s" (include "victoria-metrics-k8s-stack.fullname" .) "vm-health" | trunc 63 | trimSuffix "-" }} labels: app: {{ include "victoria-metrics-k8s-stack.name" $ }} {{ include "victoria-metrics-k8s-stack.labels" $ | indent 4 }} @@ -21,7 +21,7 @@ metadata: {{- end }} spec: groups: - - name: serviceHealth + - name: vm-health rules: - alert: TooManyRestarts annotations: diff --git a/charts/victoria-metrics-k8s-stack/templates/rules/vmsingle.yaml b/charts/victoria-metrics-k8s-stack/templates/rules/vmsingle.yaml index dcdfe8a47..7750f1cef 100644 --- a/charts/victoria-metrics-k8s-stack/templates/rules/vmsingle.yaml +++ b/charts/victoria-metrics-k8s-stack/templates/rules/vmsingle.yaml @@ -31,16 +31,16 @@ spec: description: "Taking into account current ingestion rate, free disk space will be enough only for {{`{{`}} $value | humanizeDuration {{`}}`}} on instance {{`{{`}} $labels.instance {{`}}`}}.\n Consider to limit the ingestion rate, decrease retention or scale the disk space if possible." summary: Instance {{`{{`}} $labels.instance {{`}}`}} will run out of disk space soon expr: |- - vm_free_disk_space_bytes / ignoring(path) ( - ( - sum(rate(vm_rows_added_to_storage_total[1d])) - - sum(rate(vm_deduplicated_samples_total[1d])) without(type) - ) - * - ( - sum(vm_data_size_bytes{type!="indexdb"}) / - sum(vm_rows{type!="indexdb"}) - ) + vm_free_disk_space_bytes / ignoring(path) + ( + ( + rate(vm_rows_added_to_storage_total[1d]) - + ignoring(type) rate(vm_deduplicated_samples_total{type="merge"}[1d]) + ) + * scalar( + sum(vm_data_size_bytes{type!="indexdb"}) / + sum(vm_rows{type!="indexdb"}) + ) ) < 3 * 24 * 3600 for: 30m labels: @@ -173,5 +173,17 @@ spec: severity: critical {{- if .Values.defaultRules.additionalRuleLabels }} {{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} +{{- end }} + - alert: LabelsLimitExceededOnIngestion + annotations: + dashboard: {{ index .Values.grafana.ingress.hosts 0 }}/d/oS7Bi_0Wz?viewPanel=74&var-instance={{`{{`}} $labels.instance {{`}}`}} + description: "VictoriaMetrics limits the number of labels per each metric with `-maxLabelsPerTimeseries` command-line flag.\n This prevents from ingesting metrics with too many labels. Please verify that `-maxLabelsPerTimeseries` is configured correctly or that clients which send these metrics aren't misbehaving." + summary: Metrics ingested in ({{`{{`}} $labels.instance {{`}}`}}) are exceeding labels limit + expr: sum(increase(vm_metrics_with_dropped_labels_total[5m])) by (instance) > 0 + for: 15m + labels: + severity: warning +{{- if .Values.defaultRules.additionalRuleLabels }} +{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} {{- end }} {{- end }} \ No newline at end of file diff --git a/charts/victoria-metrics-k8s-stack/templates/victoria-metrics-operator/uninstall_hook.yaml b/charts/victoria-metrics-k8s-stack/templates/victoria-metrics-operator/uninstall_hook.yaml index 63b5228c9..03ef5d537 100644 --- a/charts/victoria-metrics-k8s-stack/templates/victoria-metrics-operator/uninstall_hook.yaml +++ b/charts/victoria-metrics-k8s-stack/templates/victoria-metrics-operator/uninstall_hook.yaml @@ -1,4 +1,5 @@ -{{- if and .Values.operator.cleanupCRD }} +{{- if .Values.operator.enabled }} +{{- if .Values.operator.cleanupCRD }} apiVersion: batch/v1 kind: Job metadata: @@ -28,6 +29,7 @@ spec: kubectl delete vmagents --all --ignore-not-found=true; kubectl delete vmsingles --all --ignore-not-found=true; kubectl delete vmalertmanagers --all --ignore-not-found=true; + kubectl delete vmclusters --all --ignore-not-found=true; restartPolicy: OnFailure --- {{- if .Values.operator.cleanupSA.create }} @@ -65,8 +67,10 @@ rules: - vmalerts - vmsingles - vmalertmanagers + - vmclusters verbs: ["get", "list", "watch","delete"] --- {{- end }} --- +{{- end }} {{- end }} \ No newline at end of file diff --git a/charts/victoria-metrics-k8s-stack/templates/victoria-metrics-operator/vmcluster.yaml b/charts/victoria-metrics-k8s-stack/templates/victoria-metrics-operator/vmcluster.yaml new file mode 100644 index 000000000..19fb46018 --- /dev/null +++ b/charts/victoria-metrics-k8s-stack/templates/victoria-metrics-operator/vmcluster.yaml @@ -0,0 +1,263 @@ +{{- if .Values.vmcluster.enabled }} +--- +apiVersion: operator.victoriametrics.com/v1beta1 +kind: VMCluster +metadata: + name: {{ .Values.vmcluster.name | default (include "victoria-metrics-k8s-stack.fullname" .) }} + namespace: {{ .Release.Namespace }} + labels: {{ include "victoria-metrics-k8s-stack.labels" . | nindent 4 }} +spec: +{{ .Values.vmcluster.spec | toYaml | indent 2 }} + +{{- $newAPI := .Capabilities.APIVersions.Has "networking.k8s.io/v1/Ingress" -}} +{{ if .Values.vmcluster.ingress.storage.enabled }} +--- +{{- with .Values.vmcluster.ingress.storage }} +{{- $servicePort := $.Values.vmcluster.spec.vmstorage.port | default 8482 -}} +{{- $serviceName := printf "%s-%s" "vmstorage" (include "victoria-metrics-k8s-stack.fullname" $) | trunc 63 | trimSuffix "-" }} +{{- $ingressPath := .path -}} +{{- $ingressPathType := .pathType | default "" -}} +{{- $extraPaths := .extraPaths -}} +{{- if $newAPI -}} +apiVersion: networking.k8s.io/v1 +{{- else if $.Capabilities.APIVersions.Has "networking.k8s.io/v1beta1/Ingress" }} +apiVersion: networking.k8s.io/v1beta1 +{{- else }} +apiVersion: extensions/v1beta1 +{{- end }} +kind: Ingress +metadata: + name: {{ $serviceName }} + namespace: {{ $.Release.Namespace }} + labels: + app.kubernetes.io/component: {{ include "victoria-metrics-k8s-stack.name" $ }}-vmcluster +{{ include "victoria-metrics-k8s-stack.labels" $ | indent 4 }} +{{- if .labels }} +{{ toYaml .labels | indent 4 }} +{{- end }} + {{- if .annotations }} + annotations: + {{- range $key, $value := .annotations }} + {{ $key }}: {{ tpl $value $ | quote }} + {{- end }} + {{- end }} +spec: + {{- if .ingressClassName }} + ingressClassName: {{ .ingressClassName }} + {{- end -}} +{{- if .tls }} + tls: +{{ tpl (toYaml .tls) $ | indent 4 }} +{{- end }} + rules: + {{- if .hosts }} + {{- range .hosts }} + - host: {{ tpl . $}} + http: + paths: +{{- if $extraPaths }} +{{ toYaml $extraPaths | indent 10 }} +{{- end }} + - path: {{ $ingressPath }} + {{- if $newAPI }} + pathType: {{ $ingressPathType }} + {{- end }} + backend: + {{- if $newAPI }} + service: + name: {{ $serviceName }} + port: + number: {{ $servicePort }} + {{- else }} + serviceName: {{ $serviceName }} + servicePort: {{ $servicePort }} + {{- end }} + {{- end }} + {{- else }} + - http: + paths: + - backend: + {{- if $newAPI }} + service: + name: {{ $serviceName }} + port: + number: {{ $servicePort }} + pathType: {{ $ingressPathType }} + {{- else }} + serviceName: {{ $serviceName }} + servicePort: {{ $servicePort }} + {{- end }} + {{- if $ingressPath }} + path: {{ $ingressPath }} + {{- end }} + {{- end -}} +{{- end }} +{{- end }} +{{ if .Values.vmcluster.ingress.select.enabled -}} +--- +{{- with .Values.vmcluster.ingress.select }} +{{- $servicePort := $.Values.vmcluster.spec.vmselect.port | default 8481 -}} +{{- $serviceName := printf "%s-%s" "vmselect" (include "victoria-metrics-k8s-stack.fullname" $) | trunc 63 | trimSuffix "-" }} +{{- $ingressPath := .path -}} +{{- $ingressPathType := .pathType | default "" -}} +{{- $extraPaths := .extraPaths -}} +{{- if $newAPI -}} +apiVersion: networking.k8s.io/v1 +{{- else if $.Capabilities.APIVersions.Has "networking.k8s.io/v1beta1/Ingress" }} +apiVersion: networking.k8s.io/v1beta1 +{{- else }} +apiVersion: extensions/v1beta1 +{{- end }} +kind: Ingress +metadata: + name: {{ $serviceName }} + namespace: {{ $.Release.Namespace }} + labels: + app.kubernetes.io/component: {{ include "victoria-metrics-k8s-stack.name" $ }}-vmcluster +{{ include "victoria-metrics-k8s-stack.labels" $ | indent 4 }} +{{- if .labels }} +{{ toYaml .labels | indent 4 }} +{{- end }} + {{- if .annotations }} + annotations: + {{- range $key, $value := .annotations }} + {{ $key }}: {{ tpl $value $ | quote }} + {{- end }} + {{- end }} +spec: + {{- if .ingressClassName }} + ingressClassName: {{ .ingressClassName }} + {{- end -}} +{{- if .tls }} + tls: +{{ tpl (toYaml .tls) $ | indent 4 }} +{{- end }} + rules: + {{- if .hosts }} + {{- range .hosts }} + - host: {{ tpl . $}} + http: + paths: +{{- if $extraPaths }} +{{ toYaml $extraPaths | indent 10 }} +{{- end }} + - path: {{ $ingressPath }} + {{- if $newAPI }} + pathType: {{ $ingressPathType }} + {{- end }} + backend: + {{- if $newAPI }} + service: + name: {{ $serviceName }} + port: + number: {{ $servicePort }} + {{- else }} + serviceName: {{ $serviceName }} + servicePort: {{ $servicePort }} + {{- end }} + {{- end }} + {{- else }} + - http: + paths: + - backend: + {{- if $newAPI }} + service: + name: {{ $serviceName }} + port: + number: {{ $servicePort }} + pathType: {{ $ingressPathType }} + {{- else }} + serviceName: {{ $serviceName }} + servicePort: {{ $servicePort }} + {{- end }} + {{- if $ingressPath }} + path: {{ $ingressPath }} + {{- end }} + {{- end -}} +{{- end }} +{{- end }} +{{ if .Values.vmcluster.ingress.insert.enabled -}} +--- +{{- with .Values.vmcluster.ingress.insert }} +{{- $servicePort := $.Values.vmcluster.spec.vminsert.port | default 8480 -}} +{{- $serviceName := printf "%s-%s" "vminsert" (include "victoria-metrics-k8s-stack.fullname" $) | trunc 63 | trimSuffix "-" }} +{{- $ingressPath := .path -}} +{{- $ingressPathType := .pathType | default "" -}} +{{- $extraPaths := .extraPaths -}} +{{- if $newAPI -}} +apiVersion: networking.k8s.io/v1 +{{- else if $.Capabilities.APIVersions.Has "networking.k8s.io/v1beta1/Ingress" }} +apiVersion: networking.k8s.io/v1beta1 +{{- else }} +apiVersion: extensions/v1beta1 +{{- end }} +kind: Ingress +metadata: + name: {{ $serviceName }} + namespace: {{ $.Release.Namespace }} + labels: + app.kubernetes.io/component: {{ include "victoria-metrics-k8s-stack.name" $ }}-vmcluster +{{ include "victoria-metrics-k8s-stack.labels" $ | indent 4 }} +{{- if .labels }} +{{ toYaml .labels | indent 4 }} +{{- end }} + {{- if .annotations }} + annotations: + {{- range $key, $value := .annotations }} + {{ $key }}: {{ tpl $value $ | quote }} + {{- end }} + {{- end }} +spec: + {{- if .ingressClassName }} + ingressClassName: {{ .ingressClassName }} + {{- end -}} +{{- if .tls }} + tls: +{{ tpl (toYaml .tls) $ | indent 4 }} +{{- end }} + rules: + {{- if .hosts }} + {{- range .hosts }} + - host: {{ tpl . $}} + http: + paths: +{{- if $extraPaths }} +{{ toYaml $extraPaths | indent 10 }} +{{- end }} + - path: {{ $ingressPath }} + {{- if $newAPI }} + pathType: {{ $ingressPathType }} + {{- end }} + backend: + {{- if $newAPI }} + service: + name: {{ $serviceName }} + port: + number: {{ $servicePort }} + {{- else }} + serviceName: {{ $serviceName }} + servicePort: {{ $servicePort }} + {{- end }} + {{- end }} + {{- else }} + - http: + paths: + - backend: + {{- if $newAPI }} + service: + name: {{ $serviceName }} + port: + number: {{ $servicePort }} + pathType: {{ $ingressPathType }} + {{- else }} + serviceName: {{ $serviceName }} + servicePort: {{ $servicePort }} + {{- end }} + {{- if $ingressPath }} + path: {{ $ingressPath }} + {{- end }} + {{- end -}} +{{- end }} +{{- end }} +{{- end }} + diff --git a/charts/victoria-metrics-k8s-stack/values.yaml b/charts/victoria-metrics-k8s-stack/values.yaml index dda014290..c26f83cc5 100644 --- a/charts/victoria-metrics-k8s-stack/values.yaml +++ b/charts/victoria-metrics-k8s-stack/values.yaml @@ -2,6 +2,7 @@ nameOverride: "" fullnameOverride: "" operator: + enabled: true cleanupCRD: true cleanupSA: create: true @@ -11,7 +12,6 @@ operator: tag: v1.16.0 pullPolicy: IfNotPresent - serviceAccount: # Specifies whether a service account should be created create: true @@ -45,7 +45,7 @@ defaultRules: node: true ## Runbook url prefix for default rules - runbookUrl: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md# + runbookUrl: https://runbooks.prometheus-operator.dev/runbooks ## Reduce app namespace alert scope appNamespacesTarget: ".*" @@ -57,8 +57,6 @@ defaultRules: ## Additional labels for PrometheusRule alerts additionalRuleLabels: {} - - ############## # victoria-metrics-operator dependency chart configuration. For possible values refer to https://github.com/VictoriaMetrics/helm-charts/tree/master/charts/victoria-metrics-operator#parameters @@ -69,12 +67,13 @@ victoria-metrics-operator: # -- By default, operator converts prometheus-operator objects. disable_prometheus_converter: true - vmsingle: enabled: true # spec for VMSingle crd # https://github.com/VictoriaMetrics/operator/blob/master/docs/api.MD#vmsinglespec spec: + image: + tag: v1.63.0 retentionPeriod: "14" replicaCount: 1 storage: @@ -89,7 +88,8 @@ vmsingle: # See https://kubernetes.io/blog/2020/04/02/improvements-to-the-ingress-api-in-kubernetes-1.18/#specifying-the-class-of-an-ingress # ingressClassName: nginx # Values can be templated - annotations: {} + annotations: + {} # kubernetes.io/ingress.class: nginx # kubernetes.io/tls-acme: "true" labels: {} @@ -118,11 +118,172 @@ vmsingle: # hosts: # - vmsingle.domain.com +vmcluster: + enabled: false + # spec for VMSingle crd + # https://github.com/VictoriaMetrics/operator/blob/master/docs/api.MD#vmclusterspec + spec: + retentionPeriod: "14" + replicationFactor: 2 + vmstorage: + image: + tag: v1.63.0-cluster + replicaCount: 2 + storageDataPath: "/vm-data" + storage: + volumeClaimTemplate: + spec: + resources: + requests: + storage: 10Gi + resources: + limits: + cpu: "1" + memory: 1500Mi + vmselect: + image: + tag: v1.63.0-cluster + replicaCount: 2 + cacheMountPath: "/select-cache" + storage: + volumeClaimTemplate: + spec: + resources: + requests: + storage: 2Gi + resources: + limits: + cpu: "1" + memory: "1000Mi" + requests: + cpu: "0.5" + memory: "500Mi" + vminsert: + image: + tag: v1.63.0-cluster + replicaCount: 2 + resources: + limits: + cpu: "1" + memory: 1000Mi + requests: + cpu: "0.5" + memory: "500Mi" + + ingress: + storage: + enabled: false + # For Kubernetes >= 1.18 you should specify the ingress-controller via the field ingressClassName + # See https://kubernetes.io/blog/2020/04/02/improvements-to-the-ingress-api-in-kubernetes-1.18/#specifying-the-class-of-an-ingress + # ingressClassName: nginx + # Values can be templated + annotations: + {} + # kubernetes.io/ingress.class: nginx + # kubernetes.io/tls-acme: "true" + labels: {} + path: / + # pathType is only for k8s > 1.19 + pathType: Prefix + + hosts: + - vmstorage.domain.com + ## Extra paths to prepend to every host configuration. This is useful when working with annotation based services. + extraPaths: [] + # - path: /* + # backend: + # serviceName: ssl-redirect + # servicePort: use-annotation + ## Or for k8s > 1.19 + # - path: /* + # pathType: Prefix + # backend: + # service: + # name: ssl-redirect + # port: + # name: service + tls: [] + # - secretName: vmstorage-ingress-tls + # hosts: + # - vmstorage.domain.com + select: + enabled: false + # For Kubernetes >= 1.18 you should specify the ingress-controller via the field ingressClassName + # See https://kubernetes.io/blog/2020/04/02/improvements-to-the-ingress-api-in-kubernetes-1.18/#specifying-the-class-of-an-ingress + # ingressClassName: nginx + # Values can be templated + annotations: + {} + # kubernetes.io/ingress.class: nginx + # kubernetes.io/tls-acme: "true" + labels: {} + path: / + # pathType is only for k8s > 1.19 + pathType: Prefix + + hosts: + - vmselect.domain.com + ## Extra paths to prepend to every host configuration. This is useful when working with annotation based services. + extraPaths: [] + # - path: /* + # backend: + # serviceName: ssl-redirect + # servicePort: use-annotation + ## Or for k8s > 1.19 + # - path: /* + # pathType: Prefix + # backend: + # service: + # name: ssl-redirect + # port: + # name: service + tls: [] + # - secretName: vmselect-ingress-tls + # hosts: + # - vmselect.domain.com + insert: + enabled: false + # For Kubernetes >= 1.18 you should specify the ingress-controller via the field ingressClassName + # See https://kubernetes.io/blog/2020/04/02/improvements-to-the-ingress-api-in-kubernetes-1.18/#specifying-the-class-of-an-ingress + # ingressClassName: nginx + # Values can be templated + annotations: + {} + # kubernetes.io/ingress.class: nginx + # kubernetes.io/tls-acme: "true" + labels: {} + path: / + # pathType is only for k8s > 1.19 + pathType: Prefix + + hosts: + - vminsert.domain.com + ## Extra paths to prepend to every host configuration. This is useful when working with annotation based services. + extraPaths: [] + # - path: /* + # backend: + # serviceName: ssl-redirect + # servicePort: use-annotation + ## Or for k8s > 1.19 + # - path: /* + # pathType: Prefix + # backend: + # service: + # name: ssl-redirect + # port: + # name: service + tls: [] + # - secretName: vminsert-ingress-tls + # hosts: + # - vminsert.domain.com + alertmanager: enabled: true # spec for VMAlertmanager crd # https://github.com/VictoriaMetrics/operator/blob/master/docs/api.MD#vmalertmanagerspec spec: + image: + tag: v0.22.2 externalURL: "" routePrefix: / @@ -134,81 +295,81 @@ alertmanager: resolve_timeout: 5m slack_api_url: "http://slack:30500/" templates: - - "/etc/vm/configs/**/*.tmpl" + - "/etc/vm/configs/**/*.tmpl" route: - group_by: ['job'] + group_by: ["job"] group_wait: 30s group_interval: 5m repeat_interval: 12h - receiver: 'slack-monitoring' + receiver: "slack-monitoring" routes: - ################################################### - ## Duplicate code_owner routes to teams - ## These will send alerts to team channels but continue - ## processing through the rest of the tree to handled by on-call - - match_re: - code_owner: '.+' - routes: - - match: {severity: info|warning|critical} - continue: true - receiver: slack-code-owners - - ################################################### - ## Standard on-call routes - - match_re: - severity: info|warning|critical - receiver: slack-monitoring - continue: true - + ################################################### + ## Duplicate code_owner routes to teams + ## These will send alerts to team channels but continue + ## processing through the rest of the tree to handled by on-call + - match_re: + code_owner: ".+" + routes: + - match: { severity: info|warning|critical } + continue: true + receiver: slack-code-owners + + ################################################### + ## Standard on-call routes + - match_re: + severity: info|warning|critical + receiver: slack-monitoring + continue: true + receivers: - name: "slack-monitoring" slack_configs: - - channel: "#channel" - send_resolved: true - title: '{{ template "slack.monzo.title" . }}' - icon_emoji: '{{ template "slack.monzo.icon_emoji" . }}' - color: '{{ template "slack.monzo.color" . }}' - text: '{{ template "slack.monzo.text" . }}' - actions: - - type: button - text: 'Runbook :green_book:' - url: '{{ (index .Alerts 0).Annotations.runbook }}' - - type: button - text: 'Query :mag:' - url: '{{ (index .Alerts 0).GeneratorURL }}' - - type: button - text: 'Dashboard :grafana:' - url: '{{ (index .Alerts 0).Annotations.dashboard }}' - - type: button - text: 'Silence :no_bell:' - url: '{{ template "__alert_silence_link" . }}' - - type: button - text: '{{ template "slack.monzo.link_button_text" . }}' - url: '{{ .CommonAnnotations.link_url }}' + - channel: "#channel" + send_resolved: true + title: '{{ template "slack.monzo.title" . }}' + icon_emoji: '{{ template "slack.monzo.icon_emoji" . }}' + color: '{{ template "slack.monzo.color" . }}' + text: '{{ template "slack.monzo.text" . }}' + actions: + - type: button + text: "Runbook :green_book:" + url: "{{ (index .Alerts 0).Annotations.runbook }}" + - type: button + text: "Query :mag:" + url: "{{ (index .Alerts 0).GeneratorURL }}" + - type: button + text: "Dashboard :grafana:" + url: "{{ (index .Alerts 0).Annotations.dashboard }}" + - type: button + text: "Silence :no_bell:" + url: '{{ template "__alert_silence_link" . }}' + - type: button + text: '{{ template "slack.monzo.link_button_text" . }}' + url: "{{ .CommonAnnotations.link_url }}" - name: slack-code-owners slack_configs: - - channel: '#{{- template "slack.monzo.code_owner_channel" . -}}' - send_resolved: true - title: '{{ template "slack.monzo.title" . }}' - icon_emoji: '{{ template "slack.monzo.icon_emoji" . }}' - color: '{{ template "slack.monzo.color" . }}' - text: '{{ template "slack.monzo.text" . }}' - actions: - - type: button - text: 'Runbook :green_book:' - url: '{{ (index .Alerts 0).Annotations.runbook }}' - - type: button - text: 'Query :mag:' - url: '{{ (index .Alerts 0).GeneratorURL }}' - - type: button - text: 'Dashboard :grafana:' - url: '{{ (index .Alerts 0).Annotations.dashboard }}' - - type: button - text: 'Silence :no_bell:' - url: '{{ template "__alert_silence_link" . }}' - - type: button - text: '{{ template "slack.monzo.link_button_text" . }}' - url: '{{ .CommonAnnotations.link_url }}' + - channel: '#{{- template "slack.monzo.code_owner_channel" . -}}' + send_resolved: true + title: '{{ template "slack.monzo.title" . }}' + icon_emoji: '{{ template "slack.monzo.icon_emoji" . }}' + color: '{{ template "slack.monzo.color" . }}' + text: '{{ template "slack.monzo.text" . }}' + actions: + - type: button + text: "Runbook :green_book:" + url: "{{ (index .Alerts 0).Annotations.runbook }}" + - type: button + text: "Query :mag:" + url: "{{ (index .Alerts 0).GeneratorURL }}" + - type: button + text: "Dashboard :grafana:" + url: "{{ (index .Alerts 0).Annotations.dashboard }}" + - type: button + text: "Silence :no_bell:" + url: '{{ template "__alert_silence_link" . }}' + - type: button + text: '{{ template "slack.monzo.link_button_text" . }}' + url: "{{ .CommonAnnotations.link_url }}" # better alert templates for slack # source https://gist.github.com/milesbxf/e2744fc90e9c41b47aa47925f8ff6512 @@ -221,7 +382,8 @@ alertmanager: # See https://kubernetes.io/blog/2020/04/02/improvements-to-the-ingress-api-in-kubernetes-1.18/#specifying-the-class-of-an-ingress # ingressClassName: nginx # Values can be templated - annotations: {} + annotations: + {} # kubernetes.io/ingress.class: nginx # kubernetes.io/tls-acme: "true" labels: {} @@ -250,13 +412,13 @@ alertmanager: # hosts: # - alertmanager.domain.com - - vmalert: enabled: true # spec for VMAlert crd # https://github.com/VictoriaMetrics/operator/blob/master/docs/api.MD#vmalertspec spec: + image: + tag: v1.63.0 evaluationInterval: 15s ingress: enabled: false @@ -264,7 +426,8 @@ vmalert: # See https://kubernetes.io/blog/2020/04/02/improvements-to-the-ingress-api-in-kubernetes-1.18/#specifying-the-class-of-an-ingress # ingressClassName: nginx # Values can be templated - annotations: {} + annotations: + {} # kubernetes.io/ingress.class: nginx # kubernetes.io/tls-acme: "true" labels: {} @@ -293,12 +456,13 @@ vmalert: # hosts: # - vmalert.domain.com - vmagent: enabled: true # spec for VMAgent crd # https://github.com/VictoriaMetrics/operator/blob/master/docs/api.MD#vmagentspec spec: + image: + tag: v1.63.0 scrapeInterval: 25s externalLabels: cluster: cluster-name @@ -310,7 +474,8 @@ vmagent: # See https://kubernetes.io/blog/2020/04/02/improvements-to-the-ingress-api-in-kubernetes-1.18/#specifying-the-class-of-an-ingress # ingressClassName: nginx # Values can be templated - annotations: {} + annotations: + {} # kubernetes.io/ingress.class: nginx # kubernetes.io/tls-acme: "true" labels: {} @@ -339,7 +504,6 @@ vmagent: # hosts: # - vmagent.domain.com - ################################################# ### dependencies ##### ################################################# @@ -374,14 +538,14 @@ grafana: dashboardproviders.yaml: apiVersion: 1 providers: - - name: 'default' - orgId: 1 - folder: '' - type: file - disableDeletion: false - editable: true - options: - path: /var/lib/grafana/dashboards/default + - name: "default" + orgId: 1 + folder: "" + type: file + disableDeletion: false + editable: true + options: + path: /var/lib/grafana/dashboards/default dashboards: default: @@ -402,7 +566,8 @@ grafana: # See https://kubernetes.io/blog/2020/04/02/improvements-to-the-ingress-api-in-kubernetes-1.18/#specifying-the-class-of-an-ingress # ingressClassName: nginx # Values can be templated - annotations: {} + annotations: + {} # kubernetes.io/ingress.class: nginx # kubernetes.io/tls-acme: "true" labels: {} @@ -435,12 +600,10 @@ grafana: # wheter we should create a service scrape resource for node-exporter enabled: true - # spec for VMServiceScrape crd + # spec for VMServiceScrape crd # https://github.com/VictoriaMetrics/operator/blob/master/docs/api.MD#vmservicescrapespec spec: {} - - # prometheus-node-exporter dependency chart configuration. For possible values refer to https://github.com/prometheus-community/helm-charts/blob/main/charts/prometheus-node-exporter/values.yaml prometheus-node-exporter: enabled: true @@ -458,27 +621,23 @@ prometheus-node-exporter: # wheter we should create a service scrape resource for node-exporter enabled: true - # spec for VMServiceScrape crd + # spec for VMServiceScrape crd # https://github.com/VictoriaMetrics/operator/blob/master/docs/api.MD#vmservicescrapespec spec: jobLabel: jobLabel - - # kube-state-metrics dependency chart configuration. For possible values refer to https://github.com/kubernetes/kube-state-metrics/blob/master/charts/kube-state-metrics/values.yaml kube-state-metrics: enabled: true ## all values for kube-state-metrics helm chart can be specified here - # spec for VMServiceScrape crd + # spec for VMServiceScrape crd # https://github.com/VictoriaMetrics/operator/blob/master/docs/api.MD#vmservicescrapespec vmServiceScrape: spec: {} #TODO: selector override for kube-state-metrics deployed separatelly - - ### Service Monitors ## Component scraping the kubelets kubelet: @@ -488,7 +647,7 @@ kubelet: cadvisor: true ## Enable scraping /metrics/probes from kubelet's service probes: true - # spec for VMNodeScrape crd + # spec for VMNodeScrape crd # https://github.com/VictoriaMetrics/operator/blob/master/docs/api.MD#vmnodescrapespec spec: scheme: "https" @@ -507,34 +666,30 @@ kubelet: - targetLabel: "job" replacement: "kubelet" - - ## Component scraping the kube api server kubeApiServer: enabled: true - # spec for VMServiceScrape crd + # spec for VMServiceScrape crd # https://github.com/VictoriaMetrics/operator/blob/master/docs/api.MD#vmservicescrapespec spec: endpoints: - - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token - # bearerTokenSecret: - # key: "" - port: https - scheme: https - tlsConfig: - caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt - serverName: kubernetes + - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + # bearerTokenSecret: + # key: "" + port: https + scheme: https + tlsConfig: + caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + serverName: kubernetes jobLabel: component namespaceSelector: matchNames: - - default + - default selector: matchLabels: component: apiserver provider: kubernetes - - ## Component scraping the kube controller manager kubeControllerManager: enabled: true @@ -555,10 +710,9 @@ kubeControllerManager: # selector: # component: kube-controller-manager - vmServiceScrape: enabled: true - # spec for VMServiceScrape crd + # spec for VMServiceScrape crd # https://github.com/VictoriaMetrics/operator/blob/master/docs/api.MD#vmservicescrapespec spec: jobLabel: jobLabel @@ -572,8 +726,6 @@ kubeControllerManager: caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt serverName: kubernetes - - ## Component scraping coreDns. Use either this or kubeDns ## coreDns: @@ -587,14 +739,12 @@ coreDns: vmServiceScrape: enabled: true - # spec for VMServiceScrape crd + # spec for VMServiceScrape crd # https://github.com/VictoriaMetrics/operator/blob/master/docs/api.MD#vmservicescrapespec - spec: + spec: endpoints: - - port: http-metrics - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token - - + - port: http-metrics + bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token ## Component scraping etcd ## @@ -619,7 +769,7 @@ kubeEtcd: vmServiceScrape: enabled: true - # spec for VMServiceScrape crd + # spec for VMServiceScrape crd # https://github.com/VictoriaMetrics/operator/blob/master/docs/api.MD#vmservicescrapespec spec: jobLabel: jobLabel @@ -632,8 +782,6 @@ kubeEtcd: tlsConfig: caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt - - ## Component scraping kube scheduler ## kubeScheduler: @@ -657,7 +805,7 @@ kubeScheduler: vmServiceScrape: enabled: true - # spec for VMServiceScrape crd + # spec for VMServiceScrape crd # https://github.com/VictoriaMetrics/operator/blob/master/docs/api.MD#vmservicescrapespec spec: jobLabel: jobLabel @@ -670,8 +818,6 @@ kubeScheduler: tlsConfig: caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt - - ## Component scraping kube proxy ## kubeProxy: @@ -693,7 +839,7 @@ kubeProxy: vmServiceScrape: enabled: true - # spec for VMServiceScrape crd + # spec for VMServiceScrape crd # https://github.com/VictoriaMetrics/operator/blob/master/docs/api.MD#vmservicescrapespec spec: jobLabel: jobLabel