diff --git a/charts/victoria-metrics-k8s-stack/.gitignore b/charts/victoria-metrics-k8s-stack/.gitignore
new file mode 100644
index 000000000..89f9ac04a
--- /dev/null
+++ b/charts/victoria-metrics-k8s-stack/.gitignore
@@ -0,0 +1 @@
+out/
diff --git a/charts/victoria-metrics-k8s-stack/Chart.lock b/charts/victoria-metrics-k8s-stack/Chart.lock
index 60a5ad10b..0484a6be2 100644
--- a/charts/victoria-metrics-k8s-stack/Chart.lock
+++ b/charts/victoria-metrics-k8s-stack/Chart.lock
@@ -4,12 +4,12 @@ dependencies:
version: 0.1.17
- name: kube-state-metrics
repository: https://prometheus-community.github.io/helm-charts
- version: 3.2.2
+ version: 3.4.1
- name: prometheus-node-exporter
repository: https://prometheus-community.github.io/helm-charts
- version: 1.18.2
+ version: 2.0.2
- name: grafana
repository: https://grafana.github.io/helm-charts
- version: 6.12.1
-digest: sha256:58ad75ef412eed7eff3fbfc4d721c33d34ca1af838f60d7297de2388dc2d8b26
-generated: "2021-07-12T20:06:41.848769241Z"
+ version: 6.14.1
+digest: sha256:ef56bd6d0c02f87ffbf5f3ae2debf4a8d6a914c1cd46a999940ce1d62354e039
+generated: "2021-07-27T19:55:13.172435+08:00"
diff --git a/charts/victoria-metrics-k8s-stack/Chart.yaml b/charts/victoria-metrics-k8s-stack/Chart.yaml
index 64e527900..434df53fe 100644
--- a/charts/victoria-metrics-k8s-stack/Chart.yaml
+++ b/charts/victoria-metrics-k8s-stack/Chart.yaml
@@ -2,22 +2,23 @@ apiVersion: v2
name: victoria-metrics-k8s-stack
description: Kubernetes monitoring on VictoriaMetrics stack. Includes VictoriaMetrics Operator, Grafana dashboards, ServiceScrapes and VMRules
type: application
-version: 0.2.9
-appVersion: "1.16.0"
+version: 0.3.0
+appVersion: "1.63.0"
dependencies:
-- name: victoria-metrics-operator
- version: "0.1.*"
- repository: https://victoriametrics.github.io/helm-charts
-- name: kube-state-metrics
- version: "3.2.*"
- repository: https://prometheus-community.github.io/helm-charts
- condition: kube-state-metrics.enabled
-- name: prometheus-node-exporter
- version: "1.18.*"
- repository: https://prometheus-community.github.io/helm-charts
- condition: prometheus-node-exporter.enabled
-- name: grafana
- version: "6.12.*"
- repository: https://grafana.github.io/helm-charts
- condition: grafana.enabled
\ No newline at end of file
+ - name: victoria-metrics-operator
+ version: "0.1.*"
+ repository: https://victoriametrics.github.io/helm-charts
+ condition: operator.enabled
+ - name: kube-state-metrics
+ version: "3.4.*"
+ repository: https://prometheus-community.github.io/helm-charts
+ condition: kube-state-metrics.enabled
+ - name: prometheus-node-exporter
+ version: "2.0.*"
+ repository: https://prometheus-community.github.io/helm-charts
+ condition: prometheus-node-exporter.enabled
+ - name: grafana
+ version: "6.14.*"
+ repository: https://grafana.github.io/helm-charts
+ condition: grafana.enabled
diff --git a/charts/victoria-metrics-k8s-stack/README.md b/charts/victoria-metrics-k8s-stack/README.md
index e8713eb02..1f8c97768 100644
--- a/charts/victoria-metrics-k8s-stack/README.md
+++ b/charts/victoria-metrics-k8s-stack/README.md
@@ -1,6 +1,6 @@
# Helm Chart For Victoria Metrics kubernetes monitoring stack.
-![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![Version: 0.2.9](https://img.shields.io/badge/Version-0.2.9-informational?style=flat-square)
+![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![Version: 0.3.0](https://img.shields.io/badge/Version-0.3.0-informational?style=flat-square)
Kubernetes monitoring on VictoriaMetrics stack. Includes VictoriaMetrics Operator, Grafana dashboards, ServiceScrapes and VMRules
@@ -252,6 +252,7 @@ Change the values according to the need of the environment in ``victoria-metrics
| alertmanager.ingress.tls | list | `[]` | |
| alertmanager.monzoTemplate.enabled | bool | `true` | |
| alertmanager.spec.externalURL | string | `""` | |
+| alertmanager.spec.image.tag | string | `"v0.22.2"` | |
| alertmanager.spec.routePrefix | string | `"/"` | |
| coreDns.enabled | bool | `true` | |
| coreDns.service.enabled | bool | `true` | |
@@ -282,7 +283,7 @@ Change the values according to the need of the environment in ``victoria-metrics
| defaultRules.rules.kubernetesSystem | bool | `true` | |
| defaultRules.rules.network | bool | `true` | |
| defaultRules.rules.node | bool | `true` | |
-| defaultRules.runbookUrl | string | `"https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#"` | |
+| defaultRules.runbookUrl | string | `"https://runbooks.prometheus-operator.dev/runbooks"` | |
| fullnameOverride | string | `""` | |
| grafana.additionalDataSources | list | `[]` | |
| grafana.dashboardProviders."dashboardproviders.yaml".apiVersion | int | `1` | |
@@ -390,6 +391,7 @@ Change the values according to the need of the environment in ``victoria-metrics
| operator.cleanupCRD | bool | `true` | |
| operator.cleanupSA.create | bool | `true` | |
| operator.cleanupSA.name | string | `""` | |
+| operator.enabled | bool | `true` | |
| operator.kubectlImage.pullPolicy | string | `"IfNotPresent"` | |
| operator.kubectlImage.repository | string | `"gcr.io/google_containers/hyperkube"` | |
| operator.kubectlImage.tag | string | `"v1.16.0"` | |
@@ -415,6 +417,7 @@ Change the values according to the need of the environment in ``victoria-metrics
| vmagent.ingress.tls | list | `[]` | |
| vmagent.spec.externalLabels.cluster | string | `"cluster-name"` | |
| vmagent.spec.extraArgs."promscrape.streamParse" | string | `"true"` | |
+| vmagent.spec.image.tag | string | `"v1.63.0"` | |
| vmagent.spec.scrapeInterval | string | `"25s"` | |
| vmalert.enabled | bool | `true` | |
| vmalert.ingress.annotations | object | `{}` | |
@@ -426,6 +429,54 @@ Change the values according to the need of the environment in ``victoria-metrics
| vmalert.ingress.pathType | string | `"Prefix"` | |
| vmalert.ingress.tls | list | `[]` | |
| vmalert.spec.evaluationInterval | string | `"15s"` | |
+| vmalert.spec.image.tag | string | `"v1.63.0"` | |
+| vmcluster.enabled | bool | `false` | |
+| vmcluster.ingress.insert.annotations | object | `{}` | |
+| vmcluster.ingress.insert.enabled | bool | `false` | |
+| vmcluster.ingress.insert.extraPaths | list | `[]` | |
+| vmcluster.ingress.insert.hosts[0] | string | `"vminsert.domain.com"` | |
+| vmcluster.ingress.insert.labels | object | `{}` | |
+| vmcluster.ingress.insert.path | string | `"/"` | |
+| vmcluster.ingress.insert.pathType | string | `"Prefix"` | |
+| vmcluster.ingress.insert.tls | list | `[]` | |
+| vmcluster.ingress.select.annotations | object | `{}` | |
+| vmcluster.ingress.select.enabled | bool | `false` | |
+| vmcluster.ingress.select.extraPaths | list | `[]` | |
+| vmcluster.ingress.select.hosts[0] | string | `"vmselect.domain.com"` | |
+| vmcluster.ingress.select.labels | object | `{}` | |
+| vmcluster.ingress.select.path | string | `"/"` | |
+| vmcluster.ingress.select.pathType | string | `"Prefix"` | |
+| vmcluster.ingress.select.tls | list | `[]` | |
+| vmcluster.ingress.storage.annotations | object | `{}` | |
+| vmcluster.ingress.storage.enabled | bool | `false` | |
+| vmcluster.ingress.storage.extraPaths | list | `[]` | |
+| vmcluster.ingress.storage.hosts[0] | string | `"vmstorage.domain.com"` | |
+| vmcluster.ingress.storage.labels | object | `{}` | |
+| vmcluster.ingress.storage.path | string | `"/"` | |
+| vmcluster.ingress.storage.pathType | string | `"Prefix"` | |
+| vmcluster.ingress.storage.tls | list | `[]` | |
+| vmcluster.spec.replicationFactor | int | `2` | |
+| vmcluster.spec.retentionPeriod | string | `"14"` | |
+| vmcluster.spec.vminsert.image.tag | string | `"v1.63.0-cluster"` | |
+| vmcluster.spec.vminsert.replicaCount | int | `2` | |
+| vmcluster.spec.vminsert.resources.limits.cpu | string | `"1"` | |
+| vmcluster.spec.vminsert.resources.limits.memory | string | `"1000Mi"` | |
+| vmcluster.spec.vminsert.resources.requests.cpu | string | `"0.5"` | |
+| vmcluster.spec.vminsert.resources.requests.memory | string | `"500Mi"` | |
+| vmcluster.spec.vmselect.cacheMountPath | string | `"/select-cache"` | |
+| vmcluster.spec.vmselect.image.tag | string | `"v1.63.0-cluster"` | |
+| vmcluster.spec.vmselect.replicaCount | int | `2` | |
+| vmcluster.spec.vmselect.resources.limits.cpu | string | `"1"` | |
+| vmcluster.spec.vmselect.resources.limits.memory | string | `"1000Mi"` | |
+| vmcluster.spec.vmselect.resources.requests.cpu | string | `"0.5"` | |
+| vmcluster.spec.vmselect.resources.requests.memory | string | `"500Mi"` | |
+| vmcluster.spec.vmselect.storage.volumeClaimTemplate.spec.resources.requests.storage | string | `"2Gi"` | |
+| vmcluster.spec.vmstorage.image.tag | string | `"v1.63.0-cluster"` | |
+| vmcluster.spec.vmstorage.replicaCount | int | `2` | |
+| vmcluster.spec.vmstorage.resources.limits.cpu | string | `"1"` | |
+| vmcluster.spec.vmstorage.resources.limits.memory | string | `"1500Mi"` | |
+| vmcluster.spec.vmstorage.storage.volumeClaimTemplate.spec.resources.requests.storage | string | `"10Gi"` | |
+| vmcluster.spec.vmstorage.storageDataPath | string | `"/vm-data"` | |
| vmsingle.enabled | bool | `true` | |
| vmsingle.ingress.annotations | object | `{}` | |
| vmsingle.ingress.enabled | bool | `false` | |
@@ -435,6 +486,7 @@ Change the values according to the need of the environment in ``victoria-metrics
| vmsingle.ingress.path | string | `"/"` | |
| vmsingle.ingress.pathType | string | `"Prefix"` | |
| vmsingle.ingress.tls | list | `[]` | |
+| vmsingle.spec.image.tag | string | `"v1.63.0"` | |
| vmsingle.spec.replicaCount | int | `1` | |
| vmsingle.spec.retentionPeriod | string | `"14"` | |
| vmsingle.spec.storage.accessModes[0] | string | `"ReadWriteOnce"` | |
diff --git a/charts/victoria-metrics-k8s-stack/hack/sync_grafana_dashboards.py b/charts/victoria-metrics-k8s-stack/hack/sync_grafana_dashboards.py
index 8eed4ab36..4f081b642 100644
--- a/charts/victoria-metrics-k8s-stack/hack/sync_grafana_dashboards.py
+++ b/charts/victoria-metrics-k8s-stack/hack/sync_grafana_dashboards.py
@@ -45,6 +45,11 @@ def new_representer(dumper, data):
'destination': '../templates/grafana/dashboards',
'type': 'json'
},
+ {
+ 'source': 'https://raw.githubusercontent.com/VictoriaMetrics/VictoriaMetrics/cluster/dashboards/clusterbytenant.json',
+ 'destination': '../templates/grafana/dashboards',
+ 'type': 'json'
+ },
]
skip_list = [
@@ -63,6 +68,7 @@ def new_representer(dumper, data):
'scheduler': ' .Values.kubeScheduler.enabled',
'node-rsrc-use': ' (index .Values "prometheus-node-exporter" "enabled")',
'node-cluster-rsrc-use': ' (index .Values "prometheus-node-exporter" "enabled")',
+ 'clusterbytenant': '.Values.vmcluster.enabled'
}
# standard header
diff --git a/charts/victoria-metrics-k8s-stack/hack/sync_rules.py b/charts/victoria-metrics-k8s-stack/hack/sync_rules.py
index 6215271f5..9f6b64686 100644
--- a/charts/victoria-metrics-k8s-stack/hack/sync_rules.py
+++ b/charts/victoria-metrics-k8s-stack/hack/sync_rules.py
@@ -95,12 +95,9 @@ def new_representer(dumper, data):
}
replacement_map = {
- 'https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#': {
+ 'https://runbooks.prometheus-operator.dev/runbooks': {
'replacement': '{{ .Values.defaultRules.runbookUrl }}',
'init': ''},
- 'https://github.com/prometheus-operator/kube-prometheus/wiki/': {
- 'replacement': '{{ .Values.defaultRules.runbookUrl }}alert-name-',
- 'init': ''},
'job="kube-state-metrics"': {
'replacement': 'job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"',
'limitGroup': ['kubernetes-apps'],
diff --git a/charts/victoria-metrics-k8s-stack/templates/_helpers.tpl b/charts/victoria-metrics-k8s-stack/templates/_helpers.tpl
index 0a41901b6..1ab7d2607 100644
--- a/charts/victoria-metrics-k8s-stack/templates/_helpers.tpl
+++ b/charts/victoria-metrics-k8s-stack/templates/_helpers.tpl
@@ -82,57 +82,78 @@ app.kubernetes.io/name: {{ include "victoria-metrics-k8s-stack.name" . }}
app.kubernetes.io/instance: {{ .Release.Name }}
{{- end }}
-{{/*
-VM endpoint
-*/}}
-{{- define "victoria-metrics-k8s-stack.vmEndpoint" -}}
+
+{{- define "victoria-metrics-k8s-stack.vmSelectEndpoint" -}}
{{- if .Values.vmsingle.enabled -}}
-url: "http://{{ include "victoria-metrics-k8s-stack.vmsingleName" .}}.{{ .Release.Namespace }}.svc:{{ .Values.vmsingle.spec.port | default 8429 }}"
+{{ printf "http://%s.%s.svc:%d" (include "victoria-metrics-k8s-stack.vmsingleName" .) .Release.Namespace (.Values.vmsingle.spec.port | default 8429) }}
+{{- end }}
+{{- if .Values.vmcluster.enabled -}}
+{{ printf "http://%s-%s.%s.svc:%d/select/0/prometheus" "vmselect" (include "victoria-metrics-k8s-stack.fullname" .) .Release.Namespace (.Values.vmcluster.spec.vmselect.port | default 8481) }}
{{- end }}
{{- end }}
-
-{{/*
-Alermanager spec
-*/}}
-{{- define "victoria-metrics-k8s-stack.alertmanagerSpec" -}}
-{{ omit .Values.alertmanager.spec "configMaps" "configSecret" | toYaml }}
-configSecret: {{ .Values.alertmanager.spec.configSecret | default (printf "%s-alertmanager" (include "victoria-metrics-k8s-stack.fullname" .)) }}
-{{- if or .Values.alertmanager.spec.configMaps .Values.alertmanager.monzoTemplate.enabled }}
-{{- $list := .Values.alertmanager.spec.configMaps | default (list "") }}
-{{- if .Values.alertmanager.monzoTemplate.enabled }}
-{{- $list = append $list (printf "%s-%s" (include "victoria-metrics-k8s-stack.fullname" $) "alertmanager-monzo-tpl" | trunc 63 | trimSuffix "-") }}
+{{- define "victoria-metrics-k8s-stack.vmInsertEndpoint" -}}
+{{- if .Values.vmsingle.enabled -}}
+{{ printf "http://%s.%s.svc:%d" (include "victoria-metrics-k8s-stack.vmsingleName" .) .Release.Namespace (.Values.vmsingle.spec.port | default 8429) }}
{{- end }}
-configMaps:
-{{- range compact $list }}
-- {{ . }}
+{{- if .Values.vmcluster.enabled -}}
+{{ printf "http://%s-%s.%s.svc:%d/insert/0" "vminsert" (include "victoria-metrics-k8s-stack.fullname" .) .Release.Namespace (.Values.vmcluster.spec.vminsert.port | default 8480) }}
{{- end }}
{{- end }}
+
+
+{{/*
+VMAlert remotes
+*/}}
+{{- define "victoria-metrics-k8s-stack.vmAlertRemotes" -}}
+remoteWrite:
+ - url: {{ include "victoria-metrics-k8s-stack.vmInsertEndpoint" . }}
+remoteRead:
+ - url: {{ include "victoria-metrics-k8s-stack.vmSelectEndpoint" . }}
+datasource:
+ - url: {{ include "victoria-metrics-k8s-stack.vmSelectEndpoint" . }}
+notifier:
+ - url: {{ printf "http://%s-%s.%s.svc:9093" "vmalertmanager" (include "victoria-metrics-k8s-stack.fullname" .) .Release.Namespace }}
{{- end }}
{{/*
VMAlert spec
*/}}
{{- define "victoria-metrics-k8s-stack.vmAlertSpec" -}}
-{{- $vmAlertStackRemoteWrite := dict "remoteWrite" ( include "victoria-metrics-k8s-stack.vmEndpoint" . | fromYaml ) -}}
-{{- $vmAlertStackRemoteRead := dict "remoteRead" ( include "victoria-metrics-k8s-stack.vmEndpoint" . | fromYaml ) -}}
-{{- $vmAlertStackDatasource := dict "datasource" ( include "victoria-metrics-k8s-stack.vmEndpoint" . | fromYaml ) -}}
-{{- $vmAlertStackNotifier := dict "notifier" ( dict "url" ( printf "http://vmalertmanager-%s.%s.svc:9093" (include "victoria-metrics-k8s-stack.fullname" .) .Release.Namespace ) ) -}}
-{{ deepCopy .Values.vmalert.spec | mergeOverwrite $vmAlertStackRemoteWrite $vmAlertStackRemoteRead $vmAlertStackDatasource $vmAlertStackNotifier | toYaml }}
+{{ deepCopy .Values.vmalert.spec | mergeOverwrite (include "victoria-metrics-k8s-stack.vmAlertRemotes" . | fromYaml) | toYaml }}
{{- end }}
{{/*
-VM remoteWrite
+VM Agent remoteWrite
*/}}
{{- define "victoria-metrics-k8s-stack.vmAgentRemoteWrite" -}}
remoteWrite:
- - url: "http://{{ .Values.vmsingle.name | default (printf "vmsingle-%s" (include "victoria-metrics-k8s-stack.fullname" .))}}.{{ .Release.Namespace }}.svc:{{ .Values.vmsingle.spec.port | default 8429 }}/api/v1/write"
+ - url: {{ include "victoria-metrics-k8s-stack.vmInsertEndpoint" . }}/api/v1/write
{{- end }}
{{/*
VMAgent spec
*/}}
{{- define "victoria-metrics-k8s-stack.vmAgentSpec" -}}
-{{ deepCopy .Values.vmagent.spec | mergeOverwrite ( include "victoria-metrics-k8s-stack.vmAgentRemoteWrite" . | fromYaml ) | toYaml }}
+{{ deepCopy .Values.vmagent.spec | mergeOverwrite ( include "victoria-metrics-k8s-stack.vmAgentRemoteWrite" . | fromYaml) | toYaml }}
+{{- end }}
+
+
+{{/*
+Alermanager spec
+*/}}
+{{- define "victoria-metrics-k8s-stack.alertmanagerSpec" -}}
+{{ omit .Values.alertmanager.spec "configMaps" "configSecret" | toYaml }}
+configSecret: {{ .Values.alertmanager.spec.configSecret | default (printf "%s-alertmanager" (include "victoria-metrics-k8s-stack.fullname" .)) }}
+{{- if or .Values.alertmanager.spec.configMaps .Values.alertmanager.monzoTemplate.enabled }}
+{{- $list := .Values.alertmanager.spec.configMaps | default (list "") }}
+{{- if .Values.alertmanager.monzoTemplate.enabled }}
+{{- $list = append $list (printf "%s-%s" (include "victoria-metrics-k8s-stack.fullname" $) "alertmanager-monzo-tpl" | trunc 63 | trimSuffix "-") }}
+{{- end }}
+configMaps:
+{{- range compact $list }}
+- {{ . }}
+{{- end }}
{{- end }}
+{{- end }}
\ No newline at end of file
diff --git a/charts/victoria-metrics-k8s-stack/templates/grafana/dashboards/alertmanager-overview.yaml b/charts/victoria-metrics-k8s-stack/templates/grafana/dashboards/alertmanager-overview.yaml
new file mode 100644
index 000000000..3b765f032
--- /dev/null
+++ b/charts/victoria-metrics-k8s-stack/templates/grafana/dashboards/alertmanager-overview.yaml
@@ -0,0 +1,607 @@
+{{- /*
+Generated from 'alertmanager-overview' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/grafana-dashboardDefinitions.yaml
+Do not change in-place! In order to change this file first read following link:
+https://github.com/VictoriaMetrics/helm-charts/tree/master/charts/victoria-metrics-k8s-stack/hack
+*/ -}}
+{{- if and .Values.grafana.enabled .Values.grafana.defaultDashboardsEnabled }}
+apiVersion: v1
+kind: ConfigMap
+metadata:
+ namespace: {{ .Release.Namespace }}
+ name: {{ printf "%s-%s" (include "victoria-metrics-k8s-stack.fullname" $) "alertmanager-overview" | trunc 63 | trimSuffix "-" }}
+ labels:
+ {{- if $.Values.grafana.sidecar.dashboards.label }}
+ {{ $.Values.grafana.sidecar.dashboards.label }}: "1"
+ {{- end }}
+ app: {{ include "victoria-metrics-k8s-stack.name" $ }}-grafana
+{{ include "victoria-metrics-k8s-stack.labels" $ | indent 4 }}
+data:
+ alertmanager-overview.json: |-
+ {
+ "__inputs": [
+
+ ],
+ "__requires": [
+
+ ],
+ "annotations": {
+ "list": [
+
+ ]
+ },
+ "editable": false,
+ "gnetId": null,
+ "graphTooltip": 1,
+ "hideControls": false,
+ "id": null,
+ "links": [
+
+ ],
+ "refresh": "30s",
+ "rows": [
+ {
+ "collapse": false,
+ "collapsed": false,
+ "panels": [
+ {
+ "aliasColors": {
+
+ },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+
+ },
+ "id": 2,
+ "legend": {
+ "alignAsTable": false,
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "rightSide": false,
+ "show": false,
+ "sideWidth": null,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [
+
+ ],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "repeat": null,
+ "seriesOverrides": [
+
+ ],
+ "spaceLength": 10,
+ "span": 6,
+ "stack": true,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "sum(alertmanager_alerts{namespace=\"$namespace\",service=\"$service\"}) by (namespace,service,instance)",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "{{`{{`}}instance{{`}}`}}",
+ "refId": "A"
+ }
+ ],
+ "thresholds": [
+
+ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Alerts",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": [
+
+ ]
+ },
+ "yaxes": [
+ {
+ "format": "none",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "none",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ]
+ },
+ {
+ "aliasColors": {
+
+ },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+
+ },
+ "id": 3,
+ "legend": {
+ "alignAsTable": false,
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "rightSide": false,
+ "show": false,
+ "sideWidth": null,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [
+
+ ],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "repeat": null,
+ "seriesOverrides": [
+
+ ],
+ "spaceLength": 10,
+ "span": 6,
+ "stack": true,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "sum(rate(alertmanager_alerts_received_total{namespace=\"$namespace\",service=\"$service\"}[5m])) by (namespace,service,instance)",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "{{`{{`}}instance{{`}}`}} Received",
+ "refId": "A"
+ },
+ {
+ "expr": "sum(rate(alertmanager_alerts_invalid_total{namespace=\"$namespace\",service=\"$service\"}[5m])) by (namespace,service,instance)",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "{{`{{`}}instance{{`}}`}} Invalid",
+ "refId": "B"
+ }
+ ],
+ "thresholds": [
+
+ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Alerts receive rate",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": [
+
+ ]
+ },
+ "yaxes": [
+ {
+ "format": "ops",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "ops",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ]
+ }
+ ],
+ "repeat": null,
+ "repeatIteration": null,
+ "repeatRowId": null,
+ "showTitle": true,
+ "title": "Alerts",
+ "titleSize": "h6",
+ "type": "row"
+ },
+ {
+ "collapse": false,
+ "collapsed": false,
+ "panels": [
+ {
+ "aliasColors": {
+
+ },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+
+ },
+ "id": 4,
+ "legend": {
+ "alignAsTable": false,
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "rightSide": false,
+ "show": false,
+ "sideWidth": null,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [
+
+ ],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "repeat": "integration",
+ "seriesOverrides": [
+
+ ],
+ "spaceLength": 10,
+ "stack": true,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "sum(rate(alertmanager_notifications_total{namespace=\"$namespace\",service=\"$service\", integration=\"$integration\"}[5m])) by (integration,namespace,service,instance)",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "{{`{{`}}instance{{`}}`}} Total",
+ "refId": "A"
+ },
+ {
+ "expr": "sum(rate(alertmanager_notifications_failed_total{namespace=\"$namespace\",service=\"$service\", integration=\"$integration\"}[5m])) by (integration,namespace,service,instance)",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "{{`{{`}}instance{{`}}`}} Failed",
+ "refId": "B"
+ }
+ ],
+ "thresholds": [
+
+ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "$integration: Notifications Send Rate",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": [
+
+ ]
+ },
+ "yaxes": [
+ {
+ "format": "ops",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "ops",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ]
+ },
+ {
+ "aliasColors": {
+
+ },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+
+ },
+ "id": 5,
+ "legend": {
+ "alignAsTable": false,
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "rightSide": false,
+ "show": false,
+ "sideWidth": null,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [
+
+ ],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "repeat": "integration",
+ "seriesOverrides": [
+
+ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "histogram_quantile(0.99,\n sum(rate(alertmanager_notification_latency_seconds_bucket{namespace=\"$namespace\",service=\"$service\", integration=\"$integration\"}[5m])) by (le,namespace,service,instance)\n) \n",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "{{`{{`}}instance{{`}}`}} 99th Percentile",
+ "refId": "A"
+ },
+ {
+ "expr": "histogram_quantile(0.50,\n sum(rate(alertmanager_notification_latency_seconds_bucket{namespace=\"$namespace\",service=\"$service\", integration=\"$integration\"}[5m])) by (le,namespace,service,instance)\n) \n",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "{{`{{`}}instance{{`}}`}} Median",
+ "refId": "B"
+ },
+ {
+ "expr": "sum(rate(alertmanager_notification_latency_seconds_sum{namespace=\"$namespace\",service=\"$service\", integration=\"$integration\"}[5m])) by (namespace,service,instance)\n/\nsum(rate(alertmanager_notification_latency_seconds_count{namespace=\"$namespace\",service=\"$service\", integration=\"$integration\"}[5m])) by (namespace,service,instance)\n",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "{{`{{`}}instance{{`}}`}} Average",
+ "refId": "C"
+ }
+ ],
+ "thresholds": [
+
+ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "$integration: Notification Duration",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": [
+
+ ]
+ },
+ "yaxes": [
+ {
+ "format": "s",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "s",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ]
+ }
+ ],
+ "repeat": null,
+ "repeatIteration": null,
+ "repeatRowId": null,
+ "showTitle": true,
+ "title": "Notifications",
+ "titleSize": "h6",
+ "type": "row"
+ }
+ ],
+ "schemaVersion": 14,
+ "style": "dark",
+ "tags": [
+ "alertmanager-mixin"
+ ],
+ "templating": {
+ "list": [
+ {
+ "current": {
+ "text": "Prometheus",
+ "value": "Prometheus"
+ },
+ "hide": 0,
+ "label": null,
+ "name": "datasource",
+ "options": [
+
+ ],
+ "query": "prometheus",
+ "refresh": 1,
+ "regex": "",
+ "type": "datasource"
+ },
+ {
+ "allValue": null,
+ "current": {
+ "text": "",
+ "value": ""
+ },
+ "datasource": "$datasource",
+ "hide": 0,
+ "includeAll": false,
+ "label": null,
+ "multi": false,
+ "name": "namespace",
+ "options": [
+
+ ],
+ "query": "label_values(alertmanager_alerts, namespace)",
+ "refresh": 2,
+ "regex": "",
+ "sort": 1,
+ "tagValuesQuery": "",
+ "tags": [
+
+ ],
+ "tagsQuery": "",
+ "type": "query",
+ "useTags": false
+ },
+ {
+ "allValue": null,
+ "current": {
+ "text": "",
+ "value": ""
+ },
+ "datasource": "$datasource",
+ "hide": 0,
+ "includeAll": false,
+ "label": null,
+ "multi": false,
+ "name": "service",
+ "options": [
+
+ ],
+ "query": "label_values(alertmanager_alerts, service)",
+ "refresh": 2,
+ "regex": "",
+ "sort": 1,
+ "tagValuesQuery": "",
+ "tags": [
+
+ ],
+ "tagsQuery": "",
+ "type": "query",
+ "useTags": false
+ },
+ {
+ "allValue": null,
+ "current": {
+ "text": "all",
+ "value": "$__all"
+ },
+ "datasource": "$datasource",
+ "hide": 2,
+ "includeAll": true,
+ "label": null,
+ "multi": false,
+ "name": "integration",
+ "options": [
+
+ ],
+ "query": "label_values(alertmanager_notifications_total{integration=~\".*\"}, integration)",
+ "refresh": 2,
+ "regex": "",
+ "sort": 1,
+ "tagValuesQuery": "",
+ "tags": [
+
+ ],
+ "tagsQuery": "",
+ "type": "query",
+ "useTags": false
+ }
+ ]
+ },
+ "time": {
+ "from": "now-1h",
+ "to": "now"
+ },
+ "timepicker": {
+ "refresh_intervals": [
+ "5s",
+ "10s",
+ "30s",
+ "1m",
+ "5m",
+ "15m",
+ "30m",
+ "1h",
+ "2h",
+ "1d"
+ ],
+ "time_options": [
+ "5m",
+ "15m",
+ "1h",
+ "6h",
+ "12h",
+ "24h",
+ "2d",
+ "7d",
+ "30d"
+ ]
+ },
+ "timezone": "utc",
+ "title": "Alertmanager / Overview",
+ "uid": "alertmanager-overview",
+ "version": 0
+ }
+{{- end }}
\ No newline at end of file
diff --git a/charts/victoria-metrics-k8s-stack/templates/grafana/dashboards/clusterbytenant.yaml b/charts/victoria-metrics-k8s-stack/templates/grafana/dashboards/clusterbytenant.yaml
new file mode 100644
index 000000000..35860fd77
--- /dev/null
+++ b/charts/victoria-metrics-k8s-stack/templates/grafana/dashboards/clusterbytenant.yaml
@@ -0,0 +1,695 @@
+{{- /*
+Generated from 'clusterbytenant' from https://raw.githubusercontent.com/VictoriaMetrics/VictoriaMetrics/cluster/dashboards/clusterbytenant.json
+Do not change in-place! In order to change this file first read following link:
+https://github.com/VictoriaMetrics/helm-charts/tree/master/charts/victoria-metrics-k8s-stack/hack
+*/ -}}
+{{- if and .Values.grafana.enabled .Values.grafana.defaultDashboardsEnabled.Values.vmcluster.enabled }}
+apiVersion: v1
+kind: ConfigMap
+metadata:
+ namespace: {{ .Release.Namespace }}
+ name: {{ printf "%s-%s" (include "victoria-metrics-k8s-stack.fullname" $) "clusterbytenant" | trunc 63 | trimSuffix "-" }}
+ labels:
+ {{- if $.Values.grafana.sidecar.dashboards.label }}
+ {{ $.Values.grafana.sidecar.dashboards.label }}: "1"
+ {{- end }}
+ app: {{ include "victoria-metrics-k8s-stack.name" $ }}-grafana
+{{ include "victoria-metrics-k8s-stack.labels" $ | indent 4 }}
+data:
+ clusterbytenant.json: |-
+ {
+ "annotations": {
+ "list": [
+ {
+ "builtIn": 1,
+ "datasource": "-- Grafana --",
+ "enable": true,
+ "hide": true,
+ "iconColor": "rgba(0, 211, 255, 1)",
+ "name": "Annotations & Alerts",
+ "type": "dashboard"
+ }
+ ]
+ },
+ "description": "Overview for enterprise cluster VictoriaMetrics v1.56.0 or higher",
+ "editable": true,
+ "gnetId": null,
+ "graphTooltip": 0,
+ "id": 13,
+ "iteration": 1617980754279,
+ "links": [],
+ "panels": [
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$ds",
+ "description": "How many datapoints are inserted into storage per second by accountID and projectID",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {},
+ "links": []
+ },
+ "overrides": []
+ },
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 8,
+ "w": 12,
+ "x": 0,
+ "y": 0
+ },
+ "hiddenSeries": false,
+ "id": 2,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "nullPointMode": "null",
+ "options": {
+ "alertThreshold": true
+ },
+ "percentage": false,
+ "pluginVersion": "7.3.4",
+ "pointradius": 2,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": true,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "sum(increase(vm_tenant_inserted_rows_total{job=~\"$job\", instance=~\"$instance\",accountID=~\"$accountID\", projectID=~\"$projectID\"}[1m])/60) by (accountID,projectID) ",
+ "interval": "",
+ "legendFormat": "inserted rows: {{`{{`}}accountID{{`}}`}}:{{`{{`}}projectID{{`}}`}}",
+ "refId": "A"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Datapoints ingestion rate ($instance)",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$ds",
+ "description": "Request rate accepted by vmselect nodes per tenant",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {},
+ "links": []
+ },
+ "overrides": []
+ },
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 8,
+ "w": 12,
+ "x": 12,
+ "y": 0
+ },
+ "hiddenSeries": false,
+ "id": 4,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": false,
+ "min": false,
+ "show": true,
+ "sort": "current",
+ "sortDesc": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null as zero",
+ "options": {
+ "alertThreshold": true
+ },
+ "percentage": false,
+ "pluginVersion": "7.3.4",
+ "pointradius": 2,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "sum(rate(vm_tenant_select_requests_total{job=~\"$job\", instance=~\"$instance.*\",accountID=~\"$accountID\", projectID=~\"$projectID\"}[5m])) by (accountID,projectID) ",
+ "format": "time_series",
+ "interval": "",
+ "intervalFactor": 1,
+ "legendFormat": "tenant: {{`{{`}}accountID{{`}}`}}{{`{{`}}projectID{{`}}`}}",
+ "refId": "A"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Query rate ($instance)",
+ "tooltip": {
+ "shared": true,
+ "sort": 2,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$ds",
+ "description": "Shows the number of active time series with new data points inserted during the last hour. High value may result in ingestion slowdown. \n\nSee following link for details:",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {},
+ "links": []
+ },
+ "overrides": []
+ },
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 8,
+ "w": 12,
+ "x": 0,
+ "y": 8
+ },
+ "hiddenSeries": false,
+ "id": 6,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [
+ {
+ "targetBlank": true,
+ "title": "troubleshooting",
+ "url": "https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/README.md#troubleshooting"
+ }
+ ],
+ "nullPointMode": "null",
+ "options": {
+ "alertThreshold": true
+ },
+ "percentage": false,
+ "pluginVersion": "7.3.4",
+ "pointradius": 2,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "sum(vm_tenant_active_timeseries{job=~\"$job\", instance=~\"$instance.*\",accountID=~\"$accountID\",projectID=~\"$projectID\"}) by(accountID,projectID)",
+ "format": "time_series",
+ "interval": "",
+ "intervalFactor": 1,
+ "legendFormat": "Active time series tenant: {{`{{`}}accountID{{`}}`}}:{{`{{`}}projectID{{`}}`}}",
+ "refId": "A"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Active time series ($instance)",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$ds",
+ "description": "Shows how many of new time-series are created every second. High churn rate tightly connected with database performance and may result in unexpected OOM's or slow queries. It is recommended to always keep an eye on this metric to avoid unexpected cardinality \"explosions\".\n\nGood references to read:\n* https://www.robustperception.io/cardinality-is-key\n* https://www.robustperception.io/using-tsdb-analyze-to-investigate-churn-and-cardinality",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {},
+ "links": []
+ },
+ "overrides": []
+ },
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 8,
+ "w": 12,
+ "x": 12,
+ "y": 8
+ },
+ "hiddenSeries": false,
+ "id": 8,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "nullPointMode": "null",
+ "options": {
+ "alertThreshold": true
+ },
+ "percentage": false,
+ "pluginVersion": "7.3.4",
+ "pointradius": 2,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "sum(increase(vm_tenant_timeseries_created_total{job=~\"$job\", instance=~\"$instance\",accountID=~\"$accountID\", projectID=~\"$projectID\"}[1m])/60) by(accountID,projectID)",
+ "interval": "",
+ "legendFormat": "churn rate tenant: {{`{{`}}accountID{{`}}`}}:{{`{{`}}projectID{{`}}`}}",
+ "refId": "A"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Churn rate ($instance)",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$ds",
+ "description": "Shows amount of on-disk space occupied by data points.",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {},
+ "links": []
+ },
+ "overrides": []
+ },
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 8,
+ "w": 12,
+ "x": 0,
+ "y": 16
+ },
+ "hiddenSeries": false,
+ "id": 10,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "options": {
+ "alertThreshold": true
+ },
+ "percentage": false,
+ "pluginVersion": "7.3.4",
+ "pointradius": 2,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": true,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "sum(vm_tenant_used_tenant_bytes{job=\"$job_storage\", instance=~\"$instance\",accountID=~\"$accountID\",projectID=~\"$projectID\"}) by(accountID,projectID)",
+ "format": "time_series",
+ "interval": "",
+ "intervalFactor": 1,
+ "legendFormat": "{{`{{`}}accountID{{`}}`}}:{{`{{`}}projectID{{`}}`}}",
+ "refId": "A"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Disk space usage (datapoints) ($instance)",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "bytes",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ }
+ ],
+ "schemaVersion": 26,
+ "style": "dark",
+ "tags": [
+ "VictoriaMetrics",
+ "monitoring"
+ ],
+ "templating": {
+ "list": [
+ {
+ "current": {
+ "selected": false,
+ "text": "gw",
+ "value": "gw"
+ },
+ "error": null,
+ "hide": 0,
+ "includeAll": false,
+ "label": null,
+ "multi": false,
+ "name": "ds",
+ "options": [],
+ "query": "prometheus",
+ "queryValue": "",
+ "refresh": 1,
+ "regex": "",
+ "skipUrlSync": false,
+ "type": "datasource"
+ },
+ {
+ "allValue": ".*",
+ "current": {
+ "selected": false,
+ "text": "All",
+ "value": "$__all"
+ },
+ "datasource": "$ds",
+ "definition": "label_values(vm_app_version{version=~\"^vm(insert|select|storage).*\"}, job)",
+ "error": null,
+ "hide": 0,
+ "includeAll": true,
+ "label": null,
+ "multi": true,
+ "name": "job",
+ "options": [],
+ "query": "label_values(vm_app_version{version=~\"^vm(insert|select|storage).*\"}, job)",
+ "refresh": 1,
+ "regex": "",
+ "skipUrlSync": false,
+ "sort": 0,
+ "tagValuesQuery": "",
+ "tags": [],
+ "tagsQuery": "",
+ "type": "query",
+ "useTags": false
+ },
+ {
+ "allValue": ".*",
+ "current": {
+ "selected": false,
+ "text": "All",
+ "value": "$__all"
+ },
+ "datasource": "$ds",
+ "definition": "label_values(vm_app_version{job=~\"$job\"}, instance)",
+ "error": null,
+ "hide": 0,
+ "includeAll": true,
+ "label": null,
+ "multi": false,
+ "name": "instance",
+ "options": [],
+ "query": "label_values(vm_app_version{job=~\"$job\"}, instance)",
+ "refresh": 1,
+ "regex": "",
+ "skipUrlSync": false,
+ "sort": 0,
+ "tagValuesQuery": "",
+ "tags": [],
+ "tagsQuery": "",
+ "type": "query",
+ "useTags": false
+ },
+ {
+ "allValue": ".*",
+ "current": {
+ "selected": false,
+ "text": "All",
+ "value": "$__all"
+ },
+ "datasource": "$ds",
+ "definition": "label_values(vm_tenant_active_timeseries{job=~\"$job\"},accountID)",
+ "error": null,
+ "hide": 0,
+ "includeAll": true,
+ "label": null,
+ "multi": false,
+ "name": "accountID",
+ "options": [],
+ "query": "label_values(vm_tenant_active_timeseries{job=~\"$job\"},accountID)",
+ "refresh": 1,
+ "regex": "",
+ "skipUrlSync": false,
+ "sort": 0,
+ "tagValuesQuery": "",
+ "tags": [],
+ "tagsQuery": "",
+ "type": "query",
+ "useTags": false
+ },
+ {
+ "allValue": ".*",
+ "current": {
+ "selected": false,
+ "text": "All",
+ "value": "$__all"
+ },
+ "datasource": "$ds",
+ "definition": "label_values(vm_tenant_active_timeseries{accountID=~\"$accountID\"},projectID)",
+ "error": null,
+ "hide": 0,
+ "includeAll": true,
+ "label": null,
+ "multi": false,
+ "name": "projectID",
+ "options": [],
+ "query": "label_values(vm_tenant_active_timeseries{accountID=~\"$accountID\"},projectID)",
+ "refresh": 1,
+ "regex": "",
+ "skipUrlSync": false,
+ "sort": 0,
+ "tagValuesQuery": "",
+ "tags": [],
+ "tagsQuery": "",
+ "type": "query",
+ "useTags": false
+ }
+ ]
+ },
+ "time": {
+ "from": "now-1h",
+ "to": "now"
+ },
+ "timepicker": {},
+ "timezone": "",
+ "title": "VictoriaMetrics cluster per tenant Copy",
+ "uid": "IZFqd3lMz",
+ "version": 1
+ }
+{{- end }}
\ No newline at end of file
diff --git a/charts/victoria-metrics-k8s-stack/templates/grafana/dashboards/controller-manager.yaml b/charts/victoria-metrics-k8s-stack/templates/grafana/dashboards/controller-manager.yaml
index 572b330b7..b5b76d4e1 100644
--- a/charts/victoria-metrics-k8s-stack/templates/grafana/dashboards/controller-manager.yaml
+++ b/charts/victoria-metrics-k8s-stack/templates/grafana/dashboards/controller-manager.yaml
@@ -173,10 +173,10 @@ data:
"steppedLine": false,
"targets": [
{
- "expr": "sum(rate(workqueue_adds_total{cluster=\"$cluster\", job=\"kube-controller-manager\", instance=~\"$instance\"}[5m])) by (instance, name)",
+ "expr": "sum(rate(workqueue_adds_total{cluster=\"$cluster\", job=\"kube-controller-manager\", instance=~\"$instance\"}[5m])) by (cluster, instance, name)",
"format": "time_series",
"intervalFactor": 2,
- "legendFormat": "{{`{{`}}instance{{`}}`}} {{`{{`}}name{{`}}`}}",
+ "legendFormat": "{{`{{`}}cluster{{`}}`}} {{`{{`}}instance{{`}}`}} {{`{{`}}name{{`}}`}}",
"refId": "A"
}
],
@@ -279,10 +279,10 @@ data:
"steppedLine": false,
"targets": [
{
- "expr": "sum(rate(workqueue_depth{cluster=\"$cluster\", job=\"kube-controller-manager\", instance=~\"$instance\"}[5m])) by (instance, name)",
+ "expr": "sum(rate(workqueue_depth{cluster=\"$cluster\", job=\"kube-controller-manager\", instance=~\"$instance\"}[5m])) by (cluster, instance, name)",
"format": "time_series",
"intervalFactor": 2,
- "legendFormat": "{{`{{`}}instance{{`}}`}} {{`{{`}}name{{`}}`}}",
+ "legendFormat": "{{`{{`}}cluster{{`}}`}} {{`{{`}}instance{{`}}`}} {{`{{`}}name{{`}}`}}",
"refId": "A"
}
],
@@ -385,10 +385,10 @@ data:
"steppedLine": false,
"targets": [
{
- "expr": "histogram_quantile(0.99, sum(rate(workqueue_queue_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-controller-manager\", instance=~\"$instance\"}[5m])) by (instance, name, le))",
+ "expr": "histogram_quantile(0.99, sum(rate(workqueue_queue_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-controller-manager\", instance=~\"$instance\"}[5m])) by (cluster, instance, name, le))",
"format": "time_series",
"intervalFactor": 2,
- "legendFormat": "{{`{{`}}instance{{`}}`}} {{`{{`}}name{{`}}`}}",
+ "legendFormat": "{{`{{`}}cluster{{`}}`}} {{`{{`}}instance{{`}}`}} {{`{{`}}name{{`}}`}}",
"refId": "A"
}
],
diff --git a/charts/victoria-metrics-k8s-stack/templates/grafana/dashboards/k8s-resources-cluster.yaml b/charts/victoria-metrics-k8s-stack/templates/grafana/dashboards/k8s-resources-cluster.yaml
index a2e03eb63..de52d5e1e 100644
--- a/charts/victoria-metrics-k8s-stack/templates/grafana/dashboards/k8s-resources-cluster.yaml
+++ b/charts/victoria-metrics-k8s-stack/templates/grafana/dashboards/k8s-resources-cluster.yaml
@@ -244,7 +244,7 @@ data:
"steppedLine": false,
"targets": [
{
- "expr": "sum(kube_pod_container_resource_limits{cluster=\"$cluster\", resource=\"cpu\"}) / sum(kube_node_status_allocatable{resource=\"cpu\",cluster=\"$cluster\"})",
+ "expr": "sum(namespace_cpu:kube_pod_container_resource_limits:sum{cluster=\"$cluster\"}) / sum(kube_node_status_allocatable{resource=\"cpu\",cluster=\"$cluster\"})",
"format": "time_series",
"instant": true,
"intervalFactor": 2,
@@ -496,7 +496,7 @@ data:
"steppedLine": false,
"targets": [
{
- "expr": "sum(kube_pod_container_resource_limits{cluster=\"$cluster\", resource=\"memory\"}) / sum(kube_node_status_allocatable{resource=\"memory\",cluster=\"$cluster\"})",
+ "expr": "sum(namespace_memory:kube_pod_container_resource_limits:sum{cluster=\"$cluster\"}) / sum(kube_node_status_allocatable{resource=\"memory\",cluster=\"$cluster\"})",
"format": "time_series",
"instant": true,
"intervalFactor": 2,
@@ -591,7 +591,7 @@ data:
"steppedLine": false,
"targets": [
{
- "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\"}) by (namespace)",
+ "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\"}) by (namespace)",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{`{{`}}namespace{{`}}`}}",
@@ -882,7 +882,7 @@ data:
"step": 10
},
{
- "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\"}) by (namespace)",
+ "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\"}) by (namespace)",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -900,7 +900,7 @@ data:
"step": 10
},
{
- "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\"}) by (namespace) / sum(kube_pod_container_resource_requests{cluster=\"$cluster\", resource=\"cpu\"}) by (namespace)",
+ "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\"}) by (namespace) / sum(namespace_cpu:kube_pod_container_resource_requests:sum{cluster=\"$cluster\"}) by (namespace)",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -909,7 +909,7 @@ data:
"step": 10
},
{
- "expr": "sum(kube_pod_container_resource_limits{cluster=\"$cluster\", resource=\"cpu\"}) by (namespace)",
+ "expr": "sum(namespace_cpu:kube_pod_container_resource_limits:sum{cluster=\"$cluster\"}) by (namespace)",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -918,7 +918,7 @@ data:
"step": 10
},
{
- "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\"}) by (namespace) / sum(kube_pod_container_resource_limits{cluster=\"$cluster\", resource=\"cpu\"}) by (namespace)",
+ "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\"}) by (namespace) / sum(namespace_cpu:kube_pod_container_resource_limits:sum{cluster=\"$cluster\"}) by (namespace)",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -1318,7 +1318,7 @@ data:
"step": 10
},
{
- "expr": "sum(kube_pod_container_resource_requests{cluster=\"$cluster\", resource=\"memory\"}) by (namespace)",
+ "expr": "sum(namespace_memory:kube_pod_container_resource_requests:sum{cluster=\"$cluster\"}) by (namespace)",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -1327,7 +1327,7 @@ data:
"step": 10
},
{
- "expr": "sum(container_memory_rss{cluster=\"$cluster\", container!=\"\"}) by (namespace) / sum(kube_pod_container_resource_requests{cluster=\"$cluster\", resource=\"memory\"}) by (namespace)",
+ "expr": "sum(container_memory_rss{cluster=\"$cluster\", container!=\"\"}) by (namespace) / sum(namespace_memory:kube_pod_container_resource_requests:sum{cluster=\"$cluster\"}) by (namespace)",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -1336,7 +1336,7 @@ data:
"step": 10
},
{
- "expr": "sum(kube_pod_container_resource_limits{cluster=\"$cluster\", resource=\"memory\"}) by (namespace)",
+ "expr": "sum(namespace_memory:kube_pod_container_resource_limits:sum{cluster=\"$cluster\"}) by (namespace)",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -1345,7 +1345,7 @@ data:
"step": 10
},
{
- "expr": "sum(container_memory_rss{cluster=\"$cluster\", container!=\"\"}) by (namespace) / sum(kube_pod_container_resource_limits{cluster=\"$cluster\", resource=\"memory\"}) by (namespace)",
+ "expr": "sum(container_memory_rss{cluster=\"$cluster\", container!=\"\"}) by (namespace) / sum(namespace_memory:kube_pod_container_resource_limits:sum{cluster=\"$cluster\"}) by (namespace)",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -2146,7 +2146,7 @@ data:
},
"yaxes": [
{
- "format": "Bps",
+ "format": "pps",
"label": null,
"logBase": 1,
"max": null,
@@ -2232,7 +2232,7 @@ data:
},
"yaxes": [
{
- "format": "Bps",
+ "format": "pps",
"label": null,
"logBase": 1,
"max": null,
@@ -2330,7 +2330,7 @@ data:
},
"yaxes": [
{
- "format": "Bps",
+ "format": "pps",
"label": null,
"logBase": 1,
"max": null,
@@ -2416,7 +2416,7 @@ data:
},
"yaxes": [
{
- "format": "Bps",
+ "format": "pps",
"label": null,
"logBase": 1,
"max": null,
diff --git a/charts/victoria-metrics-k8s-stack/templates/grafana/dashboards/k8s-resources-namespace.yaml b/charts/victoria-metrics-k8s-stack/templates/grafana/dashboards/k8s-resources-namespace.yaml
index 18f2753e7..70dd9b322 100644
--- a/charts/victoria-metrics-k8s-stack/templates/grafana/dashboards/k8s-resources-namespace.yaml
+++ b/charts/victoria-metrics-k8s-stack/templates/grafana/dashboards/k8s-resources-namespace.yaml
@@ -75,7 +75,7 @@ data:
"steppedLine": false,
"targets": [
{
- "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}) / sum(kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"})",
+ "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}) / sum(kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"})",
"format": "time_series",
"instant": true,
"intervalFactor": 2,
@@ -159,7 +159,7 @@ data:
"steppedLine": false,
"targets": [
{
- "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}) / sum(kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"})",
+ "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}) / sum(kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"})",
"format": "time_series",
"instant": true,
"intervalFactor": 2,
@@ -443,7 +443,7 @@ data:
"steppedLine": false,
"targets": [
{
- "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)",
+ "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{`{{`}}pod{{`}}`}}",
@@ -694,7 +694,7 @@ data:
],
"targets": [
{
- "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)",
+ "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -703,7 +703,7 @@ data:
"step": 10
},
{
- "expr": "sum(kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}) by (pod)",
+ "expr": "sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -712,7 +712,7 @@ data:
"step": 10
},
{
- "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod) / sum(kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}) by (pod)",
+ "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod) / sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -721,7 +721,7 @@ data:
"step": 10
},
{
- "expr": "sum(kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}) by (pod)",
+ "expr": "sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -730,7 +730,7 @@ data:
"step": 10
},
{
- "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod) / sum(kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}) by (pod)",
+ "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod) / sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -1168,7 +1168,7 @@ data:
"step": 10
},
{
- "expr": "sum(kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"memory\"}) by (pod)",
+ "expr": "sum(cluster:namespace:pod_memory:active:kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -1177,7 +1177,7 @@ data:
"step": 10
},
{
- "expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\",container!=\"\", image!=\"\"}) by (pod) / sum(kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"memory\"}) by (pod)",
+ "expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\",container!=\"\", image!=\"\"}) by (pod) / sum(cluster:namespace:pod_memory:active:kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -1186,7 +1186,7 @@ data:
"step": 10
},
{
- "expr": "sum(kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"memory\"}) by (pod)",
+ "expr": "sum(cluster:namespace:pod_memory:active:kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -1195,7 +1195,7 @@ data:
"step": 10
},
{
- "expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\",container!=\"\", image!=\"\"}) by (pod) / sum(kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"memory\"}) by (pod)",
+ "expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\",container!=\"\", image!=\"\"}) by (pod) / sum(cluster:namespace:pod_memory:active:kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -1839,7 +1839,7 @@ data:
},
"yaxes": [
{
- "format": "Bps",
+ "format": "pps",
"label": null,
"logBase": 1,
"max": null,
@@ -1925,7 +1925,7 @@ data:
},
"yaxes": [
{
- "format": "Bps",
+ "format": "pps",
"label": null,
"logBase": 1,
"max": null,
@@ -2023,7 +2023,7 @@ data:
},
"yaxes": [
{
- "format": "Bps",
+ "format": "pps",
"label": null,
"logBase": 1,
"max": null,
@@ -2109,7 +2109,7 @@ data:
},
"yaxes": [
{
- "format": "Bps",
+ "format": "pps",
"label": null,
"logBase": 1,
"max": null,
diff --git a/charts/victoria-metrics-k8s-stack/templates/grafana/dashboards/k8s-resources-node.yaml b/charts/victoria-metrics-k8s-stack/templates/grafana/dashboards/k8s-resources-node.yaml
index 75cab8d1b..758119a64 100644
--- a/charts/victoria-metrics-k8s-stack/templates/grafana/dashboards/k8s-resources-node.yaml
+++ b/charts/victoria-metrics-k8s-stack/templates/grafana/dashboards/k8s-resources-node.yaml
@@ -74,7 +74,7 @@ data:
"steppedLine": false,
"targets": [
{
- "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", node=~\"$node\"}) by (pod)",
+ "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", node=~\"$node\"}) by (pod)",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{`{{`}}pod{{`}}`}}",
@@ -309,7 +309,7 @@ data:
],
"targets": [
{
- "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", node=~\"$node\"}) by (pod)",
+ "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", node=~\"$node\"}) by (pod)",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -318,7 +318,7 @@ data:
"step": 10
},
{
- "expr": "sum(kube_pod_container_resource_requests{cluster=\"$cluster\", node=~\"$node\", resource=\"cpu\"}) by (pod)",
+ "expr": "sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests{cluster=\"$cluster\", node=~\"$node\"}) by (pod)",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -327,7 +327,7 @@ data:
"step": 10
},
{
- "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", node=~\"$node\"}) by (pod) / sum(kube_pod_container_resource_requests{cluster=\"$cluster\", node=~\"$node\", resource=\"cpu\"}) by (pod)",
+ "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", node=~\"$node\"}) by (pod) / sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests{cluster=\"$cluster\", node=~\"$node\"}) by (pod)",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -336,7 +336,7 @@ data:
"step": 10
},
{
- "expr": "sum(kube_pod_container_resource_limits{cluster=\"$cluster\", node=~\"$node\", resource=\"cpu\"}) by (pod)",
+ "expr": "sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits{cluster=\"$cluster\", node=~\"$node\"}) by (pod)",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -345,7 +345,7 @@ data:
"step": 10
},
{
- "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", node=~\"$node\"}) by (pod) / sum(kube_pod_container_resource_limits{cluster=\"$cluster\", node=~\"$node\", resource=\"cpu\"}) by (pod)",
+ "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", node=~\"$node\"}) by (pod) / sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits{cluster=\"$cluster\", node=~\"$node\"}) by (pod)",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -746,7 +746,7 @@ data:
"step": 10
},
{
- "expr": "sum(kube_pod_container_resource_requests{cluster=\"$cluster\", node=~\"$node\", resource=\"memory\"}) by (pod)",
+ "expr": "sum(cluster:namespace:pod_memory:active:kube_pod_container_resource_requests{cluster=\"$cluster\", node=~\"$node\"}) by (pod)",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -755,7 +755,7 @@ data:
"step": 10
},
{
- "expr": "sum(node_namespace_pod_container:container_memory_working_set_bytes{cluster=\"$cluster\", node=~\"$node\",container!=\"\"}) by (pod) / sum(kube_pod_container_resource_requests{cluster=\"$cluster\", node=~\"$node\", resource=\"memory\"}) by (pod)",
+ "expr": "sum(node_namespace_pod_container:container_memory_working_set_bytes{cluster=\"$cluster\", node=~\"$node\",container!=\"\"}) by (pod) / sum(cluster:namespace:pod_memory:active:kube_pod_container_resource_requests{cluster=\"$cluster\", node=~\"$node\"}) by (pod)",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -764,7 +764,7 @@ data:
"step": 10
},
{
- "expr": "sum(kube_pod_container_resource_limits{cluster=\"$cluster\", node=~\"$node\", resource=\"memory\"}) by (pod)",
+ "expr": "sum(cluster:namespace:pod_memory:active:kube_pod_container_resource_limits{cluster=\"$cluster\", node=~\"$node\"}) by (pod)",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -773,7 +773,7 @@ data:
"step": 10
},
{
- "expr": "sum(node_namespace_pod_container:container_memory_working_set_bytes{cluster=\"$cluster\", node=~\"$node\",container!=\"\"}) by (pod) / sum(kube_pod_container_resource_limits{cluster=\"$cluster\", node=~\"$node\", resource=\"memory\"}) by (pod)",
+ "expr": "sum(node_namespace_pod_container:container_memory_working_set_bytes{cluster=\"$cluster\", node=~\"$node\",container!=\"\"}) by (pod) / sum(cluster:namespace:pod_memory:active:kube_pod_container_resource_limits{cluster=\"$cluster\", node=~\"$node\"}) by (pod)",
"format": "table",
"instant": true,
"intervalFactor": 2,
diff --git a/charts/victoria-metrics-k8s-stack/templates/grafana/dashboards/k8s-resources-pod.yaml b/charts/victoria-metrics-k8s-stack/templates/grafana/dashboards/k8s-resources-pod.yaml
index 4bb66eb14..09b182896 100644
--- a/charts/victoria-metrics-k8s-stack/templates/grafana/dashboards/k8s-resources-pod.yaml
+++ b/charts/victoria-metrics-k8s-stack/templates/grafana/dashboards/k8s-resources-pod.yaml
@@ -91,7 +91,7 @@ data:
"steppedLine": false,
"targets": [
{
- "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{namespace=\"$namespace\", pod=\"$pod\", cluster=\"$cluster\"}) by (container)",
+ "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{namespace=\"$namespace\", pod=\"$pod\", cluster=\"$cluster\"}) by (container)",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{`{{`}}container{{`}}`}}",
@@ -447,7 +447,7 @@ data:
],
"targets": [
{
- "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container)",
+ "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container)",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -456,7 +456,7 @@ data:
"step": 10
},
{
- "expr": "sum(kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", resource=\"cpu\"}) by (container)",
+ "expr": "sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container)",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -465,7 +465,7 @@ data:
"step": 10
},
{
- "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container) / sum(kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", resource=\"cpu\"}) by (container)",
+ "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container) / sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container)",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -474,7 +474,7 @@ data:
"step": 10
},
{
- "expr": "sum(kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", resource=\"cpu\"}) by (container)",
+ "expr": "sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container)",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -483,7 +483,7 @@ data:
"step": 10
},
{
- "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container) / sum(kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", resource=\"cpu\"}) by (container)",
+ "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container) / sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container)",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -919,7 +919,7 @@ data:
"step": 10
},
{
- "expr": "sum(kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", resource=\"memory\"}) by (container)",
+ "expr": "sum(cluster:namespace:pod_memory:active:kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container)",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -928,7 +928,7 @@ data:
"step": 10
},
{
- "expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", image!=\"\"}) by (container) / sum(kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", resource=\"memory\"}) by (container)",
+ "expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", image!=\"\"}) by (container) / sum(cluster:namespace:pod_memory:active:kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container)",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -937,7 +937,7 @@ data:
"step": 10
},
{
- "expr": "sum(kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", resource=\"memory\"}) by (container)",
+ "expr": "sum(cluster:namespace:pod_memory:active:kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container)",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -946,7 +946,7 @@ data:
"step": 10
},
{
- "expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", container!=\"\", image!=\"\"}) by (container) / sum(kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", resource=\"memory\"}) by (container)",
+ "expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", container!=\"\", image!=\"\"}) by (container) / sum(cluster:namespace:pod_memory:active:kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container)",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -1291,7 +1291,7 @@ data:
},
"yaxes": [
{
- "format": "Bps",
+ "format": "pps",
"label": null,
"logBase": 1,
"max": null,
@@ -1378,7 +1378,7 @@ data:
},
"yaxes": [
{
- "format": "Bps",
+ "format": "pps",
"label": null,
"logBase": 1,
"max": null,
@@ -1477,7 +1477,7 @@ data:
},
"yaxes": [
{
- "format": "Bps",
+ "format": "pps",
"label": null,
"logBase": 1,
"max": null,
@@ -1564,7 +1564,7 @@ data:
},
"yaxes": [
{
- "format": "Bps",
+ "format": "pps",
"label": null,
"logBase": 1,
"max": null,
diff --git a/charts/victoria-metrics-k8s-stack/templates/grafana/dashboards/k8s-resources-workload.yaml b/charts/victoria-metrics-k8s-stack/templates/grafana/dashboards/k8s-resources-workload.yaml
index 2e96a4374..baee08097 100644
--- a/charts/victoria-metrics-k8s-stack/templates/grafana/dashboards/k8s-resources-workload.yaml
+++ b/charts/victoria-metrics-k8s-stack/templates/grafana/dashboards/k8s-resources-workload.yaml
@@ -74,7 +74,7 @@ data:
"steppedLine": false,
"targets": [
{
- "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n",
+ "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{`{{`}}pod{{`}}`}}",
@@ -309,7 +309,7 @@ data:
],
"targets": [
{
- "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n",
+ "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -327,7 +327,7 @@ data:
"step": 10
},
{
- "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n/sum(\n kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n",
+ "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n/sum(\n kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -345,7 +345,7 @@ data:
"step": 10
},
{
- "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n/sum(\n kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n",
+ "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n/sum(\n kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -1517,7 +1517,7 @@ data:
},
"yaxes": [
{
- "format": "Bps",
+ "format": "pps",
"label": null,
"logBase": 1,
"max": null,
@@ -1603,7 +1603,7 @@ data:
},
"yaxes": [
{
- "format": "Bps",
+ "format": "pps",
"label": null,
"logBase": 1,
"max": null,
@@ -1701,7 +1701,7 @@ data:
},
"yaxes": [
{
- "format": "Bps",
+ "format": "pps",
"label": null,
"logBase": 1,
"max": null,
@@ -1787,7 +1787,7 @@ data:
},
"yaxes": [
{
- "format": "Bps",
+ "format": "pps",
"label": null,
"logBase": 1,
"max": null,
diff --git a/charts/victoria-metrics-k8s-stack/templates/grafana/dashboards/k8s-resources-workloads-namespace.yaml b/charts/victoria-metrics-k8s-stack/templates/grafana/dashboards/k8s-resources-workloads-namespace.yaml
index 32f5f0688..f06866d84 100644
--- a/charts/victoria-metrics-k8s-stack/templates/grafana/dashboards/k8s-resources-workloads-namespace.yaml
+++ b/charts/victoria-metrics-k8s-stack/templates/grafana/dashboards/k8s-resources-workloads-namespace.yaml
@@ -95,7 +95,7 @@ data:
"steppedLine": false,
"targets": [
{
- "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n",
+ "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{`{{`}}workload{{`}}`}} - {{`{{`}}workload_type{{`}}`}}",
@@ -393,7 +393,7 @@ data:
"step": 10
},
{
- "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n",
+ "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -411,7 +411,7 @@ data:
"step": 10
},
{
- "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n/sum(\n kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n",
+ "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n/sum(\n kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -429,7 +429,7 @@ data:
"step": 10
},
{
- "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n/sum(\n kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n",
+ "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n/sum(\n kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -1704,7 +1704,7 @@ data:
},
"yaxes": [
{
- "format": "Bps",
+ "format": "pps",
"label": null,
"logBase": 1,
"max": null,
@@ -1790,7 +1790,7 @@ data:
},
"yaxes": [
{
- "format": "Bps",
+ "format": "pps",
"label": null,
"logBase": 1,
"max": null,
@@ -1888,7 +1888,7 @@ data:
},
"yaxes": [
{
- "format": "Bps",
+ "format": "pps",
"label": null,
"logBase": 1,
"max": null,
@@ -1974,7 +1974,7 @@ data:
},
"yaxes": [
{
- "format": "Bps",
+ "format": "pps",
"label": null,
"logBase": 1,
"max": null,
diff --git a/charts/victoria-metrics-k8s-stack/templates/grafana/dashboards/kubelet.yaml b/charts/victoria-metrics-k8s-stack/templates/grafana/dashboards/kubelet.yaml
index 1d84c7b9e..925c0d074 100644
--- a/charts/victoria-metrics-k8s-stack/templates/grafana/dashboards/kubelet.yaml
+++ b/charts/victoria-metrics-k8s-stack/templates/grafana/dashboards/kubelet.yaml
@@ -37,2384 +37,2099 @@ data:
"links": [
],
- "refresh": "10s",
- "rows": [
+ "panels": [
{
- "collapse": false,
- "collapsed": false,
- "panels": [
- {
- "cacheTimeout": null,
- "colorBackground": false,
- "colorValue": false,
- "colors": [
- "#299c46",
- "rgba(237, 129, 40, 0.89)",
- "#d44a3a"
- ],
- "datasource": "$datasource",
- "format": "none",
- "gauge": {
- "maxValue": 100,
- "minValue": 0,
- "show": false,
- "thresholdLabels": false,
- "thresholdMarkers": true
- },
- "gridPos": {
-
- },
- "id": 2,
- "interval": null,
- "links": [
-
- ],
- "mappingType": 1,
- "mappingTypes": [
- {
- "name": "value to text",
- "value": 1
- },
- {
- "name": "range to text",
- "value": 2
- }
- ],
- "maxDataPoints": 100,
- "nullPointMode": "connected",
- "nullText": null,
- "postfix": "",
- "postfixFontSize": "50%",
- "prefix": "",
- "prefixFontSize": "50%",
- "rangeMaps": [
- {
- "from": "null",
- "text": "N/A",
- "to": "null"
- }
- ],
- "span": 2,
- "sparkline": {
- "fillColor": "rgba(31, 118, 189, 0.18)",
- "full": false,
- "lineColor": "rgb(31, 120, 193)",
- "show": false
- },
- "tableColumn": "",
- "targets": [
- {
- "expr": "sum(kubelet_node_name{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\"})",
- "format": "time_series",
- "intervalFactor": 2,
- "legendFormat": "",
- "refId": "A"
- }
- ],
- "thresholds": "",
- "title": "Running Kubelets",
- "tooltip": {
- "shared": false
- },
- "type": "singlestat",
- "valueFontSize": "80%",
- "valueMaps": [
- {
- "op": "=",
- "text": "N/A",
- "value": "null"
- }
- ],
- "valueName": "min"
- },
- {
- "cacheTimeout": null,
- "colorBackground": false,
- "colorValue": false,
- "colors": [
- "#299c46",
- "rgba(237, 129, 40, 0.89)",
- "#d44a3a"
- ],
- "datasource": "$datasource",
- "format": "none",
- "gauge": {
- "maxValue": 100,
- "minValue": 0,
- "show": false,
- "thresholdLabels": false,
- "thresholdMarkers": true
- },
- "gridPos": {
-
- },
- "id": 3,
- "interval": null,
- "links": [
-
- ],
- "mappingType": 1,
- "mappingTypes": [
- {
- "name": "value to text",
- "value": 1
- },
- {
- "name": "range to text",
- "value": 2
- }
- ],
- "maxDataPoints": 100,
- "nullPointMode": "connected",
- "nullText": null,
- "postfix": "",
- "postfixFontSize": "50%",
- "prefix": "",
- "prefixFontSize": "50%",
- "rangeMaps": [
- {
- "from": "null",
- "text": "N/A",
- "to": "null"
- }
- ],
- "span": 2,
- "sparkline": {
- "fillColor": "rgba(31, 118, 189, 0.18)",
- "full": false,
- "lineColor": "rgb(31, 120, 193)",
- "show": false
- },
- "tableColumn": "",
- "targets": [
- {
- "expr": "sum(kubelet_running_pods{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"}) OR sum(kubelet_running_pod_count{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"})",
- "format": "time_series",
- "intervalFactor": 2,
- "legendFormat": "{{`{{`}}instance{{`}}`}}",
- "refId": "A"
- }
- ],
- "thresholds": "",
- "title": "Running Pods",
- "tooltip": {
- "shared": false
- },
- "type": "singlestat",
- "valueFontSize": "80%",
- "valueMaps": [
- {
- "op": "=",
- "text": "N/A",
- "value": "null"
- }
- ],
- "valueName": "min"
- },
- {
- "cacheTimeout": null,
- "colorBackground": false,
- "colorValue": false,
- "colors": [
- "#299c46",
- "rgba(237, 129, 40, 0.89)",
- "#d44a3a"
- ],
- "datasource": "$datasource",
- "format": "none",
- "gauge": {
- "maxValue": 100,
- "minValue": 0,
- "show": false,
- "thresholdLabels": false,
- "thresholdMarkers": true
- },
- "gridPos": {
-
- },
- "id": 4,
- "interval": null,
+ "datasource": "$datasource",
+ "fieldConfig": {
+ "defaults": {
"links": [
],
- "mappingType": 1,
- "mappingTypes": [
- {
- "name": "value to text",
- "value": 1
- },
- {
- "name": "range to text",
- "value": 2
- }
- ],
- "maxDataPoints": 100,
- "nullPointMode": "connected",
- "nullText": null,
- "postfix": "",
- "postfixFontSize": "50%",
- "prefix": "",
- "prefixFontSize": "50%",
- "rangeMaps": [
- {
- "from": "null",
- "text": "N/A",
- "to": "null"
- }
- ],
- "span": 2,
- "sparkline": {
- "fillColor": "rgba(31, 118, 189, 0.18)",
- "full": false,
- "lineColor": "rgb(31, 120, 193)",
- "show": false
- },
- "tableColumn": "",
- "targets": [
- {
- "expr": "sum(kubelet_running_containers{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"}) OR sum(kubelet_running_container_count{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"})",
- "format": "time_series",
- "intervalFactor": 2,
- "legendFormat": "{{`{{`}}instance{{`}}`}}",
- "refId": "A"
- }
- ],
- "thresholds": "",
- "title": "Running Container",
- "tooltip": {
- "shared": false
- },
- "type": "singlestat",
- "valueFontSize": "80%",
- "valueMaps": [
- {
- "op": "=",
- "text": "N/A",
- "value": "null"
- }
- ],
- "valueName": "min"
- },
- {
- "cacheTimeout": null,
- "colorBackground": false,
- "colorValue": false,
- "colors": [
- "#299c46",
- "rgba(237, 129, 40, 0.89)",
- "#d44a3a"
- ],
- "datasource": "$datasource",
- "format": "none",
- "gauge": {
- "maxValue": 100,
- "minValue": 0,
- "show": false,
- "thresholdLabels": false,
- "thresholdMarkers": true
- },
- "gridPos": {
-
- },
- "id": 5,
- "interval": null,
- "links": [
+ "mappings": [
],
- "mappingType": 1,
- "mappingTypes": [
- {
- "name": "value to text",
- "value": 1
- },
- {
- "name": "range to text",
- "value": 2
- }
- ],
- "maxDataPoints": 100,
- "nullPointMode": "connected",
- "nullText": null,
- "postfix": "",
- "postfixFontSize": "50%",
- "prefix": "",
- "prefixFontSize": "50%",
- "rangeMaps": [
- {
- "from": "null",
- "text": "N/A",
- "to": "null"
- }
- ],
- "span": 2,
- "sparkline": {
- "fillColor": "rgba(31, 118, 189, 0.18)",
- "full": false,
- "lineColor": "rgb(31, 120, 193)",
- "show": false
- },
- "tableColumn": "",
- "targets": [
- {
- "expr": "sum(volume_manager_total_volumes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\", state=\"actual_state_of_world\"})",
- "format": "time_series",
- "intervalFactor": 2,
- "legendFormat": "{{`{{`}}instance{{`}}`}}",
- "refId": "A"
- }
- ],
- "thresholds": "",
- "title": "Actual Volume Count",
- "tooltip": {
- "shared": false
- },
- "type": "singlestat",
- "valueFontSize": "80%",
- "valueMaps": [
- {
- "op": "=",
- "text": "N/A",
- "value": "null"
- }
- ],
- "valueName": "min"
- },
- {
- "cacheTimeout": null,
- "colorBackground": false,
- "colorValue": false,
- "colors": [
- "#299c46",
- "rgba(237, 129, 40, 0.89)",
- "#d44a3a"
- ],
- "datasource": "$datasource",
- "format": "none",
- "gauge": {
- "maxValue": 100,
- "minValue": 0,
- "show": false,
- "thresholdLabels": false,
- "thresholdMarkers": true
- },
- "gridPos": {
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ ]
},
- "id": 6,
- "interval": null,
- "links": [
+ "unit": "none"
+ }
+ },
+ "gridPos": {
+ "h": 7,
+ "w": 4,
+ "x": 0,
+ "y": 0
+ },
+ "id": 2,
+ "links": [
- ],
- "mappingType": 1,
- "mappingTypes": [
- {
- "name": "value to text",
- "value": 1
- },
- {
- "name": "range to text",
- "value": 2
- }
- ],
- "maxDataPoints": 100,
- "nullPointMode": "connected",
- "nullText": null,
- "postfix": "",
- "postfixFontSize": "50%",
- "prefix": "",
- "prefixFontSize": "50%",
- "rangeMaps": [
- {
- "from": "null",
- "text": "N/A",
- "to": "null"
- }
- ],
- "span": 2,
- "sparkline": {
- "fillColor": "rgba(31, 118, 189, 0.18)",
- "full": false,
- "lineColor": "rgb(31, 120, 193)",
- "show": false
- },
- "tableColumn": "",
- "targets": [
- {
- "expr": "sum(volume_manager_total_volumes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\",state=\"desired_state_of_world\"})",
- "format": "time_series",
- "intervalFactor": 2,
- "legendFormat": "{{`{{`}}instance{{`}}`}}",
- "refId": "A"
- }
- ],
- "thresholds": "",
- "title": "Desired Volume Count",
- "tooltip": {
- "shared": false
- },
- "type": "singlestat",
- "valueFontSize": "80%",
- "valueMaps": [
- {
- "op": "=",
- "text": "N/A",
- "value": "null"
- }
- ],
- "valueName": "min"
- },
+ ],
+ "options": {
+ "colorMode": "value",
+ "graphMode": "area",
+ "justifyMode": "auto",
+ "orientation": "auto",
+ "reduceOptions": {
+ "calcs": [
+ "lastNotNull"
+ ],
+ "fields": "",
+ "values": false
+ }
+ },
+ "pluginVersion": "7",
+ "targets": [
{
- "cacheTimeout": null,
- "colorBackground": false,
- "colorValue": false,
- "colors": [
- "#299c46",
- "rgba(237, 129, 40, 0.89)",
- "#d44a3a"
- ],
- "datasource": "$datasource",
- "format": "none",
- "gauge": {
- "maxValue": 100,
- "minValue": 0,
- "show": false,
- "thresholdLabels": false,
- "thresholdMarkers": true
- },
- "gridPos": {
-
- },
- "id": 7,
- "interval": null,
- "links": [
-
- ],
- "mappingType": 1,
- "mappingTypes": [
- {
- "name": "value to text",
- "value": 1
- },
- {
- "name": "range to text",
- "value": 2
- }
- ],
- "maxDataPoints": 100,
- "nullPointMode": "connected",
- "nullText": null,
- "postfix": "",
- "postfixFontSize": "50%",
- "prefix": "",
- "prefixFontSize": "50%",
- "rangeMaps": [
- {
- "from": "null",
- "text": "N/A",
- "to": "null"
- }
- ],
- "span": 2,
- "sparkline": {
- "fillColor": "rgba(31, 118, 189, 0.18)",
- "full": false,
- "lineColor": "rgb(31, 120, 193)",
- "show": false
- },
- "tableColumn": "",
- "targets": [
- {
- "expr": "sum(rate(kubelet_node_config_error{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"}[5m]))",
- "format": "time_series",
- "intervalFactor": 2,
- "legendFormat": "{{`{{`}}instance{{`}}`}}",
- "refId": "A"
- }
- ],
- "thresholds": "",
- "title": "Config Error Count",
- "tooltip": {
- "shared": false
- },
- "type": "singlestat",
- "valueFontSize": "80%",
- "valueMaps": [
- {
- "op": "=",
- "text": "N/A",
- "value": "null"
- }
- ],
- "valueName": "min"
+ "expr": "sum(kubelet_node_name{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\"})",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "",
+ "refId": "A"
}
],
- "repeat": null,
- "repeatIteration": null,
- "repeatRowId": null,
- "showTitle": false,
- "title": "Dashboard Row",
- "titleSize": "h6",
- "type": "row"
+ "title": "Running Kubelets",
+ "transparent": false,
+ "type": "stat"
},
{
- "collapse": false,
- "collapsed": false,
- "panels": [
- {
- "aliasColors": {
-
- },
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": "$datasource",
- "fill": 1,
- "fillGradient": 0,
- "gridPos": {
-
- },
- "id": 8,
- "legend": {
- "alignAsTable": true,
- "avg": false,
- "current": true,
- "max": false,
- "min": false,
- "rightSide": true,
- "show": true,
- "sideWidth": null,
- "total": false,
- "values": true
- },
- "lines": true,
- "linewidth": 1,
+ "datasource": "$datasource",
+ "fieldConfig": {
+ "defaults": {
"links": [
],
- "nullPointMode": "null",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "repeat": null,
- "seriesOverrides": [
-
- ],
- "spaceLength": 10,
- "span": 6,
- "stack": false,
- "steppedLine": false,
- "targets": [
- {
- "expr": "sum(rate(kubelet_runtime_operations_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (operation_type, instance)",
- "format": "time_series",
- "intervalFactor": 2,
- "legendFormat": "{{`{{`}}instance{{`}}`}} {{`{{`}}operation_type{{`}}`}}",
- "refId": "A"
- }
- ],
- "thresholds": [
+ "mappings": [
],
- "timeFrom": null,
- "timeShift": null,
- "title": "Operation Rate",
- "tooltip": {
- "shared": false,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": [
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
]
},
- "yaxes": [
- {
- "format": "ops",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": null,
- "show": true
- },
- {
- "format": "ops",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": null,
- "show": true
- }
- ]
- },
- {
- "aliasColors": {
-
- },
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": "$datasource",
- "fill": 1,
- "fillGradient": 0,
- "gridPos": {
+ "unit": "none"
+ }
+ },
+ "gridPos": {
+ "h": 7,
+ "w": 4,
+ "x": 4,
+ "y": 0
+ },
+ "id": 3,
+ "links": [
- },
- "id": 9,
- "legend": {
- "alignAsTable": true,
- "avg": false,
- "current": true,
- "max": false,
- "min": false,
- "rightSide": true,
- "show": true,
- "sideWidth": null,
- "total": false,
- "values": true
- },
- "lines": true,
- "linewidth": 1,
+ ],
+ "options": {
+ "colorMode": "value",
+ "graphMode": "area",
+ "justifyMode": "auto",
+ "orientation": "auto",
+ "reduceOptions": {
+ "calcs": [
+ "lastNotNull"
+ ],
+ "fields": "",
+ "values": false
+ }
+ },
+ "pluginVersion": "7",
+ "targets": [
+ {
+ "expr": "sum(kubelet_running_pods{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"}) OR sum(kubelet_running_pod_count{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"})",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "{{`{{`}}instance{{`}}`}}",
+ "refId": "A"
+ }
+ ],
+ "title": "Running Pods",
+ "transparent": false,
+ "type": "stat"
+ },
+ {
+ "datasource": "$datasource",
+ "fieldConfig": {
+ "defaults": {
"links": [
],
- "nullPointMode": "null",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "repeat": null,
- "seriesOverrides": [
+ "mappings": [
],
- "spaceLength": 10,
- "span": 6,
- "stack": false,
- "steppedLine": false,
- "targets": [
- {
- "expr": "sum(rate(kubelet_runtime_operations_errors_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, operation_type)",
- "format": "time_series",
- "intervalFactor": 2,
- "legendFormat": "{{`{{`}}instance{{`}}`}} {{`{{`}}operation_type{{`}}`}}",
- "refId": "A"
- }
- ],
- "thresholds": [
-
- ],
- "timeFrom": null,
- "timeShift": null,
- "title": "Operation Error Rate",
- "tooltip": {
- "shared": false,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": [
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
]
},
- "yaxes": [
- {
- "format": "ops",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
- },
- {
- "format": "ops",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
- }
- ]
+ "unit": "none"
}
+ },
+ "gridPos": {
+ "h": 7,
+ "w": 4,
+ "x": 8,
+ "y": 0
+ },
+ "id": 4,
+ "links": [
+
],
- "repeat": null,
- "repeatIteration": null,
- "repeatRowId": null,
- "showTitle": false,
- "title": "Dashboard Row",
- "titleSize": "h6",
- "type": "row"
+ "options": {
+ "colorMode": "value",
+ "graphMode": "area",
+ "justifyMode": "auto",
+ "orientation": "auto",
+ "reduceOptions": {
+ "calcs": [
+ "lastNotNull"
+ ],
+ "fields": "",
+ "values": false
+ }
+ },
+ "pluginVersion": "7",
+ "targets": [
+ {
+ "expr": "sum(kubelet_running_containers{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"}) OR sum(kubelet_running_container_count{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"})",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "{{`{{`}}instance{{`}}`}}",
+ "refId": "A"
+ }
+ ],
+ "title": "Running Container",
+ "transparent": false,
+ "type": "stat"
},
{
- "collapse": false,
- "collapsed": false,
- "panels": [
- {
- "aliasColors": {
-
- },
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": "$datasource",
- "fill": 1,
- "fillGradient": 0,
- "gridPos": {
-
- },
- "id": 10,
- "legend": {
- "alignAsTable": true,
- "avg": false,
- "current": true,
- "max": false,
- "min": false,
- "rightSide": true,
- "show": true,
- "sideWidth": null,
- "total": false,
- "values": true
- },
- "lines": true,
- "linewidth": 1,
+ "datasource": "$datasource",
+ "fieldConfig": {
+ "defaults": {
"links": [
],
- "nullPointMode": "null",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "repeat": null,
- "seriesOverrides": [
-
- ],
- "spaceLength": 10,
- "span": 12,
- "stack": false,
- "steppedLine": false,
- "targets": [
- {
- "expr": "histogram_quantile(0.99, sum(rate(kubelet_runtime_operations_duration_seconds_bucket{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, operation_type, le))",
- "format": "time_series",
- "intervalFactor": 2,
- "legendFormat": "{{`{{`}}instance{{`}}`}} {{`{{`}}operation_type{{`}}`}}",
- "refId": "A"
- }
- ],
- "thresholds": [
+ "mappings": [
],
- "timeFrom": null,
- "timeShift": null,
- "title": "Operation duration 99th quantile",
- "tooltip": {
- "shared": false,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": [
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
]
},
- "yaxes": [
- {
- "format": "s",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": null,
- "show": true
- },
- {
- "format": "s",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": null,
- "show": true
- }
- ]
+ "unit": "none"
}
+ },
+ "gridPos": {
+ "h": 7,
+ "w": 4,
+ "x": 12,
+ "y": 0
+ },
+ "id": 5,
+ "links": [
+
],
- "repeat": null,
- "repeatIteration": null,
- "repeatRowId": null,
- "showTitle": false,
- "title": "Dashboard Row",
- "titleSize": "h6",
- "type": "row"
+ "options": {
+ "colorMode": "value",
+ "graphMode": "area",
+ "justifyMode": "auto",
+ "orientation": "auto",
+ "reduceOptions": {
+ "calcs": [
+ "lastNotNull"
+ ],
+ "fields": "",
+ "values": false
+ }
+ },
+ "pluginVersion": "7",
+ "targets": [
+ {
+ "expr": "sum(volume_manager_total_volumes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\", state=\"actual_state_of_world\"})",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "{{`{{`}}instance{{`}}`}}",
+ "refId": "A"
+ }
+ ],
+ "title": "Actual Volume Count",
+ "transparent": false,
+ "type": "stat"
},
{
- "collapse": false,
- "collapsed": false,
- "panels": [
- {
- "aliasColors": {
-
- },
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": "$datasource",
- "fill": 1,
- "fillGradient": 0,
- "gridPos": {
-
- },
- "id": 11,
- "legend": {
- "alignAsTable": true,
- "avg": false,
- "current": true,
- "max": false,
- "min": false,
- "rightSide": true,
- "show": true,
- "sideWidth": null,
- "total": false,
- "values": true
- },
- "lines": true,
- "linewidth": 1,
+ "datasource": "$datasource",
+ "fieldConfig": {
+ "defaults": {
"links": [
],
- "nullPointMode": "null",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "repeat": null,
- "seriesOverrides": [
-
- ],
- "spaceLength": 10,
- "span": 6,
- "stack": false,
- "steppedLine": false,
- "targets": [
- {
- "expr": "sum(rate(kubelet_pod_start_duration_seconds_count{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance)",
- "format": "time_series",
- "intervalFactor": 2,
- "legendFormat": "{{`{{`}}instance{{`}}`}} pod",
- "refId": "A"
- },
- {
- "expr": "sum(rate(kubelet_pod_worker_duration_seconds_count{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance)",
- "format": "time_series",
- "intervalFactor": 2,
- "legendFormat": "{{`{{`}}instance{{`}}`}} worker",
- "refId": "B"
- }
- ],
- "thresholds": [
+ "mappings": [
],
- "timeFrom": null,
- "timeShift": null,
- "title": "Pod Start Rate",
- "tooltip": {
- "shared": false,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": [
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
]
},
- "yaxes": [
- {
- "format": "ops",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
- },
- {
- "format": "ops",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
- }
- ]
- },
- {
- "aliasColors": {
-
- },
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": "$datasource",
- "fill": 1,
- "fillGradient": 0,
- "gridPos": {
+ "unit": "none"
+ }
+ },
+ "gridPos": {
+ "h": 7,
+ "w": 4,
+ "x": 16,
+ "y": 0
+ },
+ "id": 6,
+ "links": [
- },
- "id": 12,
- "legend": {
- "alignAsTable": true,
- "avg": false,
- "current": true,
- "max": false,
- "min": false,
- "rightSide": true,
- "show": true,
- "sideWidth": null,
- "total": false,
- "values": true
- },
- "lines": true,
- "linewidth": 1,
+ ],
+ "options": {
+ "colorMode": "value",
+ "graphMode": "area",
+ "justifyMode": "auto",
+ "orientation": "auto",
+ "reduceOptions": {
+ "calcs": [
+ "lastNotNull"
+ ],
+ "fields": "",
+ "values": false
+ }
+ },
+ "pluginVersion": "7",
+ "targets": [
+ {
+ "expr": "sum(volume_manager_total_volumes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\",state=\"desired_state_of_world\"})",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "{{`{{`}}instance{{`}}`}}",
+ "refId": "A"
+ }
+ ],
+ "title": "Desired Volume Count",
+ "transparent": false,
+ "type": "stat"
+ },
+ {
+ "datasource": "$datasource",
+ "fieldConfig": {
+ "defaults": {
"links": [
],
- "nullPointMode": "null",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "repeat": null,
- "seriesOverrides": [
-
- ],
- "spaceLength": 10,
- "span": 6,
- "stack": false,
- "steppedLine": false,
- "targets": [
- {
- "expr": "histogram_quantile(0.99, sum(rate(kubelet_pod_start_duration_seconds_count{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, le))",
- "format": "time_series",
- "intervalFactor": 2,
- "legendFormat": "{{`{{`}}instance{{`}}`}} pod",
- "refId": "A"
- },
- {
- "expr": "histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, le))",
- "format": "time_series",
- "intervalFactor": 2,
- "legendFormat": "{{`{{`}}instance{{`}}`}} worker",
- "refId": "B"
- }
- ],
- "thresholds": [
+ "mappings": [
],
- "timeFrom": null,
- "timeShift": null,
- "title": "Pod Start Duration",
- "tooltip": {
- "shared": false,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": [
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
]
},
- "yaxes": [
- {
- "format": "s",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
- },
- {
- "format": "s",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
- }
- ]
+ "unit": "none"
}
+ },
+ "gridPos": {
+ "h": 7,
+ "w": 4,
+ "x": 20,
+ "y": 0
+ },
+ "id": 7,
+ "links": [
+
],
- "repeat": null,
- "repeatIteration": null,
- "repeatRowId": null,
- "showTitle": false,
- "title": "Dashboard Row",
- "titleSize": "h6",
- "type": "row"
+ "options": {
+ "colorMode": "value",
+ "graphMode": "area",
+ "justifyMode": "auto",
+ "orientation": "auto",
+ "reduceOptions": {
+ "calcs": [
+ "lastNotNull"
+ ],
+ "fields": "",
+ "values": false
+ }
+ },
+ "pluginVersion": "7",
+ "targets": [
+ {
+ "expr": "sum(rate(kubelet_node_config_error{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"}[5m]))",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "{{`{{`}}instance{{`}}`}}",
+ "refId": "A"
+ }
+ ],
+ "title": "Config Error Count",
+ "transparent": false,
+ "type": "stat"
},
{
- "collapse": false,
- "collapsed": false,
- "panels": [
- {
- "aliasColors": {
-
- },
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": "$datasource",
- "fill": 1,
- "fillGradient": 0,
- "gridPos": {
-
- },
- "id": 13,
- "legend": {
- "alignAsTable": true,
- "avg": false,
- "current": true,
- "hideEmpty": true,
- "hideZero": true,
- "max": false,
- "min": false,
- "rightSide": true,
- "show": true,
- "sideWidth": null,
- "total": false,
- "values": true
- },
- "lines": true,
- "linewidth": 1,
- "links": [
+ "aliasColors": {
- ],
- "nullPointMode": "null",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "repeat": null,
- "seriesOverrides": [
+ },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 7,
+ "w": 12,
+ "x": 0,
+ "y": 7
+ },
+ "id": 8,
+ "legend": {
+ "alignAsTable": true,
+ "avg": false,
+ "current": true,
+ "max": false,
+ "min": false,
+ "rightSide": true,
+ "show": true,
+ "sideWidth": null,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [
- ],
- "spaceLength": 10,
- "span": 6,
- "stack": false,
- "steppedLine": false,
- "targets": [
- {
- "expr": "sum(rate(storage_operation_duration_seconds_count{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, operation_name, volume_plugin)",
- "format": "time_series",
- "intervalFactor": 2,
- "legendFormat": "{{`{{`}}instance{{`}}`}} {{`{{`}}operation_name{{`}}`}} {{`{{`}}volume_plugin{{`}}`}}",
- "refId": "A"
- }
- ],
- "thresholds": [
+ ],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "repeat": null,
+ "seriesOverrides": [
- ],
- "timeFrom": null,
- "timeShift": null,
- "title": "Storage Operation Rate",
- "tooltip": {
- "shared": false,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": [
+ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "sum(rate(kubelet_runtime_operations_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (operation_type, instance)",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "{{`{{`}}instance{{`}}`}} {{`{{`}}operation_type{{`}}`}}",
+ "refId": "A"
+ }
+ ],
+ "thresholds": [
- ]
- },
- "yaxes": [
- {
- "format": "ops",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
- },
- {
- "format": "ops",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
- }
- ]
+ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Operation Rate",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": [
+
+ ]
+ },
+ "yaxes": [
+ {
+ "format": "ops",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
},
{
- "aliasColors": {
-
- },
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": "$datasource",
- "fill": 1,
- "fillGradient": 0,
- "gridPos": {
-
- },
- "id": 14,
- "legend": {
- "alignAsTable": true,
- "avg": false,
- "current": true,
- "hideEmpty": true,
- "hideZero": true,
- "max": false,
- "min": false,
- "rightSide": true,
- "show": true,
- "sideWidth": null,
- "total": false,
- "values": true
- },
- "lines": true,
- "linewidth": 1,
- "links": [
-
- ],
- "nullPointMode": "null",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "repeat": null,
- "seriesOverrides": [
+ "format": "ops",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ]
+ },
+ {
+ "aliasColors": {
- ],
- "spaceLength": 10,
- "span": 6,
- "stack": false,
- "steppedLine": false,
- "targets": [
- {
- "expr": "sum(rate(storage_operation_errors_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, operation_name, volume_plugin)",
- "format": "time_series",
- "intervalFactor": 2,
- "legendFormat": "{{`{{`}}instance{{`}}`}} {{`{{`}}operation_name{{`}}`}} {{`{{`}}volume_plugin{{`}}`}}",
- "refId": "A"
- }
- ],
- "thresholds": [
+ },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 7,
+ "w": 12,
+ "x": 12,
+ "y": 7
+ },
+ "id": 9,
+ "legend": {
+ "alignAsTable": true,
+ "avg": false,
+ "current": true,
+ "max": false,
+ "min": false,
+ "rightSide": true,
+ "show": true,
+ "sideWidth": null,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [
- ],
- "timeFrom": null,
- "timeShift": null,
- "title": "Storage Operation Error Rate",
- "tooltip": {
- "shared": false,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": [
+ ],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "repeat": null,
+ "seriesOverrides": [
- ]
- },
- "yaxes": [
- {
- "format": "ops",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
- },
- {
- "format": "ops",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
- }
- ]
+ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "sum(rate(kubelet_runtime_operations_errors_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, operation_type)",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "{{`{{`}}instance{{`}}`}} {{`{{`}}operation_type{{`}}`}}",
+ "refId": "A"
}
],
- "repeat": null,
- "repeatIteration": null,
- "repeatRowId": null,
- "showTitle": false,
- "title": "Dashboard Row",
- "titleSize": "h6",
- "type": "row"
+ "thresholds": [
+
+ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Operation Error Rate",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": [
+
+ ]
+ },
+ "yaxes": [
+ {
+ "format": "ops",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "ops",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ]
},
{
- "collapse": false,
- "collapsed": false,
- "panels": [
- {
- "aliasColors": {
+ "aliasColors": {
- },
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": "$datasource",
- "fill": 1,
- "fillGradient": 0,
- "gridPos": {
+ },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 7,
+ "w": 24,
+ "x": 0,
+ "y": 14
+ },
+ "id": 10,
+ "legend": {
+ "alignAsTable": true,
+ "avg": false,
+ "current": true,
+ "max": false,
+ "min": false,
+ "rightSide": true,
+ "show": true,
+ "sideWidth": null,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [
- },
- "id": 15,
- "legend": {
- "alignAsTable": true,
- "avg": false,
- "current": true,
- "hideEmpty": true,
- "hideZero": true,
- "max": false,
- "min": false,
- "rightSide": true,
- "show": true,
- "sideWidth": null,
- "total": false,
- "values": true
- },
- "lines": true,
- "linewidth": 1,
- "links": [
-
- ],
- "nullPointMode": "null",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "repeat": null,
- "seriesOverrides": [
-
- ],
- "spaceLength": 10,
- "span": 12,
- "stack": false,
- "steppedLine": false,
- "targets": [
- {
- "expr": "histogram_quantile(0.99, sum(rate(storage_operation_duration_seconds_bucket{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"}[5m])) by (instance, operation_name, volume_plugin, le))",
- "format": "time_series",
- "intervalFactor": 2,
- "legendFormat": "{{`{{`}}instance{{`}}`}} {{`{{`}}operation_name{{`}}`}} {{`{{`}}volume_plugin{{`}}`}}",
- "refId": "A"
- }
- ],
- "thresholds": [
-
- ],
- "timeFrom": null,
- "timeShift": null,
- "title": "Storage Operation Duration 99th quantile",
- "tooltip": {
- "shared": false,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": [
+ ],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "repeat": null,
+ "seriesOverrides": [
- ]
- },
- "yaxes": [
- {
- "format": "s",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
- },
- {
- "format": "s",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
- }
- ]
+ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "histogram_quantile(0.99, sum(rate(kubelet_runtime_operations_duration_seconds_bucket{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, operation_type, le))",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "{{`{{`}}instance{{`}}`}} {{`{{`}}operation_type{{`}}`}}",
+ "refId": "A"
}
],
- "repeat": null,
- "repeatIteration": null,
- "repeatRowId": null,
- "showTitle": false,
- "title": "Dashboard Row",
- "titleSize": "h6",
- "type": "row"
+ "thresholds": [
+
+ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Operation duration 99th quantile",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": [
+
+ ]
+ },
+ "yaxes": [
+ {
+ "format": "s",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "s",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ]
},
{
- "collapse": false,
- "collapsed": false,
- "panels": [
- {
- "aliasColors": {
+ "aliasColors": {
- },
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": "$datasource",
- "fill": 1,
- "fillGradient": 0,
- "gridPos": {
-
- },
- "id": 16,
- "legend": {
- "alignAsTable": true,
- "avg": false,
- "current": true,
- "max": false,
- "min": false,
- "rightSide": true,
- "show": true,
- "sideWidth": null,
- "total": false,
- "values": true
- },
- "lines": true,
- "linewidth": 1,
- "links": [
-
- ],
- "nullPointMode": "null",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "repeat": null,
- "seriesOverrides": [
-
- ],
- "spaceLength": 10,
- "span": 6,
- "stack": false,
- "steppedLine": false,
- "targets": [
- {
- "expr": "sum(rate(kubelet_cgroup_manager_duration_seconds_count{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"}[5m])) by (instance, operation_type)",
- "format": "time_series",
- "intervalFactor": 2,
- "legendFormat": "{{`{{`}}operation_type{{`}}`}}",
- "refId": "A"
- }
- ],
- "thresholds": [
+ },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 7,
+ "w": 12,
+ "x": 0,
+ "y": 21
+ },
+ "id": 11,
+ "legend": {
+ "alignAsTable": true,
+ "avg": false,
+ "current": true,
+ "max": false,
+ "min": false,
+ "rightSide": true,
+ "show": true,
+ "sideWidth": null,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [
- ],
- "timeFrom": null,
- "timeShift": null,
- "title": "Cgroup manager operation rate",
- "tooltip": {
- "shared": false,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": [
+ ],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "repeat": null,
+ "seriesOverrides": [
- ]
- },
- "yaxes": [
- {
- "format": "ops",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
- },
- {
- "format": "ops",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
- }
- ]
+ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "sum(rate(kubelet_pod_start_duration_seconds_count{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance)",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "{{`{{`}}instance{{`}}`}} pod",
+ "refId": "A"
},
{
- "aliasColors": {
-
- },
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": "$datasource",
- "fill": 1,
- "fillGradient": 0,
- "gridPos": {
-
- },
- "id": 17,
- "legend": {
- "alignAsTable": true,
- "avg": false,
- "current": true,
- "max": false,
- "min": false,
- "rightSide": true,
- "show": true,
- "sideWidth": null,
- "total": false,
- "values": true
- },
- "lines": true,
- "linewidth": 1,
- "links": [
+ "expr": "sum(rate(kubelet_pod_worker_duration_seconds_count{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance)",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "{{`{{`}}instance{{`}}`}} worker",
+ "refId": "B"
+ }
+ ],
+ "thresholds": [
- ],
- "nullPointMode": "null",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "repeat": null,
- "seriesOverrides": [
+ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Pod Start Rate",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": [
+
+ ]
+ },
+ "yaxes": [
+ {
+ "format": "ops",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "ops",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ]
+ },
+ {
+ "aliasColors": {
- ],
- "spaceLength": 10,
- "span": 6,
- "stack": false,
- "steppedLine": false,
- "targets": [
- {
- "expr": "histogram_quantile(0.99, sum(rate(kubelet_cgroup_manager_duration_seconds_bucket{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"}[5m])) by (instance, operation_type, le))",
- "format": "time_series",
- "intervalFactor": 2,
- "legendFormat": "{{`{{`}}instance{{`}}`}} {{`{{`}}operation_type{{`}}`}}",
- "refId": "A"
- }
- ],
- "thresholds": [
+ },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 7,
+ "w": 12,
+ "x": 12,
+ "y": 21
+ },
+ "id": 12,
+ "legend": {
+ "alignAsTable": true,
+ "avg": false,
+ "current": true,
+ "max": false,
+ "min": false,
+ "rightSide": true,
+ "show": true,
+ "sideWidth": null,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [
- ],
- "timeFrom": null,
- "timeShift": null,
- "title": "Cgroup manager 99th quantile",
- "tooltip": {
- "shared": false,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": [
+ ],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "repeat": null,
+ "seriesOverrides": [
- ]
- },
- "yaxes": [
- {
- "format": "s",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
- },
- {
- "format": "s",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
- }
- ]
+ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "histogram_quantile(0.99, sum(rate(kubelet_pod_start_duration_seconds_count{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, le))",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "{{`{{`}}instance{{`}}`}} pod",
+ "refId": "A"
+ },
+ {
+ "expr": "histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, le))",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "{{`{{`}}instance{{`}}`}} worker",
+ "refId": "B"
}
],
- "repeat": null,
- "repeatIteration": null,
- "repeatRowId": null,
- "showTitle": false,
- "title": "Dashboard Row",
- "titleSize": "h6",
- "type": "row"
+ "thresholds": [
+
+ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Pod Start Duration",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": [
+
+ ]
+ },
+ "yaxes": [
+ {
+ "format": "s",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "s",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ]
},
{
- "collapse": false,
- "collapsed": false,
- "panels": [
- {
- "aliasColors": {
-
- },
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": "$datasource",
- "description": "Pod lifecycle event generator",
- "fill": 1,
- "fillGradient": 0,
- "gridPos": {
-
- },
- "id": 18,
- "legend": {
- "alignAsTable": true,
- "avg": false,
- "current": true,
- "max": false,
- "min": false,
- "rightSide": true,
- "show": true,
- "sideWidth": null,
- "total": false,
- "values": true
- },
- "lines": true,
- "linewidth": 1,
- "links": [
+ "aliasColors": {
- ],
- "nullPointMode": "null",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "repeat": null,
- "seriesOverrides": [
+ },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 7,
+ "w": 12,
+ "x": 0,
+ "y": 28
+ },
+ "id": 13,
+ "legend": {
+ "alignAsTable": true,
+ "avg": false,
+ "current": true,
+ "hideEmpty": true,
+ "hideZero": true,
+ "max": false,
+ "min": false,
+ "rightSide": true,
+ "show": true,
+ "sideWidth": null,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [
- ],
- "spaceLength": 10,
- "span": 6,
- "stack": false,
- "steppedLine": false,
- "targets": [
- {
- "expr": "sum(rate(kubelet_pleg_relist_duration_seconds_count{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"}[5m])) by (instance)",
- "format": "time_series",
- "intervalFactor": 2,
- "legendFormat": "{{`{{`}}instance{{`}}`}}",
- "refId": "A"
- }
- ],
- "thresholds": [
+ ],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "repeat": null,
+ "seriesOverrides": [
- ],
- "timeFrom": null,
- "timeShift": null,
- "title": "PLEG relist rate",
- "tooltip": {
- "shared": false,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": [
+ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "sum(rate(storage_operation_duration_seconds_count{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, operation_name, volume_plugin)",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "{{`{{`}}instance{{`}}`}} {{`{{`}}operation_name{{`}}`}} {{`{{`}}volume_plugin{{`}}`}}",
+ "refId": "A"
+ }
+ ],
+ "thresholds": [
- ]
- },
- "yaxes": [
- {
- "format": "ops",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
- },
- {
- "format": "ops",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
- }
- ]
+ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Storage Operation Rate",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": [
+
+ ]
+ },
+ "yaxes": [
+ {
+ "format": "ops",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
},
{
- "aliasColors": {
+ "format": "ops",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ]
+ },
+ {
+ "aliasColors": {
- },
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": "$datasource",
- "fill": 1,
- "fillGradient": 0,
- "gridPos": {
+ },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 7,
+ "w": 12,
+ "x": 12,
+ "y": 28
+ },
+ "id": 14,
+ "legend": {
+ "alignAsTable": true,
+ "avg": false,
+ "current": true,
+ "hideEmpty": true,
+ "hideZero": true,
+ "max": false,
+ "min": false,
+ "rightSide": true,
+ "show": true,
+ "sideWidth": null,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [
- },
- "id": 19,
- "legend": {
- "alignAsTable": true,
- "avg": false,
- "current": true,
- "max": false,
- "min": false,
- "rightSide": true,
- "show": true,
- "sideWidth": null,
- "total": false,
- "values": true
- },
- "lines": true,
- "linewidth": 1,
- "links": [
+ ],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "repeat": null,
+ "seriesOverrides": [
- ],
- "nullPointMode": "null",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "repeat": null,
- "seriesOverrides": [
+ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "sum(rate(storage_operation_errors_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, operation_name, volume_plugin)",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "{{`{{`}}instance{{`}}`}} {{`{{`}}operation_name{{`}}`}} {{`{{`}}volume_plugin{{`}}`}}",
+ "refId": "A"
+ }
+ ],
+ "thresholds": [
- ],
- "spaceLength": 10,
- "span": 6,
- "stack": false,
- "steppedLine": false,
- "targets": [
- {
- "expr": "histogram_quantile(0.99, sum(rate(kubelet_pleg_relist_interval_seconds_bucket{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, le))",
- "format": "time_series",
- "intervalFactor": 2,
- "legendFormat": "{{`{{`}}instance{{`}}`}}",
- "refId": "A"
- }
- ],
- "thresholds": [
+ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Storage Operation Error Rate",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": [
+
+ ]
+ },
+ "yaxes": [
+ {
+ "format": "ops",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "ops",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ]
+ },
+ {
+ "aliasColors": {
- ],
- "timeFrom": null,
- "timeShift": null,
- "title": "PLEG relist interval",
- "tooltip": {
- "shared": false,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": [
+ },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 7,
+ "w": 24,
+ "x": 0,
+ "y": 35
+ },
+ "id": 15,
+ "legend": {
+ "alignAsTable": true,
+ "avg": false,
+ "current": true,
+ "hideEmpty": true,
+ "hideZero": true,
+ "max": false,
+ "min": false,
+ "rightSide": true,
+ "show": true,
+ "sideWidth": null,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [
- ]
- },
- "yaxes": [
- {
- "format": "s",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
- },
- {
- "format": "s",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
- }
- ]
- }
],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
"repeat": null,
- "repeatIteration": null,
- "repeatRowId": null,
- "showTitle": false,
- "title": "Dashboard Row",
- "titleSize": "h6",
- "type": "row"
+ "seriesOverrides": [
+
+ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "histogram_quantile(0.99, sum(rate(storage_operation_duration_seconds_bucket{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"}[5m])) by (instance, operation_name, volume_plugin, le))",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "{{`{{`}}instance{{`}}`}} {{`{{`}}operation_name{{`}}`}} {{`{{`}}volume_plugin{{`}}`}}",
+ "refId": "A"
+ }
+ ],
+ "thresholds": [
+
+ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Storage Operation Duration 99th quantile",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": [
+
+ ]
+ },
+ "yaxes": [
+ {
+ "format": "s",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "s",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ]
},
{
- "collapse": false,
- "collapsed": false,
- "panels": [
- {
- "aliasColors": {
+ "aliasColors": {
- },
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": "$datasource",
- "fill": 1,
- "fillGradient": 0,
- "gridPos": {
+ },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 7,
+ "w": 12,
+ "x": 0,
+ "y": 42
+ },
+ "id": 16,
+ "legend": {
+ "alignAsTable": true,
+ "avg": false,
+ "current": true,
+ "max": false,
+ "min": false,
+ "rightSide": true,
+ "show": true,
+ "sideWidth": null,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [
- },
- "id": 20,
- "legend": {
- "alignAsTable": true,
- "avg": false,
- "current": true,
- "max": false,
- "min": false,
- "rightSide": true,
- "show": true,
- "sideWidth": null,
- "total": false,
- "values": true
- },
- "lines": true,
- "linewidth": 1,
- "links": [
+ ],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "repeat": null,
+ "seriesOverrides": [
- ],
- "nullPointMode": "null",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "repeat": null,
- "seriesOverrides": [
+ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "sum(rate(kubelet_cgroup_manager_duration_seconds_count{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"}[5m])) by (instance, operation_type)",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "{{`{{`}}operation_type{{`}}`}}",
+ "refId": "A"
+ }
+ ],
+ "thresholds": [
- ],
- "spaceLength": 10,
- "span": 12,
- "stack": false,
- "steppedLine": false,
- "targets": [
- {
- "expr": "histogram_quantile(0.99, sum(rate(kubelet_pleg_relist_duration_seconds_bucket{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, le))",
- "format": "time_series",
- "intervalFactor": 2,
- "legendFormat": "{{`{{`}}instance{{`}}`}}",
- "refId": "A"
- }
- ],
- "thresholds": [
+ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Cgroup manager operation rate",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": [
+
+ ]
+ },
+ "yaxes": [
+ {
+ "format": "ops",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "ops",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ]
+ },
+ {
+ "aliasColors": {
- ],
- "timeFrom": null,
- "timeShift": null,
- "title": "PLEG relist duration",
- "tooltip": {
- "shared": false,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": [
+ },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 7,
+ "w": 12,
+ "x": 12,
+ "y": 42
+ },
+ "id": 17,
+ "legend": {
+ "alignAsTable": true,
+ "avg": false,
+ "current": true,
+ "max": false,
+ "min": false,
+ "rightSide": true,
+ "show": true,
+ "sideWidth": null,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [
- ]
- },
- "yaxes": [
- {
- "format": "s",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
- },
- {
- "format": "s",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
- }
- ]
- }
],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
"repeat": null,
- "repeatIteration": null,
- "repeatRowId": null,
- "showTitle": false,
- "title": "Dashboard Row",
- "titleSize": "h6",
- "type": "row"
+ "seriesOverrides": [
+
+ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "histogram_quantile(0.99, sum(rate(kubelet_cgroup_manager_duration_seconds_bucket{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"}[5m])) by (instance, operation_type, le))",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "{{`{{`}}instance{{`}}`}} {{`{{`}}operation_type{{`}}`}}",
+ "refId": "A"
+ }
+ ],
+ "thresholds": [
+
+ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Cgroup manager 99th quantile",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": [
+
+ ]
+ },
+ "yaxes": [
+ {
+ "format": "s",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "s",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ]
},
{
- "collapse": false,
- "collapsed": false,
- "panels": [
- {
- "aliasColors": {
+ "aliasColors": {
- },
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": "$datasource",
- "fill": 1,
- "fillGradient": 0,
- "gridPos": {
+ },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "description": "Pod lifecycle event generator",
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 7,
+ "w": 12,
+ "x": 0,
+ "y": 49
+ },
+ "id": 18,
+ "legend": {
+ "alignAsTable": true,
+ "avg": false,
+ "current": true,
+ "max": false,
+ "min": false,
+ "rightSide": true,
+ "show": true,
+ "sideWidth": null,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [
- },
- "id": 21,
- "legend": {
- "alignAsTable": false,
- "avg": false,
- "current": false,
- "max": false,
- "min": false,
- "rightSide": false,
- "show": true,
- "sideWidth": null,
- "total": false,
- "values": false
- },
- "lines": true,
- "linewidth": 1,
- "links": [
+ ],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "repeat": null,
+ "seriesOverrides": [
- ],
- "nullPointMode": "null",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "repeat": null,
- "seriesOverrides": [
+ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "sum(rate(kubelet_pleg_relist_duration_seconds_count{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"}[5m])) by (instance)",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "{{`{{`}}instance{{`}}`}}",
+ "refId": "A"
+ }
+ ],
+ "thresholds": [
- ],
- "spaceLength": 10,
- "span": 12,
- "stack": false,
- "steppedLine": false,
- "targets": [
- {
- "expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\",code=~\"2..\"}[5m]))",
- "format": "time_series",
- "intervalFactor": 2,
- "legendFormat": "2xx",
- "refId": "A"
- },
- {
- "expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\",code=~\"3..\"}[5m]))",
- "format": "time_series",
- "intervalFactor": 2,
- "legendFormat": "3xx",
- "refId": "B"
- },
- {
- "expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\",code=~\"4..\"}[5m]))",
- "format": "time_series",
- "intervalFactor": 2,
- "legendFormat": "4xx",
- "refId": "C"
- },
- {
- "expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\",code=~\"5..\"}[5m]))",
- "format": "time_series",
- "intervalFactor": 2,
- "legendFormat": "5xx",
- "refId": "D"
- }
- ],
- "thresholds": [
+ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "PLEG relist rate",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": [
+
+ ]
+ },
+ "yaxes": [
+ {
+ "format": "ops",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "ops",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ]
+ },
+ {
+ "aliasColors": {
- ],
- "timeFrom": null,
- "timeShift": null,
- "title": "RPC Rate",
- "tooltip": {
- "shared": false,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": [
+ },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 7,
+ "w": 12,
+ "x": 12,
+ "y": 49
+ },
+ "id": 19,
+ "legend": {
+ "alignAsTable": true,
+ "avg": false,
+ "current": true,
+ "max": false,
+ "min": false,
+ "rightSide": true,
+ "show": true,
+ "sideWidth": null,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [
- ]
- },
- "yaxes": [
- {
- "format": "ops",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
- },
- {
- "format": "ops",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
- }
- ]
- }
],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
"repeat": null,
- "repeatIteration": null,
- "repeatRowId": null,
- "showTitle": false,
- "title": "Dashboard Row",
- "titleSize": "h6",
- "type": "row"
+ "seriesOverrides": [
+
+ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "histogram_quantile(0.99, sum(rate(kubelet_pleg_relist_interval_seconds_bucket{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, le))",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "{{`{{`}}instance{{`}}`}}",
+ "refId": "A"
+ }
+ ],
+ "thresholds": [
+
+ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "PLEG relist interval",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": [
+
+ ]
+ },
+ "yaxes": [
+ {
+ "format": "s",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "s",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ]
},
{
- "collapse": false,
- "collapsed": false,
- "panels": [
- {
- "aliasColors": {
+ "aliasColors": {
- },
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": "$datasource",
- "fill": 1,
- "fillGradient": 0,
- "gridPos": {
+ },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 7,
+ "w": 24,
+ "x": 0,
+ "y": 56
+ },
+ "id": 20,
+ "legend": {
+ "alignAsTable": true,
+ "avg": false,
+ "current": true,
+ "max": false,
+ "min": false,
+ "rightSide": true,
+ "show": true,
+ "sideWidth": null,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [
- },
- "id": 22,
- "legend": {
- "alignAsTable": true,
- "avg": false,
- "current": true,
- "max": false,
- "min": false,
- "rightSide": true,
- "show": true,
- "sideWidth": null,
- "total": false,
- "values": true
- },
- "lines": true,
- "linewidth": 1,
- "links": [
+ ],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "repeat": null,
+ "seriesOverrides": [
- ],
- "nullPointMode": "null",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "repeat": null,
- "seriesOverrides": [
+ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "histogram_quantile(0.99, sum(rate(kubelet_pleg_relist_duration_seconds_bucket{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, le))",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "{{`{{`}}instance{{`}}`}}",
+ "refId": "A"
+ }
+ ],
+ "thresholds": [
- ],
- "spaceLength": 10,
- "span": 12,
- "stack": false,
- "steppedLine": false,
- "targets": [
- {
- "expr": "histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"}[5m])) by (instance, verb, url, le))",
- "format": "time_series",
- "intervalFactor": 2,
- "legendFormat": "{{`{{`}}instance{{`}}`}} {{`{{`}}verb{{`}}`}} {{`{{`}}url{{`}}`}}",
- "refId": "A"
- }
- ],
- "thresholds": [
+ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "PLEG relist duration",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": [
+
+ ]
+ },
+ "yaxes": [
+ {
+ "format": "s",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "s",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ]
+ },
+ {
+ "aliasColors": {
- ],
- "timeFrom": null,
- "timeShift": null,
- "title": "Request duration 99th quantile",
- "tooltip": {
- "shared": false,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": [
+ },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 7,
+ "w": 24,
+ "x": 0,
+ "y": 63
+ },
+ "id": 21,
+ "legend": {
+ "alignAsTable": false,
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "rightSide": false,
+ "show": true,
+ "sideWidth": null,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [
- ]
- },
- "yaxes": [
- {
- "format": "s",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
- },
- {
- "format": "s",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
- }
- ]
- }
],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
"repeat": null,
- "repeatIteration": null,
- "repeatRowId": null,
- "showTitle": false,
- "title": "Dashboard Row",
- "titleSize": "h6",
- "type": "row"
- },
- {
- "collapse": false,
- "collapsed": false,
- "panels": [
- {
- "aliasColors": {
+ "seriesOverrides": [
- },
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": "$datasource",
- "fill": 1,
- "fillGradient": 0,
- "gridPos": {
+ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\",code=~\"2..\"}[5m]))",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "2xx",
+ "refId": "A"
+ },
+ {
+ "expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\",code=~\"3..\"}[5m]))",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "3xx",
+ "refId": "B"
+ },
+ {
+ "expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\",code=~\"4..\"}[5m]))",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "4xx",
+ "refId": "C"
+ },
+ {
+ "expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\",code=~\"5..\"}[5m]))",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "5xx",
+ "refId": "D"
+ }
+ ],
+ "thresholds": [
- },
- "id": 23,
- "legend": {
- "alignAsTable": false,
- "avg": false,
- "current": false,
- "max": false,
- "min": false,
- "rightSide": false,
- "show": true,
- "sideWidth": null,
- "total": false,
- "values": false
- },
- "lines": true,
- "linewidth": 1,
- "links": [
+ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "RPC Rate",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": [
+
+ ]
+ },
+ "yaxes": [
+ {
+ "format": "ops",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "ops",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ]
+ },
+ {
+ "aliasColors": {
- ],
- "nullPointMode": "null",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "repeat": null,
- "seriesOverrides": [
+ },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 7,
+ "w": 24,
+ "x": 0,
+ "y": 70
+ },
+ "id": 22,
+ "legend": {
+ "alignAsTable": true,
+ "avg": false,
+ "current": true,
+ "max": false,
+ "min": false,
+ "rightSide": true,
+ "show": true,
+ "sideWidth": null,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [
- ],
- "spaceLength": 10,
- "span": 4,
- "stack": false,
- "steppedLine": false,
- "targets": [
- {
- "expr": "process_resident_memory_bytes{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}",
- "format": "time_series",
- "intervalFactor": 2,
- "legendFormat": "{{`{{`}}instance{{`}}`}}",
- "refId": "A"
- }
- ],
- "thresholds": [
+ ],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "repeat": null,
+ "seriesOverrides": [
- ],
- "timeFrom": null,
- "timeShift": null,
- "title": "Memory",
- "tooltip": {
- "shared": false,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": [
+ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"}[5m])) by (instance, verb, url, le))",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "{{`{{`}}instance{{`}}`}} {{`{{`}}verb{{`}}`}} {{`{{`}}url{{`}}`}}",
+ "refId": "A"
+ }
+ ],
+ "thresholds": [
- ]
- },
- "yaxes": [
- {
- "format": "bytes",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": null,
- "show": true
- },
- {
- "format": "bytes",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": null,
- "show": true
- }
- ]
+ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Request duration 99th quantile",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": [
+
+ ]
+ },
+ "yaxes": [
+ {
+ "format": "s",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
},
{
- "aliasColors": {
-
- },
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": "$datasource",
- "fill": 1,
- "fillGradient": 0,
- "gridPos": {
-
- },
- "id": 24,
- "legend": {
- "alignAsTable": false,
- "avg": false,
- "current": false,
- "max": false,
- "min": false,
- "rightSide": false,
- "show": true,
- "sideWidth": null,
- "total": false,
- "values": false
- },
- "lines": true,
- "linewidth": 1,
- "links": [
+ "format": "s",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ]
+ },
+ {
+ "aliasColors": {
- ],
- "nullPointMode": "null",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "repeat": null,
- "seriesOverrides": [
+ },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 7,
+ "w": 8,
+ "x": 0,
+ "y": 77
+ },
+ "id": 23,
+ "legend": {
+ "alignAsTable": false,
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "rightSide": false,
+ "show": true,
+ "sideWidth": null,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [
- ],
- "spaceLength": 10,
- "span": 4,
- "stack": false,
- "steppedLine": false,
- "targets": [
- {
- "expr": "rate(process_cpu_seconds_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])",
- "format": "time_series",
- "intervalFactor": 2,
- "legendFormat": "{{`{{`}}instance{{`}}`}}",
- "refId": "A"
- }
- ],
- "thresholds": [
+ ],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "repeat": null,
+ "seriesOverrides": [
- ],
- "timeFrom": null,
- "timeShift": null,
- "title": "CPU usage",
- "tooltip": {
- "shared": false,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": [
+ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "process_resident_memory_bytes{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "{{`{{`}}instance{{`}}`}}",
+ "refId": "A"
+ }
+ ],
+ "thresholds": [
- ]
- },
- "yaxes": [
- {
- "format": "short",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
- },
- {
- "format": "short",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
- }
- ]
+ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Memory",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": [
+
+ ]
+ },
+ "yaxes": [
+ {
+ "format": "bytes",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
},
{
- "aliasColors": {
+ "format": "bytes",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ]
+ },
+ {
+ "aliasColors": {
- },
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": "$datasource",
- "fill": 1,
- "fillGradient": 0,
- "gridPos": {
+ },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 7,
+ "w": 8,
+ "x": 8,
+ "y": 77
+ },
+ "id": 24,
+ "legend": {
+ "alignAsTable": false,
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "rightSide": false,
+ "show": true,
+ "sideWidth": null,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [
- },
- "id": 25,
- "legend": {
- "alignAsTable": false,
- "avg": false,
- "current": false,
- "max": false,
- "min": false,
- "rightSide": false,
- "show": true,
- "sideWidth": null,
- "total": false,
- "values": false
- },
- "lines": true,
- "linewidth": 1,
- "links": [
+ ],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "repeat": null,
+ "seriesOverrides": [
- ],
- "nullPointMode": "null",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "repeat": null,
- "seriesOverrides": [
+ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "rate(process_cpu_seconds_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "{{`{{`}}instance{{`}}`}}",
+ "refId": "A"
+ }
+ ],
+ "thresholds": [
- ],
- "spaceLength": 10,
- "span": 4,
- "stack": false,
- "steppedLine": false,
- "targets": [
- {
- "expr": "go_goroutines{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}",
- "format": "time_series",
- "intervalFactor": 2,
- "legendFormat": "{{`{{`}}instance{{`}}`}}",
- "refId": "A"
- }
- ],
- "thresholds": [
+ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "CPU usage",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": [
+
+ ]
+ },
+ "yaxes": [
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ]
+ },
+ {
+ "aliasColors": {
- ],
- "timeFrom": null,
- "timeShift": null,
- "title": "Goroutines",
- "tooltip": {
- "shared": false,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": [
+ },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 7,
+ "w": 8,
+ "x": 16,
+ "y": 77
+ },
+ "id": 25,
+ "legend": {
+ "alignAsTable": false,
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "rightSide": false,
+ "show": true,
+ "sideWidth": null,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [
- ]
- },
- "yaxes": [
- {
- "format": "short",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": null,
- "show": true
- },
- {
- "format": "short",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": null,
- "show": true
- }
- ]
- }
],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
"repeat": null,
- "repeatIteration": null,
- "repeatRowId": null,
- "showTitle": false,
- "title": "Dashboard Row",
- "titleSize": "h6",
- "type": "row"
+ "seriesOverrides": [
+
+ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "go_goroutines{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "{{`{{`}}instance{{`}}`}}",
+ "refId": "A"
+ }
+ ],
+ "thresholds": [
+
+ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Goroutines",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": [
+
+ ]
+ },
+ "yaxes": [
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ]
}
+ ],
+ "refresh": "10s",
+ "rows": [
+
],
"schemaVersion": 14,
"style": "dark",
diff --git a/charts/victoria-metrics-k8s-stack/templates/grafana/dashboards/scheduler.yaml b/charts/victoria-metrics-k8s-stack/templates/grafana/dashboards/scheduler.yaml
index 29cab75a3..b11b4da31 100644
--- a/charts/victoria-metrics-k8s-stack/templates/grafana/dashboards/scheduler.yaml
+++ b/charts/victoria-metrics-k8s-stack/templates/grafana/dashboards/scheduler.yaml
@@ -173,31 +173,31 @@ data:
"steppedLine": false,
"targets": [
{
- "expr": "sum(rate(scheduler_e2e_scheduling_duration_seconds_count{cluster=\"$cluster\", job=\"kube-scheduler\", instance=~\"$instance\"}[5m])) by (instance)",
+ "expr": "sum(rate(scheduler_e2e_scheduling_duration_seconds_count{cluster=\"$cluster\", job=\"kube-scheduler\", instance=~\"$instance\"}[5m])) by (cluster, instance)",
"format": "time_series",
"intervalFactor": 2,
- "legendFormat": "{{`{{`}}instance{{`}}`}} e2e",
+ "legendFormat": "{{`{{`}}cluster{{`}}`}} {{`{{`}}instance{{`}}`}} e2e",
"refId": "A"
},
{
- "expr": "sum(rate(scheduler_binding_duration_seconds_count{cluster=\"$cluster\", job=\"kube-scheduler\", instance=~\"$instance\"}[5m])) by (instance)",
+ "expr": "sum(rate(scheduler_binding_duration_seconds_count{cluster=\"$cluster\", job=\"kube-scheduler\", instance=~\"$instance\"}[5m])) by (cluster, instance)",
"format": "time_series",
"intervalFactor": 2,
- "legendFormat": "{{`{{`}}instance{{`}}`}} binding",
+ "legendFormat": "{{`{{`}}cluster{{`}}`}} {{`{{`}}instance{{`}}`}} binding",
"refId": "B"
},
{
- "expr": "sum(rate(scheduler_scheduling_algorithm_duration_seconds_count{cluster=\"$cluster\", job=\"kube-scheduler\", instance=~\"$instance\"}[5m])) by (instance)",
+ "expr": "sum(rate(scheduler_scheduling_algorithm_duration_seconds_count{cluster=\"$cluster\", job=\"kube-scheduler\", instance=~\"$instance\"}[5m])) by (cluster, instance)",
"format": "time_series",
"intervalFactor": 2,
- "legendFormat": "{{`{{`}}instance{{`}}`}} scheduling algorithm",
+ "legendFormat": "{{`{{`}}cluster{{`}}`}} {{`{{`}}instance{{`}}`}} scheduling algorithm",
"refId": "C"
},
{
- "expr": "sum(rate(scheduler_volume_scheduling_duration_seconds_count{cluster=\"$cluster\", job=\"kube-scheduler\", instance=~\"$instance\"}[5m])) by (instance)",
+ "expr": "sum(rate(scheduler_volume_scheduling_duration_seconds_count{cluster=\"$cluster\", job=\"kube-scheduler\", instance=~\"$instance\"}[5m])) by (cluster, instance)",
"format": "time_series",
"intervalFactor": 2,
- "legendFormat": "{{`{{`}}instance{{`}}`}} volume",
+ "legendFormat": "{{`{{`}}cluster{{`}}`}} {{`{{`}}instance{{`}}`}} volume",
"refId": "D"
}
],
@@ -287,31 +287,31 @@ data:
"steppedLine": false,
"targets": [
{
- "expr": "histogram_quantile(0.99, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-scheduler\",instance=~\"$instance\"}[5m])) by (instance, le))",
+ "expr": "histogram_quantile(0.99, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-scheduler\",instance=~\"$instance\"}[5m])) by (cluster, instance, le))",
"format": "time_series",
"intervalFactor": 2,
- "legendFormat": "{{`{{`}}instance{{`}}`}} e2e",
+ "legendFormat": "{{`{{`}}cluster{{`}}`}} {{`{{`}}instance{{`}}`}} e2e",
"refId": "A"
},
{
- "expr": "histogram_quantile(0.99, sum(rate(scheduler_binding_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-scheduler\",instance=~\"$instance\"}[5m])) by (instance, le))",
+ "expr": "histogram_quantile(0.99, sum(rate(scheduler_binding_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-scheduler\",instance=~\"$instance\"}[5m])) by (cluster, instance, le))",
"format": "time_series",
"intervalFactor": 2,
- "legendFormat": "{{`{{`}}instance{{`}}`}} binding",
+ "legendFormat": "{{`{{`}}cluster{{`}}`}} {{`{{`}}instance{{`}}`}} binding",
"refId": "B"
},
{
- "expr": "histogram_quantile(0.99, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-scheduler\",instance=~\"$instance\"}[5m])) by (instance, le))",
+ "expr": "histogram_quantile(0.99, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-scheduler\",instance=~\"$instance\"}[5m])) by (cluster, instance, le))",
"format": "time_series",
"intervalFactor": 2,
- "legendFormat": "{{`{{`}}instance{{`}}`}} scheduling algorithm",
+ "legendFormat": "{{`{{`}}cluster{{`}}`}} {{`{{`}}instance{{`}}`}} scheduling algorithm",
"refId": "C"
},
{
- "expr": "histogram_quantile(0.99, sum(rate(scheduler_volume_scheduling_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-scheduler\",instance=~\"$instance\"}[5m])) by (instance, le))",
+ "expr": "histogram_quantile(0.99, sum(rate(scheduler_volume_scheduling_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-scheduler\",instance=~\"$instance\"}[5m])) by (cluster, instance, le))",
"format": "time_series",
"intervalFactor": 2,
- "legendFormat": "{{`{{`}}instance{{`}}`}} volume",
+ "legendFormat": "{{`{{`}}cluster{{`}}`}} {{`{{`}}instance{{`}}`}} volume",
"refId": "D"
}
],
diff --git a/charts/victoria-metrics-k8s-stack/templates/grafana/dashboards/victoriametrics.yaml b/charts/victoria-metrics-k8s-stack/templates/grafana/dashboards/victoriametrics.yaml
index 95dddd8b0..342b0cac3 100644
--- a/charts/victoria-metrics-k8s-stack/templates/grafana/dashboards/victoriametrics.yaml
+++ b/charts/victoria-metrics-k8s-stack/templates/grafana/dashboards/victoriametrics.yaml
@@ -24,12 +24,12 @@ data:
"type": "grafana",
"id": "grafana",
"name": "Grafana",
- "version": "7.1.1"
+ "version": "8.0.0"
},
{
"type": "panel",
"id": "graph",
- "name": "Graph",
+ "name": "Graph (old)",
"version": ""
},
{
@@ -40,15 +40,15 @@ data:
},
{
"type": "panel",
- "id": "singlestat",
- "name": "Singlestat",
+ "id": "stat",
+ "name": "Stat",
"version": ""
},
{
"type": "panel",
"id": "text",
"name": "Text",
- "version": "7.1.0"
+ "version": ""
}
],
"annotations": {
@@ -64,12 +64,12 @@ data:
}
]
},
- "description": "Overview for single node VictoriaMetrics v1.56.0 or higher",
+ "description": "Overview for single node VictoriaMetrics v1.57.0 or higher",
"editable": true,
"gnetId": 10229,
"graphTooltip": 0,
"id": null,
- "iteration": 1616956884194,
+ "iteration": 1624970666582,
"links": [
{
"icon": "doc",
@@ -99,7 +99,7 @@ data:
],
"panels": [
{
- "collapsed": false,
+ "collapsed": true,
"datasource": "$ds",
"gridPos": {
"h": 1,
@@ -108,706 +108,956 @@ data:
"y": 0
},
"id": 6,
- "panels": [],
- "title": "Configuration",
+ "panels": [
+ {
+ "datasource": null,
+ "description": "",
+ "gridPos": {
+ "h": 2,
+ "w": 4,
+ "x": 0,
+ "y": 1
+ },
+ "id": 85,
+ "options": {
+ "content": "
$version
",
+ "mode": "markdown"
+ },
+ "pluginVersion": "8.0.0",
+ "title": "Version",
+ "type": "text"
+ },
+ {
+ "cacheTimeout": null,
+ "datasource": "$ds",
+ "description": "How many datapoints are in storage",
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "thresholds"
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ }
+ ]
+ },
+ "unit": "short"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 2,
+ "w": 5,
+ "x": 4,
+ "y": 1
+ },
+ "id": 26,
+ "interval": null,
+ "links": [],
+ "maxDataPoints": 100,
+ "options": {
+ "colorMode": "value",
+ "graphMode": "area",
+ "justifyMode": "auto",
+ "orientation": "horizontal",
+ "reduceOptions": {
+ "calcs": [
+ "lastNotNull"
+ ],
+ "fields": "",
+ "values": false
+ },
+ "text": {},
+ "textMode": "auto"
+ },
+ "pluginVersion": "8.0.0",
+ "targets": [
+ {
+ "exemplar": true,
+ "expr": "sum(vm_rows{job=\"$job\", instance=~\"$instance\", type!=\"indexdb\"})",
+ "format": "time_series",
+ "instant": true,
+ "interval": "",
+ "intervalFactor": 1,
+ "legendFormat": "",
+ "refId": "A"
+ }
+ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Total datapoints",
+ "type": "stat"
+ },
+ {
+ "cacheTimeout": null,
+ "datasource": "$ds",
+ "description": "Total amount of used disk space",
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "thresholds"
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ }
+ ]
+ },
+ "unit": "bytes"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 2,
+ "w": 5,
+ "x": 9,
+ "y": 1
+ },
+ "id": 81,
+ "interval": null,
+ "links": [],
+ "maxDataPoints": 100,
+ "options": {
+ "colorMode": "value",
+ "graphMode": "area",
+ "justifyMode": "auto",
+ "orientation": "horizontal",
+ "reduceOptions": {
+ "calcs": [
+ "lastNotNull"
+ ],
+ "fields": "",
+ "values": false
+ },
+ "text": {},
+ "textMode": "auto"
+ },
+ "pluginVersion": "8.0.0",
+ "targets": [
+ {
+ "exemplar": true,
+ "expr": "sum(vm_data_size_bytes{job=\"$job\", type!=\"indexdb\"})",
+ "format": "time_series",
+ "instant": true,
+ "interval": "",
+ "intervalFactor": 1,
+ "legendFormat": "",
+ "refId": "A"
+ }
+ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Disk space usage",
+ "type": "stat"
+ },
+ {
+ "cacheTimeout": null,
+ "datasource": "$ds",
+ "description": "Average disk usage per datapoint.",
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "thresholds"
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ }
+ ]
+ },
+ "unit": "bytes"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 2,
+ "w": 5,
+ "x": 14,
+ "y": 1
+ },
+ "id": 82,
+ "interval": null,
+ "links": [],
+ "maxDataPoints": 100,
+ "options": {
+ "colorMode": "value",
+ "graphMode": "area",
+ "justifyMode": "auto",
+ "orientation": "horizontal",
+ "reduceOptions": {
+ "calcs": [
+ "lastNotNull"
+ ],
+ "fields": "",
+ "values": false
+ },
+ "text": {},
+ "textMode": "auto"
+ },
+ "pluginVersion": "8.0.0",
+ "targets": [
+ {
+ "exemplar": true,
+ "expr": "sum(vm_data_size_bytes{job=\"$job\", type!=\"indexdb\"}) / sum(vm_rows{job=\"$job\", type!=\"indexdb\"})",
+ "format": "time_series",
+ "instant": true,
+ "interval": "",
+ "intervalFactor": 1,
+ "legendFormat": "",
+ "refId": "A"
+ }
+ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Bytes per point",
+ "type": "stat"
+ },
+ {
+ "cacheTimeout": null,
+ "datasource": "$ds",
+ "description": "Total size of allowed memory via flag `-memory.allowedPercent`",
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "thresholds"
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ }
+ ]
+ },
+ "unit": "bytes"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 2,
+ "w": 5,
+ "x": 19,
+ "y": 1
+ },
+ "id": 79,
+ "interval": null,
+ "links": [],
+ "maxDataPoints": 100,
+ "options": {
+ "colorMode": "value",
+ "graphMode": "area",
+ "justifyMode": "auto",
+ "orientation": "horizontal",
+ "reduceOptions": {
+ "calcs": [
+ "lastNotNull"
+ ],
+ "fields": "",
+ "values": false
+ },
+ "text": {},
+ "textMode": "auto"
+ },
+ "pluginVersion": "8.0.0",
+ "targets": [
+ {
+ "exemplar": true,
+ "expr": "sum(vm_allowed_memory_bytes{job=\"$job\", instance=~\"$instance\"})",
+ "format": "time_series",
+ "instant": true,
+ "interval": "",
+ "intervalFactor": 1,
+ "legendFormat": "",
+ "refId": "A"
+ }
+ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Allowed memory",
+ "type": "stat"
+ },
+ {
+ "datasource": null,
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "thresholds"
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "red",
+ "value": null
+ },
+ {
+ "color": "green",
+ "value": 1800
+ }
+ ]
+ },
+ "unit": "s"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 2,
+ "w": 4,
+ "x": 0,
+ "y": 3
+ },
+ "id": 87,
+ "options": {
+ "colorMode": "value",
+ "graphMode": "area",
+ "justifyMode": "auto",
+ "orientation": "auto",
+ "reduceOptions": {
+ "calcs": [
+ "lastNotNull"
+ ],
+ "fields": "",
+ "values": false
+ },
+ "text": {},
+ "textMode": "auto"
+ },
+ "pluginVersion": "8.0.0",
+ "targets": [
+ {
+ "exemplar": true,
+ "expr": "vm_app_uptime_seconds{job=\"$job\", instance=\"$instance\"}",
+ "instant": true,
+ "interval": "",
+ "legendFormat": "",
+ "refId": "A"
+ }
+ ],
+ "title": "Uptime",
+ "type": "stat"
+ },
+ {
+ "cacheTimeout": null,
+ "datasource": "$ds",
+ "description": "How many entries inverted index contains. This value is proportional to the number of unique timeseries in storage(cardinality).",
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "thresholds"
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ }
+ ]
+ },
+ "unit": "short"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 2,
+ "w": 5,
+ "x": 4,
+ "y": 3
+ },
+ "id": 38,
+ "interval": null,
+ "links": [],
+ "maxDataPoints": 100,
+ "options": {
+ "colorMode": "value",
+ "graphMode": "area",
+ "justifyMode": "auto",
+ "orientation": "horizontal",
+ "reduceOptions": {
+ "calcs": [
+ "lastNotNull"
+ ],
+ "fields": "",
+ "values": false
+ },
+ "text": {},
+ "textMode": "auto"
+ },
+ "pluginVersion": "8.0.0",
+ "targets": [
+ {
+ "exemplar": true,
+ "expr": "sum(vm_rows{job=\"$job\", instance=~\"$instance\", type=\"indexdb\"})",
+ "format": "time_series",
+ "instant": true,
+ "interval": "",
+ "intervalFactor": 1,
+ "legendFormat": "",
+ "refId": "A"
+ }
+ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Index size",
+ "type": "stat"
+ },
+ {
+ "cacheTimeout": null,
+ "datasource": "$ds",
+ "description": "The minimum free disk space left",
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "thresholds"
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "percentage",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ }
+ ]
+ },
+ "unit": "bytes"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 2,
+ "w": 5,
+ "x": 9,
+ "y": 3
+ },
+ "id": 80,
+ "interval": null,
+ "links": [],
+ "maxDataPoints": 100,
+ "options": {
+ "colorMode": "value",
+ "graphMode": "area",
+ "justifyMode": "auto",
+ "orientation": "horizontal",
+ "reduceOptions": {
+ "calcs": [
+ "lastNotNull"
+ ],
+ "fields": "",
+ "values": false
+ },
+ "text": {},
+ "textMode": "auto"
+ },
+ "pluginVersion": "8.0.0",
+ "targets": [
+ {
+ "exemplar": true,
+ "expr": "min(vm_free_disk_space_bytes{job=\"$job\", instance=~\"$instance\"})",
+ "format": "time_series",
+ "instant": true,
+ "interval": "",
+ "intervalFactor": 1,
+ "legendFormat": "",
+ "refId": "A"
+ }
+ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Min free disk space",
+ "type": "stat"
+ },
+ {
+ "cacheTimeout": null,
+ "datasource": "$ds",
+ "description": "Total number of available CPUs for VM process",
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "thresholds"
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ },
+ {
+ "color": "red",
+ "value": 80
+ }
+ ]
+ },
+ "unit": "short"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 2,
+ "w": 5,
+ "x": 14,
+ "y": 3
+ },
+ "id": 77,
+ "interval": null,
+ "links": [],
+ "maxDataPoints": 100,
+ "options": {
+ "colorMode": "value",
+ "graphMode": "area",
+ "justifyMode": "auto",
+ "orientation": "horizontal",
+ "reduceOptions": {
+ "calcs": [
+ "lastNotNull"
+ ],
+ "fields": "",
+ "values": false
+ },
+ "text": {},
+ "textMode": "auto"
+ },
+ "pluginVersion": "8.0.0",
+ "targets": [
+ {
+ "exemplar": true,
+ "expr": "sum(vm_available_cpu_cores{job=\"$job\", instance=~\"$instance\"})",
+ "format": "time_series",
+ "instant": true,
+ "interval": "",
+ "intervalFactor": 1,
+ "legendFormat": "",
+ "refId": "A"
+ }
+ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Available CPU",
+ "type": "stat"
+ },
+ {
+ "cacheTimeout": null,
+ "datasource": "$ds",
+ "description": "Total size of available memory for VM process",
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "thresholds"
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ }
+ ]
+ },
+ "unit": "bytes"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 2,
+ "w": 5,
+ "x": 19,
+ "y": 3
+ },
+ "id": 78,
+ "interval": null,
+ "links": [],
+ "maxDataPoints": 100,
+ "options": {
+ "colorMode": "value",
+ "graphMode": "area",
+ "justifyMode": "auto",
+ "orientation": "horizontal",
+ "reduceOptions": {
+ "calcs": [
+ "lastNotNull"
+ ],
+ "fields": "",
+ "values": false
+ },
+ "text": {},
+ "textMode": "auto"
+ },
+ "pluginVersion": "8.0.0",
+ "targets": [
+ {
+ "exemplar": true,
+ "expr": "sum(vm_available_memory_bytes{job=\"$job\", instance=~\"$instance\"})",
+ "format": "time_series",
+ "instant": true,
+ "interval": "",
+ "intervalFactor": 1,
+ "legendFormat": "",
+ "refId": "A"
+ }
+ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Available memory",
+ "type": "stat"
+ }
+ ],
+ "title": "Stats",
"type": "row"
},
{
- "content": "$version
",
+ "collapsed": false,
"datasource": "$ds",
- "description": "",
- "fieldConfig": {
- "defaults": {
- "custom": {}
- },
- "overrides": []
- },
"gridPos": {
- "h": 2,
- "w": 6,
+ "h": 1,
+ "w": 24,
"x": 0,
"y": 1
},
- "id": 2,
- "links": [
- {
- "targetBlank": true,
- "title": "VictoriaMetrics releases",
- "url": "https://github.com/VictoriaMetrics/VictoriaMetrics/releases"
- }
- ],
- "mode": "html",
- "options": {
- "content": "$version
",
- "mode": "html"
- },
- "pluginVersion": "7.1.0",
- "timeFrom": null,
- "timeShift": null,
- "title": "Version",
- "type": "text"
+ "id": 24,
+ "panels": [],
+ "title": "Performance",
+ "type": "row"
},
{
- "cacheTimeout": null,
- "colorBackground": false,
- "colorValue": false,
- "colors": [
- "#299c46",
- "rgba(237, 129, 40, 0.89)",
- "#d44a3a"
- ],
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
"datasource": "$ds",
- "description": "How many datapoints are in storage",
+ "description": "* `*` - unsupported query path\n* `/write` - insert into VM\n* `/metrics` - query VM system metrics\n* `/query` - query instant values\n* `/query_range` - query over a range of time\n* `/series` - match a certain label set\n* `/label/{}/values` - query a list of label values (variables mostly)",
"fieldConfig": {
"defaults": {
- "custom": {}
+ "links": []
},
"overrides": []
},
- "format": "short",
- "gauge": {
- "maxValue": 100,
- "minValue": 0,
- "show": false,
- "thresholdLabels": false,
- "thresholdMarkers": true
- },
+ "fill": 1,
+ "fillGradient": 0,
"gridPos": {
- "h": 2,
- "w": 6,
- "x": 6,
- "y": 1
+ "h": 8,
+ "w": 12,
+ "x": 0,
+ "y": 2
+ },
+ "hiddenSeries": false,
+ "id": 12,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": true,
+ "min": false,
+ "show": true,
+ "sort": "current",
+ "sortDesc": true,
+ "total": false,
+ "values": true
},
- "id": 26,
- "interval": null,
+ "lines": true,
+ "linewidth": 1,
"links": [],
- "mappingType": 1,
- "mappingTypes": [
- {
- "name": "value to text",
- "value": 1
- },
- {
- "name": "range to text",
- "value": 2
- }
- ],
- "maxDataPoints": 100,
- "nullPointMode": "connected",
- "nullText": null,
- "postfix": "",
- "postfixFontSize": "50%",
- "prefix": "",
- "prefixFontSize": "50%",
- "rangeMaps": [
- {
- "from": "null",
- "text": "N/A",
- "to": "null"
- }
- ],
- "sparkline": {
- "fillColor": "rgba(31, 118, 189, 0.18)",
- "full": true,
- "lineColor": "rgb(31, 120, 193)",
- "show": true
+ "nullPointMode": "null as zero",
+ "options": {
+ "alertThreshold": true
},
- "tableColumn": "",
+ "percentage": false,
+ "pluginVersion": "8.0.0",
+ "pointradius": 2,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
"targets": [
{
- "expr": "sum(vm_rows{job=\"$job\", instance=~\"$instance\", type!=\"indexdb\"})",
+ "expr": "sum(rate(vm_http_requests_total{job=\"$job\", instance=~\"$instance\", path!~\"/favicon.ico\"}[$__interval])) by (path) > 0",
"format": "time_series",
- "instant": false,
+ "interval": "",
"intervalFactor": 1,
+ "legendFormat": "{{`{{`}}path{{`}}`}}",
"refId": "A"
}
],
- "thresholds": "",
+ "thresholds": [],
"timeFrom": null,
+ "timeRegions": [],
"timeShift": null,
- "title": "Total datapoints",
- "type": "singlestat",
- "valueFontSize": "80%",
- "valueMaps": [
- {
- "op": "=",
- "text": "N/A",
- "value": "null"
+ "title": "Requests rate ($instance)",
+ "tooltip": {
+ "shared": true,
+ "sort": 2,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
}
],
- "valueName": "current"
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
},
{
- "cacheTimeout": null,
- "colorBackground": false,
- "colorValue": false,
- "colors": [
- "#299c46",
- "rgba(237, 129, 40, 0.89)",
- "#d44a3a"
- ],
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
"datasource": "$ds",
- "description": "The size of the free disk space left",
+ "description": "The less time it takes is better.\n* `*` - unsupported query path\n* `/write` - insert into VM\n* `/metrics` - query VM system metrics\n* `/query` - query instant values\n* `/query_range` - query over a range of time\n* `/series` - match a certain label set\n* `/label/{}/values` - query a list of label values (variables mostly)",
"fieldConfig": {
"defaults": {
- "custom": {}
+ "links": []
},
"overrides": []
},
- "format": "bytes",
- "gauge": {
- "maxValue": 100,
- "minValue": 0,
- "show": false,
- "thresholdLabels": false,
- "thresholdMarkers": true
- },
+ "fill": 1,
+ "fillGradient": 0,
"gridPos": {
- "h": 2,
- "w": 6,
+ "h": 8,
+ "w": 12,
"x": 12,
- "y": 1
+ "y": 2
+ },
+ "hiddenSeries": false,
+ "id": 22,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": true,
+ "min": false,
+ "show": true,
+ "sort": "current",
+ "sortDesc": true,
+ "total": false,
+ "values": true
},
- "id": 80,
- "interval": null,
+ "lines": true,
+ "linewidth": 1,
"links": [],
- "mappingType": 1,
- "mappingTypes": [
- {
- "name": "value to text",
- "value": 1
- },
- {
- "name": "range to text",
- "value": 2
- }
- ],
- "maxDataPoints": 100,
- "nullPointMode": "connected",
- "nullText": null,
- "postfix": "",
- "postfixFontSize": "50%",
- "prefix": "",
- "prefixFontSize": "50%",
- "rangeMaps": [
- {
- "from": "null",
- "text": "N/A",
- "to": "null"
- }
- ],
- "sparkline": {
- "fillColor": "rgba(31, 118, 189, 0.18)",
- "full": true,
- "lineColor": "rgb(31, 120, 193)",
- "show": true
+ "nullPointMode": "null as zero",
+ "options": {
+ "alertThreshold": true
},
- "tableColumn": "",
+ "percentage": false,
+ "pluginVersion": "8.0.0",
+ "pointradius": 2,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
"targets": [
{
- "expr": "sum(vm_free_disk_space_bytes{job=\"$job\", instance=~\"$instance\", path=\"/storage\"})",
+ "expr": "max(vm_request_duration_seconds{job=\"$job\", instance=~\"$instance\", quantile=~\"(0.5|0.99)\"}) by (path, quantile) > 0",
"format": "time_series",
- "instant": false,
- "interval": "",
"intervalFactor": 1,
- "legendFormat": "",
+ "legendFormat": "{{`{{`}}quantile{{`}}`}} ({{`{{`}}path{{`}}`}})",
"refId": "A"
}
],
- "thresholds": "",
+ "thresholds": [],
"timeFrom": null,
+ "timeRegions": [],
"timeShift": null,
- "title": "Free disk space",
- "type": "singlestat",
- "valueFontSize": "80%",
- "valueMaps": [
- {
- "op": "=",
- "text": "N/A",
- "value": "null"
+ "title": "Query duration ($instance)",
+ "tooltip": {
+ "shared": true,
+ "sort": 2,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "s",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
}
],
- "valueName": "current"
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
},
{
- "cacheTimeout": null,
- "colorBackground": false,
- "colorValue": false,
- "colors": [
- "#299c46",
- "rgba(237, 129, 40, 0.89)",
- "#d44a3a"
- ],
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
"datasource": "$ds",
- "description": "Total size of available memory for VM process",
+ "description": "Shows the number of active time series with new data points inserted during the last hour. High value may result in ingestion slowdown. \n\nSee following link for details:",
"fieldConfig": {
"defaults": {
- "custom": {}
+ "links": []
},
"overrides": []
},
- "format": "bytes",
- "gauge": {
- "maxValue": 100,
- "minValue": 0,
- "show": false,
- "thresholdLabels": false,
- "thresholdMarkers": true
- },
+ "fill": 1,
+ "fillGradient": 0,
"gridPos": {
- "h": 2,
- "w": 6,
- "x": 18,
- "y": 1
+ "h": 8,
+ "w": 12,
+ "x": 0,
+ "y": 10
},
- "id": 78,
- "interval": null,
- "links": [],
- "mappingType": 1,
- "mappingTypes": [
- {
- "name": "value to text",
- "value": 1
- },
+ "hiddenSeries": false,
+ "id": 51,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": true,
+ "min": false,
+ "show": true,
+ "sort": "current",
+ "sortDesc": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [
{
- "name": "range to text",
- "value": 2
- }
- ],
- "maxDataPoints": 100,
- "nullPointMode": "connected",
- "nullText": null,
- "postfix": "",
- "postfixFontSize": "50%",
- "prefix": "",
- "prefixFontSize": "50%",
- "rangeMaps": [
- {
- "from": "null",
- "text": "N/A",
- "to": "null"
+ "targetBlank": true,
+ "title": "troubleshooting",
+ "url": "https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/README.md#troubleshooting"
}
],
- "sparkline": {
- "fillColor": "rgba(31, 118, 189, 0.18)",
- "full": true,
- "lineColor": "rgb(31, 120, 193)",
- "show": true
+ "nullPointMode": "null",
+ "options": {
+ "alertThreshold": true
},
- "tableColumn": "",
+ "percentage": false,
+ "pluginVersion": "8.0.0",
+ "pointradius": 2,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
"targets": [
{
- "expr": "sum(vm_available_memory_bytes{job=\"$job\", instance=~\"$instance\"})",
+ "expr": "vm_cache_entries{job=\"$job\", instance=~\"$instance\", type=\"storage/hour_metric_ids\"}",
"format": "time_series",
- "instant": false,
- "interval": "",
"intervalFactor": 1,
- "legendFormat": "",
+ "legendFormat": "Active time series",
"refId": "A"
}
],
- "thresholds": "",
+ "thresholds": [],
"timeFrom": null,
+ "timeRegions": [],
"timeShift": null,
- "title": "Available memory",
- "type": "singlestat",
- "valueFontSize": "80%",
- "valueMaps": [
- {
- "op": "=",
- "text": "N/A",
- "value": "null"
+ "title": "Active time series ($instance)",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
}
],
- "valueName": "current"
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
},
{
- "cacheTimeout": null,
- "colorBackground": false,
- "colorValue": false,
- "colors": [
- "#299c46",
- "rgba(237, 129, 40, 0.89)",
- "#d44a3a"
- ],
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
"datasource": "$ds",
+ "description": "VictoriaMetrics stores various caches in RAM. Memory size for these caches may be limited with -`memory.allowedPercent` flag. Line `max allowed` shows max allowed memory size for cache.",
"fieldConfig": {
"defaults": {
- "custom": {}
- },
- "overrides": []
- },
- "format": "s",
- "gauge": {
- "maxValue": 100,
- "minValue": 0,
- "show": false,
- "thresholdLabels": false,
- "thresholdMarkers": true
- },
- "gridPos": {
- "h": 2,
- "w": 6,
- "x": 0,
- "y": 3
- },
- "id": 8,
- "interval": null,
- "links": [],
- "mappingType": 1,
- "mappingTypes": [
- {
- "name": "value to text",
- "value": 1
- },
- {
- "name": "range to text",
- "value": 2
- }
- ],
- "maxDataPoints": 100,
- "nullPointMode": "connected",
- "nullText": null,
- "postfix": "",
- "postfixFontSize": "50%",
- "prefix": "",
- "prefixFontSize": "50%",
- "rangeMaps": [
- {
- "from": "null",
- "text": "N/A",
- "to": "null"
- }
- ],
- "sparkline": {
- "fillColor": "rgba(31, 118, 189, 0.18)",
- "full": false,
- "lineColor": "rgb(31, 120, 193)",
- "show": false
- },
- "tableColumn": "vm_app_uptime_seconds{instance=\"victoriametrics:8428\", job=\"victoriametrics\"}",
- "targets": [
- {
- "expr": "vm_app_uptime_seconds{job=\"$job\", instance=\"$instance\"}",
- "format": "time_series",
- "intervalFactor": 1,
- "refId": "A"
- }
- ],
- "thresholds": "",
- "timeFrom": null,
- "timeShift": null,
- "title": "Uptime",
- "type": "singlestat",
- "valueFontSize": "80%",
- "valueMaps": [
- {
- "op": "=",
- "text": "N/A",
- "value": "null"
- }
- ],
- "valueName": "current"
- },
- {
- "cacheTimeout": null,
- "colorBackground": false,
- "colorValue": false,
- "colors": [
- "#299c46",
- "rgba(237, 129, 40, 0.89)",
- "#d44a3a"
- ],
- "datasource": "$ds",
- "description": "How many entries inverted index contains. This value is proportional to the number of unique timeseries in storage(cardinality).",
- "fieldConfig": {
- "defaults": {
- "custom": {}
- },
- "overrides": []
- },
- "format": "short",
- "gauge": {
- "maxValue": 100,
- "minValue": 0,
- "show": false,
- "thresholdLabels": false,
- "thresholdMarkers": true
- },
- "gridPos": {
- "h": 2,
- "w": 6,
- "x": 6,
- "y": 3
- },
- "id": 38,
- "interval": null,
- "links": [],
- "mappingType": 1,
- "mappingTypes": [
- {
- "name": "value to text",
- "value": 1
- },
- {
- "name": "range to text",
- "value": 2
- }
- ],
- "maxDataPoints": 100,
- "nullPointMode": "connected",
- "nullText": null,
- "postfix": "",
- "postfixFontSize": "50%",
- "prefix": "",
- "prefixFontSize": "50%",
- "rangeMaps": [
- {
- "from": "null",
- "text": "N/A",
- "to": "null"
- }
- ],
- "sparkline": {
- "fillColor": "rgba(31, 118, 189, 0.18)",
- "full": true,
- "lineColor": "rgb(31, 120, 193)",
- "show": true
- },
- "tableColumn": "",
- "targets": [
- {
- "expr": "sum(vm_rows{job=\"$job\", instance=~\"$instance\", type=\"indexdb\"})",
- "format": "time_series",
- "instant": false,
- "intervalFactor": 1,
- "refId": "A"
- }
- ],
- "thresholds": "",
- "timeFrom": null,
- "timeShift": null,
- "title": "Index size",
- "type": "singlestat",
- "valueFontSize": "80%",
- "valueMaps": [
- {
- "op": "=",
- "text": "N/A",
- "value": "null"
- }
- ],
- "valueName": "current"
- },
- {
- "cacheTimeout": null,
- "colorBackground": false,
- "colorValue": false,
- "colors": [
- "#299c46",
- "rgba(237, 129, 40, 0.89)",
- "#d44a3a"
- ],
- "datasource": "$ds",
- "description": "Total number of available CPUs for VM process",
- "fieldConfig": {
- "defaults": {
- "custom": {}
- },
- "overrides": []
- },
- "format": "short",
- "gauge": {
- "maxValue": 100,
- "minValue": 0,
- "show": false,
- "thresholdLabels": false,
- "thresholdMarkers": true
- },
- "gridPos": {
- "h": 2,
- "w": 6,
- "x": 12,
- "y": 3
- },
- "id": 77,
- "interval": null,
- "links": [],
- "mappingType": 1,
- "mappingTypes": [
- {
- "name": "value to text",
- "value": 1
- },
- {
- "name": "range to text",
- "value": 2
- }
- ],
- "maxDataPoints": 100,
- "nullPointMode": "connected",
- "nullText": null,
- "postfix": "",
- "postfixFontSize": "50%",
- "prefix": "",
- "prefixFontSize": "50%",
- "rangeMaps": [
- {
- "from": "null",
- "text": "N/A",
- "to": "null"
- }
- ],
- "sparkline": {
- "fillColor": "rgba(31, 118, 189, 0.18)",
- "full": true,
- "lineColor": "rgb(31, 120, 193)",
- "show": true
- },
- "tableColumn": "",
- "targets": [
- {
- "expr": "sum(vm_available_cpu_cores{job=\"$job\", instance=~\"$instance\"})",
- "format": "time_series",
- "instant": false,
- "interval": "",
- "intervalFactor": 1,
- "legendFormat": "",
- "refId": "A"
- }
- ],
- "thresholds": "",
- "timeFrom": null,
- "timeShift": null,
- "title": "Available CPU",
- "type": "singlestat",
- "valueFontSize": "80%",
- "valueMaps": [
- {
- "op": "=",
- "text": "N/A",
- "value": "null"
- }
- ],
- "valueName": "current"
- },
- {
- "cacheTimeout": null,
- "colorBackground": false,
- "colorValue": false,
- "colors": [
- "#299c46",
- "rgba(237, 129, 40, 0.89)",
- "#d44a3a"
- ],
- "datasource": "$ds",
- "description": "Total size of allowed memory via flag `-memory.allowedPercent`",
- "fieldConfig": {
- "defaults": {
- "custom": {}
- },
- "overrides": []
- },
- "format": "bytes",
- "gauge": {
- "maxValue": 100,
- "minValue": 0,
- "show": false,
- "thresholdLabels": false,
- "thresholdMarkers": true
- },
- "gridPos": {
- "h": 2,
- "w": 6,
- "x": 18,
- "y": 3
- },
- "id": 79,
- "interval": null,
- "links": [],
- "mappingType": 1,
- "mappingTypes": [
- {
- "name": "value to text",
- "value": 1
- },
- {
- "name": "range to text",
- "value": 2
- }
- ],
- "maxDataPoints": 100,
- "nullPointMode": "connected",
- "nullText": null,
- "postfix": "",
- "postfixFontSize": "50%",
- "prefix": "",
- "prefixFontSize": "50%",
- "rangeMaps": [
- {
- "from": "null",
- "text": "N/A",
- "to": "null"
- }
- ],
- "sparkline": {
- "fillColor": "rgba(31, 118, 189, 0.18)",
- "full": true,
- "lineColor": "rgb(31, 120, 193)",
- "show": true
- },
- "tableColumn": "",
- "targets": [
- {
- "expr": "sum(vm_allowed_memory_bytes{job=\"$job\", instance=~\"$instance\"})",
- "format": "time_series",
- "instant": false,
- "interval": "",
- "intervalFactor": 1,
- "legendFormat": "",
- "refId": "A"
- }
- ],
- "thresholds": "",
- "timeFrom": null,
- "timeShift": null,
- "title": "Allowed memory",
- "type": "singlestat",
- "valueFontSize": "80%",
- "valueMaps": [
- {
- "op": "=",
- "text": "N/A",
- "value": "null"
- }
- ],
- "valueName": "current"
- },
- {
- "collapsed": false,
- "datasource": "$ds",
- "gridPos": {
- "h": 1,
- "w": 24,
- "x": 0,
- "y": 5
- },
- "id": 24,
- "panels": [],
- "title": "Performance",
- "type": "row"
- },
- {
- "aliasColors": {},
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": "$ds",
- "description": "* `*` - unsupported query path\n* `/write` - insert into VM\n* `/metrics` - query VM system metrics\n* `/query` - query instant values\n* `/query_range` - query over a range of time\n* `/series` - match a certain label set\n* `/label/{}/values` - query a list of label values (variables mostly)",
- "fieldConfig": {
- "defaults": {
- "custom": {},
- "links": []
+ "links": []
},
"overrides": []
},
@@ -816,16 +1066,16 @@ data:
"gridPos": {
"h": 8,
"w": 12,
- "x": 0,
- "y": 6
+ "x": 12,
+ "y": 10
},
"hiddenSeries": false,
- "id": 12,
+ "id": 33,
"legend": {
"alignAsTable": true,
"avg": true,
"current": true,
- "max": false,
+ "max": true,
"min": false,
"show": true,
"sort": "current",
@@ -836,34 +1086,50 @@ data:
"lines": true,
"linewidth": 1,
"links": [],
- "nullPointMode": "null as zero",
+ "nullPointMode": "null",
+ "options": {
+ "alertThreshold": true
+ },
"percentage": false,
- "pluginVersion": "7.1.1",
+ "pluginVersion": "8.0.0",
"pointradius": 2,
"points": false,
"renderer": "flot",
- "seriesOverrides": [],
+ "seriesOverrides": [
+ {
+ "alias": "max allowed",
+ "color": "#C4162A"
+ }
+ ],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
- "expr": "sum(rate(vm_http_requests_total{job=\"$job\", instance=~\"$instance\", path!~\"/favicon.ico\"}[$__interval])) by (path) > 0",
+ "expr": "sum(vm_cache_size_bytes{job=\"$job\", instance=\"$instance\"})",
"format": "time_series",
- "interval": "",
+ "hide": false,
"intervalFactor": 1,
- "legendFormat": "{{`{{`}}path{{`}}`}}",
+ "legendFormat": "size",
"refId": "A"
+ },
+ {
+ "expr": "max(vm_allowed_memory_bytes{job=\"$job\", instance=\"$instance\"})",
+ "format": "time_series",
+ "hide": false,
+ "intervalFactor": 1,
+ "legendFormat": "max allowed",
+ "refId": "B"
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
- "title": "Requests rate ($instance)",
+ "title": "Cache size ($instance)",
"tooltip": {
"shared": true,
- "sort": 2,
+ "sort": 0,
"value_type": "individual"
},
"type": "graph",
@@ -876,7 +1142,7 @@ data:
},
"yaxes": [
{
- "format": "short",
+ "format": "bytes",
"label": null,
"logBase": 1,
"max": null,
@@ -903,10 +1169,9 @@ data:
"dashLength": 10,
"dashes": false,
"datasource": "$ds",
- "description": "The less time it takes is better.\n* `*` - unsupported query path\n* `/write` - insert into VM\n* `/metrics` - query VM system metrics\n* `/query` - query instant values\n* `/query_range` - query over a range of time\n* `/series` - match a certain label set\n* `/label/{}/values` - query a list of label values (variables mostly)",
+ "description": "Shows how many ongoing insertions (not API /write calls) on disk are taking place, where:\n* `max` - equal to number of CPUs;\n* `current` - current number of goroutines busy with inserting rows into underlying storage.\n\nEvery successful API /write call results into flush on disk. However, these two actions are separated and controlled via different concurrency limiters. The `max` on this panel can't be changed and always equal to number of CPUs. \n\nWhen `current` hits `max` constantly, it means storage is overloaded and requires more CPU.\n\n",
"fieldConfig": {
"defaults": {
- "custom": {},
"links": []
},
"overrides": []
@@ -916,15 +1181,17 @@ data:
"gridPos": {
"h": 8,
"w": 12,
- "x": 12,
- "y": 6
+ "x": 0,
+ "y": 18
},
"hiddenSeries": false,
- "id": 22,
+ "id": 59,
"legend": {
"alignAsTable": true,
"avg": true,
"current": true,
+ "hideEmpty": false,
+ "hideZero": false,
"max": false,
"min": false,
"show": true,
@@ -936,30 +1203,46 @@ data:
"lines": true,
"linewidth": 1,
"links": [],
- "nullPointMode": "null as zero",
+ "nullPointMode": "null",
+ "options": {
+ "alertThreshold": true
+ },
"percentage": false,
- "pluginVersion": "7.1.1",
+ "pluginVersion": "8.0.0",
"pointradius": 2,
"points": false,
"renderer": "flot",
- "seriesOverrides": [],
+ "seriesOverrides": [
+ {
+ "alias": "max",
+ "color": "#C4162A"
+ }
+ ],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
- "expr": "max(vm_request_duration_seconds{job=\"$job\", instance=~\"$instance\", quantile=~\"(0.5|0.99)\"}) by (path, quantile) > 0",
+ "expr": "sum(vm_concurrent_addrows_capacity{job=\"$job\", instance=\"$instance\"})",
"format": "time_series",
+ "interval": "",
"intervalFactor": 1,
- "legendFormat": "{{`{{`}}quantile{{`}}`}} ({{`{{`}}path{{`}}`}})",
+ "legendFormat": "max",
"refId": "A"
+ },
+ {
+ "expr": "sum(vm_concurrent_addrows_current{job=\"$job\", instance=\"$instance\"})",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "current",
+ "refId": "B"
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
- "title": "Query duration ($instance)",
+ "title": "Concurrent flushes on disk ($instance)",
"tooltip": {
"shared": true,
"sort": 2,
@@ -975,7 +1258,8 @@ data:
},
"yaxes": [
{
- "format": "s",
+ "decimals": 0,
+ "format": "short",
"label": null,
"logBase": 1,
"max": null,
@@ -983,6 +1267,7 @@ data:
"show": true
},
{
+ "decimals": 0,
"format": "short",
"label": null,
"logBase": 1,
@@ -1002,10 +1287,9 @@ data:
"dashLength": 10,
"dashes": false,
"datasource": "$ds",
- "description": "Shows the number of active time series with new data points inserted during the last hour. High value may result in ingestion slowdown. \n\nSee following link for details:",
+ "description": "* `*` - unsupported query path\n* `/write` - insert into VM\n* `/metrics` - query VM system metrics\n* `/query` - query instant values\n* `/query_range` - query over a range of time\n* `/series` - match a certain label set\n* `/label/{}/values` - query a list of label values (variables mostly)",
"fieldConfig": {
"defaults": {
- "custom": {},
"links": []
},
"overrides": []
@@ -1015,32 +1299,32 @@ data:
"gridPos": {
"h": 8,
"w": 12,
- "x": 0,
- "y": 14
+ "x": 12,
+ "y": 18
},
"hiddenSeries": false,
- "id": 51,
+ "id": 35,
"legend": {
- "avg": false,
- "current": false,
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
"max": false,
"min": false,
"show": true,
+ "sort": "current",
+ "sortDesc": true,
"total": false,
- "values": false
+ "values": true
},
"lines": true,
"linewidth": 1,
- "links": [
- {
- "targetBlank": true,
- "title": "troubleshooting",
- "url": "https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/README.md#troubleshooting"
- }
- ],
- "nullPointMode": "null",
+ "links": [],
+ "nullPointMode": "null as zero",
+ "options": {
+ "alertThreshold": true
+ },
"percentage": false,
- "pluginVersion": "7.1.1",
+ "pluginVersion": "8.0.0",
"pointradius": 2,
"points": false,
"renderer": "flot",
@@ -1050,10 +1334,12 @@ data:
"steppedLine": false,
"targets": [
{
- "expr": "vm_cache_entries{job=\"$job\", instance=~\"$instance\", type=\"storage/hour_metric_ids\"}",
+ "exemplar": true,
+ "expr": "sum(rate(vm_http_request_errors_total{job=\"$job\", instance=\"$instance\"}[$__interval])) by (path) > 0",
"format": "time_series",
+ "interval": "",
"intervalFactor": 1,
- "legendFormat": "Active time series",
+ "legendFormat": "{{`{{`}}path{{`}}`}}",
"refId": "A"
}
],
@@ -1061,10 +1347,10 @@ data:
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
- "title": "Active time series ($instance)",
+ "title": "Requests error rate ($instance)",
"tooltip": {
"shared": true,
- "sort": 0,
+ "sort": 2,
"value_type": "individual"
},
"type": "graph",
@@ -1098,16 +1384,29 @@ data:
"alignLevel": null
}
},
+ {
+ "collapsed": false,
+ "datasource": "$ds",
+ "gridPos": {
+ "h": 1,
+ "w": 24,
+ "x": 0,
+ "y": 26
+ },
+ "id": 14,
+ "panels": [],
+ "title": "Storage",
+ "type": "row"
+ },
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "$ds",
- "description": "VictoriaMetrics stores various caches in RAM. Memory size for these caches may be limited with -`memory.allowedPercent` flag. Line `max allowed` shows max allowed memory size for cache.",
+ "description": "How many datapoints are inserted into storage per second",
"fieldConfig": {
"defaults": {
- "custom": {},
"links": []
},
"overrides": []
@@ -1117,64 +1416,163 @@ data:
"gridPos": {
"h": 8,
"w": 12,
- "x": 12,
- "y": 14
+ "x": 0,
+ "y": 27
},
"hiddenSeries": false,
- "id": 33,
+ "id": 10,
"legend": {
- "avg": false,
- "current": false,
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "hideZero": true,
"max": false,
"min": false,
- "show": false,
+ "show": true,
+ "sort": "current",
+ "sortDesc": true,
"total": false,
- "values": false
+ "values": true
},
"lines": true,
"linewidth": 1,
"links": [],
- "nullPointMode": "null",
+ "nullPointMode": "null as zero",
+ "options": {
+ "alertThreshold": true
+ },
"percentage": false,
- "pluginVersion": "7.1.1",
+ "pluginVersion": "8.0.0",
"pointradius": 2,
"points": false,
"renderer": "flot",
- "seriesOverrides": [
- {
- "alias": "max allowed",
- "color": "#C4162A"
- }
- ],
+ "seriesOverrides": [],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
- "expr": "sum(vm_cache_size_bytes{job=\"$job\", instance=\"$instance\"})",
+ "expr": "sum(rate(vm_rows_inserted_total{job=\"$job\", instance=\"$instance\"}[$__interval])) by (type) > 0",
"format": "time_series",
"hide": false,
"intervalFactor": 1,
- "legendFormat": "size",
+ "legendFormat": "{{`{{`}}type{{`}}`}}",
"refId": "A"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Datapoints ingestion rate ($instance)",
+ "tooltip": {
+ "shared": true,
+ "sort": 2,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "decimals": null,
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
},
{
- "expr": "max(vm_allowed_memory_bytes{job=\"$job\", instance=\"$instance\"})",
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$ds",
+ "description": "Shows the time needed to reach the 100% of disk capacity based on the following params:\n* free disk space;\n* row ingestion rate;\n* dedup rate;\n* compression.\n\nUse this panel for capacity planning in order to estimate the time remaining for running out of the disk space.\n\n",
+ "fieldConfig": {
+ "defaults": {
+ "links": []
+ },
+ "overrides": []
+ },
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 8,
+ "w": 12,
+ "x": 12,
+ "y": 27
+ },
+ "hiddenSeries": false,
+ "id": 73,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "hideZero": true,
+ "max": false,
+ "min": true,
+ "show": true,
+ "sort": "current",
+ "sortDesc": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null as zero",
+ "options": {
+ "alertThreshold": true
+ },
+ "percentage": false,
+ "pluginVersion": "8.0.0",
+ "pointradius": 2,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "vm_free_disk_space_bytes{job=\"$job\", instance=\"$instance\"} / ignoring(path) ((rate(vm_rows_added_to_storage_total{job=\"$job\", instance=\"$instance\"}[1d]) - ignoring(type) rate(vm_deduplicated_samples_total{job=\"$job\", instance=\"$instance\", type=\"merge\"}[1d])) * scalar(sum(vm_data_size_bytes{job=\"$job\", instance=\"$instance\", type!=\"indexdb\"}) / sum(vm_rows{job=\"$job\", instance=\"$instance\", type!=\"indexdb\"})))",
"format": "time_series",
"hide": false,
+ "interval": "",
"intervalFactor": 1,
- "legendFormat": "max allowed",
- "refId": "B"
+ "legendFormat": "",
+ "refId": "A"
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
- "title": "Cache size ($instance)",
+ "title": "Storage full ETA ($instance)",
"tooltip": {
"shared": true,
- "sort": 0,
+ "sort": 2,
"value_type": "individual"
},
"type": "graph",
@@ -1187,7 +1585,8 @@ data:
},
"yaxes": [
{
- "format": "bytes",
+ "decimals": null,
+ "format": "s",
"label": null,
"logBase": 1,
"max": null,
@@ -1214,10 +1613,9 @@ data:
"dashLength": 10,
"dashes": false,
"datasource": "$ds",
- "description": "Shows how many ongoing insertions (not API /write calls) on disk are taking place, where:\n* `max` - equal to number of CPUs;\n* `current` - current number of goroutines busy with inserting rows into underlying storage.\n\nEvery successful API /write call results into flush on disk. However, these two actions are separated and controlled via different concurrency limiters. The `max` on this panel can't be changed and always equal to number of CPUs. \n\nWhen `current` hits `max` constantly, it means storage is overloaded and requires more CPU.\n\n",
+ "description": "Shows how many datapoints are in the storage and what is average disk usage per datapoint.",
"fieldConfig": {
"defaults": {
- "custom": {},
"links": []
},
"overrides": []
@@ -1228,17 +1626,15 @@ data:
"h": 8,
"w": 12,
"x": 0,
- "y": 22
+ "y": 35
},
"hiddenSeries": false,
- "id": 59,
+ "id": 30,
"legend": {
"alignAsTable": true,
"avg": true,
"current": true,
- "hideEmpty": false,
- "hideZero": false,
- "max": false,
+ "max": true,
"min": false,
"show": true,
"sort": "current",
@@ -1250,15 +1646,18 @@ data:
"linewidth": 1,
"links": [],
"nullPointMode": "null",
+ "options": {
+ "alertThreshold": true
+ },
"percentage": false,
- "pluginVersion": "7.1.1",
+ "pluginVersion": "8.0.0",
"pointradius": 2,
"points": false,
"renderer": "flot",
"seriesOverrides": [
{
- "alias": "max",
- "color": "#C4162A"
+ "alias": "bytes-per-datapoint",
+ "yaxis": 2
}
],
"spaceLength": 10,
@@ -1266,18 +1665,19 @@ data:
"steppedLine": false,
"targets": [
{
- "expr": "sum(vm_concurrent_addrows_capacity{job=\"$job\", instance=\"$instance\"})",
+ "expr": "sum(vm_rows{job=\"$job\", instance=~\"$instance\", type != \"indexdb\"})",
"format": "time_series",
"interval": "",
"intervalFactor": 1,
- "legendFormat": "max",
+ "legendFormat": "total datapoints",
"refId": "A"
},
{
- "expr": "sum(vm_concurrent_addrows_current{job=\"$job\", instance=\"$instance\"})",
+ "expr": "sum(vm_data_size_bytes{job=\"$job\", instance=~\"$instance\", type!=\"indexdb\"}) / sum(vm_rows{job=\"$job\", instance=~\"$instance\", type != \"indexdb\"})",
"format": "time_series",
+ "interval": "",
"intervalFactor": 1,
- "legendFormat": "current",
+ "legendFormat": "bytes-per-datapoint",
"refId": "B"
}
],
@@ -1285,7 +1685,7 @@ data:
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
- "title": "Concurrent flushes on disk ($instance)",
+ "title": "Datapoints ($instance)",
"tooltip": {
"shared": true,
"sort": 2,
@@ -1301,7 +1701,6 @@ data:
},
"yaxes": [
{
- "decimals": 0,
"format": "short",
"label": null,
"logBase": 1,
@@ -1310,8 +1709,8 @@ data:
"show": true
},
{
- "decimals": 0,
- "format": "short",
+ "decimals": 2,
+ "format": "bytes",
"label": null,
"logBase": 1,
"max": null,
@@ -1330,10 +1729,9 @@ data:
"dashLength": 10,
"dashes": false,
"datasource": "$ds",
- "description": "* `*` - unsupported query path\n* `/write` - insert into VM\n* `/metrics` - query VM system metrics\n* `/query` - query instant values\n* `/query_range` - query over a range of time\n* `/series` - match a certain label set\n* `/label/{}/values` - query a list of label values (variables mostly)",
+ "description": "How many datapoints are in RAM queue waiting to be written into storage. The number of pending data points should be in the range from 0 to `2*`, since VictoriaMetrics pushes pending data to persistent storage every second.",
"fieldConfig": {
"defaults": {
- "custom": {},
"links": []
},
"overrides": []
@@ -1344,15 +1742,15 @@ data:
"h": 8,
"w": 12,
"x": 12,
- "y": 22
+ "y": 35
},
"hiddenSeries": false,
- "id": 35,
+ "id": 34,
"legend": {
"alignAsTable": true,
"avg": true,
"current": true,
- "max": false,
+ "max": true,
"min": false,
"show": true,
"sort": "current",
@@ -1363,33 +1761,50 @@ data:
"lines": true,
"linewidth": 1,
"links": [],
- "nullPointMode": "null as zero",
+ "nullPointMode": "null",
+ "options": {
+ "alertThreshold": true
+ },
"percentage": false,
- "pluginVersion": "7.1.1",
+ "pluginVersion": "8.0.0",
"pointradius": 2,
"points": false,
"renderer": "flot",
- "seriesOverrides": [],
+ "seriesOverrides": [
+ {
+ "alias": "pending index entries",
+ "yaxis": 2
+ }
+ ],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
- "expr": "sum(rate(vm_http_request_errors_total{job=\"$job\", instance=\"$instance\"}[$__interval])) by (path) > 0",
+ "expr": "vm_pending_rows{job=\"$job\", instance=~\"$instance\", type=\"storage\"}",
"format": "time_series",
+ "hide": false,
"intervalFactor": 1,
- "legendFormat": "{{`{{`}}path{{`}}`}}",
+ "legendFormat": "pending datapoints",
"refId": "A"
+ },
+ {
+ "expr": "vm_pending_rows{job=\"$job\", instance=~\"$instance\", type=\"indexdb\"}",
+ "format": "time_series",
+ "hide": false,
+ "intervalFactor": 1,
+ "legendFormat": "pending index entries",
+ "refId": "B"
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
- "title": "Requests error rate ($instance)",
+ "title": "Pending datapoints ($instance)",
"tooltip": {
"shared": true,
- "sort": 2,
+ "sort": 0,
"value_type": "individual"
},
"type": "graph",
@@ -1410,7 +1825,8 @@ data:
"show": true
},
{
- "format": "short",
+ "decimals": 3,
+ "format": "none",
"label": null,
"logBase": 1,
"max": null,
@@ -1424,1237 +1840,727 @@ data:
}
},
{
- "collapsed": true,
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
"datasource": "$ds",
+ "description": "Shows amount of on-disk space occupied by data points and the remaining disk space at `-storageDataPath`",
+ "fieldConfig": {
+ "defaults": {
+ "links": []
+ },
+ "overrides": []
+ },
+ "fill": 1,
+ "fillGradient": 0,
"gridPos": {
- "h": 1,
- "w": 24,
+ "h": 8,
+ "w": 12,
"x": 0,
- "y": 30
+ "y": 43
},
- "id": 14,
- "panels": [
+ "hiddenSeries": false,
+ "id": 53,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": true,
+ "min": false,
+ "rightSide": false,
+ "show": true,
+ "sort": "current",
+ "sortDesc": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "options": {
+ "alertThreshold": true
+ },
+ "percentage": false,
+ "pluginVersion": "8.0.0",
+ "pointradius": 2,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
{
- "aliasColors": {},
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": "$ds",
- "description": "How many datapoints are inserted into storage per second",
- "fieldConfig": {
- "defaults": {
- "custom": {},
- "links": []
- },
- "overrides": []
- },
- "fill": 1,
- "fillGradient": 0,
- "gridPos": {
- "h": 8,
- "w": 12,
- "x": 0,
- "y": 40
- },
- "hiddenSeries": false,
- "id": 10,
- "legend": {
- "alignAsTable": true,
- "avg": true,
- "current": true,
- "hideZero": true,
- "max": false,
- "min": false,
- "show": true,
- "total": false,
- "values": true
- },
- "lines": true,
- "linewidth": 1,
- "links": [],
- "nullPointMode": "null as zero",
- "percentage": false,
- "pluginVersion": "7.1.1",
- "pointradius": 2,
- "points": false,
- "renderer": "flot",
- "seriesOverrides": [],
- "spaceLength": 10,
- "stack": false,
- "steppedLine": false,
- "targets": [
- {
- "expr": "sum(rate(vm_rows_inserted_total{job=\"$job\", instance=\"$instance\"}[$__interval])) by (type) > 0",
- "format": "time_series",
- "hide": false,
- "intervalFactor": 1,
- "legendFormat": "{{`{{`}}type{{`}}`}}",
- "refId": "A"
- }
- ],
- "thresholds": [],
- "timeFrom": null,
- "timeRegions": [],
- "timeShift": null,
- "title": "Datapoints ingestion rate ($instance)",
- "tooltip": {
- "shared": true,
- "sort": 2,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": []
- },
- "yaxes": [
- {
- "decimals": null,
- "format": "short",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": "0",
- "show": true
- },
- {
- "format": "short",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": "0",
- "show": true
- }
- ],
- "yaxis": {
- "align": false,
- "alignLevel": null
- }
+ "expr": "sum(vm_data_size_bytes{job=\"$job\", instance=~\"$instance\", type!=\"indexdb\"})",
+ "format": "time_series",
+ "interval": "",
+ "intervalFactor": 1,
+ "legendFormat": "Used",
+ "refId": "A"
},
{
- "aliasColors": {},
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": "$ds",
- "description": "Shows the time needed to reach the 100% of disk capacity based on the following params:\n* free disk space;\n* row ingestion rate;\n* dedup rate;\n* compression.\n\nUse this panel for capacity planning in order to estimate the time remaining for running out of the disk space.\n\n",
- "fieldConfig": {
- "defaults": {
- "custom": {},
- "links": []
- },
- "overrides": []
- },
- "fill": 1,
- "fillGradient": 0,
- "gridPos": {
- "h": 8,
- "w": 12,
- "x": 12,
- "y": 40
- },
- "hiddenSeries": false,
- "id": 73,
- "legend": {
- "alignAsTable": true,
- "avg": true,
- "current": true,
- "hideZero": true,
- "max": false,
- "min": false,
- "show": false,
- "total": false,
- "values": true
- },
- "lines": true,
- "linewidth": 1,
- "links": [],
- "nullPointMode": "null as zero",
- "percentage": false,
- "pluginVersion": "7.1.1",
- "pointradius": 2,
- "points": false,
- "renderer": "flot",
- "seriesOverrides": [],
- "spaceLength": 10,
- "stack": false,
- "steppedLine": false,
- "targets": [
- {
- "expr": "vm_free_disk_space_bytes{job=\"$job\", instance=\"$instance\"} / ignoring(path) ((rate(vm_rows_added_to_storage_total{job=\"$job\", instance=\"$instance\"}[1d]) - ignoring(type) rate(vm_deduplicated_samples_total{job=\"$job\", instance=\"$instance\", type=\"merge\"}[1d])) * scalar(sum(vm_data_size_bytes{job=\"$job\", instance=\"$instance\", type!=\"indexdb\"}) / sum(vm_rows{job=\"$job\", instance=\"$instance\", type!=\"indexdb\"})))",
- "format": "time_series",
- "hide": false,
- "interval": "",
- "intervalFactor": 1,
- "legendFormat": "",
- "refId": "A"
- }
- ],
- "thresholds": [],
- "timeFrom": null,
- "timeRegions": [],
- "timeShift": null,
- "title": "Storage full ETA ($instance)",
- "tooltip": {
- "shared": true,
- "sort": 2,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": []
- },
- "yaxes": [
- {
- "decimals": null,
- "format": "s",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": "0",
- "show": true
- },
- {
- "format": "short",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": "0",
- "show": true
- }
- ],
- "yaxis": {
- "align": false,
- "alignLevel": null
- }
+ "expr": "vm_free_disk_space_bytes{job=\"$job\", instance=\"$instance\"}",
+ "format": "time_series",
+ "interval": "",
+ "intervalFactor": 1,
+ "legendFormat": "Free",
+ "refId": "B"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Disk space usage - datapoints ($instance)",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "bytes",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
},
{
- "aliasColors": {},
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": "$ds",
- "description": "Shows how many datapoints are in the storage and what is average disk usage per datapoint.",
- "fieldConfig": {
- "defaults": {
- "custom": {},
- "links": []
- },
- "overrides": []
- },
- "fill": 1,
- "fillGradient": 0,
- "gridPos": {
- "h": 8,
- "w": 12,
- "x": 0,
- "y": 48
- },
- "hiddenSeries": false,
- "id": 30,
- "legend": {
- "avg": false,
- "current": false,
- "max": false,
- "min": false,
- "show": false,
- "total": false,
- "values": false
- },
- "lines": true,
- "linewidth": 1,
- "links": [],
- "nullPointMode": "null",
- "percentage": false,
- "pluginVersion": "7.1.1",
- "pointradius": 2,
- "points": false,
- "renderer": "flot",
- "seriesOverrides": [
- {
- "alias": "bytes-per-datapoint",
- "yaxis": 2
- }
- ],
- "spaceLength": 10,
- "stack": false,
- "steppedLine": false,
- "targets": [
- {
- "expr": "sum(vm_rows{job=\"$job\", instance=~\"$instance\", type != \"indexdb\"})",
- "format": "time_series",
- "interval": "",
- "intervalFactor": 1,
- "legendFormat": "total datapoints",
- "refId": "A"
- },
- {
- "expr": "sum(vm_data_size_bytes{job=\"$job\", instance=~\"$instance\", type!=\"indexdb\"}) / sum(vm_rows{job=\"$job\", instance=~\"$instance\", type != \"indexdb\"})",
- "format": "time_series",
- "interval": "",
- "intervalFactor": 1,
- "legendFormat": "bytes-per-datapoint",
- "refId": "B"
- }
- ],
- "thresholds": [],
- "timeFrom": null,
- "timeRegions": [],
- "timeShift": null,
- "title": "Datapoints ($instance)",
- "tooltip": {
- "shared": true,
- "sort": 2,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": []
- },
- "yaxes": [
- {
- "format": "short",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": "0",
- "show": true
- },
- {
- "decimals": 2,
- "format": "bytes",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": "0",
- "show": true
- }
- ],
- "yaxis": {
- "align": false,
- "alignLevel": null
- }
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$ds",
+ "description": "Data parts of LSM tree.\nHigh number of parts could be an evidence of slow merge performance - check the resource utilization.\n* `indexdb` - inverted index\n* `storage/small` - recently added parts of data ingested into storage(hot data)\n* `storage/big` - small parts gradually merged into big parts (cold data)",
+ "fieldConfig": {
+ "defaults": {
+ "links": []
},
+ "overrides": []
+ },
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 8,
+ "w": 12,
+ "x": 12,
+ "y": 43
+ },
+ "hiddenSeries": false,
+ "id": 36,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": true,
+ "min": false,
+ "show": true,
+ "sort": "current",
+ "sortDesc": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "options": {
+ "alertThreshold": true
+ },
+ "percentage": false,
+ "pluginVersion": "8.0.0",
+ "pointradius": 2,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
{
- "aliasColors": {},
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": "$ds",
- "description": "How many datapoints are in RAM queue waiting to be written into storage. The number of pending data points should be in the range from 0 to `2*`, since VictoriaMetrics pushes pending data to persistent storage every second.",
- "fieldConfig": {
- "defaults": {
- "custom": {},
- "links": []
- },
- "overrides": []
- },
- "fill": 1,
- "fillGradient": 0,
- "gridPos": {
- "h": 8,
- "w": 12,
- "x": 12,
- "y": 48
- },
- "hiddenSeries": false,
- "id": 34,
- "legend": {
- "avg": false,
- "current": false,
- "max": false,
- "min": false,
- "show": false,
- "total": false,
- "values": false
- },
- "lines": true,
- "linewidth": 1,
- "links": [],
- "nullPointMode": "null",
- "percentage": false,
- "pluginVersion": "7.1.1",
- "pointradius": 2,
- "points": false,
- "renderer": "flot",
- "seriesOverrides": [
- {
- "alias": "pending index entries",
- "yaxis": 2
- }
- ],
- "spaceLength": 10,
- "stack": false,
- "steppedLine": false,
- "targets": [
- {
- "expr": "vm_pending_rows{job=\"$job\", instance=~\"$instance\", type=\"storage\"}",
- "format": "time_series",
- "hide": false,
- "intervalFactor": 1,
- "legendFormat": "pending datapoints",
- "refId": "A"
- },
- {
- "expr": "vm_pending_rows{job=\"$job\", instance=~\"$instance\", type=\"indexdb\"}",
- "format": "time_series",
- "hide": false,
- "intervalFactor": 1,
- "legendFormat": "pending index entries",
- "refId": "B"
- }
- ],
- "thresholds": [],
- "timeFrom": null,
- "timeRegions": [],
- "timeShift": null,
- "title": "Pending datapoints ($instance)",
- "tooltip": {
- "shared": true,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": []
- },
- "yaxes": [
- {
- "format": "short",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": "0",
- "show": true
- },
- {
- "decimals": 3,
- "format": "none",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": "0",
- "show": true
- }
- ],
- "yaxis": {
- "align": false,
- "alignLevel": null
- }
+ "expr": "sum(vm_parts{job=\"$job\", instance=\"$instance\"}) by (type)",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "{{`{{`}}type{{`}}`}}",
+ "refId": "A"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "LSM parts ($instance)",
+ "tooltip": {
+ "shared": true,
+ "sort": 2,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
},
{
- "aliasColors": {},
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": "$ds",
- "description": "Shows amount of on-disk space occupied by data points and the remaining disk space at `-storageDataPath`",
- "fieldConfig": {
- "defaults": {
- "custom": {},
- "links": []
- },
- "overrides": []
- },
- "fill": 1,
- "fillGradient": 0,
- "gridPos": {
- "h": 8,
- "w": 12,
- "x": 0,
- "y": 56
- },
- "hiddenSeries": false,
- "id": 53,
- "legend": {
- "alignAsTable": true,
- "avg": false,
- "current": true,
- "max": false,
- "min": false,
- "rightSide": false,
- "show": true,
- "total": false,
- "values": true
- },
- "lines": true,
- "linewidth": 1,
- "links": [],
- "nullPointMode": "null",
- "percentage": false,
- "pluginVersion": "7.1.1",
- "pointradius": 2,
- "points": false,
- "renderer": "flot",
- "seriesOverrides": [],
- "spaceLength": 10,
- "stack": false,
- "steppedLine": false,
- "targets": [
- {
- "expr": "sum(vm_data_size_bytes{job=\"$job\", instance=~\"$instance\", type!=\"indexdb\"})",
- "format": "time_series",
- "interval": "",
- "intervalFactor": 1,
- "legendFormat": "Used",
- "refId": "A"
- },
- {
- "expr": "vm_free_disk_space_bytes{job=\"$job\", instance=\"$instance\"}",
- "format": "time_series",
- "interval": "",
- "intervalFactor": 1,
- "legendFormat": "Free",
- "refId": "B"
- }
- ],
- "thresholds": [],
- "timeFrom": null,
- "timeRegions": [],
- "timeShift": null,
- "title": "Disk space usage - datapoints ($instance)",
- "tooltip": {
- "shared": true,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": []
- },
- "yaxes": [
- {
- "format": "bytes",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": "0",
- "show": true
- },
- {
- "format": "short",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": "0",
- "show": true
- }
- ],
- "yaxis": {
- "align": false,
- "alignLevel": null
- }
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$ds",
+ "description": "Shows amount of on-disk space occupied by inverted index.",
+ "fieldConfig": {
+ "defaults": {
+ "links": []
},
+ "overrides": []
+ },
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 8,
+ "w": 12,
+ "x": 0,
+ "y": 51
+ },
+ "hiddenSeries": false,
+ "id": 55,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": true,
+ "min": false,
+ "show": true,
+ "sort": "current",
+ "sortDesc": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "options": {
+ "alertThreshold": true
+ },
+ "percentage": false,
+ "pluginVersion": "8.0.0",
+ "pointradius": 2,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
{
- "aliasColors": {},
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": "$ds",
- "description": "Data parts of LSM tree.\nHigh number of parts could be an evidence of slow merge performance - check the resource utilization.\n* `indexdb` - inverted index\n* `storage/small` - recently added parts of data ingested into storage(hot data)\n* `storage/big` - small parts gradually merged into big parts (cold data)",
- "fieldConfig": {
- "defaults": {
- "custom": {},
- "links": []
- },
- "overrides": []
- },
- "fill": 1,
- "fillGradient": 0,
- "gridPos": {
- "h": 8,
- "w": 12,
- "x": 12,
- "y": 56
- },
- "hiddenSeries": false,
- "id": 36,
- "legend": {
- "avg": false,
- "current": false,
- "max": false,
- "min": false,
- "show": true,
- "total": false,
- "values": false
- },
- "lines": true,
- "linewidth": 1,
- "links": [],
- "nullPointMode": "null",
- "percentage": false,
- "pluginVersion": "7.1.1",
- "pointradius": 2,
- "points": false,
- "renderer": "flot",
- "seriesOverrides": [],
- "spaceLength": 10,
- "stack": false,
- "steppedLine": false,
- "targets": [
- {
- "expr": "sum(vm_parts{job=\"$job\", instance=\"$instance\"}) by (type)",
- "format": "time_series",
- "intervalFactor": 1,
- "legendFormat": "{{`{{`}}type{{`}}`}}",
- "refId": "A"
- }
- ],
- "thresholds": [],
- "timeFrom": null,
- "timeRegions": [],
- "timeShift": null,
- "title": "LSM parts ($instance)",
- "tooltip": {
- "shared": true,
- "sort": 2,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": []
- },
- "yaxes": [
- {
- "format": "short",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": "0",
- "show": true
- },
- {
- "format": "short",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": "0",
- "show": true
- }
- ],
- "yaxis": {
- "align": false,
- "alignLevel": null
- }
+ "exemplar": true,
+ "expr": "vm_data_size_bytes{job=\"$job\", instance=~\"$instance\", type=\"indexdb\"}",
+ "format": "time_series",
+ "interval": "",
+ "intervalFactor": 1,
+ "legendFormat": "disk space used",
+ "refId": "A"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Disk space usage - index ($instance)",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "bytes",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
},
{
- "aliasColors": {},
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": "$ds",
- "description": "Shows amount of on-disk space occupied by inverted index.",
- "fieldConfig": {
- "defaults": {
- "custom": {},
- "links": []
- },
- "overrides": []
- },
- "fill": 1,
- "fillGradient": 0,
- "gridPos": {
- "h": 8,
- "w": 12,
- "x": 0,
- "y": 64
- },
- "hiddenSeries": false,
- "id": 55,
- "legend": {
- "avg": false,
- "current": false,
- "max": false,
- "min": false,
- "show": false,
- "total": false,
- "values": false
- },
- "lines": true,
- "linewidth": 1,
- "links": [],
- "nullPointMode": "null",
- "percentage": false,
- "pluginVersion": "7.1.1",
- "pointradius": 2,
- "points": false,
- "renderer": "flot",
- "seriesOverrides": [],
- "spaceLength": 10,
- "stack": false,
- "steppedLine": false,
- "targets": [
- {
- "expr": "vm_data_size_bytes{job=\"$job\", instance=~\"$instance\", type=\"indexdb\"}",
- "format": "time_series",
- "intervalFactor": 1,
- "refId": "A"
- }
- ],
- "thresholds": [],
- "timeFrom": null,
- "timeRegions": [],
- "timeShift": null,
- "title": "Disk space usage - index ($instance)",
- "tooltip": {
- "shared": true,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": []
- },
- "yaxes": [
- {
- "format": "bytes",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": "0",
- "show": true
- },
- {
- "format": "short",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": "0",
- "show": true
- }
- ],
- "yaxis": {
- "align": false,
- "alignLevel": null
- }
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$ds",
+ "description": "The number of on-going merges in storage nodes. It is expected to have high numbers for `storage/small` metric.",
+ "fieldConfig": {
+ "defaults": {
+ "links": []
},
+ "overrides": []
+ },
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 8,
+ "w": 12,
+ "x": 12,
+ "y": 51
+ },
+ "hiddenSeries": false,
+ "id": 62,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": true,
+ "min": false,
+ "show": true,
+ "sort": "current",
+ "sortDesc": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "nullPointMode": "null",
+ "options": {
+ "alertThreshold": true
+ },
+ "percentage": false,
+ "pluginVersion": "8.0.0",
+ "pointradius": 2,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
{
- "aliasColors": {},
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": "$ds",
- "description": "The number of on-going merges in storage nodes. It is expected to have high numbers for `storage/small` metric.",
- "fieldConfig": {
- "defaults": {
- "custom": {},
- "links": []
- },
- "overrides": []
- },
- "fill": 1,
- "fillGradient": 0,
- "gridPos": {
- "h": 8,
- "w": 12,
- "x": 12,
- "y": 64
- },
- "hiddenSeries": false,
- "id": 62,
- "legend": {
- "avg": false,
- "current": false,
- "max": false,
- "min": false,
- "show": true,
- "total": false,
- "values": false
- },
- "lines": true,
- "linewidth": 1,
- "nullPointMode": "null",
- "percentage": false,
- "pluginVersion": "7.1.1",
- "pointradius": 2,
- "points": false,
- "renderer": "flot",
- "seriesOverrides": [],
- "spaceLength": 10,
- "stack": false,
- "steppedLine": false,
- "targets": [
- {
- "expr": "sum(vm_active_merges{job=\"$job\", instance=\"$instance\"}) by(type)",
- "legendFormat": "{{`{{`}}type{{`}}`}}",
- "refId": "A"
- }
- ],
- "thresholds": [],
- "timeFrom": null,
- "timeRegions": [],
- "timeShift": null,
- "title": "Active merges ($instance)",
- "tooltip": {
- "shared": true,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": []
- },
- "yaxes": [
- {
- "decimals": 0,
- "format": "short",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": "0",
- "show": true
- },
- {
- "format": "short",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": "0",
- "show": true
- }
- ],
- "yaxis": {
- "align": false,
- "alignLevel": null
- }
+ "expr": "sum(vm_active_merges{job=\"$job\", instance=\"$instance\"}) by(type)",
+ "legendFormat": "{{`{{`}}type{{`}}`}}",
+ "refId": "A"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Active merges ($instance)",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "decimals": 0,
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
},
{
- "aliasColors": {},
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": "$ds",
- "description": "Shows the number of bytes read/write from the storage layer.",
- "fieldConfig": {
- "defaults": {
- "custom": {},
- "links": []
- },
- "overrides": []
- },
- "fill": 1,
- "fillGradient": 0,
- "gridPos": {
- "h": 8,
- "w": 12,
- "x": 0,
- "y": 72
- },
- "hiddenSeries": false,
- "id": 76,
- "legend": {
- "avg": false,
- "current": false,
- "max": false,
- "min": false,
- "show": false,
- "total": false,
- "values": false
- },
- "lines": true,
- "linewidth": 1,
- "links": [],
- "nullPointMode": "null",
- "percentage": false,
- "pluginVersion": "7.1.1",
- "pointradius": 2,
- "points": false,
- "renderer": "flot",
- "seriesOverrides": [
- {
- "alias": "read",
- "transform": "negative-Y"
- }
- ],
- "spaceLength": 10,
- "stack": false,
- "steppedLine": false,
- "targets": [
- {
- "expr": "sum(rate(process_io_storage_read_bytes_total{job=\"$job\", instance=\"$instance\"}[5m]))",
- "format": "time_series",
- "hide": false,
- "interval": "",
- "intervalFactor": 1,
- "legendFormat": "read",
- "refId": "A"
- },
- {
- "expr": "sum(rate(process_io_storage_written_bytes_total{job=\"$job\", instance=\"$instance\"}[5m]))",
- "format": "time_series",
- "hide": false,
- "interval": "",
- "intervalFactor": 1,
- "legendFormat": "write",
- "refId": "B"
- }
- ],
- "thresholds": [],
- "timeFrom": null,
- "timeRegions": [],
- "timeShift": null,
- "title": "Disk writes/reads ($instance)",
- "tooltip": {
- "shared": true,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": []
- },
- "yaxes": [
- {
- "decimals": null,
- "format": "bytes",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": null,
- "show": true
- },
- {
- "format": "short",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": "0",
- "show": true
- }
- ],
- "yaxis": {
- "align": false,
- "alignLevel": null
- }
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$ds",
+ "description": "Shows how many rows were ignored on insertion due to corrupted or out of retention timestamps.",
+ "fieldConfig": {
+ "defaults": {
+ "links": []
},
+ "overrides": []
+ },
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 8,
+ "w": 12,
+ "x": 0,
+ "y": 59
+ },
+ "hiddenSeries": false,
+ "id": 58,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": true,
+ "min": false,
+ "show": true,
+ "sort": "current",
+ "sortDesc": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "options": {
+ "alertThreshold": true
+ },
+ "percentage": false,
+ "pluginVersion": "8.0.0",
+ "pointradius": 2,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
{
- "aliasColors": {},
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": "$ds",
- "description": "The number of rows merged per second by storage nodes.",
- "fieldConfig": {
- "defaults": {
- "custom": {},
- "links": []
- },
- "overrides": []
- },
- "fill": 1,
- "fillGradient": 0,
- "gridPos": {
- "h": 8,
- "w": 12,
- "x": 12,
- "y": 72
- },
- "hiddenSeries": false,
- "id": 64,
- "legend": {
- "avg": false,
- "current": false,
- "max": false,
- "min": false,
- "show": true,
- "total": false,
- "values": false
- },
- "lines": true,
- "linewidth": 1,
- "nullPointMode": "null",
- "percentage": false,
- "pluginVersion": "7.1.1",
- "pointradius": 2,
- "points": false,
- "renderer": "flot",
- "seriesOverrides": [],
- "spaceLength": 10,
- "stack": false,
- "steppedLine": false,
- "targets": [
- {
- "expr": "sum(rate(vm_rows_merged_total{job=\"$job\", instance=\"$instance\"}[5m])) by(type)",
- "legendFormat": "{{`{{`}}type{{`}}`}}",
- "refId": "A"
- }
- ],
- "thresholds": [],
- "timeFrom": null,
- "timeRegions": [],
- "timeShift": null,
- "title": "Merge speed ($instance)",
- "tooltip": {
- "shared": true,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": []
- },
- "yaxes": [
- {
- "decimals": 0,
- "format": "short",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": "0",
- "show": true
- },
- {
- "format": "short",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": "0",
- "show": true
- }
- ],
- "yaxis": {
- "align": false,
- "alignLevel": null
- }
+ "exemplar": true,
+ "expr": "sum(vm_rows_ignored_total{job=\"$job\", instance=\"$instance\"}) by (reason)",
+ "format": "time_series",
+ "hide": false,
+ "interval": "",
+ "intervalFactor": 1,
+ "legendFormat": "{{`{{`}}reason{{`}}`}}",
+ "refId": "A"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Rows ignored ($instance)",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "decimals": null,
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$ds",
+ "description": "The number of rows merged per second by storage nodes.",
+ "fieldConfig": {
+ "defaults": {
+ "links": []
+ },
+ "overrides": []
+ },
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 8,
+ "w": 12,
+ "x": 12,
+ "y": 59
+ },
+ "hiddenSeries": false,
+ "id": 64,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": true,
+ "min": false,
+ "show": true,
+ "sort": "current",
+ "sortDesc": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "nullPointMode": "null",
+ "options": {
+ "alertThreshold": true
+ },
+ "percentage": false,
+ "pluginVersion": "8.0.0",
+ "pointradius": 2,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "sum(rate(vm_rows_merged_total{job=\"$job\", instance=\"$instance\"}[5m])) by(type)",
+ "legendFormat": "{{`{{`}}type{{`}}`}}",
+ "refId": "A"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Merge speed ($instance)",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "$$hashKey": "object:867",
+ "decimals": 0,
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
},
{
- "aliasColors": {},
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": "$ds",
- "description": "Shows how many rows were ignored on insertion due to corrupted or out of retention timestamps.",
- "fieldConfig": {
- "defaults": {
- "custom": {},
- "links": []
- },
- "overrides": []
- },
- "fill": 1,
- "fillGradient": 0,
- "gridPos": {
- "h": 8,
- "w": 12,
- "x": 0,
- "y": 80
- },
- "hiddenSeries": false,
- "id": 58,
- "legend": {
- "avg": false,
- "current": false,
- "max": false,
- "min": false,
- "show": false,
- "total": false,
- "values": false
- },
- "lines": true,
- "linewidth": 1,
- "links": [],
- "nullPointMode": "null",
- "percentage": false,
- "pluginVersion": "7.1.1",
- "pointradius": 2,
- "points": false,
- "renderer": "flot",
- "seriesOverrides": [],
- "spaceLength": 10,
- "stack": false,
- "steppedLine": false,
- "targets": [
- {
- "expr": "sum(vm_rows_ignored_total{job=\"$job\", instance=\"$instance\"}) by (reason) > 0",
- "format": "time_series",
- "hide": false,
- "intervalFactor": 1,
- "legendFormat": "{{`{{`}}reason{{`}}`}}",
- "refId": "A"
- }
- ],
- "thresholds": [],
- "timeFrom": null,
- "timeRegions": [],
- "timeShift": null,
- "title": "Rows ignored ($instance)",
- "tooltip": {
- "shared": true,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": []
- },
- "yaxes": [
- {
- "decimals": null,
- "format": "short",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": "0",
- "show": true
- },
- {
- "format": "short",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": "0",
- "show": true
- }
- ],
- "yaxis": {
- "align": false,
- "alignLevel": null
- }
+ "$$hashKey": "object:868",
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$ds",
+ "description": "Shows the rate of logging the messages by their level. Unexpected spike in rate is a good reason to check logs.",
+ "fieldConfig": {
+ "defaults": {
+ "links": []
},
+ "overrides": []
+ },
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 8,
+ "w": 12,
+ "x": 12,
+ "y": 67
+ },
+ "hiddenSeries": false,
+ "id": 67,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": true,
+ "min": false,
+ "show": true,
+ "sort": "current",
+ "sortDesc": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "options": {
+ "alertThreshold": true
+ },
+ "percentage": false,
+ "pluginVersion": "8.0.0",
+ "pointradius": 2,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
{
- "aliasColors": {},
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": "$ds",
- "description": "Shows the rate of logging the messages by their level. Unexpected spike in rate is a good reason to check logs.",
- "fieldConfig": {
- "defaults": {
- "custom": {},
- "links": []
- },
- "overrides": []
- },
- "fill": 1,
- "fillGradient": 0,
- "gridPos": {
- "h": 8,
- "w": 12,
- "x": 12,
- "y": 80
- },
- "hiddenSeries": false,
- "id": 67,
- "legend": {
- "avg": false,
- "current": false,
- "max": false,
- "min": false,
- "show": true,
- "total": false,
- "values": false
- },
- "lines": true,
- "linewidth": 1,
- "links": [],
- "nullPointMode": "null",
- "percentage": false,
- "pluginVersion": "7.1.1",
- "pointradius": 2,
- "points": false,
- "renderer": "flot",
- "seriesOverrides": [],
- "spaceLength": 10,
- "stack": false,
- "steppedLine": false,
- "targets": [
- {
- "expr": "sum(rate(vm_log_messages_total{job=\"$job\", instance=\"$instance\"}[5m])) by (level) ",
- "format": "time_series",
- "hide": false,
- "intervalFactor": 1,
- "legendFormat": "{{`{{`}}level{{`}}`}}",
- "refId": "A"
- }
- ],
- "thresholds": [],
- "timeFrom": null,
- "timeRegions": [],
- "timeShift": null,
- "title": "Logging rate ($instance)",
- "tooltip": {
- "shared": true,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": []
- },
- "yaxes": [
- {
- "decimals": null,
- "format": "short",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": "0",
- "show": true
- },
- {
- "format": "short",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": "0",
- "show": true
- }
- ],
- "yaxis": {
- "align": false,
- "alignLevel": null
- }
+ "expr": "sum(rate(vm_log_messages_total{job=\"$job\", instance=\"$instance\"}[5m])) by (level) ",
+ "format": "time_series",
+ "hide": false,
+ "intervalFactor": 1,
+ "legendFormat": "{{`{{`}}level{{`}}`}}",
+ "refId": "A"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Logging rate ($instance)",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "decimals": null,
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
}
],
- "title": "Storage",
- "type": "row"
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
},
{
"collapsed": true,
@@ -2663,7 +2569,7 @@ data:
"h": 1,
"w": 24,
"x": 0,
- "y": 31
+ "y": 75
},
"id": 71,
"panels": [
@@ -2676,7 +2582,6 @@ data:
"description": "Shows the rate and total number of new series created over last 24h.\n\nHigh churn rate tightly connected with database performance and may result in unexpected OOM's or slow queries. It is recommended to always keep an eye on this metric to avoid unexpected cardinality \"explosions\".\n\nThe higher churn rate is, the more resources required to handle it. Consider to keep the churn rate as low as possible.\n\nGood references to read:\n* https://www.robustperception.io/cardinality-is-key\n* https://www.robustperception.io/using-tsdb-analyze-to-investigate-churn-and-cardinality",
"fieldConfig": {
"defaults": {
- "custom": {},
"links": []
},
"overrides": []
@@ -2692,19 +2597,25 @@ data:
"hiddenSeries": false,
"id": 66,
"legend": {
- "avg": false,
- "current": false,
- "max": false,
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": true,
"min": false,
"show": true,
+ "sort": "current",
+ "sortDesc": true,
"total": false,
- "values": false
+ "values": true
},
"lines": true,
"linewidth": 1,
"nullPointMode": "null",
+ "options": {
+ "alertThreshold": true
+ },
"percentage": false,
- "pluginVersion": "7.1.1",
+ "pluginVersion": "8.0.0",
"pointradius": 2,
"points": false,
"renderer": "flot",
@@ -2781,7 +2692,6 @@ data:
"description": "Slow queries rate according to `search.logSlowQueryDuration` flag, which is `5s` by default.",
"fieldConfig": {
"defaults": {
- "custom": {},
"links": []
},
"overrides": []
@@ -2797,20 +2707,26 @@ data:
"hiddenSeries": false,
"id": 60,
"legend": {
- "avg": false,
- "current": false,
- "max": false,
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": true,
"min": false,
- "show": false,
+ "show": true,
+ "sort": "current",
+ "sortDesc": true,
"total": false,
- "values": false
+ "values": true
},
"lines": true,
"linewidth": 1,
"links": [],
"nullPointMode": "null",
+ "options": {
+ "alertThreshold": true
+ },
"percentage": false,
- "pluginVersion": "7.1.1",
+ "pluginVersion": "8.0.0",
"pointradius": 2,
"points": false,
"renderer": "flot",
@@ -2879,7 +2795,6 @@ data:
"description": "The percentage of slow inserts comparing to total insertion rate during the last 5 minutes. \n\nThe less value is better. If percentage remains high (>50%) during extended periods of time, then it is likely more RAM is needed for optimal handling of the current number of active time series. \n\nIn general, VictoriaMetrics requires ~1KB or RAM per active time series, so it should be easy calculating the required amounts of RAM for the current workload according to capacity planning docs. But the resulting number may be far from the real number because the required amounts of memory depends on may other factors such as the number of labels per time series and the length of label values.",
"fieldConfig": {
"defaults": {
- "custom": {},
"links": []
},
"overrides": []
@@ -2895,20 +2810,26 @@ data:
"hiddenSeries": false,
"id": 68,
"legend": {
- "avg": false,
- "current": false,
- "max": false,
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": true,
"min": false,
- "show": false,
+ "show": true,
+ "sort": "current",
+ "sortDesc": true,
"total": false,
- "values": false
+ "values": true
},
"lines": true,
"linewidth": 1,
"links": [],
"nullPointMode": "null",
+ "options": {
+ "alertThreshold": true
+ },
"percentage": false,
- "pluginVersion": "7.1.1",
+ "pluginVersion": "8.0.0",
"pointradius": 2,
"points": false,
"renderer": "flot",
@@ -2978,7 +2899,6 @@ data:
"description": "VictoriaMetrics limits the number of labels per each metric with `-maxLabelsPerTimeseries` command-line flag.\n\nThis prevents from ingesting metrics with too many labels. The value of `maxLabelsPerTimeseries` must be adjusted for your workload.\n\nWhen limit is exceeded (graph is > 0) - extra labels are dropped, which could result in unexpected identical time series.",
"fieldConfig": {
"defaults": {
- "custom": {},
"links": []
},
"overrides": []
@@ -2994,20 +2914,24 @@ data:
"hiddenSeries": false,
"id": 74,
"legend": {
- "avg": false,
- "current": false,
- "max": false,
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": true,
"min": false,
"show": false,
"total": false,
- "values": false
+ "values": true
},
"lines": true,
"linewidth": 1,
"links": [],
"nullPointMode": "null",
+ "options": {
+ "alertThreshold": true
+ },
"percentage": false,
- "pluginVersion": "7.1.1",
+ "pluginVersion": "8.0.0",
"pointradius": 2,
"points": false,
"renderer": "flot",
@@ -3017,12 +2941,13 @@ data:
"steppedLine": false,
"targets": [
{
+ "exemplar": true,
"expr": "sum(increase(vm_metrics_with_dropped_labels_total{job=\"$job\", instance=\"$instance\"}[5m]))",
"format": "time_series",
"hide": false,
"interval": "",
"intervalFactor": 1,
- "legendFormat": "",
+ "legendFormat": "limit exceeded",
"refId": "A"
}
],
@@ -3079,7 +3004,7 @@ data:
"h": 1,
"w": 24,
"x": 0,
- "y": 32
+ "y": 76
},
"id": 46,
"panels": [
@@ -3092,7 +3017,6 @@ data:
"description": "",
"fieldConfig": {
"defaults": {
- "custom": {},
"links": []
},
"overrides": []
@@ -3103,25 +3027,31 @@ data:
"h": 8,
"w": 12,
"x": 0,
- "y": 103
+ "y": 29
},
"hiddenSeries": false,
"id": 44,
"legend": {
- "avg": false,
- "current": false,
- "max": false,
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": true,
"min": false,
- "show": false,
+ "show": true,
+ "sort": "current",
+ "sortDesc": true,
"total": false,
- "values": false
+ "values": true
},
"lines": true,
"linewidth": 1,
"links": [],
"nullPointMode": "null",
+ "options": {
+ "alertThreshold": true
+ },
"percentage": false,
- "pluginVersion": "7.1.1",
+ "pluginVersion": "8.0.0",
"pointradius": 2,
"points": false,
"renderer": "flot",
@@ -3162,6 +3092,16 @@ data:
"intervalFactor": 1,
"legendFormat": "resident",
"refId": "D"
+ },
+ {
+ "exemplar": true,
+ "expr": "sum(process_resident_memory_anon_bytes{job=\"$job\", instance=\"$instance\"})",
+ "format": "time_series",
+ "hide": false,
+ "interval": "",
+ "intervalFactor": 1,
+ "legendFormat": "resident anonymous",
+ "refId": "E"
}
],
"thresholds": [],
@@ -3213,7 +3153,6 @@ data:
"datasource": "$ds",
"fieldConfig": {
"defaults": {
- "custom": {},
"links": []
},
"overrides": []
@@ -3224,25 +3163,31 @@ data:
"h": 8,
"w": 12,
"x": 12,
- "y": 103
+ "y": 29
},
"hiddenSeries": false,
"id": 57,
"legend": {
- "avg": false,
- "current": false,
- "max": false,
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": true,
"min": false,
"show": true,
+ "sort": "current",
+ "sortDesc": true,
"total": false,
- "values": false
+ "values": true
},
"lines": true,
"linewidth": 1,
"links": [],
"nullPointMode": "null",
+ "options": {
+ "alertThreshold": true
+ },
"percentage": false,
- "pluginVersion": "7.1.1",
+ "pluginVersion": "8.0.0",
"pointradius": 2,
"points": false,
"renderer": "flot",
@@ -3310,7 +3255,6 @@ data:
"description": "Panel shows the number of open file descriptors in the OS.\nReaching the limit of open files can cause various issues and must be prevented.\n\nSee how to change limits here https://medium.com/@muhammadtriwibowo/set-permanently-ulimit-n-open-files-in-ubuntu-4d61064429a",
"fieldConfig": {
"defaults": {
- "custom": {},
"links": []
},
"overrides": []
@@ -3321,25 +3265,31 @@ data:
"h": 8,
"w": 12,
"x": 0,
- "y": 111
+ "y": 37
},
"hiddenSeries": false,
"id": 75,
"legend": {
- "avg": false,
- "current": false,
- "max": false,
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": true,
"min": false,
- "show": false,
+ "show": true,
+ "sort": "current",
+ "sortDesc": true,
"total": false,
- "values": false
+ "values": true
},
"lines": true,
"linewidth": 1,
"links": [],
"nullPointMode": "null",
+ "options": {
+ "alertThreshold": true
+ },
"percentage": false,
- "pluginVersion": "7.1.1",
+ "pluginVersion": "8.0.0",
"pointradius": 2,
"points": false,
"renderer": "flot",
@@ -3418,10 +3368,9 @@ data:
"dashLength": 10,
"dashes": false,
"datasource": "$ds",
- "description": "Shows avg GC duration",
+ "description": "Shows the number of bytes read/write from the storage layer.",
"fieldConfig": {
"defaults": {
- "custom": {},
"links": []
},
"overrides": []
@@ -3432,46 +3381,68 @@ data:
"h": 8,
"w": 12,
"x": 12,
- "y": 111
+ "y": 37
},
"hiddenSeries": false,
- "id": 42,
+ "id": 76,
"legend": {
- "avg": false,
- "current": false,
- "max": false,
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": true,
"min": false,
- "show": false,
+ "show": true,
+ "sort": "current",
+ "sortDesc": true,
"total": false,
- "values": false
+ "values": true
},
"lines": true,
"linewidth": 1,
"links": [],
"nullPointMode": "null",
+ "options": {
+ "alertThreshold": true
+ },
"percentage": false,
- "pluginVersion": "7.1.1",
+ "pluginVersion": "8.0.0",
"pointradius": 2,
"points": false,
"renderer": "flot",
- "seriesOverrides": [],
+ "seriesOverrides": [
+ {
+ "alias": "read",
+ "transform": "negative-Y"
+ }
+ ],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
- "expr": "sum(rate(go_gc_duration_seconds_sum{job=\"$job\", instance=\"$instance\"}[5m]))\n/\nsum(rate(go_gc_duration_seconds_count{job=\"$job\", instance=\"$instance\"}[5m]))",
+ "expr": "sum(rate(process_io_storage_read_bytes_total{job=\"$job\", instance=\"$instance\"}[5m]))",
"format": "time_series",
- "intervalFactor": 2,
- "legendFormat": "avg gc duration",
+ "hide": false,
+ "interval": "",
+ "intervalFactor": 1,
+ "legendFormat": "read",
"refId": "A"
+ },
+ {
+ "expr": "sum(rate(process_io_storage_written_bytes_total{job=\"$job\", instance=\"$instance\"}[5m]))",
+ "format": "time_series",
+ "hide": false,
+ "interval": "",
+ "intervalFactor": 1,
+ "legendFormat": "write",
+ "refId": "B"
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
- "title": "GC duration ($instance)",
+ "title": "Disk writes/reads ($instance)",
"tooltip": {
"shared": true,
"sort": 0,
@@ -3487,11 +3458,12 @@ data:
},
"yaxes": [
{
- "format": "s",
+ "decimals": null,
+ "format": "bytes",
"label": null,
"logBase": 1,
"max": null,
- "min": "0",
+ "min": null,
"show": true
},
{
@@ -3516,7 +3488,6 @@ data:
"datasource": "$ds",
"fieldConfig": {
"defaults": {
- "custom": {},
"links": []
},
"overrides": []
@@ -3527,25 +3498,31 @@ data:
"h": 8,
"w": 12,
"x": 0,
- "y": 119
+ "y": 45
},
"hiddenSeries": false,
"id": 47,
"legend": {
- "avg": false,
- "current": false,
- "max": false,
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": true,
"min": false,
- "show": false,
+ "show": true,
+ "sort": "current",
+ "sortDesc": true,
"total": false,
- "values": false
+ "values": true
},
"lines": true,
"linewidth": 1,
"links": [],
"nullPointMode": "null",
+ "options": {
+ "alertThreshold": true
+ },
"percentage": false,
- "pluginVersion": "7.1.1",
+ "pluginVersion": "8.0.0",
"pointradius": 2,
"points": false,
"renderer": "flot",
@@ -3610,10 +3587,9 @@ data:
"dashLength": 10,
"dashes": false,
"datasource": "$ds",
- "description": "",
+ "description": "Shows avg GC duration",
"fieldConfig": {
"defaults": {
- "custom": {},
"links": []
},
"overrides": []
@@ -3624,25 +3600,31 @@ data:
"h": 8,
"w": 12,
"x": 12,
- "y": 119
+ "y": 45
},
"hiddenSeries": false,
- "id": 37,
+ "id": 42,
"legend": {
- "avg": false,
- "current": false,
- "max": false,
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": true,
"min": false,
- "show": false,
+ "show": true,
+ "sort": "current",
+ "sortDesc": true,
"total": false,
- "values": false
+ "values": true
},
"lines": true,
"linewidth": 1,
"links": [],
"nullPointMode": "null",
+ "options": {
+ "alertThreshold": true
+ },
"percentage": false,
- "pluginVersion": "7.1.1",
+ "pluginVersion": "8.0.0",
"pointradius": 2,
"points": false,
"renderer": "flot",
@@ -3652,11 +3634,10 @@ data:
"steppedLine": false,
"targets": [
{
- "expr": "sum(vm_tcplistener_conns{job=\"$job\", instance=\"$instance\"})",
+ "expr": "sum(rate(go_gc_duration_seconds_sum{job=\"$job\", instance=\"$instance\"}[5m]))\n/\nsum(rate(go_gc_duration_seconds_count{job=\"$job\", instance=\"$instance\"}[5m]))",
"format": "time_series",
- "hide": false,
- "intervalFactor": 1,
- "legendFormat": "connections",
+ "intervalFactor": 2,
+ "legendFormat": "avg gc duration",
"refId": "A"
}
],
@@ -3664,7 +3645,7 @@ data:
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
- "title": "TCP connections ($instance)",
+ "title": "GC duration ($instance)",
"tooltip": {
"shared": true,
"sort": 0,
@@ -3680,8 +3661,7 @@ data:
},
"yaxes": [
{
- "decimals": null,
- "format": "short",
+ "format": "s",
"label": null,
"logBase": 1,
"max": null,
@@ -3710,7 +3690,6 @@ data:
"datasource": "$ds",
"fieldConfig": {
"defaults": {
- "custom": {},
"links": []
},
"overrides": []
@@ -3721,25 +3700,31 @@ data:
"h": 8,
"w": 12,
"x": 0,
- "y": 127
+ "y": 53
},
"hiddenSeries": false,
"id": 48,
"legend": {
- "avg": false,
- "current": false,
- "max": false,
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": true,
"min": false,
- "show": false,
+ "show": true,
+ "sort": "current",
+ "sortDesc": true,
"total": false,
- "values": false
+ "values": true
},
"lines": true,
"linewidth": 1,
"links": [],
"nullPointMode": "null",
+ "options": {
+ "alertThreshold": true
+ },
"percentage": false,
- "pluginVersion": "7.1.1",
+ "pluginVersion": "8.0.0",
"pointradius": 2,
"points": false,
"renderer": "flot",
@@ -3807,7 +3792,6 @@ data:
"description": "",
"fieldConfig": {
"defaults": {
- "custom": {},
"links": []
},
"overrides": []
@@ -3818,25 +3802,134 @@ data:
"h": 8,
"w": 12,
"x": 12,
- "y": 127
+ "y": 53
+ },
+ "hiddenSeries": false,
+ "id": 37,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": true,
+ "min": false,
+ "show": true,
+ "sort": "current",
+ "sortDesc": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "options": {
+ "alertThreshold": true
+ },
+ "percentage": false,
+ "pluginVersion": "8.0.0",
+ "pointradius": 2,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "sum(vm_tcplistener_conns{job=\"$job\", instance=\"$instance\"})",
+ "format": "time_series",
+ "hide": false,
+ "intervalFactor": 1,
+ "legendFormat": "connections",
+ "refId": "A"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "TCP connections ($instance)",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "decimals": null,
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$ds",
+ "description": "",
+ "fieldConfig": {
+ "defaults": {
+ "links": []
+ },
+ "overrides": []
+ },
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 8,
+ "w": 12,
+ "x": 12,
+ "y": 61
},
"hiddenSeries": false,
"id": 49,
"legend": {
- "avg": false,
- "current": false,
- "max": false,
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": true,
"min": false,
- "show": false,
+ "show": true,
+ "sort": "current",
+ "sortDesc": true,
"total": false,
- "values": false
+ "values": true
},
"lines": true,
"linewidth": 1,
"links": [],
"nullPointMode": "null",
+ "options": {
+ "alertThreshold": true
+ },
"percentage": false,
- "pluginVersion": "7.1.1",
+ "pluginVersion": "8.0.0",
"pointradius": 2,
"points": false,
"renderer": "flot",
@@ -3902,9 +3995,12 @@ data:
}
],
"refresh": "30s",
- "schemaVersion": 26,
+ "schemaVersion": 30,
"style": "dark",
- "tags": [],
+ "tags": [
+ "victoriametrics",
+ "vmsingle"
+ ],
"templating": {
"list": [
{
@@ -3913,6 +4009,8 @@ data:
"text": "VictoriaMetrics",
"value": "VictoriaMetrics"
},
+ "description": null,
+ "error": null,
"hide": 0,
"includeAll": false,
"label": null,
@@ -3931,19 +4029,23 @@ data:
"current": {},
"datasource": "$ds",
"definition": "label_values(vm_app_version{version=~\"victoria-metrics-.*\"}, job)",
+ "description": null,
+ "error": null,
"hide": 0,
"includeAll": false,
"label": null,
"multi": false,
"name": "job",
"options": [],
- "query": "label_values(vm_app_version{version=~\"victoria-metrics-.*\"}, job)",
+ "query": {
+ "query": "label_values(vm_app_version{version=~\"victoria-metrics-.*\"}, job)",
+ "refId": "VictoriaMetrics-job-Variable-Query"
+ },
"refresh": 1,
"regex": "",
"skipUrlSync": false,
"sort": 0,
"tagValuesQuery": "",
- "tags": [],
"tagsQuery": "",
"type": "query",
"useTags": false
@@ -3953,19 +4055,23 @@ data:
"current": {},
"datasource": "$ds",
"definition": "label_values(vm_app_version{job=\"$job\", instance=\"$instance\"}, version)",
+ "description": null,
+ "error": null,
"hide": 2,
"includeAll": false,
"label": null,
"multi": false,
"name": "version",
"options": [],
- "query": "label_values(vm_app_version{job=\"$job\", instance=\"$instance\"}, version)",
+ "query": {
+ "query": "label_values(vm_app_version{job=\"$job\", instance=\"$instance\"}, version)",
+ "refId": "VictoriaMetrics-version-Variable-Query"
+ },
"refresh": 1,
"regex": "/.*-tags-(v\\d+\\.\\d+\\.\\d+)/",
"skipUrlSync": false,
"sort": 2,
"tagValuesQuery": "",
- "tags": [],
"tagsQuery": "",
"type": "query",
"useTags": false
@@ -3975,19 +4081,23 @@ data:
"current": {},
"datasource": "$ds",
"definition": "label_values(vm_app_version{job=~\"$job\"}, instance)",
+ "description": null,
+ "error": null,
"hide": 0,
"includeAll": false,
"label": null,
"multi": false,
"name": "instance",
"options": [],
- "query": "label_values(vm_app_version{job=~\"$job\"}, instance)",
+ "query": {
+ "query": "label_values(vm_app_version{job=~\"$job\"}, instance)",
+ "refId": "VictoriaMetrics-instance-Variable-Query"
+ },
"refresh": 1,
"regex": "",
"skipUrlSync": false,
"sort": 0,
"tagValuesQuery": "",
- "tags": [],
"tagsQuery": "",
"type": "query",
"useTags": false
diff --git a/charts/victoria-metrics-k8s-stack/templates/grafana/dashboards/vmagent.yaml b/charts/victoria-metrics-k8s-stack/templates/grafana/dashboards/vmagent.yaml
index 4ab3d2dbd..f6e188fe4 100644
--- a/charts/victoria-metrics-k8s-stack/templates/grafana/dashboards/vmagent.yaml
+++ b/charts/victoria-metrics-k8s-stack/templates/grafana/dashboards/vmagent.yaml
@@ -24,12 +24,12 @@ data:
"type": "grafana",
"id": "grafana",
"name": "Grafana",
- "version": "7.1.1"
+ "version": "8.0.0"
},
{
"type": "panel",
"id": "graph",
- "name": "Graph",
+ "name": "Graph (old)",
"version": ""
},
{
@@ -70,12 +70,12 @@ data:
}
]
},
- "description": "Overview for VictoriaMetrics vmagent v1.56.0 or higher",
+ "description": "Overview for VictoriaMetrics vmagent v1.57.0 or higher",
"editable": true,
"gnetId": null,
"graphTooltip": 1,
"id": null,
- "iteration": 1616957263139,
+ "iteration": 1623414948941,
"links": [
{
"icon": "doc",
@@ -107,6 +107,10 @@ data:
{
"collapsed": false,
"datasource": "$ds",
+ "fieldConfig": {
+ "defaults": {},
+ "overrides": []
+ },
"gridPos": {
"h": 1,
"w": 24,
@@ -123,7 +127,6 @@ data:
"description": "Shows total number of all configured scrape targets in state \"up\".\n\nSee `http://vmagent-host:8429/targets` to get list of all targets. \n",
"fieldConfig": {
"defaults": {
- "custom": {},
"mappings": [],
"thresholds": {
"mode": "absolute",
@@ -156,9 +159,10 @@ data:
"fields": "",
"values": false
},
+ "text": {},
"textMode": "auto"
},
- "pluginVersion": "7.1.1",
+ "pluginVersion": "8.0.0",
"targets": [
{
"expr": "sum(vm_promscrape_targets{job=~\"$job\", instance=~\"$instance\", status=\"up\"})",
@@ -177,7 +181,6 @@ data:
"description": "Shows total number of all configured scrape targets in state \"down\".\n\nSee `http://vmagent-host:8429/targets` to get list of all targets. \n",
"fieldConfig": {
"defaults": {
- "custom": {},
"mappings": [],
"thresholds": {
"mode": "absolute",
@@ -220,9 +223,10 @@ data:
"fields": "",
"values": false
},
+ "text": {},
"textMode": "auto"
},
- "pluginVersion": "7.1.1",
+ "pluginVersion": "8.0.0",
"targets": [
{
"expr": "sum(vm_promscrape_targets{job=~\"$job\", instance=~\"$instance\", status=\"down\"})",
@@ -241,7 +245,6 @@ data:
"description": "Shows number of generated error messages in logs over last 30m. Non-zero value may be a sign of connectivity or missconfiguration errors.",
"fieldConfig": {
"defaults": {
- "custom": {},
"mappings": [],
"min": 0,
"thresholds": {
@@ -287,9 +290,10 @@ data:
"fields": "",
"values": false
},
+ "text": {},
"textMode": "auto"
},
- "pluginVersion": "7.1.1",
+ "pluginVersion": "8.0.0",
"targets": [
{
"expr": "sum(increase(vm_log_messages_total{job=~\"$job\", instance=~\"$instance\", level!=\"info\"}[30m]))",
@@ -308,7 +312,6 @@ data:
"description": "Persistent queue size shows size of pending samples in bytes which hasn't been flushed to remote storage yet. \nIncreasing of value might be a sign of connectivity issues. In such cases, vmagent starts to flush pending data on disk with attempt to send it later once connection is restored.",
"fieldConfig": {
"defaults": {
- "custom": {},
"mappings": [],
"thresholds": {
"mode": "absolute",
@@ -346,9 +349,10 @@ data:
"fields": "",
"values": false
},
+ "text": {},
"textMode": "auto"
},
- "pluginVersion": "7.1.1",
+ "pluginVersion": "8.0.0",
"targets": [
{
"expr": "sum(vm_persistentqueue_bytes_pending{job=~\"$job\", instance=~\"$instance\"})",
@@ -365,12 +369,6 @@ data:
{
"columns": [],
"datasource": "$ds",
- "fieldConfig": {
- "defaults": {
- "custom": {}
- },
- "overrides": []
- },
"fontSize": "100%",
"gridPos": {
"h": 7,
@@ -383,7 +381,7 @@ data:
"scroll": true,
"showHeader": true,
"sort": {
- "col": null,
+ "col": 3,
"desc": false
},
"styles": [
@@ -467,7 +465,6 @@ data:
"datasource": "$ds",
"fieldConfig": {
"defaults": {
- "custom": {},
"links": []
},
"overrides": []
@@ -501,8 +498,11 @@ data:
"lines": true,
"linewidth": 1,
"nullPointMode": "null as zero",
+ "options": {
+ "alertThreshold": true
+ },
"percentage": false,
- "pluginVersion": "7.1.1",
+ "pluginVersion": "8.0.0",
"pointradius": 2,
"points": false,
"renderer": "flot",
@@ -572,7 +572,6 @@ data:
"description": "Shows in/out samples rate including push and pull models. \n\nThe out-rate could be different to in-rate because of replication or additional timeseries added by vmagent for every scraped target.",
"fieldConfig": {
"defaults": {
- "custom": {},
"links": []
},
"overrides": []
@@ -588,19 +587,25 @@ data:
"hiddenSeries": false,
"id": 5,
"legend": {
- "avg": false,
- "current": false,
- "max": false,
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": true,
"min": false,
- "show": false,
+ "show": true,
+ "sort": "current",
+ "sortDesc": true,
"total": false,
- "values": false
+ "values": true
},
"lines": true,
"linewidth": 1,
"nullPointMode": "null",
+ "options": {
+ "alertThreshold": true
+ },
"percentage": false,
- "pluginVersion": "7.1.1",
+ "pluginVersion": "8.0.0",
"pointradius": 2,
"points": false,
"renderer": "flot",
@@ -678,7 +683,6 @@ data:
"description": "Shows the rate of requests served by vmagent HTTP server.",
"fieldConfig": {
"defaults": {
- "custom": {},
"links": []
},
"overrides": []
@@ -694,20 +698,26 @@ data:
"hiddenSeries": false,
"id": 15,
"legend": {
- "avg": false,
- "current": false,
- "max": false,
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": true,
"min": false,
- "show": false,
+ "show": true,
+ "sort": "current",
+ "sortDesc": true,
"total": false,
- "values": false
+ "values": true
},
"lines": true,
"linewidth": 1,
"links": [],
"nullPointMode": "null as zero",
+ "options": {
+ "alertThreshold": true
+ },
"percentage": false,
- "pluginVersion": "7.1.1",
+ "pluginVersion": "8.0.0",
"pointradius": 2,
"points": false,
"renderer": "flot",
@@ -773,7 +783,6 @@ data:
"description": "Network usage shows the bytes rate for data accepted by vmagent and pushed via remotewrite protocol.\nDiscrepancies are possible because of different protocols used for ingesting, scraping and writing data.",
"fieldConfig": {
"defaults": {
- "custom": {},
"links": []
},
"overrides": []
@@ -789,19 +798,26 @@ data:
"hiddenSeries": false,
"id": 7,
"legend": {
- "avg": false,
- "current": false,
- "max": false,
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": true,
"min": false,
- "show": false,
+ "rightSide": false,
+ "show": true,
+ "sort": "current",
+ "sortDesc": true,
"total": false,
- "values": false
+ "values": true
},
"lines": true,
"linewidth": 1,
"nullPointMode": "null",
+ "options": {
+ "alertThreshold": true
+ },
"percentage": false,
- "pluginVersion": "7.1.1",
+ "pluginVersion": "8.0.0",
"pointradius": 2,
"points": false,
"renderer": "flot",
@@ -878,7 +894,6 @@ data:
"description": "Errors rate shows rate for multiple metrics that track possible errors in vmagent, such as network or parsing errors.",
"fieldConfig": {
"defaults": {
- "custom": {},
"links": []
},
"overrides": []
@@ -894,13 +909,16 @@ data:
"hiddenSeries": false,
"id": 69,
"legend": {
- "avg": false,
- "current": false,
- "max": false,
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": true,
"min": false,
- "show": false,
+ "show": true,
+ "sort": "current",
+ "sortDesc": true,
"total": false,
- "values": false
+ "values": true
},
"lines": true,
"linewidth": 1,
@@ -912,8 +930,11 @@ data:
}
],
"nullPointMode": "null as zero",
+ "options": {
+ "alertThreshold": true
+ },
"percentage": false,
- "pluginVersion": "7.1.1",
+ "pluginVersion": "8.0.0",
"pointradius": 2,
"points": false,
"renderer": "flot",
@@ -1003,7 +1024,6 @@ data:
"description": "Shows rate of dropped samples from persistent queue. VMagent drops samples from queue if in-memory and on-disk queues are full and it is unable to flush them to remote storage.\nThe max size of on-disk queue is configured by `-remoteWrite.maxDiskUsagePerURL` flag.",
"fieldConfig": {
"defaults": {
- "custom": {},
"links": []
},
"overrides": []
@@ -1019,13 +1039,16 @@ data:
"hiddenSeries": false,
"id": 49,
"legend": {
- "avg": false,
- "current": false,
- "max": false,
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": true,
"min": false,
- "show": false,
+ "show": true,
+ "sort": "current",
+ "sortDesc": true,
"total": false,
- "values": false
+ "values": true
},
"lines": true,
"linewidth": 1,
@@ -1037,8 +1060,11 @@ data:
}
],
"nullPointMode": "null",
+ "options": {
+ "alertThreshold": true
+ },
"percentage": false,
- "pluginVersion": "7.1.1",
+ "pluginVersion": "8.0.0",
"pointradius": 2,
"points": false,
"renderer": "flot",
@@ -1104,7 +1130,6 @@ data:
"description": "Shows the persistent queue size of pending samples in bytes which hasn't been flushed to remote storage yet. \n\nIncreasing of value might be a sign of connectivity issues. In such cases, vmagent starts to flush pending data on disk with attempt to send it later once connection is restored.\n\nRemote write URLs are hidden by default but might be unveiled once `-remoteWrite.showURL` is set to true.",
"fieldConfig": {
"defaults": {
- "custom": {},
"links": []
},
"overrides": []
@@ -1120,13 +1145,16 @@ data:
"hiddenSeries": false,
"id": 17,
"legend": {
- "avg": false,
- "current": false,
- "max": false,
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": true,
"min": false,
- "show": false,
+ "show": true,
+ "sort": "current",
+ "sortDesc": true,
"total": false,
- "values": false
+ "values": true
},
"lines": true,
"linewidth": 1,
@@ -1137,8 +1165,11 @@ data:
}
],
"nullPointMode": "null",
+ "options": {
+ "alertThreshold": true
+ },
"percentage": false,
- "pluginVersion": "7.1.1",
+ "pluginVersion": "8.0.0",
"pointradius": 2,
"points": false,
"renderer": "flot",
@@ -1204,7 +1235,6 @@ data:
"description": "Shows the rate of dropped samples due to relabeling. \nMetric tracks drops for `-remoteWrite.relabelConfig` configuration only.",
"fieldConfig": {
"defaults": {
- "custom": {},
"links": []
},
"overrides": []
@@ -1220,13 +1250,16 @@ data:
"hiddenSeries": false,
"id": 18,
"legend": {
- "avg": false,
- "current": false,
- "max": false,
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": true,
"min": false,
- "show": false,
+ "show": true,
+ "sort": "current",
+ "sortDesc": true,
"total": false,
- "values": false
+ "values": true
},
"lines": true,
"linewidth": 1,
@@ -1238,8 +1271,11 @@ data:
}
],
"nullPointMode": "null",
+ "options": {
+ "alertThreshold": true
+ },
"percentage": false,
- "pluginVersion": "7.1.1",
+ "pluginVersion": "8.0.0",
"pointradius": 2,
"points": false,
"renderer": "flot",
@@ -1311,7 +1347,6 @@ data:
"description": "Shows the rate of dropped data blocks in cases when remote storage replies with `400 Bad Request` and `409 Conflict` HTTP responses.\n\nSee https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1149",
"fieldConfig": {
"defaults": {
- "custom": {},
"links": []
},
"overrides": []
@@ -1327,20 +1362,26 @@ data:
"hiddenSeries": false,
"id": 79,
"legend": {
- "avg": false,
- "current": false,
- "max": false,
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": true,
"min": false,
- "show": false,
+ "show": true,
+ "sort": "current",
+ "sortDesc": true,
"total": false,
- "values": false
+ "values": true
},
"lines": true,
"linewidth": 1,
"links": [],
"nullPointMode": "null",
+ "options": {
+ "alertThreshold": true
+ },
"percentage": false,
- "pluginVersion": "7.1.1",
+ "pluginVersion": "8.0.0",
"pointradius": 2,
"points": false,
"renderer": "flot",
@@ -1350,9 +1391,10 @@ data:
"steppedLine": false,
"targets": [
{
+ "exemplar": true,
"expr": "sum(rate(vmagent_remotewrite_packets_dropped_total{job=~\"$job\", instance=~\"$instance\"}[$__interval]))",
"interval": "",
- "legendFormat": "",
+ "legendFormat": "dropped",
"refId": "A"
}
],
@@ -1400,6 +1442,10 @@ data:
{
"collapsed": true,
"datasource": "$ds",
+ "fieldConfig": {
+ "defaults": {},
+ "overrides": []
+ },
"gridPos": {
"h": 1,
"w": 24,
@@ -1416,7 +1462,6 @@ data:
"datasource": "$ds",
"fieldConfig": {
"defaults": {
- "custom": {},
"links": []
},
"overrides": []
@@ -1432,19 +1477,25 @@ data:
"hiddenSeries": false,
"id": 48,
"legend": {
- "avg": false,
- "current": false,
- "max": false,
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": true,
"min": false,
- "show": false,
+ "show": true,
+ "sort": "current",
+ "sortDesc": true,
"total": false,
- "values": false
+ "values": true
},
"lines": true,
"linewidth": 1,
"nullPointMode": "null",
+ "options": {
+ "alertThreshold": true
+ },
"percentage": false,
- "pluginVersion": "7.1.1",
+ "pluginVersion": "8.0.0",
"pointradius": 2,
"points": false,
"renderer": "flot",
@@ -1454,7 +1505,8 @@ data:
"steppedLine": false,
"targets": [
{
- "expr": "sum(vm_promscrape_targets{job=~\"$job\", instance=~\"$instance\", status=\"up\"}) by(type)",
+ "exemplar": true,
+ "expr": "sum(vm_promscrape_targets{job=~\"$job\", instance=~\"$instance\", status=\"up\"}) by(type) > 0",
"format": "time_series",
"interval": "",
"legendFormat": "{{`{{`}}type{{`}}`}}",
@@ -1471,7 +1523,6 @@ data:
"sort": 2,
"value_type": "individual"
},
- "transparent": true,
"type": "graph",
"xaxis": {
"buckets": null,
@@ -1511,7 +1562,6 @@ data:
"datasource": "$ds",
"fieldConfig": {
"defaults": {
- "custom": {},
"links": []
},
"overrides": []
@@ -1527,19 +1577,25 @@ data:
"hiddenSeries": false,
"id": 76,
"legend": {
- "avg": false,
- "current": false,
- "max": false,
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": true,
"min": false,
- "show": false,
+ "show": true,
+ "sort": "current",
+ "sortDesc": true,
"total": false,
- "values": false
+ "values": true
},
"lines": true,
"linewidth": 1,
"nullPointMode": "null",
+ "options": {
+ "alertThreshold": true
+ },
"percentage": false,
- "pluginVersion": "7.1.1",
+ "pluginVersion": "8.0.0",
"pointradius": 2,
"points": false,
"renderer": "flot",
@@ -1549,6 +1605,7 @@ data:
"steppedLine": false,
"targets": [
{
+ "exemplar": true,
"expr": "sum(vm_promscrape_targets{job=~\"$job\", instance=~\"$instance\", status=\"down\"}) by(type) > 0",
"format": "time_series",
"interval": "",
@@ -1566,7 +1623,6 @@ data:
"sort": 2,
"value_type": "individual"
},
- "transparent": true,
"type": "graph",
"xaxis": {
"buckets": null,
@@ -1606,7 +1662,6 @@ data:
"datasource": "$ds",
"fieldConfig": {
"defaults": {
- "custom": {},
"links": []
},
"overrides": []
@@ -1622,19 +1677,25 @@ data:
"hiddenSeries": false,
"id": 20,
"legend": {
- "avg": false,
- "current": false,
- "max": false,
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": true,
"min": false,
- "show": false,
+ "show": true,
+ "sort": "current",
+ "sortDesc": true,
"total": false,
- "values": false
+ "values": true
},
"lines": true,
"linewidth": 1,
"nullPointMode": "null",
+ "options": {
+ "alertThreshold": true
+ },
"percentage": false,
- "pluginVersion": "7.1.1",
+ "pluginVersion": "8.0.0",
"pointradius": 2,
"points": false,
"renderer": "flot",
@@ -1710,7 +1771,6 @@ data:
"datasource": "$ds",
"fieldConfig": {
"defaults": {
- "custom": {},
"links": []
},
"overrides": []
@@ -1726,19 +1786,25 @@ data:
"hiddenSeries": false,
"id": 31,
"legend": {
- "avg": false,
- "current": false,
- "max": false,
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": true,
"min": false,
- "show": false,
+ "show": true,
+ "sort": "current",
+ "sortDesc": true,
"total": false,
- "values": false
+ "values": true
},
"lines": true,
"linewidth": 1,
"nullPointMode": "null",
+ "options": {
+ "alertThreshold": true
+ },
"percentage": false,
- "pluginVersion": "7.1.1",
+ "pluginVersion": "8.0.0",
"pointradius": 2,
"points": false,
"renderer": "flot",
@@ -1821,7 +1887,6 @@ data:
"datasource": "$ds",
"fieldConfig": {
"defaults": {
- "custom": {},
"links": []
},
"overrides": []
@@ -1837,19 +1902,25 @@ data:
"hiddenSeries": false,
"id": 46,
"legend": {
- "avg": false,
- "current": false,
- "max": false,
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": true,
"min": false,
"show": true,
+ "sort": "current",
+ "sortDesc": true,
"total": false,
- "values": false
+ "values": true
},
"lines": true,
"linewidth": 1,
"nullPointMode": "null",
+ "options": {
+ "alertThreshold": true
+ },
"percentage": false,
- "pluginVersion": "7.1.1",
+ "pluginVersion": "8.0.0",
"pointradius": 2,
"points": false,
"renderer": "flot",
@@ -1928,12 +1999,6 @@ data:
"dataFormat": "tsbuckets",
"datasource": "$ds",
"description": "works in vm only disclaimer",
- "fieldConfig": {
- "defaults": {
- "custom": {}
- },
- "overrides": []
- },
"gridPos": {
"h": 8,
"w": 12,
@@ -1991,6 +2056,10 @@ data:
{
"collapsed": true,
"datasource": "$ds",
+ "fieldConfig": {
+ "defaults": {},
+ "overrides": []
+ },
"gridPos": {
"h": 1,
"w": 24,
@@ -2008,7 +2077,6 @@ data:
"description": "Shows the rate of write requests served by ingestserver (UDP, TCP connections) and HTTP server.",
"fieldConfig": {
"defaults": {
- "custom": {},
"links": []
},
"overrides": []
@@ -2024,20 +2092,26 @@ data:
"hiddenSeries": false,
"id": 73,
"legend": {
- "avg": false,
- "current": false,
- "max": false,
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": true,
"min": false,
- "show": false,
+ "show": true,
+ "sort": "current",
+ "sortDesc": true,
"total": false,
- "values": false
+ "values": true
},
"lines": true,
"linewidth": 1,
"links": [],
"nullPointMode": "null as zero",
+ "options": {
+ "alertThreshold": true
+ },
"percentage": false,
- "pluginVersion": "7.1.1",
+ "pluginVersion": "8.0.0",
"pointradius": 2,
"points": false,
"renderer": "flot",
@@ -2047,13 +2121,15 @@ data:
"steppedLine": false,
"targets": [
{
- "expr": "sum(rate(vm_ingestserver_requests_total{job=~\"$job\", instance=~\"$instance\", path!~\"/favicon.ico\"}[$__interval])) by(type, net)",
+ "exemplar": true,
+ "expr": "sum(rate(vm_ingestserver_requests_total{job=~\"$job\", instance=~\"$instance\", path!~\"/favicon.ico\"}[$__interval])) by(type, net) > 0",
"interval": "",
"legendFormat": "{{`{{`}} type {{`}}`}} ({{`{{`}}net{{`}}`}})",
"refId": "A"
},
{
- "expr": "sum(rate(vmagent_http_requests_total{job=~\"$job\", instance=~\"$instance\", protocol!=\"\"}[$__interval])) by(protocol)",
+ "exemplar": true,
+ "expr": "sum(rate(vmagent_http_requests_total{job=~\"$job\", instance=~\"$instance\", protocol!=\"\"}[$__interval])) by(protocol) > 0",
"interval": "",
"legendFormat": "{{`{{`}} protocol {{`}}`}} (http)",
"refId": "B"
@@ -2109,7 +2185,6 @@ data:
"description": "Shows the rate of write errors in ingestserver (UDP, TCP connections) and HTTP server.",
"fieldConfig": {
"defaults": {
- "custom": {},
"links": []
},
"overrides": []
@@ -2125,20 +2200,26 @@ data:
"hiddenSeries": false,
"id": 77,
"legend": {
- "avg": false,
- "current": false,
- "max": false,
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": true,
"min": false,
- "show": false,
+ "show": true,
+ "sort": "current",
+ "sortDesc": true,
"total": false,
- "values": false
+ "values": true
},
"lines": true,
"linewidth": 1,
"links": [],
"nullPointMode": "null as zero",
+ "options": {
+ "alertThreshold": true
+ },
"percentage": false,
- "pluginVersion": "7.1.1",
+ "pluginVersion": "8.0.0",
"pointradius": 2,
"points": false,
"renderer": "flot",
@@ -2148,13 +2229,15 @@ data:
"steppedLine": false,
"targets": [
{
- "expr": "sum(rate(vm_ingestserver_request_errors_total{job=~\"$job\", instance=~\"$instance\", path!~\"/favicon.ico\"}[$__interval])) by(type, net)",
+ "exemplar": true,
+ "expr": "sum(rate(vm_ingestserver_request_errors_total{job=~\"$job\", instance=~\"$instance\", path!~\"/favicon.ico\"}[$__interval])) by(type, net) > 0",
"interval": "",
"legendFormat": "{{`{{`}} type {{`}}`}} ({{`{{`}}net{{`}}`}})",
"refId": "A"
},
{
- "expr": "sum(rate(vmagent_http_request_errors_total{job=~\"$job\", instance=~\"$instance\", protocol!=\"\"}[$__interval])) by(protocol)",
+ "exemplar": true,
+ "expr": "sum(rate(vmagent_http_request_errors_total{job=~\"$job\", instance=~\"$instance\", protocol!=\"\"}[$__interval])) by(protocol) > 0",
"interval": "",
"legendFormat": "{{`{{`}} protocol {{`}}`}} (http)",
"refId": "B"
@@ -2210,7 +2293,6 @@ data:
"description": "Shows the rate of parsed rows from write or scrape requests.",
"fieldConfig": {
"defaults": {
- "custom": {},
"links": []
},
"overrides": []
@@ -2226,20 +2308,26 @@ data:
"hiddenSeries": false,
"id": 78,
"legend": {
- "avg": false,
- "current": false,
- "max": false,
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": true,
"min": false,
- "show": false,
+ "show": true,
+ "sort": "current",
+ "sortDesc": true,
"total": false,
- "values": false
+ "values": true
},
"lines": true,
"linewidth": 1,
"links": [],
"nullPointMode": "null as zero",
+ "options": {
+ "alertThreshold": true
+ },
"percentage": false,
- "pluginVersion": "7.1.1",
+ "pluginVersion": "8.0.0",
"pointradius": 2,
"points": false,
"renderer": "flot",
@@ -2249,9 +2337,10 @@ data:
"steppedLine": false,
"targets": [
{
+ "exemplar": true,
"expr": "sum(rate(vm_protoparser_rows_read_total{job=~\"$job\", instance=~\"$instance\"}[$__interval])) by(type)",
"interval": "",
- "legendFormat": "{{`{{`}} type {{`}}`}} ({{`{{`}}net{{`}}`}})",
+ "legendFormat": "{{`{{`}} type {{`}}`}}",
"refId": "A"
}
],
@@ -2305,7 +2394,6 @@ data:
"description": "Tracks the rate of dropped invalid rows because of errors while unmarshaling write requests. The exact errors messages will be printed in logs.",
"fieldConfig": {
"defaults": {
- "custom": {},
"links": []
},
"overrides": []
@@ -2321,19 +2409,25 @@ data:
"hiddenSeries": false,
"id": 50,
"legend": {
- "avg": false,
- "current": false,
- "max": false,
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": true,
"min": false,
- "show": false,
+ "show": true,
+ "sort": "current",
+ "sortDesc": true,
"total": false,
- "values": false
+ "values": true
},
"lines": true,
"linewidth": 1,
"nullPointMode": "null",
+ "options": {
+ "alertThreshold": true
+ },
"percentage": false,
- "pluginVersion": "7.1.1",
+ "pluginVersion": "8.0.0",
"pointradius": 2,
"points": false,
"renderer": "flot",
@@ -2343,7 +2437,8 @@ data:
"steppedLine": false,
"targets": [
{
- "expr": "sum(rate(vm_rows_invalid_total{job=~\"$job\", instance=~\"$instance\"}[$__interval])) by(type)",
+ "exemplar": true,
+ "expr": "sum(rate(vm_rows_invalid_total{job=~\"$job\", instance=~\"$instance\"}[$__interval])) by(type) > 0",
"interval": "",
"legendFormat": "{{`{{`}}type{{`}}`}}",
"refId": "A"
@@ -2397,6 +2492,10 @@ data:
{
"collapsed": true,
"datasource": "$ds",
+ "fieldConfig": {
+ "defaults": {},
+ "overrides": []
+ },
"gridPos": {
"h": 1,
"w": 24,
@@ -2414,7 +2513,6 @@ data:
"description": "Shows the rate of requests to configured remote write endpoints by url and status code.\n\nRemote write URLs are hidden by default but might be unveiled once `-remoteWrite.showURL` is set to true.\n\n",
"fieldConfig": {
"defaults": {
- "custom": {},
"links": []
},
"overrides": []
@@ -2430,19 +2528,25 @@ data:
"hiddenSeries": false,
"id": 60,
"legend": {
- "avg": false,
- "current": false,
- "max": false,
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": true,
"min": false,
- "show": false,
+ "show": true,
+ "sort": "current",
+ "sortDesc": true,
"total": false,
- "values": false
+ "values": true
},
"lines": true,
"linewidth": 1,
"nullPointMode": "null",
+ "options": {
+ "alertThreshold": true
+ },
"percentage": false,
- "pluginVersion": "7.1.1",
+ "pluginVersion": "8.0.0",
"pointradius": 2,
"points": false,
"renderer": "flot",
@@ -2509,7 +2613,6 @@ data:
"description": "Shows the global rate for number of written bytes via remote write connections.",
"fieldConfig": {
"defaults": {
- "custom": {},
"links": []
},
"overrides": []
@@ -2525,19 +2628,25 @@ data:
"hiddenSeries": false,
"id": 66,
"legend": {
- "avg": false,
- "current": false,
- "max": false,
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": true,
"min": false,
- "show": false,
+ "show": true,
+ "sort": "current",
+ "sortDesc": true,
"total": false,
- "values": false
+ "values": true
},
"lines": true,
"linewidth": 1,
"nullPointMode": "null",
+ "options": {
+ "alertThreshold": true
+ },
"percentage": false,
- "pluginVersion": "7.1.1",
+ "pluginVersion": "8.0.0",
"pointradius": 2,
"points": false,
"renderer": "flot",
@@ -2603,7 +2712,6 @@ data:
"description": "Shows requests retry rate by url. Number of retries is unlimited but protected with delays up to 1m between attempts.\n\nRemote write URLs are hidden by default but might be unveiled once `-remoteWrite.showURL` is set to true.\n\n",
"fieldConfig": {
"defaults": {
- "custom": {},
"links": []
},
"overrides": []
@@ -2619,19 +2727,25 @@ data:
"hiddenSeries": false,
"id": 61,
"legend": {
- "avg": false,
- "current": false,
- "max": false,
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": true,
"min": false,
- "show": false,
+ "show": true,
+ "sort": "current",
+ "sortDesc": true,
"total": false,
- "values": false
+ "values": true
},
"lines": true,
"linewidth": 1,
"nullPointMode": "null",
+ "options": {
+ "alertThreshold": true
+ },
"percentage": false,
- "pluginVersion": "7.1.1",
+ "pluginVersion": "8.0.0",
"pointradius": 2,
"points": false,
"renderer": "flot",
@@ -2641,9 +2755,10 @@ data:
"steppedLine": false,
"targets": [
{
+ "exemplar": true,
"expr": "sum(rate(vmagent_remotewrite_retries_count_total{job=~\"$job\", instance=~\"$instance\"}[$__interval])) by(url)",
"interval": "",
- "legendFormat": "{{`{{`}} url {{`}}`}}",
+ "legendFormat": "",
"refId": "A"
}
],
@@ -2697,7 +2812,6 @@ data:
"description": "Shows current number of established connections to remote write endpoints.\n\n",
"fieldConfig": {
"defaults": {
- "custom": {},
"links": []
},
"overrides": []
@@ -2713,19 +2827,25 @@ data:
"hiddenSeries": false,
"id": 65,
"legend": {
- "avg": false,
- "current": false,
- "max": false,
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": true,
"min": false,
- "show": false,
+ "show": true,
+ "sort": "current",
+ "sortDesc": true,
"total": false,
- "values": false
+ "values": true
},
"lines": true,
"linewidth": 1,
"nullPointMode": "null",
+ "options": {
+ "alertThreshold": true
+ },
"percentage": false,
- "pluginVersion": "7.1.1",
+ "pluginVersion": "8.0.0",
"pointradius": 2,
"points": false,
"renderer": "flot",
@@ -2797,12 +2917,6 @@ data:
"dataFormat": "tsbuckets",
"datasource": "$ds",
"description": "Shows the remote write request block size distribution in rows.",
- "fieldConfig": {
- "defaults": {
- "custom": {}
- },
- "overrides": []
- },
"gridPos": {
"h": 8,
"w": 12,
@@ -2868,12 +2982,6 @@ data:
"dataFormat": "tsbuckets",
"datasource": "$ds",
"description": "Shows the remote write request block size distribution in bytes.",
- "fieldConfig": {
- "defaults": {
- "custom": {}
- },
- "overrides": []
- },
"gridPos": {
"h": 8,
"w": 12,
@@ -2939,12 +3047,6 @@ data:
"dataFormat": "tsbuckets",
"datasource": "$ds",
"description": "Shows the remote write request duration distribution in seconds. Value depends on block size, network quality and remote storage performance.",
- "fieldConfig": {
- "defaults": {
- "custom": {}
- },
- "overrides": []
- },
"gridPos": {
"h": 8,
"w": 24,
@@ -3002,6 +3104,10 @@ data:
{
"collapsed": true,
"datasource": "$ds",
+ "fieldConfig": {
+ "defaults": {},
+ "overrides": []
+ },
"gridPos": {
"h": 1,
"w": 24,
@@ -3019,7 +3125,6 @@ data:
"description": "Shows the CPU usage per vmagent instance. \nIf you think that usage is abnormal or unexpected pls file an issue and attach CPU profile if possible.",
"fieldConfig": {
"defaults": {
- "custom": {},
"links": []
},
"overrides": []
@@ -3035,13 +3140,16 @@ data:
"hiddenSeries": false,
"id": 35,
"legend": {
- "avg": false,
- "current": false,
- "max": false,
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": true,
"min": false,
"show": true,
+ "sort": "current",
+ "sortDesc": true,
"total": false,
- "values": false
+ "values": true
},
"lines": true,
"linewidth": 1,
@@ -3053,8 +3161,11 @@ data:
}
],
"nullPointMode": "null",
+ "options": {
+ "alertThreshold": true
+ },
"percentage": false,
- "pluginVersion": "7.1.1",
+ "pluginVersion": "8.0.0",
"pointradius": 2,
"points": false,
"renderer": "flot",
@@ -3118,10 +3229,9 @@ data:
"dashLength": 10,
"dashes": false,
"datasource": "$ds",
- "description": "Amount of used memory (resident)\n\nIf you think that usage is abnormal or unexpected pls file an issue and attach memory profile if possible.",
+ "description": "Amount of used memory\n\nIf you think that usage is abnormal or unexpected, please file an issue and attach memory profile if possible.",
"fieldConfig": {
"defaults": {
- "custom": {},
"links": []
},
"overrides": []
@@ -3137,13 +3247,16 @@ data:
"hiddenSeries": false,
"id": 37,
"legend": {
- "avg": false,
- "current": false,
- "max": false,
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": true,
"min": false,
- "show": false,
+ "show": true,
+ "sort": "current",
+ "sortDesc": true,
"total": false,
- "values": false
+ "values": true
},
"lines": true,
"linewidth": 1,
@@ -3155,8 +3268,11 @@ data:
}
],
"nullPointMode": "null",
+ "options": {
+ "alertThreshold": true
+ },
"percentage": false,
- "pluginVersion": "7.1.1",
+ "pluginVersion": "8.0.0",
"pointradius": 2,
"points": false,
"renderer": "flot",
@@ -3166,10 +3282,19 @@ data:
"steppedLine": false,
"targets": [
{
+ "exemplar": true,
"expr": "sum(process_resident_memory_bytes{job=~\"$job\", instance=~\"$instance\"}) by (instance)",
"interval": "",
- "legendFormat": "{{`{{`}}instance{{`}}`}}",
+ "legendFormat": "resident {{`{{`}}instance{{`}}`}}",
"refId": "A"
+ },
+ {
+ "exemplar": true,
+ "expr": "sum(process_resident_memory_anon_bytes{job=~\"$job\", instance=~\"$instance\"}) by (instance)",
+ "hide": false,
+ "interval": "",
+ "legendFormat": "anonymous {{`{{`}}instance{{`}}`}}",
+ "refId": "B"
}
],
"thresholds": [],
@@ -3222,7 +3347,6 @@ data:
"description": "Panel shows the number of open file descriptors in the OS.\nReaching the limit of open files can cause various issues and must be prevented.\n\nSee how to change limits here https://medium.com/@muhammadtriwibowo/set-permanently-ulimit-n-open-files-in-ubuntu-4d61064429a",
"fieldConfig": {
"defaults": {
- "custom": {},
"links": []
},
"overrides": []
@@ -3238,20 +3362,26 @@ data:
"hiddenSeries": false,
"id": 83,
"legend": {
- "avg": false,
- "current": false,
- "max": false,
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": true,
"min": false,
- "show": false,
+ "show": true,
+ "sort": "current",
+ "sortDesc": true,
"total": false,
- "values": false
+ "values": true
},
"lines": true,
"linewidth": 1,
"links": [],
"nullPointMode": "null",
+ "options": {
+ "alertThreshold": true
+ },
"percentage": false,
- "pluginVersion": "7.1.1",
+ "pluginVersion": "8.0.0",
"pointradius": 2,
"points": false,
"renderer": "flot",
@@ -3332,7 +3462,6 @@ data:
"datasource": "$ds",
"fieldConfig": {
"defaults": {
- "custom": {},
"links": []
},
"overrides": []
@@ -3348,20 +3477,26 @@ data:
"hiddenSeries": false,
"id": 39,
"legend": {
- "avg": false,
- "current": false,
- "max": false,
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": true,
"min": false,
- "show": false,
+ "show": true,
+ "sort": "current",
+ "sortDesc": true,
"total": false,
- "values": false
+ "values": true
},
"lines": true,
"linewidth": 1,
"links": [],
"nullPointMode": "null",
+ "options": {
+ "alertThreshold": true
+ },
"percentage": false,
- "pluginVersion": "7.1.1",
+ "pluginVersion": "8.0.0",
"pointradius": 2,
"points": false,
"renderer": "flot",
@@ -3430,7 +3565,6 @@ data:
"description": "Shows the number of bytes read/write from the storage layer when vmagent has to buffer data on disk or read already buffered data.",
"fieldConfig": {
"defaults": {
- "custom": {},
"links": []
},
"overrides": []
@@ -3446,20 +3580,26 @@ data:
"hiddenSeries": false,
"id": 81,
"legend": {
- "avg": false,
- "current": false,
- "max": false,
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": true,
"min": false,
- "show": false,
+ "show": true,
+ "sort": "current",
+ "sortDesc": true,
"total": false,
- "values": false
+ "values": true
},
"lines": true,
"linewidth": 1,
"links": [],
"nullPointMode": "null",
+ "options": {
+ "alertThreshold": true
+ },
"percentage": false,
- "pluginVersion": "7.1.1",
+ "pluginVersion": "8.0.0",
"pointradius": 2,
"points": false,
"renderer": "flot",
@@ -3542,7 +3682,6 @@ data:
"datasource": "$ds",
"fieldConfig": {
"defaults": {
- "custom": {},
"links": []
},
"overrides": []
@@ -3558,20 +3697,26 @@ data:
"hiddenSeries": false,
"id": 41,
"legend": {
- "avg": false,
- "current": false,
- "max": false,
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": true,
"min": false,
- "show": false,
+ "show": true,
+ "sort": "current",
+ "sortDesc": true,
"total": false,
- "values": false
+ "values": true
},
"lines": true,
"linewidth": 1,
"links": [],
"nullPointMode": "null",
+ "options": {
+ "alertThreshold": true
+ },
"percentage": false,
- "pluginVersion": "7.1.1",
+ "pluginVersion": "8.0.0",
"pointradius": 2,
"points": false,
"renderer": "flot",
@@ -3638,7 +3783,6 @@ data:
"datasource": "$ds",
"fieldConfig": {
"defaults": {
- "custom": {},
"links": []
},
"overrides": []
@@ -3654,20 +3798,26 @@ data:
"hiddenSeries": false,
"id": 43,
"legend": {
- "avg": false,
- "current": false,
- "max": false,
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": true,
"min": false,
- "show": false,
+ "show": true,
+ "sort": "current",
+ "sortDesc": true,
"total": false,
- "values": false
+ "values": true
},
"lines": true,
"linewidth": 1,
"links": [],
"nullPointMode": "null",
+ "options": {
+ "alertThreshold": true
+ },
"percentage": false,
- "pluginVersion": "7.1.1",
+ "pluginVersion": "8.0.0",
"pointradius": 2,
"points": false,
"renderer": "flot",
@@ -3731,7 +3881,7 @@ data:
}
],
"refresh": false,
- "schemaVersion": 26,
+ "schemaVersion": 30,
"style": "dark",
"tags": [
"vmagent",
@@ -3745,6 +3895,8 @@ data:
"text": "VictoriaMetrics",
"value": "VictoriaMetrics"
},
+ "description": null,
+ "error": null,
"hide": 0,
"includeAll": false,
"label": null,
@@ -3763,19 +3915,23 @@ data:
"current": {},
"datasource": "$ds",
"definition": "label_values(vm_app_version{version=~\"^vmagent.*\"}, job)",
+ "description": null,
+ "error": null,
"hide": 0,
"includeAll": false,
"label": null,
"multi": true,
"name": "job",
"options": [],
- "query": "label_values(vm_app_version{version=~\"^vmagent.*\"}, job)",
+ "query": {
+ "query": "label_values(vm_app_version{version=~\"^vmagent.*\"}, job)",
+ "refId": "VictoriaMetrics-job-Variable-Query"
+ },
"refresh": 1,
"regex": "",
"skipUrlSync": false,
"sort": 0,
"tagValuesQuery": "",
- "tags": [],
"tagsQuery": "",
"type": "query",
"useTags": false
@@ -3785,19 +3941,23 @@ data:
"current": {},
"datasource": "$ds",
"definition": "label_values(vm_app_version{job=~\"$job\"}, instance)",
+ "description": null,
+ "error": null,
"hide": 0,
"includeAll": true,
"label": null,
"multi": true,
"name": "instance",
"options": [],
- "query": "label_values(vm_app_version{job=~\"$job\"}, instance)",
+ "query": {
+ "query": "label_values(vm_app_version{job=~\"$job\"}, instance)",
+ "refId": "VictoriaMetrics-instance-Variable-Query"
+ },
"refresh": 1,
"regex": "",
"skipUrlSync": false,
"sort": 0,
"tagValuesQuery": "",
- "tags": [],
"tagsQuery": "",
"type": "query",
"useTags": false
diff --git a/charts/victoria-metrics-k8s-stack/templates/grafana/datasource.yaml b/charts/victoria-metrics-k8s-stack/templates/grafana/datasource.yaml
index 96560801c..039eba30d 100644
--- a/charts/victoria-metrics-k8s-stack/templates/grafana/datasource.yaml
+++ b/charts/victoria-metrics-k8s-stack/templates/grafana/datasource.yaml
@@ -13,13 +13,13 @@ metadata:
app: {{ include "victoria-metrics-k8s-stack.name" $ }}-grafana
{{- include "victoria-metrics-k8s-stack.labels" . | nindent 4 }}
data:
-{{- if .Values.vmsingle.enabled }}
+{{- if or .Values.vmsingle.enabled .Values.vmcluster.enabled }}
datasource.yaml: |-
apiVersion: 1
datasources:
- name: VictoriaMetrics
type: prometheus
- {{ include "victoria-metrics-k8s-stack.vmEndpoint" . }}
+ url: {{ include "victoria-metrics-k8s-stack.vmSelectEndpoint" . }}
access: proxy
isDefault: true
{{- end }}
diff --git a/charts/victoria-metrics-k8s-stack/templates/rules/general.rules.yaml b/charts/victoria-metrics-k8s-stack/templates/rules/general.rules.yaml
index b0d121f9f..d85009f21 100644
--- a/charts/victoria-metrics-k8s-stack/templates/rules/general.rules.yaml
+++ b/charts/victoria-metrics-k8s-stack/templates/rules/general.rules.yaml
@@ -26,7 +26,7 @@ spec:
- alert: TargetDown
annotations:
description: '{{`{{`}} printf "%.4g" $value {{`}}`}}% of the {{`{{`}} $labels.job {{`}}`}}/{{`{{`}} $labels.service {{`}}`}} targets in {{`{{`}} $labels.namespace {{`}}`}} namespace are down.'
- runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-targetdown
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}/general/targetdown
summary: One or more targets are unreachable.
expr: 100 * (count(up == 0) BY (job, namespace, service) / count(up) BY (job, namespace, service)) > 10
for: 10m
@@ -48,7 +48,7 @@ spec:
"DeadMansSnitch" integration in PagerDuty.
'
- runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-watchdog
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}/general/watchdog
summary: An alert that should always be firing to certify that Alertmanager is working properly.
expr: vector(1)
labels:
diff --git a/charts/victoria-metrics-k8s-stack/templates/rules/k8s.rules.yaml b/charts/victoria-metrics-k8s-stack/templates/rules/k8s.rules.yaml
index 4928d8710..174b35c3a 100644
--- a/charts/victoria-metrics-k8s-stack/templates/rules/k8s.rules.yaml
+++ b/charts/victoria-metrics-k8s-stack/templates/rules/k8s.rules.yaml
@@ -25,11 +25,11 @@ spec:
rules:
- expr: |-
sum by (cluster, namespace, pod, container) (
- rate(container_cpu_usage_seconds_total{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}[5m])
+ irate(container_cpu_usage_seconds_total{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}[5m])
) * on (cluster, namespace, pod) group_left(node) topk by (cluster, namespace, pod) (
1, max by(cluster, namespace, pod, node) (kube_pod_info{node!=""})
)
- record: node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate
+ record: node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate
- expr: |-
container_memory_working_set_bytes{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
* on (namespace, pod) group_left(node) topk by(namespace, pod) (1,
@@ -54,6 +54,12 @@ spec:
max by(namespace, pod, node) (kube_pod_info{node!=""})
)
record: node_namespace_pod_container:container_memory_swap
+ - expr: |-
+ kube_pod_container_resource_requests{resource="memory",job="kube-state-metrics"} * on (namespace, pod, cluster)
+ group_left() max by (namespace, pod) (
+ (kube_pod_status_phase{phase=~"Pending|Running"} == 1)
+ )
+ record: cluster:namespace:pod_memory:active:kube_pod_container_resource_requests
- expr: |-
sum by (namespace, cluster) (
sum by (namespace, pod, cluster) (
@@ -65,6 +71,12 @@ spec:
)
)
record: namespace_memory:kube_pod_container_resource_requests:sum
+ - expr: |-
+ kube_pod_container_resource_requests{resource="cpu",job="kube-state-metrics"} * on (namespace, pod, cluster)
+ group_left() max by (namespace, pod) (
+ (kube_pod_status_phase{phase=~"Pending|Running"} == 1)
+ )
+ record: cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests
- expr: |-
sum by (namespace, cluster) (
sum by (namespace, pod, cluster) (
@@ -76,6 +88,40 @@ spec:
)
)
record: namespace_cpu:kube_pod_container_resource_requests:sum
+ - expr: |-
+ kube_pod_container_resource_limits{resource="memory",job="kube-state-metrics"} * on (namespace, pod, cluster)
+ group_left() max by (namespace, pod) (
+ (kube_pod_status_phase{phase=~"Pending|Running"} == 1)
+ )
+ record: cluster:namespace:pod_memory:active:kube_pod_container_resource_limits
+ - expr: |-
+ sum by (namespace, cluster) (
+ sum by (namespace, pod, cluster) (
+ max by (namespace, pod, container, cluster) (
+ kube_pod_container_resource_limits{resource="memory",job="kube-state-metrics"}
+ ) * on(namespace, pod, cluster) group_left() max by (namespace, pod) (
+ kube_pod_status_phase{phase=~"Pending|Running"} == 1
+ )
+ )
+ )
+ record: namespace_memory:kube_pod_container_resource_limits:sum
+ - expr: |-
+ kube_pod_container_resource_limits{resource="cpu",job="kube-state-metrics"} * on (namespace, pod, cluster)
+ group_left() max by (namespace, pod) (
+ (kube_pod_status_phase{phase=~"Pending|Running"} == 1)
+ )
+ record: cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits
+ - expr: |-
+ sum by (namespace, cluster) (
+ sum by (namespace, pod, cluster) (
+ max by (namespace, pod, container, cluster) (
+ kube_pod_container_resource_limits{resource="cpu",job="kube-state-metrics"}
+ ) * on(namespace, pod, cluster) group_left() max by (namespace, pod) (
+ kube_pod_status_phase{phase=~"Pending|Running"} == 1
+ )
+ )
+ )
+ record: namespace_cpu:kube_pod_container_resource_limits:sum
- expr: |-
max by (cluster, namespace, workload, pod) (
label_replace(
diff --git a/charts/victoria-metrics-k8s-stack/templates/rules/kube-apiserver-availability.rules.yaml b/charts/victoria-metrics-k8s-stack/templates/rules/kube-apiserver-availability.rules.yaml
index fc8c9f4c8..a006cb9e9 100644
--- a/charts/victoria-metrics-k8s-stack/templates/rules/kube-apiserver-availability.rules.yaml
+++ b/charts/victoria-metrics-k8s-stack/templates/rules/kube-apiserver-availability.rules.yaml
@@ -24,6 +24,16 @@ spec:
- interval: 3m
name: kube-apiserver-availability.rules
rules:
+ - expr: avg_over_time(code_verb:apiserver_request_total:increase1h[30d]) * 24 * 30
+ record: code_verb:apiserver_request_total:increase30d
+ - expr: sum by (cluster, code) (code_verb:apiserver_request_total:increase30d{verb=~"LIST|GET"})
+ labels:
+ verb: read
+ record: code:apiserver_request_total:increase30d
+ - expr: sum by (cluster, code) (code_verb:apiserver_request_total:increase30d{verb=~"POST|PUT|PATCH|DELETE"})
+ labels:
+ verb: write
+ record: code:apiserver_request_total:increase30d
- expr: |-
1 - (
(
@@ -38,14 +48,14 @@ spec:
-
(
(
- sum by (cluster) (increase(apiserver_request_duration_seconds_bucket{verb=~"LIST|GET",scope=~"resource|",le="0.1"}[30d]))
+ sum by (cluster) (increase(apiserver_request_duration_seconds_bucket{verb=~"LIST|GET",scope=~"resource|",le="1"}[30d]))
or
vector(0)
)
+
- sum by (cluster) (increase(apiserver_request_duration_seconds_bucket{verb=~"LIST|GET",scope="namespace",le="0.5"}[30d]))
+ sum by (cluster) (increase(apiserver_request_duration_seconds_bucket{verb=~"LIST|GET",scope="namespace",le="5"}[30d]))
+
- sum by (cluster) (increase(apiserver_request_duration_seconds_bucket{verb=~"LIST|GET",scope="cluster",le="5"}[30d]))
+ sum by (cluster) (increase(apiserver_request_duration_seconds_bucket{verb=~"LIST|GET",scope="cluster",le="40"}[30d]))
)
) +
# errors
@@ -63,14 +73,14 @@ spec:
(
# too slow
(
- sum by (cluster) (increase(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[30d]))
+ sum by (cluster) (increase(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="1"}[30d]))
or
vector(0)
)
+
- sum by (cluster) (increase(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[30d]))
+ sum by (cluster) (increase(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="5"}[30d]))
+
- sum by (cluster) (increase(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[30d]))
+ sum by (cluster) (increase(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="40"}[30d]))
)
+
# errors
@@ -98,62 +108,20 @@ spec:
labels:
verb: write
record: apiserver_request:availability30d
- - expr: avg_over_time(code_verb:apiserver_request_total:increase1h[30d]) * 24 * 30
- record: code_verb:apiserver_request_total:increase30d
- - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="LIST",code=~"2.."}[1h]))
- record: code_verb:apiserver_request_total:increase1h
- - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="GET",code=~"2.."}[1h]))
- record: code_verb:apiserver_request_total:increase1h
- - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="POST",code=~"2.."}[1h]))
- record: code_verb:apiserver_request_total:increase1h
- - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="PUT",code=~"2.."}[1h]))
- record: code_verb:apiserver_request_total:increase1h
- - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="PATCH",code=~"2.."}[1h]))
- record: code_verb:apiserver_request_total:increase1h
- - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="DELETE",code=~"2.."}[1h]))
- record: code_verb:apiserver_request_total:increase1h
- - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="LIST",code=~"3.."}[1h]))
- record: code_verb:apiserver_request_total:increase1h
- - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="GET",code=~"3.."}[1h]))
- record: code_verb:apiserver_request_total:increase1h
- - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="POST",code=~"3.."}[1h]))
- record: code_verb:apiserver_request_total:increase1h
- - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="PUT",code=~"3.."}[1h]))
- record: code_verb:apiserver_request_total:increase1h
- - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="PATCH",code=~"3.."}[1h]))
- record: code_verb:apiserver_request_total:increase1h
- - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="DELETE",code=~"3.."}[1h]))
- record: code_verb:apiserver_request_total:increase1h
- - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="LIST",code=~"4.."}[1h]))
- record: code_verb:apiserver_request_total:increase1h
- - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="GET",code=~"4.."}[1h]))
- record: code_verb:apiserver_request_total:increase1h
- - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="POST",code=~"4.."}[1h]))
- record: code_verb:apiserver_request_total:increase1h
- - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="PUT",code=~"4.."}[1h]))
- record: code_verb:apiserver_request_total:increase1h
- - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="PATCH",code=~"4.."}[1h]))
- record: code_verb:apiserver_request_total:increase1h
- - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="DELETE",code=~"4.."}[1h]))
- record: code_verb:apiserver_request_total:increase1h
- - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="LIST",code=~"5.."}[1h]))
- record: code_verb:apiserver_request_total:increase1h
- - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="GET",code=~"5.."}[1h]))
- record: code_verb:apiserver_request_total:increase1h
- - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="POST",code=~"5.."}[1h]))
- record: code_verb:apiserver_request_total:increase1h
- - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="PUT",code=~"5.."}[1h]))
- record: code_verb:apiserver_request_total:increase1h
- - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="PATCH",code=~"5.."}[1h]))
- record: code_verb:apiserver_request_total:increase1h
- - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="DELETE",code=~"5.."}[1h]))
- record: code_verb:apiserver_request_total:increase1h
- - expr: sum by (cluster, code) (code_verb:apiserver_request_total:increase30d{verb=~"LIST|GET"})
+ - expr: sum by (cluster,code,resource) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[5m]))
labels:
verb: read
- record: code:apiserver_request_total:increase30d
- - expr: sum by (cluster, code) (code_verb:apiserver_request_total:increase30d{verb=~"POST|PUT|PATCH|DELETE"})
+ record: code_resource:apiserver_request_total:rate5m
+ - expr: sum by (cluster,code,resource) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m]))
labels:
verb: write
- record: code:apiserver_request_total:increase30d
+ record: code_resource:apiserver_request_total:rate5m
+ - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"2.."}[1h]))
+ record: code_verb:apiserver_request_total:increase1h
+ - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"3.."}[1h]))
+ record: code_verb:apiserver_request_total:increase1h
+ - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"4.."}[1h]))
+ record: code_verb:apiserver_request_total:increase1h
+ - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"5.."}[1h]))
+ record: code_verb:apiserver_request_total:increase1h
{{- end }}
\ No newline at end of file
diff --git a/charts/victoria-metrics-k8s-stack/templates/rules/kube-apiserver-burnrate.rules.yaml b/charts/victoria-metrics-k8s-stack/templates/rules/kube-apiserver-burnrate.rules.yaml
new file mode 100644
index 000000000..7950b818b
--- /dev/null
+++ b/charts/victoria-metrics-k8s-stack/templates/rules/kube-apiserver-burnrate.rules.yaml
@@ -0,0 +1,327 @@
+{{- /*
+Generated from 'kube-apiserver-burnrate.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/kubernetes-prometheusRule.yaml
+Do not change in-place! In order to change this file first read following link:
+https://github.com/VictoriaMetrics/helm-charts/tree/master/charts/victoria-metrics-k8s-stack/hack
+*/ -}}
+{{- if and .Values.defaultRules.create }}
+apiVersion: operator.victoriametrics.com/v1beta1
+kind: VMRule
+metadata:
+ namespace: {{ .Release.Namespace }}
+ name: {{ printf "%s-%s" (include "victoria-metrics-k8s-stack.fullname" .) "kube-apiserver-burnrate.rules" | trunc 63 | trimSuffix "-" }}
+ labels:
+ app: {{ include "victoria-metrics-k8s-stack.name" $ }}
+{{ include "victoria-metrics-k8s-stack.labels" $ | indent 4 }}
+{{- if .Values.defaultRules.labels }}
+{{ toYaml .Values.defaultRules.labels | indent 4 }}
+{{- end }}
+{{- if .Values.defaultRules.annotations }}
+ annotations:
+{{ toYaml .Values.defaultRules.annotations | indent 4 }}
+{{- end }}
+spec:
+ groups:
+ - name: kube-apiserver-burnrate.rules
+ rules:
+ - expr: |-
+ (
+ (
+ # too slow
+ sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[1d]))
+ -
+ (
+ (
+ sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="1"}[1d]))
+ or
+ vector(0)
+ )
+ +
+ sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="5"}[1d]))
+ +
+ sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="40"}[1d]))
+ )
+ )
+ +
+ # errors
+ sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[1d]))
+ )
+ /
+ sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[1d]))
+ labels:
+ verb: read
+ record: apiserver_request:burnrate1d
+ - expr: |-
+ (
+ (
+ # too slow
+ sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[1h]))
+ -
+ (
+ (
+ sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="1"}[1h]))
+ or
+ vector(0)
+ )
+ +
+ sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="5"}[1h]))
+ +
+ sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="40"}[1h]))
+ )
+ )
+ +
+ # errors
+ sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[1h]))
+ )
+ /
+ sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[1h]))
+ labels:
+ verb: read
+ record: apiserver_request:burnrate1h
+ - expr: |-
+ (
+ (
+ # too slow
+ sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[2h]))
+ -
+ (
+ (
+ sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="1"}[2h]))
+ or
+ vector(0)
+ )
+ +
+ sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="5"}[2h]))
+ +
+ sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="40"}[2h]))
+ )
+ )
+ +
+ # errors
+ sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[2h]))
+ )
+ /
+ sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[2h]))
+ labels:
+ verb: read
+ record: apiserver_request:burnrate2h
+ - expr: |-
+ (
+ (
+ # too slow
+ sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[30m]))
+ -
+ (
+ (
+ sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="1"}[30m]))
+ or
+ vector(0)
+ )
+ +
+ sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="5"}[30m]))
+ +
+ sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="40"}[30m]))
+ )
+ )
+ +
+ # errors
+ sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[30m]))
+ )
+ /
+ sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[30m]))
+ labels:
+ verb: read
+ record: apiserver_request:burnrate30m
+ - expr: |-
+ (
+ (
+ # too slow
+ sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[3d]))
+ -
+ (
+ (
+ sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="1"}[3d]))
+ or
+ vector(0)
+ )
+ +
+ sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="5"}[3d]))
+ +
+ sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="40"}[3d]))
+ )
+ )
+ +
+ # errors
+ sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[3d]))
+ )
+ /
+ sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[3d]))
+ labels:
+ verb: read
+ record: apiserver_request:burnrate3d
+ - expr: |-
+ (
+ (
+ # too slow
+ sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[5m]))
+ -
+ (
+ (
+ sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="1"}[5m]))
+ or
+ vector(0)
+ )
+ +
+ sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="5"}[5m]))
+ +
+ sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="40"}[5m]))
+ )
+ )
+ +
+ # errors
+ sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[5m]))
+ )
+ /
+ sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[5m]))
+ labels:
+ verb: read
+ record: apiserver_request:burnrate5m
+ - expr: |-
+ (
+ (
+ # too slow
+ sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[6h]))
+ -
+ (
+ (
+ sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="1"}[6h]))
+ or
+ vector(0)
+ )
+ +
+ sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="5"}[6h]))
+ +
+ sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="40"}[6h]))
+ )
+ )
+ +
+ # errors
+ sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[6h]))
+ )
+ /
+ sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[6h]))
+ labels:
+ verb: read
+ record: apiserver_request:burnrate6h
+ - expr: |-
+ (
+ (
+ # too slow
+ sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1d]))
+ -
+ sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[1d]))
+ )
+ +
+ sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[1d]))
+ )
+ /
+ sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1d]))
+ labels:
+ verb: write
+ record: apiserver_request:burnrate1d
+ - expr: |-
+ (
+ (
+ # too slow
+ sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1h]))
+ -
+ sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[1h]))
+ )
+ +
+ sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[1h]))
+ )
+ /
+ sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1h]))
+ labels:
+ verb: write
+ record: apiserver_request:burnrate1h
+ - expr: |-
+ (
+ (
+ # too slow
+ sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[2h]))
+ -
+ sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[2h]))
+ )
+ +
+ sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[2h]))
+ )
+ /
+ sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[2h]))
+ labels:
+ verb: write
+ record: apiserver_request:burnrate2h
+ - expr: |-
+ (
+ (
+ # too slow
+ sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[30m]))
+ -
+ sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[30m]))
+ )
+ +
+ sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[30m]))
+ )
+ /
+ sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[30m]))
+ labels:
+ verb: write
+ record: apiserver_request:burnrate30m
+ - expr: |-
+ (
+ (
+ # too slow
+ sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[3d]))
+ -
+ sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[3d]))
+ )
+ +
+ sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[3d]))
+ )
+ /
+ sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[3d]))
+ labels:
+ verb: write
+ record: apiserver_request:burnrate3d
+ - expr: |-
+ (
+ (
+ # too slow
+ sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m]))
+ -
+ sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[5m]))
+ )
+ +
+ sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[5m]))
+ )
+ /
+ sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m]))
+ labels:
+ verb: write
+ record: apiserver_request:burnrate5m
+ - expr: |-
+ (
+ (
+ # too slow
+ sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[6h]))
+ -
+ sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[6h]))
+ )
+ +
+ sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[6h]))
+ )
+ /
+ sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[6h]))
+ labels:
+ verb: write
+ record: apiserver_request:burnrate6h
+{{- end }}
\ No newline at end of file
diff --git a/charts/victoria-metrics-k8s-stack/templates/rules/kube-apiserver-histogram.rules.yaml b/charts/victoria-metrics-k8s-stack/templates/rules/kube-apiserver-histogram.rules.yaml
new file mode 100644
index 000000000..85ddf2f0c
--- /dev/null
+++ b/charts/victoria-metrics-k8s-stack/templates/rules/kube-apiserver-histogram.rules.yaml
@@ -0,0 +1,48 @@
+{{- /*
+Generated from 'kube-apiserver-histogram.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/kubernetes-prometheusRule.yaml
+Do not change in-place! In order to change this file first read following link:
+https://github.com/VictoriaMetrics/helm-charts/tree/master/charts/victoria-metrics-k8s-stack/hack
+*/ -}}
+{{- if and .Values.defaultRules.create }}
+apiVersion: operator.victoriametrics.com/v1beta1
+kind: VMRule
+metadata:
+ namespace: {{ .Release.Namespace }}
+ name: {{ printf "%s-%s" (include "victoria-metrics-k8s-stack.fullname" .) "kube-apiserver-histogram.rules" | trunc 63 | trimSuffix "-" }}
+ labels:
+ app: {{ include "victoria-metrics-k8s-stack.name" $ }}
+{{ include "victoria-metrics-k8s-stack.labels" $ | indent 4 }}
+{{- if .Values.defaultRules.labels }}
+{{ toYaml .Values.defaultRules.labels | indent 4 }}
+{{- end }}
+{{- if .Values.defaultRules.annotations }}
+ annotations:
+{{ toYaml .Values.defaultRules.annotations | indent 4 }}
+{{- end }}
+spec:
+ groups:
+ - name: kube-apiserver-histogram.rules
+ rules:
+ - expr: histogram_quantile(0.99, sum by (cluster, le, resource) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET"}[5m]))) > 0
+ labels:
+ quantile: '0.99'
+ verb: read
+ record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile
+ - expr: histogram_quantile(0.99, sum by (cluster, le, resource) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m]))) > 0
+ labels:
+ quantile: '0.99'
+ verb: write
+ record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile
+ - expr: histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod))
+ labels:
+ quantile: '0.99'
+ record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile
+ - expr: histogram_quantile(0.9, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod))
+ labels:
+ quantile: '0.9'
+ record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile
+ - expr: histogram_quantile(0.5, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod))
+ labels:
+ quantile: '0.5'
+ record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile
+{{- end }}
\ No newline at end of file
diff --git a/charts/victoria-metrics-k8s-stack/templates/rules/kube-apiserver-slos.yaml b/charts/victoria-metrics-k8s-stack/templates/rules/kube-apiserver-slos.yaml
index d85d1c187..f7a4dde76 100644
--- a/charts/victoria-metrics-k8s-stack/templates/rules/kube-apiserver-slos.yaml
+++ b/charts/victoria-metrics-k8s-stack/templates/rules/kube-apiserver-slos.yaml
@@ -26,7 +26,7 @@ spec:
- alert: KubeAPIErrorBudgetBurn
annotations:
description: The API server is burning too much error budget.
- runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeapierrorbudgetburn
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeapierrorbudgetburn
summary: The API server is burning too much error budget.
expr: |-
sum(apiserver_request:burnrate1h) > (14.40 * 0.01000)
@@ -43,7 +43,7 @@ spec:
- alert: KubeAPIErrorBudgetBurn
annotations:
description: The API server is burning too much error budget.
- runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeapierrorbudgetburn
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeapierrorbudgetburn
summary: The API server is burning too much error budget.
expr: |-
sum(apiserver_request:burnrate6h) > (6.00 * 0.01000)
@@ -60,7 +60,7 @@ spec:
- alert: KubeAPIErrorBudgetBurn
annotations:
description: The API server is burning too much error budget.
- runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeapierrorbudgetburn
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeapierrorbudgetburn
summary: The API server is burning too much error budget.
expr: |-
sum(apiserver_request:burnrate1d) > (3.00 * 0.01000)
@@ -77,7 +77,7 @@ spec:
- alert: KubeAPIErrorBudgetBurn
annotations:
description: The API server is burning too much error budget.
- runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeapierrorbudgetburn
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeapierrorbudgetburn
summary: The API server is burning too much error budget.
expr: |-
sum(apiserver_request:burnrate3d) > (1.00 * 0.01000)
diff --git a/charts/victoria-metrics-k8s-stack/templates/rules/kube-state-metrics.yaml b/charts/victoria-metrics-k8s-stack/templates/rules/kube-state-metrics.yaml
index 1ee495ea0..d8fe76402 100644
--- a/charts/victoria-metrics-k8s-stack/templates/rules/kube-state-metrics.yaml
+++ b/charts/victoria-metrics-k8s-stack/templates/rules/kube-state-metrics.yaml
@@ -26,7 +26,7 @@ spec:
- alert: KubeStateMetricsListErrors
annotations:
description: kube-state-metrics is experiencing errors at an elevated rate in list operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all.
- runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubestatemetricslisterrors
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}/kube-state-metrics/kubestatemetricslisterrors
summary: kube-state-metrics is experiencing errors in list operations.
expr: |-
(sum(rate(kube_state_metrics_list_total{job="kube-state-metrics",result="error"}[5m]))
@@ -42,7 +42,7 @@ spec:
- alert: KubeStateMetricsWatchErrors
annotations:
description: kube-state-metrics is experiencing errors at an elevated rate in watch operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all.
- runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubestatemetricswatcherrors
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}/kube-state-metrics/kubestatemetricswatcherrors
summary: kube-state-metrics is experiencing errors in watch operations.
expr: |-
(sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics",result="error"}[5m]))
@@ -58,7 +58,7 @@ spec:
- alert: KubeStateMetricsShardingMismatch
annotations:
description: kube-state-metrics pods are running with different --total-shards configuration, some Kubernetes objects may be exposed multiple times or not exposed at all.
- runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubestatemetricsshardingmismatch
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}/kube-state-metrics/kubestatemetricsshardingmismatch
summary: kube-state-metrics sharding is misconfigured.
expr: stdvar (kube_state_metrics_total_shards{job="kube-state-metrics"}) != 0
for: 15m
@@ -70,7 +70,7 @@ spec:
- alert: KubeStateMetricsShardsMissing
annotations:
description: kube-state-metrics shards are missing, some Kubernetes objects are not being exposed.
- runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubestatemetricsshardsmissing
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}/kube-state-metrics/kubestatemetricsshardsmissing
summary: kube-state-metrics shards are missing.
expr: |-
2^max(kube_state_metrics_total_shards{job="kube-state-metrics"}) - 1
diff --git a/charts/victoria-metrics-k8s-stack/templates/rules/kubernetes-apps.yaml b/charts/victoria-metrics-k8s-stack/templates/rules/kubernetes-apps.yaml
index 3517d07e5..4e9d6aedf 100644
--- a/charts/victoria-metrics-k8s-stack/templates/rules/kubernetes-apps.yaml
+++ b/charts/victoria-metrics-k8s-stack/templates/rules/kubernetes-apps.yaml
@@ -27,9 +27,12 @@ spec:
- alert: KubePodCrashLooping
annotations:
description: Pod {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod {{`}}`}} ({{`{{`}} $labels.container {{`}}`}}) is restarting {{`{{`}} printf "%.2f" $value {{`}}`}} times / 10 minutes.
- runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubepodcrashlooping
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubepodcrashlooping
summary: Pod is crash looping.
- expr: rate(kube_pod_container_status_restarts_total{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}[10m]) * 60 * 5 > 0
+ expr: |-
+ increase(kube_pod_container_status_restarts_total{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}[10m]) > 0
+ and
+ kube_pod_container_status_waiting{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} == 1
for: 15m
labels:
severity: warning
@@ -39,7 +42,7 @@ spec:
- alert: KubePodNotReady
annotations:
description: Pod {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod {{`}}`}} has been in a non-ready state for longer than 15 minutes.
- runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubepodnotready
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubepodnotready
summary: Pod has been in a non-ready state for more than 15 minutes.
expr: |-
sum by (namespace, pod) (
@@ -58,7 +61,7 @@ spec:
- alert: KubeDeploymentGenerationMismatch
annotations:
description: Deployment generation for {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.deployment {{`}}`}} does not match, this indicates that the Deployment has failed but has not been rolled back.
- runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubedeploymentgenerationmismatch
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubedeploymentgenerationmismatch
summary: Deployment generation mismatch due to possible roll-back
expr: |-
kube_deployment_status_observed_generation{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
@@ -73,12 +76,12 @@ spec:
- alert: KubeDeploymentReplicasMismatch
annotations:
description: Deployment {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.deployment {{`}}`}} has not matched the expected number of replicas for longer than 15 minutes.
- runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubedeploymentreplicasmismatch
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubedeploymentreplicasmismatch
summary: Deployment has not matched the expected number of replicas.
expr: |-
(
kube_deployment_spec_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
- !=
+ >
kube_deployment_status_replicas_available{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
) and (
changes(kube_deployment_status_replicas_updated{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}[10m])
@@ -94,7 +97,7 @@ spec:
- alert: KubeStatefulSetReplicasMismatch
annotations:
description: StatefulSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.statefulset {{`}}`}} has not matched the expected number of replicas for longer than 15 minutes.
- runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubestatefulsetreplicasmismatch
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubestatefulsetreplicasmismatch
summary: Deployment has not matched the expected number of replicas.
expr: |-
(
@@ -115,7 +118,7 @@ spec:
- alert: KubeStatefulSetGenerationMismatch
annotations:
description: StatefulSet generation for {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.statefulset {{`}}`}} does not match, this indicates that the StatefulSet has failed but has not been rolled back.
- runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubestatefulsetgenerationmismatch
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubestatefulsetgenerationmismatch
summary: StatefulSet generation mismatch due to possible roll-back
expr: |-
kube_statefulset_status_observed_generation{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
@@ -130,7 +133,7 @@ spec:
- alert: KubeStatefulSetUpdateNotRolledOut
annotations:
description: StatefulSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.statefulset {{`}}`}} update has not been rolled out.
- runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubestatefulsetupdatenotrolledout
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubestatefulsetupdatenotrolledout
summary: StatefulSet update has not been rolled out.
expr: |-
(
@@ -159,7 +162,7 @@ spec:
- alert: KubeDaemonSetRolloutStuck
annotations:
description: DaemonSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.daemonset {{`}}`}} has not finished or progressed for at least 15 minutes.
- runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubedaemonsetrolloutstuck
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubedaemonsetrolloutstuck
summary: DaemonSet rollout is stuck.
expr: |-
(
@@ -194,7 +197,7 @@ spec:
- alert: KubeContainerWaiting
annotations:
description: Pod {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod {{`}}`}} container {{`{{`}} $labels.container{{`}}`}} has been in waiting state for longer than 1 hour.
- runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubecontainerwaiting
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubecontainerwaiting
summary: Pod container waiting longer than 1 hour
expr: sum by (namespace, pod, container) (kube_pod_container_status_waiting_reason{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}) > 0
for: 1h
@@ -206,7 +209,7 @@ spec:
- alert: KubeDaemonSetNotScheduled
annotations:
description: '{{`{{`}} $value {{`}}`}} Pods of DaemonSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.daemonset {{`}}`}} are not scheduled.'
- runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubedaemonsetnotscheduled
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubedaemonsetnotscheduled
summary: DaemonSet pods are not scheduled.
expr: |-
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
@@ -221,7 +224,7 @@ spec:
- alert: KubeDaemonSetMisScheduled
annotations:
description: '{{`{{`}} $value {{`}}`}} Pods of DaemonSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.daemonset {{`}}`}} are running where they are not supposed to run.'
- runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubedaemonsetmisscheduled
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubedaemonsetmisscheduled
summary: DaemonSet pods are misscheduled.
expr: kube_daemonset_status_number_misscheduled{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} > 0
for: 15m
@@ -233,7 +236,7 @@ spec:
- alert: KubeJobCompletion
annotations:
description: Job {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.job_name {{`}}`}} is taking more than 12 hours to complete.
- runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubejobcompletion
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubejobcompletion
summary: Job did not complete in time
expr: kube_job_spec_completions{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} - kube_job_status_succeeded{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} > 0
for: 12h
@@ -245,7 +248,7 @@ spec:
- alert: KubeJobFailed
annotations:
description: Job {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.job_name {{`}}`}} failed to complete. Removing failed job after investigation should clear this alert.
- runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubejobfailed
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubejobfailed
summary: Job failed to complete.
expr: kube_job_failed{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} > 0
for: 15m
@@ -256,23 +259,23 @@ spec:
{{- end }}
- alert: KubeHpaReplicasMismatch
annotations:
- description: HPA {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.hpa {{`}}`}} has not matched the desired number of replicas for longer than 15 minutes.
- runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubehpareplicasmismatch
+ description: HPA {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.horizontalpodautoscaler {{`}}`}} has not matched the desired number of replicas for longer than 15 minutes.
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubehpareplicasmismatch
summary: HPA has not matched descired number of replicas.
expr: |-
- (kube_hpa_status_desired_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
+ (kube_horizontalpodautoscaler_status_desired_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
!=
- kube_hpa_status_current_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"})
+ kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"})
and
- (kube_hpa_status_current_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
+ (kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
>
- kube_hpa_spec_min_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"})
+ kube_horizontalpodautoscaler_spec_min_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"})
and
- (kube_hpa_status_current_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
+ (kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
<
- kube_hpa_spec_max_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"})
+ kube_horizontalpodautoscaler_spec_max_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"})
and
- changes(kube_hpa_status_current_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}[15m]) == 0
+ changes(kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}[15m]) == 0
for: 15m
labels:
severity: warning
@@ -281,13 +284,13 @@ spec:
{{- end }}
- alert: KubeHpaMaxedOut
annotations:
- description: HPA {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.hpa {{`}}`}} has been running at max replicas for longer than 15 minutes.
- runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubehpamaxedout
+ description: HPA {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.horizontalpodautoscaler {{`}}`}} has been running at max replicas for longer than 15 minutes.
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubehpamaxedout
summary: HPA is running at max replicas
expr: |-
- kube_hpa_status_current_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
+ kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
==
- kube_hpa_spec_max_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
+ kube_horizontalpodautoscaler_spec_max_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
for: 15m
labels:
severity: warning
diff --git a/charts/victoria-metrics-k8s-stack/templates/rules/kubernetes-resources.yaml b/charts/victoria-metrics-k8s-stack/templates/rules/kubernetes-resources.yaml
index 1158e8762..165209a67 100644
--- a/charts/victoria-metrics-k8s-stack/templates/rules/kubernetes-resources.yaml
+++ b/charts/victoria-metrics-k8s-stack/templates/rules/kubernetes-resources.yaml
@@ -26,7 +26,7 @@ spec:
- alert: KubeCPUOvercommit
annotations:
description: Cluster has overcommitted CPU resource requests for Pods and cannot tolerate node failure.
- runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubecpuovercommit
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubecpuovercommit
summary: Cluster has overcommitted CPU resource requests.
expr: |-
sum(namespace_cpu:kube_pod_container_resource_requests:sum{})
@@ -43,7 +43,7 @@ spec:
- alert: KubeMemoryOvercommit
annotations:
description: Cluster has overcommitted memory resource requests for Pods and cannot tolerate node failure.
- runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubememoryovercommit
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubememoryovercommit
summary: Cluster has overcommitted memory resource requests.
expr: |-
sum(namespace_memory:kube_pod_container_resource_requests:sum{})
@@ -62,7 +62,7 @@ spec:
- alert: KubeCPUQuotaOvercommit
annotations:
description: Cluster has overcommitted CPU resource requests for Namespaces.
- runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubecpuquotaovercommit
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubecpuquotaovercommit
summary: Cluster has overcommitted CPU resource requests.
expr: |-
sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="cpu"})
@@ -78,7 +78,7 @@ spec:
- alert: KubeMemoryQuotaOvercommit
annotations:
description: Cluster has overcommitted memory resource requests for Namespaces.
- runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubememoryquotaovercommit
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubememoryquotaovercommit
summary: Cluster has overcommitted memory resource requests.
expr: |-
sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="memory"})
@@ -94,7 +94,7 @@ spec:
- alert: KubeQuotaAlmostFull
annotations:
description: Namespace {{`{{`}} $labels.namespace {{`}}`}} is using {{`{{`}} $value | humanizePercentage {{`}}`}} of its {{`{{`}} $labels.resource {{`}}`}} quota.
- runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubequotaalmostfull
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubequotaalmostfull
summary: Namespace quota is going to be full.
expr: |-
kube_resourcequota{job="kube-state-metrics", type="used"}
@@ -110,7 +110,7 @@ spec:
- alert: KubeQuotaFullyUsed
annotations:
description: Namespace {{`{{`}} $labels.namespace {{`}}`}} is using {{`{{`}} $value | humanizePercentage {{`}}`}} of its {{`{{`}} $labels.resource {{`}}`}} quota.
- runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubequotafullyused
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubequotafullyused
summary: Namespace quota is fully used.
expr: |-
kube_resourcequota{job="kube-state-metrics", type="used"}
@@ -126,7 +126,7 @@ spec:
- alert: KubeQuotaExceeded
annotations:
description: Namespace {{`{{`}} $labels.namespace {{`}}`}} is using {{`{{`}} $value | humanizePercentage {{`}}`}} of its {{`{{`}} $labels.resource {{`}}`}} quota.
- runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubequotaexceeded
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubequotaexceeded
summary: Namespace quota has exceeded the limits.
expr: |-
kube_resourcequota{job="kube-state-metrics", type="used"}
@@ -142,7 +142,7 @@ spec:
- alert: CPUThrottlingHigh
annotations:
description: '{{`{{`}} $value | humanizePercentage {{`}}`}} throttling of CPU in namespace {{`{{`}} $labels.namespace {{`}}`}} for container {{`{{`}} $labels.container {{`}}`}} in pod {{`{{`}} $labels.pod {{`}}`}}.'
- runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-cputhrottlinghigh
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/cputhrottlinghigh
summary: Processes experience elevated CPU throttling.
expr: |-
sum(increase(container_cpu_cfs_throttled_periods_total{container!="", }[5m])) by (container, pod, namespace)
diff --git a/charts/victoria-metrics-k8s-stack/templates/rules/kubernetes-storage.yaml b/charts/victoria-metrics-k8s-stack/templates/rules/kubernetes-storage.yaml
index 0eb9141ca..56217031d 100644
--- a/charts/victoria-metrics-k8s-stack/templates/rules/kubernetes-storage.yaml
+++ b/charts/victoria-metrics-k8s-stack/templates/rules/kubernetes-storage.yaml
@@ -27,13 +27,16 @@ spec:
- alert: KubePersistentVolumeFillingUp
annotations:
description: The PersistentVolume claimed by {{`{{`}} $labels.persistentvolumeclaim {{`}}`}} in Namespace {{`{{`}} $labels.namespace {{`}}`}} is only {{`{{`}} $value | humanizePercentage {{`}}`}} free.
- runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubepersistentvolumefillingup
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubepersistentvolumefillingup
summary: PersistentVolume is filling up.
expr: |-
- kubelet_volume_stats_available_bytes{job="kubelet", namespace=~"{{ $targetNamespace }}", metrics_path="/metrics"}
- /
- kubelet_volume_stats_capacity_bytes{job="kubelet", namespace=~"{{ $targetNamespace }}", metrics_path="/metrics"}
- < 0.03
+ (
+ kubelet_volume_stats_available_bytes{job="kubelet", namespace=~"{{ $targetNamespace }}", metrics_path="/metrics"}
+ /
+ kubelet_volume_stats_capacity_bytes{job="kubelet", namespace=~"{{ $targetNamespace }}", metrics_path="/metrics"}
+ ) < 0.03
+ and
+ kubelet_volume_stats_used_bytes{job="kubelet", namespace=~"{{ $targetNamespace }}", metrics_path="/metrics"} > 0
for: 1m
labels:
severity: critical
@@ -43,7 +46,7 @@ spec:
- alert: KubePersistentVolumeFillingUp
annotations:
description: Based on recent sampling, the PersistentVolume claimed by {{`{{`}} $labels.persistentvolumeclaim {{`}}`}} in Namespace {{`{{`}} $labels.namespace {{`}}`}} is expected to fill up within four days. Currently {{`{{`}} $value | humanizePercentage {{`}}`}} is available.
- runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubepersistentvolumefillingup
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubepersistentvolumefillingup
summary: PersistentVolume is filling up.
expr: |-
(
@@ -52,6 +55,8 @@ spec:
kubelet_volume_stats_capacity_bytes{job="kubelet", namespace=~"{{ $targetNamespace }}", metrics_path="/metrics"}
) < 0.15
and
+ kubelet_volume_stats_used_bytes{job="kubelet", namespace=~"{{ $targetNamespace }}", metrics_path="/metrics"} > 0
+ and
predict_linear(kubelet_volume_stats_available_bytes{job="kubelet", namespace=~"{{ $targetNamespace }}", metrics_path="/metrics"}[6h], 4 * 24 * 3600) < 0
for: 1h
labels:
@@ -62,7 +67,7 @@ spec:
- alert: KubePersistentVolumeErrors
annotations:
description: The persistent volume {{`{{`}} $labels.persistentvolume {{`}}`}} has status {{`{{`}} $labels.phase {{`}}`}}.
- runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubepersistentvolumeerrors
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubepersistentvolumeerrors
summary: PersistentVolume is having issues with provisioning.
expr: kube_persistentvolume_status_phase{phase=~"Failed|Pending",job="kube-state-metrics"} > 0
for: 5m
diff --git a/charts/victoria-metrics-k8s-stack/templates/rules/kubernetes-system-apiserver.yaml b/charts/victoria-metrics-k8s-stack/templates/rules/kubernetes-system-apiserver.yaml
index 3d84cdb5a..f3726e8d6 100644
--- a/charts/victoria-metrics-k8s-stack/templates/rules/kubernetes-system-apiserver.yaml
+++ b/charts/victoria-metrics-k8s-stack/templates/rules/kubernetes-system-apiserver.yaml
@@ -26,7 +26,7 @@ spec:
- alert: KubeClientCertificateExpiration
annotations:
description: A client certificate used to authenticate to the apiserver is expiring in less than 7.0 days.
- runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeclientcertificateexpiration
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeclientcertificateexpiration
summary: Client certificate is about to expire.
expr: apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 604800
labels:
@@ -37,7 +37,7 @@ spec:
- alert: KubeClientCertificateExpiration
annotations:
description: A client certificate used to authenticate to the apiserver is expiring in less than 24.0 hours.
- runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeclientcertificateexpiration
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeclientcertificateexpiration
summary: Client certificate is about to expire.
expr: apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 86400
labels:
@@ -48,7 +48,7 @@ spec:
- alert: AggregatedAPIErrors
annotations:
description: An aggregated API {{`{{`}} $labels.name {{`}}`}}/{{`{{`}} $labels.namespace {{`}}`}} has reported errors. It has appeared unavailable {{`{{`}} $value | humanize {{`}}`}} times averaged over the past 10m.
- runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-aggregatedapierrors
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/aggregatedapierrors
summary: An aggregated API has reported errors.
expr: sum by(name, namespace)(increase(aggregator_unavailable_apiservice_total[10m])) > 4
labels:
@@ -59,7 +59,7 @@ spec:
- alert: AggregatedAPIDown
annotations:
description: An aggregated API {{`{{`}} $labels.name {{`}}`}}/{{`{{`}} $labels.namespace {{`}}`}} has been only {{`{{`}} $value | humanize {{`}}`}}% available over the last 10m.
- runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-aggregatedapidown
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/aggregatedapidown
summary: An aggregated API is down.
expr: (1 - max by(name, namespace)(avg_over_time(aggregator_unavailable_apiservice[10m]))) * 100 < 85
for: 5m
@@ -72,7 +72,7 @@ spec:
- alert: KubeAPIDown
annotations:
description: KubeAPI has disappeared from Prometheus target discovery.
- runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeapidown
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeapidown
summary: Target disappeared from Prometheus target discovery.
expr: absent(up{job="apiserver"} == 1)
for: 15m
@@ -85,7 +85,7 @@ spec:
- alert: KubeAPITerminatedRequests
annotations:
description: The apiserver has terminated {{`{{`}} $value | humanizePercentage {{`}}`}} of its incoming requests.
- runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeapiterminatedrequests
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeapiterminatedrequests
summary: The apiserver has terminated {{`{{`}} $value | humanizePercentage {{`}}`}} of its incoming requests.
expr: sum(rate(apiserver_request_terminations_total{job="apiserver"}[10m])) / ( sum(rate(apiserver_request_total{job="apiserver"}[10m])) + sum(rate(apiserver_request_terminations_total{job="apiserver"}[10m])) ) > 0.20
for: 5m
diff --git a/charts/victoria-metrics-k8s-stack/templates/rules/kubernetes-system-controller-manager.yaml b/charts/victoria-metrics-k8s-stack/templates/rules/kubernetes-system-controller-manager.yaml
index 7c7d108d8..56c95826a 100644
--- a/charts/victoria-metrics-k8s-stack/templates/rules/kubernetes-system-controller-manager.yaml
+++ b/charts/victoria-metrics-k8s-stack/templates/rules/kubernetes-system-controller-manager.yaml
@@ -27,7 +27,7 @@ spec:
- alert: KubeControllerManagerDown
annotations:
description: KubeControllerManager has disappeared from Prometheus target discovery.
- runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubecontrollermanagerdown
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubecontrollermanagerdown
summary: Target disappeared from Prometheus target discovery.
expr: absent(up{job="kube-controller-manager"} == 1)
for: 15m
diff --git a/charts/victoria-metrics-k8s-stack/templates/rules/kubernetes-system-kubelet.yaml b/charts/victoria-metrics-k8s-stack/templates/rules/kubernetes-system-kubelet.yaml
index a4443a079..4b324efed 100644
--- a/charts/victoria-metrics-k8s-stack/templates/rules/kubernetes-system-kubelet.yaml
+++ b/charts/victoria-metrics-k8s-stack/templates/rules/kubernetes-system-kubelet.yaml
@@ -26,7 +26,7 @@ spec:
- alert: KubeNodeNotReady
annotations:
description: '{{`{{`}} $labels.node {{`}}`}} has been unready for more than 15 minutes.'
- runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubenodenotready
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubenodenotready
summary: Node is not ready.
expr: kube_node_status_condition{job="kube-state-metrics",condition="Ready",status="true"} == 0
for: 15m
@@ -38,7 +38,7 @@ spec:
- alert: KubeNodeUnreachable
annotations:
description: '{{`{{`}} $labels.node {{`}}`}} is unreachable and some workloads may be rescheduled.'
- runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubenodeunreachable
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubenodeunreachable
summary: Node is unreachable.
expr: (kube_node_spec_taint{job="kube-state-metrics",key="node.kubernetes.io/unreachable",effect="NoSchedule"} unless ignoring(key,value) kube_node_spec_taint{job="kube-state-metrics",key=~"ToBeDeletedByClusterAutoscaler|cloud.google.com/impending-node-termination|aws-node-termination-handler/spot-itn"}) == 1
for: 15m
@@ -50,7 +50,7 @@ spec:
- alert: KubeletTooManyPods
annotations:
description: Kubelet '{{`{{`}} $labels.node {{`}}`}}' is running at {{`{{`}} $value | humanizePercentage {{`}}`}} of its Pod capacity.
- runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubelettoomanypods
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubelettoomanypods
summary: Kubelet is running at capacity.
expr: |-
count by(node) (
@@ -69,7 +69,7 @@ spec:
- alert: KubeNodeReadinessFlapping
annotations:
description: The readiness status of node {{`{{`}} $labels.node {{`}}`}} has changed {{`{{`}} $value {{`}}`}} times in the last 15 minutes.
- runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubenodereadinessflapping
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubenodereadinessflapping
summary: Node readiness status is flapping.
expr: sum(changes(kube_node_status_condition{status="true",condition="Ready"}[15m])) by (node) > 2
for: 15m
@@ -81,7 +81,7 @@ spec:
- alert: KubeletPlegDurationHigh
annotations:
description: The Kubelet Pod Lifecycle Event Generator has a 99th percentile duration of {{`{{`}} $value {{`}}`}} seconds on node {{`{{`}} $labels.node {{`}}`}}.
- runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeletplegdurationhigh
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeletplegdurationhigh
summary: Kubelet Pod Lifecycle Event Generator is taking too long to relist.
expr: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile{quantile="0.99"} >= 10
for: 5m
@@ -93,7 +93,7 @@ spec:
- alert: KubeletPodStartUpLatencyHigh
annotations:
description: Kubelet Pod startup 99th percentile latency is {{`{{`}} $value {{`}}`}} seconds on node {{`{{`}} $labels.node {{`}}`}}.
- runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeletpodstartuplatencyhigh
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeletpodstartuplatencyhigh
summary: Kubelet Pod startup latency is too high.
expr: histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{job="kubelet", metrics_path="/metrics"}[5m])) by (instance, le)) * on(instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"} > 60
for: 15m
@@ -105,7 +105,7 @@ spec:
- alert: KubeletClientCertificateExpiration
annotations:
description: Client certificate for Kubelet on node {{`{{`}} $labels.node {{`}}`}} expires in {{`{{`}} $value | humanizeDuration {{`}}`}}.
- runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeletclientcertificateexpiration
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeletclientcertificateexpiration
summary: Kubelet client certificate is about to expire.
expr: kubelet_certificate_manager_client_ttl_seconds < 604800
labels:
@@ -116,7 +116,7 @@ spec:
- alert: KubeletClientCertificateExpiration
annotations:
description: Client certificate for Kubelet on node {{`{{`}} $labels.node {{`}}`}} expires in {{`{{`}} $value | humanizeDuration {{`}}`}}.
- runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeletclientcertificateexpiration
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeletclientcertificateexpiration
summary: Kubelet client certificate is about to expire.
expr: kubelet_certificate_manager_client_ttl_seconds < 86400
labels:
@@ -127,7 +127,7 @@ spec:
- alert: KubeletServerCertificateExpiration
annotations:
description: Server certificate for Kubelet on node {{`{{`}} $labels.node {{`}}`}} expires in {{`{{`}} $value | humanizeDuration {{`}}`}}.
- runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeletservercertificateexpiration
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeletservercertificateexpiration
summary: Kubelet server certificate is about to expire.
expr: kubelet_certificate_manager_server_ttl_seconds < 604800
labels:
@@ -138,7 +138,7 @@ spec:
- alert: KubeletServerCertificateExpiration
annotations:
description: Server certificate for Kubelet on node {{`{{`}} $labels.node {{`}}`}} expires in {{`{{`}} $value | humanizeDuration {{`}}`}}.
- runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeletservercertificateexpiration
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeletservercertificateexpiration
summary: Kubelet server certificate is about to expire.
expr: kubelet_certificate_manager_server_ttl_seconds < 86400
labels:
@@ -149,7 +149,7 @@ spec:
- alert: KubeletClientCertificateRenewalErrors
annotations:
description: Kubelet on node {{`{{`}} $labels.node {{`}}`}} has failed to renew its client certificate ({{`{{`}} $value | humanize {{`}}`}} errors in the last 5 minutes).
- runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeletclientcertificaterenewalerrors
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeletclientcertificaterenewalerrors
summary: Kubelet has failed to renew its client certificate.
expr: increase(kubelet_certificate_manager_client_expiration_renew_errors[5m]) > 0
for: 15m
@@ -161,7 +161,7 @@ spec:
- alert: KubeletServerCertificateRenewalErrors
annotations:
description: Kubelet on node {{`{{`}} $labels.node {{`}}`}} has failed to renew its server certificate ({{`{{`}} $value | humanize {{`}}`}} errors in the last 5 minutes).
- runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeletservercertificaterenewalerrors
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeletservercertificaterenewalerrors
summary: Kubelet has failed to renew its server certificate.
expr: increase(kubelet_server_expiration_renew_errors[5m]) > 0
for: 15m
@@ -174,7 +174,7 @@ spec:
- alert: KubeletDown
annotations:
description: Kubelet has disappeared from Prometheus target discovery.
- runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeletdown
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeletdown
summary: Target disappeared from Prometheus target discovery.
expr: absent(up{job="kubelet", metrics_path="/metrics"} == 1)
for: 15m
diff --git a/charts/victoria-metrics-k8s-stack/templates/rules/kubernetes-system-scheduler.yaml b/charts/victoria-metrics-k8s-stack/templates/rules/kubernetes-system-scheduler.yaml
index 4ec8844f0..2e30ddb16 100644
--- a/charts/victoria-metrics-k8s-stack/templates/rules/kubernetes-system-scheduler.yaml
+++ b/charts/victoria-metrics-k8s-stack/templates/rules/kubernetes-system-scheduler.yaml
@@ -27,7 +27,7 @@ spec:
- alert: KubeSchedulerDown
annotations:
description: KubeScheduler has disappeared from Prometheus target discovery.
- runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeschedulerdown
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeschedulerdown
summary: Target disappeared from Prometheus target discovery.
expr: absent(up{job="kube-scheduler"} == 1)
for: 15m
diff --git a/charts/victoria-metrics-k8s-stack/templates/rules/kubernetes-system.yaml b/charts/victoria-metrics-k8s-stack/templates/rules/kubernetes-system.yaml
index 487ab17d5..79ce6036a 100644
--- a/charts/victoria-metrics-k8s-stack/templates/rules/kubernetes-system.yaml
+++ b/charts/victoria-metrics-k8s-stack/templates/rules/kubernetes-system.yaml
@@ -26,7 +26,7 @@ spec:
- alert: KubeVersionMismatch
annotations:
description: There are {{`{{`}} $value {{`}}`}} different semantic versions of Kubernetes components running.
- runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeversionmismatch
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeversionmismatch
summary: Different semantic versions of Kubernetes components running.
expr: count(count by (git_version) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"git_version","$1","git_version","(v[0-9]*.[0-9]*).*"))) > 1
for: 15m
@@ -38,7 +38,7 @@ spec:
- alert: KubeClientErrors
annotations:
description: Kubernetes API server client '{{`{{`}} $labels.job {{`}}`}}/{{`{{`}} $labels.instance {{`}}`}}' is experiencing {{`{{`}} $value | humanizePercentage {{`}}`}} errors.'
- runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeclienterrors
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeclienterrors
summary: Kubernetes API server client is experiencing errors.
expr: |-
(sum(rate(rest_client_requests_total{code=~"5.."}[5m])) by (instance, job)
diff --git a/charts/victoria-metrics-k8s-stack/templates/rules/node-exporter.yaml b/charts/victoria-metrics-k8s-stack/templates/rules/node-exporter.yaml
index e4cab93fb..b4bc0b198 100644
--- a/charts/victoria-metrics-k8s-stack/templates/rules/node-exporter.yaml
+++ b/charts/victoria-metrics-k8s-stack/templates/rules/node-exporter.yaml
@@ -26,7 +26,7 @@ spec:
- alert: NodeFilesystemSpaceFillingUp
annotations:
description: Filesystem on {{`{{`}} $labels.device {{`}}`}} at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available space left and is filling up.
- runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodefilesystemspacefillingup
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}/node/nodefilesystemspacefillingup
summary: Filesystem is predicted to run out of space within the next 24 hours.
expr: |-
(
@@ -45,7 +45,7 @@ spec:
- alert: NodeFilesystemSpaceFillingUp
annotations:
description: Filesystem on {{`{{`}} $labels.device {{`}}`}} at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available space left and is filling up fast.
- runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodefilesystemspacefillingup
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}/node/nodefilesystemspacefillingup
summary: Filesystem is predicted to run out of space within the next 4 hours.
expr: |-
(
@@ -64,7 +64,7 @@ spec:
- alert: NodeFilesystemAlmostOutOfSpace
annotations:
description: Filesystem on {{`{{`}} $labels.device {{`}}`}} at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available space left.
- runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodefilesystemalmostoutofspace
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}/node/nodefilesystemalmostoutofspace
summary: Filesystem has less than 5% space left.
expr: |-
(
@@ -81,7 +81,7 @@ spec:
- alert: NodeFilesystemAlmostOutOfSpace
annotations:
description: Filesystem on {{`{{`}} $labels.device {{`}}`}} at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available space left.
- runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodefilesystemalmostoutofspace
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}/node/nodefilesystemalmostoutofspace
summary: Filesystem has less than 3% space left.
expr: |-
(
@@ -98,7 +98,7 @@ spec:
- alert: NodeFilesystemFilesFillingUp
annotations:
description: Filesystem on {{`{{`}} $labels.device {{`}}`}} at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available inodes left and is filling up.
- runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodefilesystemfilesfillingup
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}/node/nodefilesystemfilesfillingup
summary: Filesystem is predicted to run out of inodes within the next 24 hours.
expr: |-
(
@@ -117,7 +117,7 @@ spec:
- alert: NodeFilesystemFilesFillingUp
annotations:
description: Filesystem on {{`{{`}} $labels.device {{`}}`}} at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available inodes left and is filling up fast.
- runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodefilesystemfilesfillingup
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}/node/nodefilesystemfilesfillingup
summary: Filesystem is predicted to run out of inodes within the next 4 hours.
expr: |-
(
@@ -136,7 +136,7 @@ spec:
- alert: NodeFilesystemAlmostOutOfFiles
annotations:
description: Filesystem on {{`{{`}} $labels.device {{`}}`}} at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available inodes left.
- runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodefilesystemalmostoutoffiles
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}/node/nodefilesystemalmostoutoffiles
summary: Filesystem has less than 5% inodes left.
expr: |-
(
@@ -153,7 +153,7 @@ spec:
- alert: NodeFilesystemAlmostOutOfFiles
annotations:
description: Filesystem on {{`{{`}} $labels.device {{`}}`}} at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available inodes left.
- runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodefilesystemalmostoutoffiles
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}/node/nodefilesystemalmostoutoffiles
summary: Filesystem has less than 3% inodes left.
expr: |-
(
@@ -170,7 +170,7 @@ spec:
- alert: NodeNetworkReceiveErrs
annotations:
description: '{{`{{`}} $labels.instance {{`}}`}} interface {{`{{`}} $labels.device {{`}}`}} has encountered {{`{{`}} printf "%.0f" $value {{`}}`}} receive errors in the last two minutes.'
- runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodenetworkreceiveerrs
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}/node/nodenetworkreceiveerrs
summary: Network interface is reporting many receive errors.
expr: rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01
for: 1h
@@ -182,7 +182,7 @@ spec:
- alert: NodeNetworkTransmitErrs
annotations:
description: '{{`{{`}} $labels.instance {{`}}`}} interface {{`{{`}} $labels.device {{`}}`}} has encountered {{`{{`}} printf "%.0f" $value {{`}}`}} transmit errors in the last two minutes.'
- runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodenetworktransmiterrs
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}/node/nodenetworktransmiterrs
summary: Network interface is reporting many transmit errors.
expr: rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01
for: 1h
@@ -194,7 +194,7 @@ spec:
- alert: NodeHighNumberConntrackEntriesUsed
annotations:
description: '{{`{{`}} $value | humanizePercentage {{`}}`}} of conntrack entries are used.'
- runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodehighnumberconntrackentriesused
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}/node/nodehighnumberconntrackentriesused
summary: Number of conntrack are getting close to the limit.
expr: (node_nf_conntrack_entries / node_nf_conntrack_entries_limit) > 0.75
labels:
@@ -205,7 +205,7 @@ spec:
- alert: NodeTextFileCollectorScrapeError
annotations:
description: Node Exporter text file collector failed to scrape.
- runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodetextfilecollectorscrapeerror
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}/node/nodetextfilecollectorscrapeerror
summary: Node Exporter text file collector failed to scrape.
expr: node_textfile_scrape_error{job="node-exporter"} == 1
labels:
@@ -216,7 +216,7 @@ spec:
- alert: NodeClockSkewDetected
annotations:
description: Clock on {{`{{`}} $labels.instance {{`}}`}} is out of sync by more than 300s. Ensure NTP is configured correctly on this host.
- runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodeclockskewdetected
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}/node/nodeclockskewdetected
summary: Clock skew detected.
expr: |-
(
@@ -239,7 +239,7 @@ spec:
- alert: NodeClockNotSynchronising
annotations:
description: Clock on {{`{{`}} $labels.instance {{`}}`}} is not synchronising. Ensure NTP is configured on this host.
- runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodeclocknotsynchronising
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}/node/nodeclocknotsynchronising
summary: Clock not synchronising.
expr: |-
min_over_time(node_timex_sync_status[5m]) == 0
@@ -254,7 +254,7 @@ spec:
- alert: NodeRAIDDegraded
annotations:
description: RAID array '{{`{{`}} $labels.device {{`}}`}}' on {{`{{`}} $labels.instance {{`}}`}} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically.
- runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-noderaiddegraded
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}/node/noderaiddegraded
summary: RAID Array is degraded
expr: node_md_disks_required - ignoring (state) (node_md_disks{state="active"}) > 0
for: 15m
@@ -266,12 +266,42 @@ spec:
- alert: NodeRAIDDiskFailure
annotations:
description: At least one device in RAID array on {{`{{`}} $labels.instance {{`}}`}} failed. Array '{{`{{`}} $labels.device {{`}}`}}' needs attention and possibly a disk swap.
- runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-noderaiddiskfailure
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}/node/noderaiddiskfailure
summary: Failed device in RAID array
expr: node_md_disks{state="failed"} > 0
labels:
severity: warning
{{- if .Values.defaultRules.additionalRuleLabels }}
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
+{{- end }}
+ - alert: NodeFileDescriptorLimit
+ annotations:
+ description: File descriptors limit at {{`{{`}} $labels.instance {{`}}`}} is currently at {{`{{`}} printf "%.2f" $value {{`}}`}}%.
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}/node/nodefiledescriptorlimit
+ summary: Kernel is predicted to exhaust file descriptors limit soon.
+ expr: |-
+ (
+ node_filefd_allocated{job="node-exporter"} * 100 / node_filefd_maximum{job="node-exporter"} > 70
+ )
+ for: 15m
+ labels:
+ severity: warning
+{{- if .Values.defaultRules.additionalRuleLabels }}
+{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
+{{- end }}
+ - alert: NodeFileDescriptorLimit
+ annotations:
+ description: File descriptors limit at {{`{{`}} $labels.instance {{`}}`}} is currently at {{`{{`}} printf "%.2f" $value {{`}}`}}%.
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}/node/nodefiledescriptorlimit
+ summary: Kernel is predicted to exhaust file descriptors limit soon.
+ expr: |-
+ (
+ node_filefd_allocated{job="node-exporter"} * 100 / node_filefd_maximum{job="node-exporter"} > 90
+ )
+ for: 15m
+ labels:
+ severity: critical
+{{- if .Values.defaultRules.additionalRuleLabels }}
+{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
{{- end }}
{{- end }}
\ No newline at end of file
diff --git a/charts/victoria-metrics-k8s-stack/templates/rules/node-network.yaml b/charts/victoria-metrics-k8s-stack/templates/rules/node-network.yaml
index a6c907fe8..540737dfa 100644
--- a/charts/victoria-metrics-k8s-stack/templates/rules/node-network.yaml
+++ b/charts/victoria-metrics-k8s-stack/templates/rules/node-network.yaml
@@ -25,8 +25,9 @@ spec:
rules:
- alert: NodeNetworkInterfaceFlapping
annotations:
- message: Network interface "{{`{{`}} $labels.device {{`}}`}}" changing it's up status often on node-exporter {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod {{`}}`}}
- runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodenetworkinterfaceflapping
+ description: Network interface "{{`{{`}} $labels.device {{`}}`}}" changing its up status often on node-exporter {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod {{`}}`}}
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}/general/nodenetworkinterfaceflapping
+ summary: Network interface is often changing its status
expr: changes(node_network_up{job="node-exporter",device!~"veth.+"}[2m]) > 2
for: 2m
labels:
diff --git a/charts/victoria-metrics-k8s-stack/templates/rules/service-health.yaml b/charts/victoria-metrics-k8s-stack/templates/rules/vm-health.yaml
similarity index 88%
rename from charts/victoria-metrics-k8s-stack/templates/rules/service-health.yaml
rename to charts/victoria-metrics-k8s-stack/templates/rules/vm-health.yaml
index a789491a1..d0f5823c6 100644
--- a/charts/victoria-metrics-k8s-stack/templates/rules/service-health.yaml
+++ b/charts/victoria-metrics-k8s-stack/templates/rules/vm-health.yaml
@@ -1,5 +1,5 @@
{{- /*
-Generated from 'serviceHealth' group from https://raw.githubusercontent.com/VictoriaMetrics/VictoriaMetrics/master/deployment/docker/alerts.yml
+Generated from 'vm-health' group from https://raw.githubusercontent.com/VictoriaMetrics/VictoriaMetrics/master/deployment/docker/alerts.yml
Do not change in-place! In order to change this file first read following link:
https://github.com/VictoriaMetrics/helm-charts/tree/master/charts/victoria-metrics-k8s-stack/hack
*/ -}}
@@ -8,7 +8,7 @@ apiVersion: operator.victoriametrics.com/v1beta1
kind: VMRule
metadata:
namespace: {{ .Release.Namespace }}
- name: {{ printf "%s-%s" (include "victoria-metrics-k8s-stack.fullname" .) "service-health" | trunc 63 | trimSuffix "-" }}
+ name: {{ printf "%s-%s" (include "victoria-metrics-k8s-stack.fullname" .) "vm-health" | trunc 63 | trimSuffix "-" }}
labels:
app: {{ include "victoria-metrics-k8s-stack.name" $ }}
{{ include "victoria-metrics-k8s-stack.labels" $ | indent 4 }}
@@ -21,7 +21,7 @@ metadata:
{{- end }}
spec:
groups:
- - name: serviceHealth
+ - name: vm-health
rules:
- alert: TooManyRestarts
annotations:
diff --git a/charts/victoria-metrics-k8s-stack/templates/rules/vmsingle.yaml b/charts/victoria-metrics-k8s-stack/templates/rules/vmsingle.yaml
index dcdfe8a47..7750f1cef 100644
--- a/charts/victoria-metrics-k8s-stack/templates/rules/vmsingle.yaml
+++ b/charts/victoria-metrics-k8s-stack/templates/rules/vmsingle.yaml
@@ -31,16 +31,16 @@ spec:
description: "Taking into account current ingestion rate, free disk space will be enough only for {{`{{`}} $value | humanizeDuration {{`}}`}} on instance {{`{{`}} $labels.instance {{`}}`}}.\n Consider to limit the ingestion rate, decrease retention or scale the disk space if possible."
summary: Instance {{`{{`}} $labels.instance {{`}}`}} will run out of disk space soon
expr: |-
- vm_free_disk_space_bytes / ignoring(path) (
- (
- sum(rate(vm_rows_added_to_storage_total[1d])) -
- sum(rate(vm_deduplicated_samples_total[1d])) without(type)
- )
- *
- (
- sum(vm_data_size_bytes{type!="indexdb"}) /
- sum(vm_rows{type!="indexdb"})
- )
+ vm_free_disk_space_bytes / ignoring(path)
+ (
+ (
+ rate(vm_rows_added_to_storage_total[1d]) -
+ ignoring(type) rate(vm_deduplicated_samples_total{type="merge"}[1d])
+ )
+ * scalar(
+ sum(vm_data_size_bytes{type!="indexdb"}) /
+ sum(vm_rows{type!="indexdb"})
+ )
) < 3 * 24 * 3600
for: 30m
labels:
@@ -173,5 +173,17 @@ spec:
severity: critical
{{- if .Values.defaultRules.additionalRuleLabels }}
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
+{{- end }}
+ - alert: LabelsLimitExceededOnIngestion
+ annotations:
+ dashboard: {{ index .Values.grafana.ingress.hosts 0 }}/d/oS7Bi_0Wz?viewPanel=74&var-instance={{`{{`}} $labels.instance {{`}}`}}
+ description: "VictoriaMetrics limits the number of labels per each metric with `-maxLabelsPerTimeseries` command-line flag.\n This prevents from ingesting metrics with too many labels. Please verify that `-maxLabelsPerTimeseries` is configured correctly or that clients which send these metrics aren't misbehaving."
+ summary: Metrics ingested in ({{`{{`}} $labels.instance {{`}}`}}) are exceeding labels limit
+ expr: sum(increase(vm_metrics_with_dropped_labels_total[5m])) by (instance) > 0
+ for: 15m
+ labels:
+ severity: warning
+{{- if .Values.defaultRules.additionalRuleLabels }}
+{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
{{- end }}
{{- end }}
\ No newline at end of file
diff --git a/charts/victoria-metrics-k8s-stack/templates/victoria-metrics-operator/uninstall_hook.yaml b/charts/victoria-metrics-k8s-stack/templates/victoria-metrics-operator/uninstall_hook.yaml
index 63b5228c9..03ef5d537 100644
--- a/charts/victoria-metrics-k8s-stack/templates/victoria-metrics-operator/uninstall_hook.yaml
+++ b/charts/victoria-metrics-k8s-stack/templates/victoria-metrics-operator/uninstall_hook.yaml
@@ -1,4 +1,5 @@
-{{- if and .Values.operator.cleanupCRD }}
+{{- if .Values.operator.enabled }}
+{{- if .Values.operator.cleanupCRD }}
apiVersion: batch/v1
kind: Job
metadata:
@@ -28,6 +29,7 @@ spec:
kubectl delete vmagents --all --ignore-not-found=true;
kubectl delete vmsingles --all --ignore-not-found=true;
kubectl delete vmalertmanagers --all --ignore-not-found=true;
+ kubectl delete vmclusters --all --ignore-not-found=true;
restartPolicy: OnFailure
---
{{- if .Values.operator.cleanupSA.create }}
@@ -65,8 +67,10 @@ rules:
- vmalerts
- vmsingles
- vmalertmanagers
+ - vmclusters
verbs: ["get", "list", "watch","delete"]
---
{{- end }}
---
+{{- end }}
{{- end }}
\ No newline at end of file
diff --git a/charts/victoria-metrics-k8s-stack/templates/victoria-metrics-operator/vmcluster.yaml b/charts/victoria-metrics-k8s-stack/templates/victoria-metrics-operator/vmcluster.yaml
new file mode 100644
index 000000000..19fb46018
--- /dev/null
+++ b/charts/victoria-metrics-k8s-stack/templates/victoria-metrics-operator/vmcluster.yaml
@@ -0,0 +1,263 @@
+{{- if .Values.vmcluster.enabled }}
+---
+apiVersion: operator.victoriametrics.com/v1beta1
+kind: VMCluster
+metadata:
+ name: {{ .Values.vmcluster.name | default (include "victoria-metrics-k8s-stack.fullname" .) }}
+ namespace: {{ .Release.Namespace }}
+ labels: {{ include "victoria-metrics-k8s-stack.labels" . | nindent 4 }}
+spec:
+{{ .Values.vmcluster.spec | toYaml | indent 2 }}
+
+{{- $newAPI := .Capabilities.APIVersions.Has "networking.k8s.io/v1/Ingress" -}}
+{{ if .Values.vmcluster.ingress.storage.enabled }}
+---
+{{- with .Values.vmcluster.ingress.storage }}
+{{- $servicePort := $.Values.vmcluster.spec.vmstorage.port | default 8482 -}}
+{{- $serviceName := printf "%s-%s" "vmstorage" (include "victoria-metrics-k8s-stack.fullname" $) | trunc 63 | trimSuffix "-" }}
+{{- $ingressPath := .path -}}
+{{- $ingressPathType := .pathType | default "" -}}
+{{- $extraPaths := .extraPaths -}}
+{{- if $newAPI -}}
+apiVersion: networking.k8s.io/v1
+{{- else if $.Capabilities.APIVersions.Has "networking.k8s.io/v1beta1/Ingress" }}
+apiVersion: networking.k8s.io/v1beta1
+{{- else }}
+apiVersion: extensions/v1beta1
+{{- end }}
+kind: Ingress
+metadata:
+ name: {{ $serviceName }}
+ namespace: {{ $.Release.Namespace }}
+ labels:
+ app.kubernetes.io/component: {{ include "victoria-metrics-k8s-stack.name" $ }}-vmcluster
+{{ include "victoria-metrics-k8s-stack.labels" $ | indent 4 }}
+{{- if .labels }}
+{{ toYaml .labels | indent 4 }}
+{{- end }}
+ {{- if .annotations }}
+ annotations:
+ {{- range $key, $value := .annotations }}
+ {{ $key }}: {{ tpl $value $ | quote }}
+ {{- end }}
+ {{- end }}
+spec:
+ {{- if .ingressClassName }}
+ ingressClassName: {{ .ingressClassName }}
+ {{- end -}}
+{{- if .tls }}
+ tls:
+{{ tpl (toYaml .tls) $ | indent 4 }}
+{{- end }}
+ rules:
+ {{- if .hosts }}
+ {{- range .hosts }}
+ - host: {{ tpl . $}}
+ http:
+ paths:
+{{- if $extraPaths }}
+{{ toYaml $extraPaths | indent 10 }}
+{{- end }}
+ - path: {{ $ingressPath }}
+ {{- if $newAPI }}
+ pathType: {{ $ingressPathType }}
+ {{- end }}
+ backend:
+ {{- if $newAPI }}
+ service:
+ name: {{ $serviceName }}
+ port:
+ number: {{ $servicePort }}
+ {{- else }}
+ serviceName: {{ $serviceName }}
+ servicePort: {{ $servicePort }}
+ {{- end }}
+ {{- end }}
+ {{- else }}
+ - http:
+ paths:
+ - backend:
+ {{- if $newAPI }}
+ service:
+ name: {{ $serviceName }}
+ port:
+ number: {{ $servicePort }}
+ pathType: {{ $ingressPathType }}
+ {{- else }}
+ serviceName: {{ $serviceName }}
+ servicePort: {{ $servicePort }}
+ {{- end }}
+ {{- if $ingressPath }}
+ path: {{ $ingressPath }}
+ {{- end }}
+ {{- end -}}
+{{- end }}
+{{- end }}
+{{ if .Values.vmcluster.ingress.select.enabled -}}
+---
+{{- with .Values.vmcluster.ingress.select }}
+{{- $servicePort := $.Values.vmcluster.spec.vmselect.port | default 8481 -}}
+{{- $serviceName := printf "%s-%s" "vmselect" (include "victoria-metrics-k8s-stack.fullname" $) | trunc 63 | trimSuffix "-" }}
+{{- $ingressPath := .path -}}
+{{- $ingressPathType := .pathType | default "" -}}
+{{- $extraPaths := .extraPaths -}}
+{{- if $newAPI -}}
+apiVersion: networking.k8s.io/v1
+{{- else if $.Capabilities.APIVersions.Has "networking.k8s.io/v1beta1/Ingress" }}
+apiVersion: networking.k8s.io/v1beta1
+{{- else }}
+apiVersion: extensions/v1beta1
+{{- end }}
+kind: Ingress
+metadata:
+ name: {{ $serviceName }}
+ namespace: {{ $.Release.Namespace }}
+ labels:
+ app.kubernetes.io/component: {{ include "victoria-metrics-k8s-stack.name" $ }}-vmcluster
+{{ include "victoria-metrics-k8s-stack.labels" $ | indent 4 }}
+{{- if .labels }}
+{{ toYaml .labels | indent 4 }}
+{{- end }}
+ {{- if .annotations }}
+ annotations:
+ {{- range $key, $value := .annotations }}
+ {{ $key }}: {{ tpl $value $ | quote }}
+ {{- end }}
+ {{- end }}
+spec:
+ {{- if .ingressClassName }}
+ ingressClassName: {{ .ingressClassName }}
+ {{- end -}}
+{{- if .tls }}
+ tls:
+{{ tpl (toYaml .tls) $ | indent 4 }}
+{{- end }}
+ rules:
+ {{- if .hosts }}
+ {{- range .hosts }}
+ - host: {{ tpl . $}}
+ http:
+ paths:
+{{- if $extraPaths }}
+{{ toYaml $extraPaths | indent 10 }}
+{{- end }}
+ - path: {{ $ingressPath }}
+ {{- if $newAPI }}
+ pathType: {{ $ingressPathType }}
+ {{- end }}
+ backend:
+ {{- if $newAPI }}
+ service:
+ name: {{ $serviceName }}
+ port:
+ number: {{ $servicePort }}
+ {{- else }}
+ serviceName: {{ $serviceName }}
+ servicePort: {{ $servicePort }}
+ {{- end }}
+ {{- end }}
+ {{- else }}
+ - http:
+ paths:
+ - backend:
+ {{- if $newAPI }}
+ service:
+ name: {{ $serviceName }}
+ port:
+ number: {{ $servicePort }}
+ pathType: {{ $ingressPathType }}
+ {{- else }}
+ serviceName: {{ $serviceName }}
+ servicePort: {{ $servicePort }}
+ {{- end }}
+ {{- if $ingressPath }}
+ path: {{ $ingressPath }}
+ {{- end }}
+ {{- end -}}
+{{- end }}
+{{- end }}
+{{ if .Values.vmcluster.ingress.insert.enabled -}}
+---
+{{- with .Values.vmcluster.ingress.insert }}
+{{- $servicePort := $.Values.vmcluster.spec.vminsert.port | default 8480 -}}
+{{- $serviceName := printf "%s-%s" "vminsert" (include "victoria-metrics-k8s-stack.fullname" $) | trunc 63 | trimSuffix "-" }}
+{{- $ingressPath := .path -}}
+{{- $ingressPathType := .pathType | default "" -}}
+{{- $extraPaths := .extraPaths -}}
+{{- if $newAPI -}}
+apiVersion: networking.k8s.io/v1
+{{- else if $.Capabilities.APIVersions.Has "networking.k8s.io/v1beta1/Ingress" }}
+apiVersion: networking.k8s.io/v1beta1
+{{- else }}
+apiVersion: extensions/v1beta1
+{{- end }}
+kind: Ingress
+metadata:
+ name: {{ $serviceName }}
+ namespace: {{ $.Release.Namespace }}
+ labels:
+ app.kubernetes.io/component: {{ include "victoria-metrics-k8s-stack.name" $ }}-vmcluster
+{{ include "victoria-metrics-k8s-stack.labels" $ | indent 4 }}
+{{- if .labels }}
+{{ toYaml .labels | indent 4 }}
+{{- end }}
+ {{- if .annotations }}
+ annotations:
+ {{- range $key, $value := .annotations }}
+ {{ $key }}: {{ tpl $value $ | quote }}
+ {{- end }}
+ {{- end }}
+spec:
+ {{- if .ingressClassName }}
+ ingressClassName: {{ .ingressClassName }}
+ {{- end -}}
+{{- if .tls }}
+ tls:
+{{ tpl (toYaml .tls) $ | indent 4 }}
+{{- end }}
+ rules:
+ {{- if .hosts }}
+ {{- range .hosts }}
+ - host: {{ tpl . $}}
+ http:
+ paths:
+{{- if $extraPaths }}
+{{ toYaml $extraPaths | indent 10 }}
+{{- end }}
+ - path: {{ $ingressPath }}
+ {{- if $newAPI }}
+ pathType: {{ $ingressPathType }}
+ {{- end }}
+ backend:
+ {{- if $newAPI }}
+ service:
+ name: {{ $serviceName }}
+ port:
+ number: {{ $servicePort }}
+ {{- else }}
+ serviceName: {{ $serviceName }}
+ servicePort: {{ $servicePort }}
+ {{- end }}
+ {{- end }}
+ {{- else }}
+ - http:
+ paths:
+ - backend:
+ {{- if $newAPI }}
+ service:
+ name: {{ $serviceName }}
+ port:
+ number: {{ $servicePort }}
+ pathType: {{ $ingressPathType }}
+ {{- else }}
+ serviceName: {{ $serviceName }}
+ servicePort: {{ $servicePort }}
+ {{- end }}
+ {{- if $ingressPath }}
+ path: {{ $ingressPath }}
+ {{- end }}
+ {{- end -}}
+{{- end }}
+{{- end }}
+{{- end }}
+
diff --git a/charts/victoria-metrics-k8s-stack/values.yaml b/charts/victoria-metrics-k8s-stack/values.yaml
index dda014290..c26f83cc5 100644
--- a/charts/victoria-metrics-k8s-stack/values.yaml
+++ b/charts/victoria-metrics-k8s-stack/values.yaml
@@ -2,6 +2,7 @@ nameOverride: ""
fullnameOverride: ""
operator:
+ enabled: true
cleanupCRD: true
cleanupSA:
create: true
@@ -11,7 +12,6 @@ operator:
tag: v1.16.0
pullPolicy: IfNotPresent
-
serviceAccount:
# Specifies whether a service account should be created
create: true
@@ -45,7 +45,7 @@ defaultRules:
node: true
## Runbook url prefix for default rules
- runbookUrl: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#
+ runbookUrl: https://runbooks.prometheus-operator.dev/runbooks
## Reduce app namespace alert scope
appNamespacesTarget: ".*"
@@ -57,8 +57,6 @@ defaultRules:
## Additional labels for PrometheusRule alerts
additionalRuleLabels: {}
-
-
##############
# victoria-metrics-operator dependency chart configuration. For possible values refer to https://github.com/VictoriaMetrics/helm-charts/tree/master/charts/victoria-metrics-operator#parameters
@@ -69,12 +67,13 @@ victoria-metrics-operator:
# -- By default, operator converts prometheus-operator objects.
disable_prometheus_converter: true
-
vmsingle:
enabled: true
# spec for VMSingle crd
# https://github.com/VictoriaMetrics/operator/blob/master/docs/api.MD#vmsinglespec
spec:
+ image:
+ tag: v1.63.0
retentionPeriod: "14"
replicaCount: 1
storage:
@@ -89,7 +88,8 @@ vmsingle:
# See https://kubernetes.io/blog/2020/04/02/improvements-to-the-ingress-api-in-kubernetes-1.18/#specifying-the-class-of-an-ingress
# ingressClassName: nginx
# Values can be templated
- annotations: {}
+ annotations:
+ {}
# kubernetes.io/ingress.class: nginx
# kubernetes.io/tls-acme: "true"
labels: {}
@@ -118,11 +118,172 @@ vmsingle:
# hosts:
# - vmsingle.domain.com
+vmcluster:
+ enabled: false
+ # spec for VMSingle crd
+ # https://github.com/VictoriaMetrics/operator/blob/master/docs/api.MD#vmclusterspec
+ spec:
+ retentionPeriod: "14"
+ replicationFactor: 2
+ vmstorage:
+ image:
+ tag: v1.63.0-cluster
+ replicaCount: 2
+ storageDataPath: "/vm-data"
+ storage:
+ volumeClaimTemplate:
+ spec:
+ resources:
+ requests:
+ storage: 10Gi
+ resources:
+ limits:
+ cpu: "1"
+ memory: 1500Mi
+ vmselect:
+ image:
+ tag: v1.63.0-cluster
+ replicaCount: 2
+ cacheMountPath: "/select-cache"
+ storage:
+ volumeClaimTemplate:
+ spec:
+ resources:
+ requests:
+ storage: 2Gi
+ resources:
+ limits:
+ cpu: "1"
+ memory: "1000Mi"
+ requests:
+ cpu: "0.5"
+ memory: "500Mi"
+ vminsert:
+ image:
+ tag: v1.63.0-cluster
+ replicaCount: 2
+ resources:
+ limits:
+ cpu: "1"
+ memory: 1000Mi
+ requests:
+ cpu: "0.5"
+ memory: "500Mi"
+
+ ingress:
+ storage:
+ enabled: false
+ # For Kubernetes >= 1.18 you should specify the ingress-controller via the field ingressClassName
+ # See https://kubernetes.io/blog/2020/04/02/improvements-to-the-ingress-api-in-kubernetes-1.18/#specifying-the-class-of-an-ingress
+ # ingressClassName: nginx
+ # Values can be templated
+ annotations:
+ {}
+ # kubernetes.io/ingress.class: nginx
+ # kubernetes.io/tls-acme: "true"
+ labels: {}
+ path: /
+ # pathType is only for k8s > 1.19
+ pathType: Prefix
+
+ hosts:
+ - vmstorage.domain.com
+ ## Extra paths to prepend to every host configuration. This is useful when working with annotation based services.
+ extraPaths: []
+ # - path: /*
+ # backend:
+ # serviceName: ssl-redirect
+ # servicePort: use-annotation
+ ## Or for k8s > 1.19
+ # - path: /*
+ # pathType: Prefix
+ # backend:
+ # service:
+ # name: ssl-redirect
+ # port:
+ # name: service
+ tls: []
+ # - secretName: vmstorage-ingress-tls
+ # hosts:
+ # - vmstorage.domain.com
+ select:
+ enabled: false
+ # For Kubernetes >= 1.18 you should specify the ingress-controller via the field ingressClassName
+ # See https://kubernetes.io/blog/2020/04/02/improvements-to-the-ingress-api-in-kubernetes-1.18/#specifying-the-class-of-an-ingress
+ # ingressClassName: nginx
+ # Values can be templated
+ annotations:
+ {}
+ # kubernetes.io/ingress.class: nginx
+ # kubernetes.io/tls-acme: "true"
+ labels: {}
+ path: /
+ # pathType is only for k8s > 1.19
+ pathType: Prefix
+
+ hosts:
+ - vmselect.domain.com
+ ## Extra paths to prepend to every host configuration. This is useful when working with annotation based services.
+ extraPaths: []
+ # - path: /*
+ # backend:
+ # serviceName: ssl-redirect
+ # servicePort: use-annotation
+ ## Or for k8s > 1.19
+ # - path: /*
+ # pathType: Prefix
+ # backend:
+ # service:
+ # name: ssl-redirect
+ # port:
+ # name: service
+ tls: []
+ # - secretName: vmselect-ingress-tls
+ # hosts:
+ # - vmselect.domain.com
+ insert:
+ enabled: false
+ # For Kubernetes >= 1.18 you should specify the ingress-controller via the field ingressClassName
+ # See https://kubernetes.io/blog/2020/04/02/improvements-to-the-ingress-api-in-kubernetes-1.18/#specifying-the-class-of-an-ingress
+ # ingressClassName: nginx
+ # Values can be templated
+ annotations:
+ {}
+ # kubernetes.io/ingress.class: nginx
+ # kubernetes.io/tls-acme: "true"
+ labels: {}
+ path: /
+ # pathType is only for k8s > 1.19
+ pathType: Prefix
+
+ hosts:
+ - vminsert.domain.com
+ ## Extra paths to prepend to every host configuration. This is useful when working with annotation based services.
+ extraPaths: []
+ # - path: /*
+ # backend:
+ # serviceName: ssl-redirect
+ # servicePort: use-annotation
+ ## Or for k8s > 1.19
+ # - path: /*
+ # pathType: Prefix
+ # backend:
+ # service:
+ # name: ssl-redirect
+ # port:
+ # name: service
+ tls: []
+ # - secretName: vminsert-ingress-tls
+ # hosts:
+ # - vminsert.domain.com
+
alertmanager:
enabled: true
# spec for VMAlertmanager crd
# https://github.com/VictoriaMetrics/operator/blob/master/docs/api.MD#vmalertmanagerspec
spec:
+ image:
+ tag: v0.22.2
externalURL: ""
routePrefix: /
@@ -134,81 +295,81 @@ alertmanager:
resolve_timeout: 5m
slack_api_url: "http://slack:30500/"
templates:
- - "/etc/vm/configs/**/*.tmpl"
+ - "/etc/vm/configs/**/*.tmpl"
route:
- group_by: ['job']
+ group_by: ["job"]
group_wait: 30s
group_interval: 5m
repeat_interval: 12h
- receiver: 'slack-monitoring'
+ receiver: "slack-monitoring"
routes:
- ###################################################
- ## Duplicate code_owner routes to teams
- ## These will send alerts to team channels but continue
- ## processing through the rest of the tree to handled by on-call
- - match_re:
- code_owner: '.+'
- routes:
- - match: {severity: info|warning|critical}
- continue: true
- receiver: slack-code-owners
-
- ###################################################
- ## Standard on-call routes
- - match_re:
- severity: info|warning|critical
- receiver: slack-monitoring
- continue: true
-
+ ###################################################
+ ## Duplicate code_owner routes to teams
+ ## These will send alerts to team channels but continue
+ ## processing through the rest of the tree to handled by on-call
+ - match_re:
+ code_owner: ".+"
+ routes:
+ - match: { severity: info|warning|critical }
+ continue: true
+ receiver: slack-code-owners
+
+ ###################################################
+ ## Standard on-call routes
+ - match_re:
+ severity: info|warning|critical
+ receiver: slack-monitoring
+ continue: true
+
receivers:
- name: "slack-monitoring"
slack_configs:
- - channel: "#channel"
- send_resolved: true
- title: '{{ template "slack.monzo.title" . }}'
- icon_emoji: '{{ template "slack.monzo.icon_emoji" . }}'
- color: '{{ template "slack.monzo.color" . }}'
- text: '{{ template "slack.monzo.text" . }}'
- actions:
- - type: button
- text: 'Runbook :green_book:'
- url: '{{ (index .Alerts 0).Annotations.runbook }}'
- - type: button
- text: 'Query :mag:'
- url: '{{ (index .Alerts 0).GeneratorURL }}'
- - type: button
- text: 'Dashboard :grafana:'
- url: '{{ (index .Alerts 0).Annotations.dashboard }}'
- - type: button
- text: 'Silence :no_bell:'
- url: '{{ template "__alert_silence_link" . }}'
- - type: button
- text: '{{ template "slack.monzo.link_button_text" . }}'
- url: '{{ .CommonAnnotations.link_url }}'
+ - channel: "#channel"
+ send_resolved: true
+ title: '{{ template "slack.monzo.title" . }}'
+ icon_emoji: '{{ template "slack.monzo.icon_emoji" . }}'
+ color: '{{ template "slack.monzo.color" . }}'
+ text: '{{ template "slack.monzo.text" . }}'
+ actions:
+ - type: button
+ text: "Runbook :green_book:"
+ url: "{{ (index .Alerts 0).Annotations.runbook }}"
+ - type: button
+ text: "Query :mag:"
+ url: "{{ (index .Alerts 0).GeneratorURL }}"
+ - type: button
+ text: "Dashboard :grafana:"
+ url: "{{ (index .Alerts 0).Annotations.dashboard }}"
+ - type: button
+ text: "Silence :no_bell:"
+ url: '{{ template "__alert_silence_link" . }}'
+ - type: button
+ text: '{{ template "slack.monzo.link_button_text" . }}'
+ url: "{{ .CommonAnnotations.link_url }}"
- name: slack-code-owners
slack_configs:
- - channel: '#{{- template "slack.monzo.code_owner_channel" . -}}'
- send_resolved: true
- title: '{{ template "slack.monzo.title" . }}'
- icon_emoji: '{{ template "slack.monzo.icon_emoji" . }}'
- color: '{{ template "slack.monzo.color" . }}'
- text: '{{ template "slack.monzo.text" . }}'
- actions:
- - type: button
- text: 'Runbook :green_book:'
- url: '{{ (index .Alerts 0).Annotations.runbook }}'
- - type: button
- text: 'Query :mag:'
- url: '{{ (index .Alerts 0).GeneratorURL }}'
- - type: button
- text: 'Dashboard :grafana:'
- url: '{{ (index .Alerts 0).Annotations.dashboard }}'
- - type: button
- text: 'Silence :no_bell:'
- url: '{{ template "__alert_silence_link" . }}'
- - type: button
- text: '{{ template "slack.monzo.link_button_text" . }}'
- url: '{{ .CommonAnnotations.link_url }}'
+ - channel: '#{{- template "slack.monzo.code_owner_channel" . -}}'
+ send_resolved: true
+ title: '{{ template "slack.monzo.title" . }}'
+ icon_emoji: '{{ template "slack.monzo.icon_emoji" . }}'
+ color: '{{ template "slack.monzo.color" . }}'
+ text: '{{ template "slack.monzo.text" . }}'
+ actions:
+ - type: button
+ text: "Runbook :green_book:"
+ url: "{{ (index .Alerts 0).Annotations.runbook }}"
+ - type: button
+ text: "Query :mag:"
+ url: "{{ (index .Alerts 0).GeneratorURL }}"
+ - type: button
+ text: "Dashboard :grafana:"
+ url: "{{ (index .Alerts 0).Annotations.dashboard }}"
+ - type: button
+ text: "Silence :no_bell:"
+ url: '{{ template "__alert_silence_link" . }}'
+ - type: button
+ text: '{{ template "slack.monzo.link_button_text" . }}'
+ url: "{{ .CommonAnnotations.link_url }}"
# better alert templates for slack
# source https://gist.github.com/milesbxf/e2744fc90e9c41b47aa47925f8ff6512
@@ -221,7 +382,8 @@ alertmanager:
# See https://kubernetes.io/blog/2020/04/02/improvements-to-the-ingress-api-in-kubernetes-1.18/#specifying-the-class-of-an-ingress
# ingressClassName: nginx
# Values can be templated
- annotations: {}
+ annotations:
+ {}
# kubernetes.io/ingress.class: nginx
# kubernetes.io/tls-acme: "true"
labels: {}
@@ -250,13 +412,13 @@ alertmanager:
# hosts:
# - alertmanager.domain.com
-
-
vmalert:
enabled: true
# spec for VMAlert crd
# https://github.com/VictoriaMetrics/operator/blob/master/docs/api.MD#vmalertspec
spec:
+ image:
+ tag: v1.63.0
evaluationInterval: 15s
ingress:
enabled: false
@@ -264,7 +426,8 @@ vmalert:
# See https://kubernetes.io/blog/2020/04/02/improvements-to-the-ingress-api-in-kubernetes-1.18/#specifying-the-class-of-an-ingress
# ingressClassName: nginx
# Values can be templated
- annotations: {}
+ annotations:
+ {}
# kubernetes.io/ingress.class: nginx
# kubernetes.io/tls-acme: "true"
labels: {}
@@ -293,12 +456,13 @@ vmalert:
# hosts:
# - vmalert.domain.com
-
vmagent:
enabled: true
# spec for VMAgent crd
# https://github.com/VictoriaMetrics/operator/blob/master/docs/api.MD#vmagentspec
spec:
+ image:
+ tag: v1.63.0
scrapeInterval: 25s
externalLabels:
cluster: cluster-name
@@ -310,7 +474,8 @@ vmagent:
# See https://kubernetes.io/blog/2020/04/02/improvements-to-the-ingress-api-in-kubernetes-1.18/#specifying-the-class-of-an-ingress
# ingressClassName: nginx
# Values can be templated
- annotations: {}
+ annotations:
+ {}
# kubernetes.io/ingress.class: nginx
# kubernetes.io/tls-acme: "true"
labels: {}
@@ -339,7 +504,6 @@ vmagent:
# hosts:
# - vmagent.domain.com
-
#################################################
### dependencies #####
#################################################
@@ -374,14 +538,14 @@ grafana:
dashboardproviders.yaml:
apiVersion: 1
providers:
- - name: 'default'
- orgId: 1
- folder: ''
- type: file
- disableDeletion: false
- editable: true
- options:
- path: /var/lib/grafana/dashboards/default
+ - name: "default"
+ orgId: 1
+ folder: ""
+ type: file
+ disableDeletion: false
+ editable: true
+ options:
+ path: /var/lib/grafana/dashboards/default
dashboards:
default:
@@ -402,7 +566,8 @@ grafana:
# See https://kubernetes.io/blog/2020/04/02/improvements-to-the-ingress-api-in-kubernetes-1.18/#specifying-the-class-of-an-ingress
# ingressClassName: nginx
# Values can be templated
- annotations: {}
+ annotations:
+ {}
# kubernetes.io/ingress.class: nginx
# kubernetes.io/tls-acme: "true"
labels: {}
@@ -435,12 +600,10 @@ grafana:
# wheter we should create a service scrape resource for node-exporter
enabled: true
- # spec for VMServiceScrape crd
+ # spec for VMServiceScrape crd
# https://github.com/VictoriaMetrics/operator/blob/master/docs/api.MD#vmservicescrapespec
spec: {}
-
-
# prometheus-node-exporter dependency chart configuration. For possible values refer to https://github.com/prometheus-community/helm-charts/blob/main/charts/prometheus-node-exporter/values.yaml
prometheus-node-exporter:
enabled: true
@@ -458,27 +621,23 @@ prometheus-node-exporter:
# wheter we should create a service scrape resource for node-exporter
enabled: true
- # spec for VMServiceScrape crd
+ # spec for VMServiceScrape crd
# https://github.com/VictoriaMetrics/operator/blob/master/docs/api.MD#vmservicescrapespec
spec:
jobLabel: jobLabel
-
-
# kube-state-metrics dependency chart configuration. For possible values refer to https://github.com/kubernetes/kube-state-metrics/blob/master/charts/kube-state-metrics/values.yaml
kube-state-metrics:
enabled: true
## all values for kube-state-metrics helm chart can be specified here
- # spec for VMServiceScrape crd
+ # spec for VMServiceScrape crd
# https://github.com/VictoriaMetrics/operator/blob/master/docs/api.MD#vmservicescrapespec
vmServiceScrape:
spec: {}
#TODO: selector override for kube-state-metrics deployed separatelly
-
-
### Service Monitors
## Component scraping the kubelets
kubelet:
@@ -488,7 +647,7 @@ kubelet:
cadvisor: true
## Enable scraping /metrics/probes from kubelet's service
probes: true
- # spec for VMNodeScrape crd
+ # spec for VMNodeScrape crd
# https://github.com/VictoriaMetrics/operator/blob/master/docs/api.MD#vmnodescrapespec
spec:
scheme: "https"
@@ -507,34 +666,30 @@ kubelet:
- targetLabel: "job"
replacement: "kubelet"
-
-
## Component scraping the kube api server
kubeApiServer:
enabled: true
- # spec for VMServiceScrape crd
+ # spec for VMServiceScrape crd
# https://github.com/VictoriaMetrics/operator/blob/master/docs/api.MD#vmservicescrapespec
spec:
endpoints:
- - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
- # bearerTokenSecret:
- # key: ""
- port: https
- scheme: https
- tlsConfig:
- caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
- serverName: kubernetes
+ - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
+ # bearerTokenSecret:
+ # key: ""
+ port: https
+ scheme: https
+ tlsConfig:
+ caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
+ serverName: kubernetes
jobLabel: component
namespaceSelector:
matchNames:
- - default
+ - default
selector:
matchLabels:
component: apiserver
provider: kubernetes
-
-
## Component scraping the kube controller manager
kubeControllerManager:
enabled: true
@@ -555,10 +710,9 @@ kubeControllerManager:
# selector:
# component: kube-controller-manager
-
vmServiceScrape:
enabled: true
- # spec for VMServiceScrape crd
+ # spec for VMServiceScrape crd
# https://github.com/VictoriaMetrics/operator/blob/master/docs/api.MD#vmservicescrapespec
spec:
jobLabel: jobLabel
@@ -572,8 +726,6 @@ kubeControllerManager:
caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
serverName: kubernetes
-
-
## Component scraping coreDns. Use either this or kubeDns
##
coreDns:
@@ -587,14 +739,12 @@ coreDns:
vmServiceScrape:
enabled: true
- # spec for VMServiceScrape crd
+ # spec for VMServiceScrape crd
# https://github.com/VictoriaMetrics/operator/blob/master/docs/api.MD#vmservicescrapespec
- spec:
+ spec:
endpoints:
- - port: http-metrics
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
-
-
+ - port: http-metrics
+ bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
## Component scraping etcd
##
@@ -619,7 +769,7 @@ kubeEtcd:
vmServiceScrape:
enabled: true
- # spec for VMServiceScrape crd
+ # spec for VMServiceScrape crd
# https://github.com/VictoriaMetrics/operator/blob/master/docs/api.MD#vmservicescrapespec
spec:
jobLabel: jobLabel
@@ -632,8 +782,6 @@ kubeEtcd:
tlsConfig:
caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
-
-
## Component scraping kube scheduler
##
kubeScheduler:
@@ -657,7 +805,7 @@ kubeScheduler:
vmServiceScrape:
enabled: true
- # spec for VMServiceScrape crd
+ # spec for VMServiceScrape crd
# https://github.com/VictoriaMetrics/operator/blob/master/docs/api.MD#vmservicescrapespec
spec:
jobLabel: jobLabel
@@ -670,8 +818,6 @@ kubeScheduler:
tlsConfig:
caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
-
-
## Component scraping kube proxy
##
kubeProxy:
@@ -693,7 +839,7 @@ kubeProxy:
vmServiceScrape:
enabled: true
- # spec for VMServiceScrape crd
+ # spec for VMServiceScrape crd
# https://github.com/VictoriaMetrics/operator/blob/master/docs/api.MD#vmservicescrapespec
spec:
jobLabel: jobLabel