Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Grafana: extend network-related dashboard #750

Merged
merged 1 commit into from Aug 12, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
Expand Up @@ -48,7 +48,7 @@ class Row(g.Row):
height = attr.ib(default=PANEL_HEIGHT)


def simple_graph(title, exprs, yAxes=None, legend="", interval="5s"):
def simple_graph(title, exprs, legend="", interval="5s", **kwargs):
if not isinstance(exprs, (list, tuple)):
exprs = [exprs]
if legend != "" and len(exprs) != 1:
Expand All @@ -62,5 +62,5 @@ def simple_graph(title, exprs, yAxes=None, legend="", interval="5s"):
)
for expr in exprs
],
yAxes=yAxes or g.YAxes(),
**kwargs
)
Expand Up @@ -26,20 +26,20 @@
d.simple_graph(
"API call latency (1s thresholds)",
'apiserver:apiserver_request_latency:histogram_quantile{quantile="0.99", verb!="LIST", verb!="WATCH", verb!="CONNECT"}',
g.single_y_axis(format=g.SECONDS_FORMAT),
"{{verb}} {{scope}}/{{resource}}",
legend="{{verb}} {{scope}}/{{resource}}",
yAxes=g.single_y_axis(format=g.SECONDS_FORMAT),
),
d.simple_graph(
"API call latency aggregated (1s thresholds)",
'histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{verb!~"LIST|WATCH|CONNECT|DELETECOLLECTION"}[1d])) by (le, resource, verb, scope, subresource))',
g.single_y_axis(format=g.SECONDS_FORMAT),
"{{verb}} {{scope}}/{{resource}}",
legend="{{verb}} {{scope}}/{{resource}}",
yAxes=g.single_y_axis(format=g.SECONDS_FORMAT),
),
d.simple_graph(
"API call latency aggregated (prometheus, 1s threshold)",
'quantile_over_time(0.99, apiserver:apiserver_request_latency:histogram_quantile{verb!~"LIST|WATCH|CONNECT|DELETECOLLECTION"}[1d])',
g.single_y_axis(format=g.SECONDS_FORMAT),
"{{verb}} {{scope}}/{{resource}}",
legend="{{verb}} {{scope}}/{{resource}}",
yAxes=g.single_y_axis(format=g.SECONDS_FORMAT),
),
]

Expand Down Expand Up @@ -68,38 +68,38 @@
d.simple_graph(
"etcd bytes sent",
"irate(etcd_network_client_grpc_sent_bytes_total[1m])",
g.single_y_axis(format=g.BYTES_PER_SEC_FORMAT),
yAxes=g.single_y_axis(format=g.BYTES_PER_SEC_FORMAT),
),
d.simple_graph(
"etcd lists rate",
'sum(rate(etcd_request_duration_seconds_count{operation="list"}[1m])) by (type)',
g.single_y_axis(format=g.OPS_FORMAT),
yAxes=g.single_y_axis(format=g.OPS_FORMAT),
),
d.simple_graph(
"etcd operations rate",
"sum(rate(etcd_request_duration_seconds_count[1m])) by (operation, type)",
g.single_y_axis(format=g.OPS_FORMAT),
yAxes=g.single_y_axis(format=g.OPS_FORMAT),
),
d.simple_graph(
"etcd get lease latency by instance (99th percentile)",
'histogram_quantile(0.99, sum(rate(etcd_request_duration_seconds_bucket{operation="get", type="*coordination.Lease"}[1m])) by (le, type, instance))',
g.single_y_axis(format=g.SECONDS_FORMAT),
yAxes=g.single_y_axis(format=g.SECONDS_FORMAT),
),
d.simple_graph(
"etcd get latency by type (99th percentile)",
'histogram_quantile(0.99, sum(rate(etcd_request_duration_seconds_bucket{operation="get"}[1m])) by (le, type))',
g.single_y_axis(format=g.SECONDS_FORMAT),
yAxes=g.single_y_axis(format=g.SECONDS_FORMAT),
),
d.simple_graph(
"etcd get latency by type (50th percentile)",
'histogram_quantile(0.50, sum(rate(etcd_request_duration_seconds_bucket{operation="get"}[1m])) by (le, type))',
g.single_y_axis(format=g.SECONDS_FORMAT),
yAxes=g.single_y_axis(format=g.SECONDS_FORMAT),
),
d.simple_graph("etcd instance id", "sum(etcd_server_id) by (instance, server_id)"),
d.simple_graph(
"etcd network latency (99th percentile)",
"histogram_quantile(0.99, sum(rate(etcd_network_peer_round_trip_time_seconds_bucket[1m])) by (le, instance, To))",
g.single_y_axis(format=g.SECONDS_FORMAT),
yAxes=g.single_y_axis(format=g.SECONDS_FORMAT),
),
d.simple_graph(
"etcd compaction keys",
Expand All @@ -108,7 +108,7 @@
d.simple_graph(
"etcd compaction pause sum duration",
"delta(etcd_debugging_mvcc_db_compaction_pause_duration_milliseconds_sum[1m])",
g.single_y_axis(format=g.MILLISECONDS_FORMAT),
yAxes=g.single_y_axis(format=g.MILLISECONDS_FORMAT),
),
d.simple_graph(
"etcd compaction pause num chunks",
Expand All @@ -117,7 +117,7 @@
d.simple_graph(
"etcd_disk_backend_commit_duration_seconds",
"histogram_quantile(0.99, sum(rate(etcd_disk_backend_commit_duration_seconds[1m])) by (le, instance))",
g.single_y_axis(format=g.SECONDS_FORMAT),
yAxes=g.single_y_axis(format=g.SECONDS_FORMAT),
),
d.Graph(
title="etcd compaction max pause",
Expand All @@ -138,17 +138,19 @@
"etcd_mvcc_db_total_size_in_use_in_bytes",
"etcd_server_quota_backend_bytes",
],
g.single_y_axis(format=g.BYTES_FORMAT),
yAxes=g.single_y_axis(format=g.BYTES_FORMAT),
),
]

APISERVER_PANELS = [
d.simple_graph("goroutines", 'go_goroutines{job="apiserver"}'),
d.simple_graph("gc rate", 'rate(go_gc_duration_seconds_count{job="apiserver"}[1m])'),
d.simple_graph(
"gc rate", 'rate(go_gc_duration_seconds_count{job="apiserver"}[1m])'
),
d.simple_graph(
"alloc rate",
'rate(go_memstats_alloc_bytes_total{job="apiserver"}[1m])',
g.single_y_axis(format=g.BYTES_PER_SEC_FORMAT),
yAxes=g.single_y_axis(format=g.BYTES_PER_SEC_FORMAT),
),
d.simple_graph(
"Number of active watches",
Expand All @@ -161,7 +163,7 @@
d.simple_graph(
"(Experimental) Watch events traffic",
"sum(irate(apiserver_watch_events_sizes_sum[1m])) by (version, kind, instance)",
g.single_y_axis(format=g.BYTES_PER_SEC_FORMAT),
yAxes=g.single_y_axis(format=g.BYTES_PER_SEC_FORMAT),
),
d.simple_graph(
"Watch event avg size",
Expand All @@ -182,14 +184,14 @@
d.simple_graph(
"Request latency (50th percentile)",
'apiserver:apiserver_request_latency:histogram_quantile{quantile="0.50", verb!="WATCH"}',
g.single_y_axis(format=g.SECONDS_FORMAT),
"{{verb}} {{scope}}/{{resource}}",
legend="{{verb}} {{scope}}/{{resource}}",
yAxes=g.single_y_axis(format=g.SECONDS_FORMAT),
),
d.simple_graph(
"Request latency (99th percentile)",
'apiserver:apiserver_request_latency:histogram_quantile{quantile="0.99", verb!="WATCH"}',
g.single_y_axis(format=g.SECONDS_FORMAT),
"{{verb}} {{scope}}/{{resource}}",
legend="{{verb}} {{scope}}/{{resource}}",
yAxes=g.single_y_axis(format=g.SECONDS_FORMAT),
),
d.simple_graph(
'"Big" LIST requests',
Expand All @@ -198,16 +200,16 @@
d.simple_graph(
"Traffic",
'sum(rate(apiserver_response_sizes_sum{verb!="WATCH"}[1m])) by (verb, version, resource, scope, instance)',
g.single_y_axis(format=g.BYTES_PER_SEC_FORMAT),
yAxes=g.single_y_axis(format=g.BYTES_PER_SEC_FORMAT),
),
]

VM_PANELS = [
d.simple_graph(
"fs bytes reads by container",
"sum(rate(container_fs_reads_bytes_total[1m])) by (container_name, instance)",
g.single_y_axis(format=g.BYTES_FORMAT),
legend="{{container_name}} {{instance}}",
yAxes=g.single_y_axis(format=g.BYTES_FORMAT),
),
d.simple_graph(
"fs reads by container",
Expand All @@ -217,8 +219,8 @@
d.simple_graph(
"fs bytes writes by container",
"sum(rate(container_fs_writes_bytes_total[1m])) by (container_name, instance)",
g.single_y_axis(format=g.BYTES_FORMAT),
legend="{{container_name}} {{instance}}",
yAxes=g.single_y_axis(format=g.BYTES_FORMAT),
),
d.simple_graph(
"fs writes by container",
Expand All @@ -244,7 +246,7 @@
),
g.Target(expr="machine_memory_bytes", legendFormat="limit"),
],
g.single_y_axis(format=g.BYTES_FORMAT),
yAxes=g.single_y_axis(format=g.BYTES_FORMAT),
),
d.simple_graph(
"memory working set by container",
Expand All @@ -255,7 +257,7 @@
),
g.Target(expr="machine_memory_bytes", legendFormat="limit"),
],
g.single_y_axis(format=g.BYTES_FORMAT),
yAxes=g.single_y_axis(format=g.BYTES_FORMAT),
),
d.Graph(
title="Network usage (bytes)",
Expand Down Expand Up @@ -342,7 +344,6 @@
panels=[
d.Graph(
title="Coredns memory",
dataSource="$source",
targets=[
g.Target(
expr='quantile(1, sum(process_resident_memory_bytes{job="kube-dns"}) by (pod))',
Expand Down

This file was deleted.