Skip to content

Commit

Permalink
imp: refactor alerts files, add new alerts, add new tracing alerts
Browse files Browse the repository at this point in the history
  • Loading branch information
johnalotoski committed May 7, 2024
1 parent 93aa275 commit 35e594a
Show file tree
Hide file tree
Showing 6 changed files with 353 additions and 270 deletions.
71 changes: 71 additions & 0 deletions flake/opentofu/cluster/cardano-node-divergence.nix-import
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
let
mkDivergenceAlert = {
env,
lagBlocks ? 6,
lagSeconds ? 120,
thresholdMinutes ? 5,
useLegacyTracing ? true,
}: let
blockMetric =
if useLegacyTracing
then "cardano_node_metrics_blockNum_int"
else "ChainDB_BlockNum";
slotMetric =
if useLegacyTracing
then "cardano_node_metrics_slotNum_int"
else "ChainDB_SlotNum";
in {
alert = "cardano_node_block_divergence_${env}${
if useLegacyTracing
then ""
else "_new_tracing"
}";
expr = ''
(
(abs(max(${blockMetric}{environment="${env}"}) - on() group_right() ${blockMetric}{environment="${env}"}) > bool ${toString lagBlocks})
- (abs(max(${slotMetric}{environment="${env}"}) - on() group_right() ${slotMetric}{environment="${env}"}) < bool ${toString lagSeconds})
) == 1
'';
for = "${toString thresholdMinutes}m";
labels.severity = "page";
annotations = {
summary = "{{$labels.instance}}: cardano-node block divergence detected on ${env} for more than ${toString thresholdMinutes} minutes.";
description = "{{$labels.instance}}: cardano-node block divergence of more than ${toString lagBlocks} blocks and ${toString lagSeconds} seconds lag detected for more than ${toString thresholdMinutes} minutes.";
};
};
in {
namespace = "cardano";
name = "cardano-node-divergence";
rule = [
(mkDivergenceAlert {env = "mainnet";})
(mkDivergenceAlert {env = "preprod";})
(mkDivergenceAlert {env = "preview";})
(mkDivergenceAlert {env = "private";})
(mkDivergenceAlert {env = "sanchonet";})
(mkDivergenceAlert {env = "shelley-qa";})
(mkDivergenceAlert {
env = "mainnet";
useLegacyTracing = false;
})
(mkDivergenceAlert {
env = "preprod";
useLegacyTracing = false;
})
(mkDivergenceAlert {
env = "preview";
useLegacyTracing = false;
})
(mkDivergenceAlert {
env = "private";
useLegacyTracing = false;
})
(mkDivergenceAlert {
env = "sanchonet";
useLegacyTracing = false;
})
(mkDivergenceAlert {
env = "shelley-qa";
useLegacyTracing = false;
})
];
}
124 changes: 124 additions & 0 deletions flake/opentofu/grafana/alerts/cardano-node-forge.nix-import
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
let
mkKesAlert = {
periodNotice,
useLegacyTracing ? true,
}: let
kesPeriodsRemaining =
if useLegacyTracing
then "cardano_node_metrics_remainingKESPeriods_int"
else "(KESInfo_operationalCertificateExpiryKESPeriod - KESInfo_operationalCertificateStartKESPeriod) - KESInfo_currentKESPeriod";
in {
alert = "cardano_node_KES_expiration_metric_${toString periodNotice}period_notice${
if useLegacyTracing
then ""
else "_new_tracing"
}";
expr = "${kesPeriodsRemaining} <= ${toString periodNotice}";
for = "5m";
labels.severity = "page";
annotations = {
summary = "{{$labels.instance}}: cardano-node KES expiration notice: less than 10 periods until KES expiration.";
description = "{{$labels.instance}}: cardano-node KES expiration notice: less than 10 periods until KES expiration.";
};
};
in {
namespace = "cardano";
name = "cardano-node-forge";
rule = [
{
alert = "cardano_node_forge_blocks_missing";
expr = "increase(cardano_node_metrics_Forge_forged_int[24h]) == 0";
for = "1m";
labels.severity = "page";
annotations = {
summary = "{{$labels.instance}}: cardano-node has not forged any blocks for 1 day.";
description = ''
{{$labels.instance}}: cardano-node has not forged any blocks for 1 day.
This should be investigated, or the alert adjusted if this is expected.'';
};
}
{
alert = "cardano_node_forge_blocks_missing_new_tracing";
expr = "increase(Forge_BlocksForgedNum[24h]) == 0";
for = "1m";
labels.severity = "page";
annotations = {
summary = "{{$labels.instance}}: cardano-node has not forged any blocks for 1 day.";
description = ''
{{$labels.instance}}: cardano-node has not forged any blocks for 1 day.
This should be investigated, or the alert adjusted if this is expected.'';
};
}
{
alert = "cardano_node_forge_not_adopted_error";
expr = "increase(cardano_node_metrics_Forge_didnt_adopt_int[1h]) > 0";
for = "1m";
labels.severity = "page";
annotations = {
summary = "{{$labels.instance}}: cardano-node is failing to adopt recent forged blocks.";
description = ''
{{$labels.instance}}: cardano-node failed to adopt 1 or more blocks in the past hour.
A restart of node on the affected machine(s) may be required.'';
};
}
{
alert = "cardano_node_forge_not_adopted_error_new_tracing";
expr = "sum_over_time(abs((Forge_ForgedSlotLast - Forge_ForgedSlotLast == bool 0) - 1)[1h:]) > 0";
for = "1m";
labels.severity = "page";
annotations = {
summary = "{{$labels.instance}}: cardano-node is failing to adopt recent forged blocks.";
description = ''
{{$labels.instance}}: cardano-node failed to adopt 1 or more blocks in the past hour.
A restart of node on the affected machine(s) may be required.'';
};
}
{
alert = "cardano_node_cannot_forge_new_tracing";
expr = "increase(Forge_NodeCannotForgeNum[1h]) > 0";
for = "1m";
labels.severity = "page";
annotations = {
summary = "{{$labels.instance}}: cardano-node is failing to forge blocks.";
description = ''
{{$labels.instance}}: cardano-node failed to forge 1 or more blocks in the past hour.
A restart of node on the affected machine(s) may be required.'';
};
}
{
alert = "too_many_slot_leadership_checks_missed";
expr = "rate(cardano_node_metrics_slotsMissedNum_int[5m]) * 1 > 0.5";
for = "2m";
labels.severity = "page";
annotations = {
summary = "{{$labels.instance}}: block producing node is failing to check for slot leadership for more than half of the slots.";
description = "{{$labels.instance}}: block producing node is failing to check for slot leadership for more than half of the slots for more than 2 min.";
};
}
{
alert = "too_many_slot_leadership_checks_missed_new_tracing";
expr = "rate(Forge_SlotsMissed[5m]) * 1 > 0.5";
for = "2m";
labels.severity = "page";
annotations = {
summary = "{{$labels.instance}}: block producing node is failing to check for slot leadership for more than half of the slots.";
description = "{{$labels.instance}}: block producing node is failing to check for slot leadership for more than half of the slots for more than 2 min.";
};
}
(mkKesAlert {periodNotice = 10;})
(mkKesAlert {periodNotice = 5;})
(mkKesAlert {periodNotice = 1;})
(mkKesAlert {
periodNotice = 10;
useLegacyTracing = false;
})
(mkKesAlert {
periodNotice = 5;
useLegacyTracing = false;
})
(mkKesAlert {
periodNotice = 1;
useLegacyTracing = false;
})
];
}
92 changes: 92 additions & 0 deletions flake/opentofu/grafana/alerts/cardano-node-network.nix-import
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
let
highBlockUtilization = toString 95; # Alert if blocks are above that % full.
in {
namespace = "cardano";
name = "cardano-node-network";
rule = [
# TODO: Legacy cardano-ops metric; potentially convert to netdata statsd push/scrape
{
alert = "high_cardano_ping_latency";
expr = "avg_over_time(cardano_ping_latency_ms[5m]) > 250";
for = "30m";
labels.severity = "page";
annotations = {
summary = "{{$labels.instance}}: Cardano average ping latency over 5 minutes has been above 250 milliseconds for the last 30 minutes";
description = "{{$labels.instance}}: Cardano average ping latency over 5 minutes has been above 250 milliseconds for the last 30 minutes.";
};
}
# TODO: New tracer equivalent metric?
{
alert = "blocks_adoption_delay_too_high";
expr = "avg(quantile_over_time(0.95, cardano_node_metrics_blockadoption_forgeDelay_real[6h])) >= 4.5";
for = "1m";
labels.severity = "page";
annotations = {
summary = "Blocks adoption delay have been above 4.5s for more than 5% of blocks";
description = "Node average of blocks adoption delay have been above 4.5s for more than 5% of blocks for more than 6 hours";
};
}
# TODO: Static max block size until node publishes max block size metric
{
alert = "blocks_utilization_too_high";
expr = "100 * avg(avg_over_time(cardano_node_metrics_blockfetchclient_blocksize[6h]) / 90112) > ${highBlockUtilization}";
for = "5m";
labels.severity = "page";
annotations = {
summary = "Block utilization is above ${highBlockUtilization}%.";
description = "Block utilization has averaged above ${highBlockUtilization}% for more than 6h.";
};
}
# TODO: Static max block size until node publishes max block size metric
{
alert = "blocks_utilization_too_high_new_tracing";
expr = "100 * avg(avg_over_time(Blockfetch_Client_Blocksize[6h]) / 90112) > ${highBlockUtilization}";
for = "5m";
labels.severity = "page";
annotations = {
summary = "Block utilization is above ${highBlockUtilization}%.";
description = "Block utilization has averaged above ${highBlockUtilization}% for more than 6h.";
};
}
{
alert = "cardano_blockfetchclient_blockdelay_high";
expr = ''cardano_node_metrics_blockfetchclient_blockdelay_cdfFive < 0.90'';
for = "10m";
labels.severity = "page";
annotations = {
summary = "{{$labels.instance}}: cardano-node has less than 90% of blocks arriving in 5 seconds.";
description = "{{$labels.instance}}: cardano-node has block fetch delays with less than 90% of blocks over a 6 hour window arriving in 5 seconds.";
};
}
{
alert = "cardano_blockfetchclient_blockdelay_high_new_tracing";
expr = ''Blockfetch_Client_Blockdelay_cdfFive < 0.90'';
for = "10m";
labels.severity = "page";
annotations = {
summary = "{{$labels.instance}}: cardano-node has less than 90% of blocks arriving in 5 seconds.";
description = "{{$labels.instance}}: cardano-node has block fetch delays with less than 90% of blocks over a 6 hour window arriving in 5 seconds.";
};
}
{
alert = "cardano_blockfetchclient_blockdelay_critical";
expr = "cardano_node_metrics_blockfetchclient_blockdelay_cdfFive < 0.50";
for = "10m";
labels.severity = "page";
annotations = {
summary = "{{$labels.instance}}: cardano-node has less than 50% of blocks arriving in 5 seconds.";
description = "{{$labels.instance}}: cardano-node has block fetch delays with less than 50% of blocks over a 6 hour window arriving in 5 seconds. This is abnormal even for mempool trace enabled nodes.";
};
}
{
alert = "cardano_blockfetchclient_blockdelay_critical_new_tracing";
expr = ''Blockfetch_Client_Blockdelay_cdfFive < 0.50'';
for = "10m";
labels.severity = "page";
annotations = {
summary = "{{$labels.instance}}: cardano-node has less than 50% of blocks arriving in 5 seconds.";
description = "{{$labels.instance}}: cardano-node has block fetch delays with less than 50% of blocks over a 6 hour window arriving in 5 seconds. This is abnormal even for mempool trace enabled nodes.";
};
}
];
}
49 changes: 49 additions & 0 deletions flake/opentofu/grafana/alerts/cardano-node-quality.nix-import
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
let
chainDensityLow = toString 70;
chainDensityVeryLow = toString 50;
in {
namespace = "cardano";
name = "cardano-node-quality";
rule = [
{
alert = "chain_quality_degraded";
expr = ''100 * quantile by(environment) (0.2, (cardano_node_metrics_density_real{environment!~"private|sanchonet|shelley-qa|shelley_qa"} * 20)) < ${chainDensityLow}'';
for = "5m";
labels.severity = "page";
annotations = {
summary = "Degraded Chain Density: more than 20% of nodes have low chain density (<${chainDensityLow}%) in environment {{$labels.environment}}.";
description = "Degraded Chain Density: more than 20% of nodes have low chain density (<${chainDensityLow}%) in environment {{$labels.environment}}.";
};
}
{
alert = "chain_quality_degraded_new_tracing";
expr = ''100 * quantile by(environment) (0.2, (ChainDB_Density{environment!~"private|sanchonet|shelley-qa|shelley_qa"} * 20)) < ${chainDensityLow}'';
for = "5m";
labels.severity = "page";
annotations = {
summary = "Degraded Chain Density: more than 20% of nodes have low chain density (<${chainDensityLow}%) in environment {{$labels.environment}}.";
description = "Degraded Chain Density: more than 20% of nodes have low chain density (<${chainDensityLow}%) in environment {{$labels.environment}}.";
};
}
{
alert = "chain_quality_degraded_very_low";
expr = ''100 * quantile by(environment) (0.2, (cardano_node_metrics_density_real * 20)) < ${chainDensityVeryLow}'';
for = "5m";
labels.severity = "page";
annotations = {
summary = "Degraded Chain Density: more than 20% of nodes have low chain density (<${chainDensityVeryLow}%) in environment {{$labels.environment}}.";
description = "Degraded Chain Density: more than 20% of nodes have low chain density (<${chainDensityVeryLow}%) in environment {{$labels.environment}}.";
};
}
{
alert = "chain_quality_degraded_very_low_new_tracing";
expr = ''100 * quantile by(environment) (0.2, (ChainDB_Density * 20)) < ${chainDensityVeryLow}'';
for = "5m";
labels.severity = "page";
annotations = {
summary = "Degraded Chain Density: more than 20% of nodes have low chain density (<${chainDensityVeryLow}%) in environment {{$labels.environment}}.";
description = "Degraded Chain Density: more than 20% of nodes have low chain density (<${chainDensityVeryLow}%) in environment {{$labels.environment}}.";
};
}
];
}

0 comments on commit 35e594a

Please sign in to comment.