-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
imp: refactor alerts files, add new alerts, add new tracing alerts
- Loading branch information
1 parent
93aa275
commit 35e594a
Showing
6 changed files
with
353 additions
and
270 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,71 @@ | ||
let | ||
mkDivergenceAlert = { | ||
env, | ||
lagBlocks ? 6, | ||
lagSeconds ? 120, | ||
thresholdMinutes ? 5, | ||
useLegacyTracing ? true, | ||
}: let | ||
blockMetric = | ||
if useLegacyTracing | ||
then "cardano_node_metrics_blockNum_int" | ||
else "ChainDB_BlockNum"; | ||
slotMetric = | ||
if useLegacyTracing | ||
then "cardano_node_metrics_slotNum_int" | ||
else "ChainDB_SlotNum"; | ||
in { | ||
alert = "cardano_node_block_divergence_${env}${ | ||
if useLegacyTracing | ||
then "" | ||
else "_new_tracing" | ||
}"; | ||
expr = '' | ||
( | ||
(abs(max(${blockMetric}{environment="${env}"}) - on() group_right() ${blockMetric}{environment="${env}"}) > bool ${toString lagBlocks}) | ||
- (abs(max(${slotMetric}{environment="${env}"}) - on() group_right() ${slotMetric}{environment="${env}"}) < bool ${toString lagSeconds}) | ||
) == 1 | ||
''; | ||
for = "${toString thresholdMinutes}m"; | ||
labels.severity = "page"; | ||
annotations = { | ||
summary = "{{$labels.instance}}: cardano-node block divergence detected on ${env} for more than ${toString thresholdMinutes} minutes."; | ||
description = "{{$labels.instance}}: cardano-node block divergence of more than ${toString lagBlocks} blocks and ${toString lagSeconds} seconds lag detected for more than ${toString thresholdMinutes} minutes."; | ||
}; | ||
}; | ||
in { | ||
namespace = "cardano"; | ||
name = "cardano-node-divergence"; | ||
rule = [ | ||
(mkDivergenceAlert {env = "mainnet";}) | ||
(mkDivergenceAlert {env = "preprod";}) | ||
(mkDivergenceAlert {env = "preview";}) | ||
(mkDivergenceAlert {env = "private";}) | ||
(mkDivergenceAlert {env = "sanchonet";}) | ||
(mkDivergenceAlert {env = "shelley-qa";}) | ||
(mkDivergenceAlert { | ||
env = "mainnet"; | ||
useLegacyTracing = false; | ||
}) | ||
(mkDivergenceAlert { | ||
env = "preprod"; | ||
useLegacyTracing = false; | ||
}) | ||
(mkDivergenceAlert { | ||
env = "preview"; | ||
useLegacyTracing = false; | ||
}) | ||
(mkDivergenceAlert { | ||
env = "private"; | ||
useLegacyTracing = false; | ||
}) | ||
(mkDivergenceAlert { | ||
env = "sanchonet"; | ||
useLegacyTracing = false; | ||
}) | ||
(mkDivergenceAlert { | ||
env = "shelley-qa"; | ||
useLegacyTracing = false; | ||
}) | ||
]; | ||
} |
124 changes: 124 additions & 0 deletions
124
flake/opentofu/grafana/alerts/cardano-node-forge.nix-import
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,124 @@ | ||
let | ||
mkKesAlert = { | ||
periodNotice, | ||
useLegacyTracing ? true, | ||
}: let | ||
kesPeriodsRemaining = | ||
if useLegacyTracing | ||
then "cardano_node_metrics_remainingKESPeriods_int" | ||
else "(KESInfo_operationalCertificateExpiryKESPeriod - KESInfo_operationalCertificateStartKESPeriod) - KESInfo_currentKESPeriod"; | ||
in { | ||
alert = "cardano_node_KES_expiration_metric_${toString periodNotice}period_notice${ | ||
if useLegacyTracing | ||
then "" | ||
else "_new_tracing" | ||
}"; | ||
expr = "${kesPeriodsRemaining} <= ${toString periodNotice}"; | ||
for = "5m"; | ||
labels.severity = "page"; | ||
annotations = { | ||
summary = "{{$labels.instance}}: cardano-node KES expiration notice: less than 10 periods until KES expiration."; | ||
description = "{{$labels.instance}}: cardano-node KES expiration notice: less than 10 periods until KES expiration."; | ||
}; | ||
}; | ||
in { | ||
namespace = "cardano"; | ||
name = "cardano-node-forge"; | ||
rule = [ | ||
{ | ||
alert = "cardano_node_forge_blocks_missing"; | ||
expr = "increase(cardano_node_metrics_Forge_forged_int[24h]) == 0"; | ||
for = "1m"; | ||
labels.severity = "page"; | ||
annotations = { | ||
summary = "{{$labels.instance}}: cardano-node has not forged any blocks for 1 day."; | ||
description = '' | ||
{{$labels.instance}}: cardano-node has not forged any blocks for 1 day. | ||
This should be investigated, or the alert adjusted if this is expected.''; | ||
}; | ||
} | ||
{ | ||
alert = "cardano_node_forge_blocks_missing_new_tracing"; | ||
expr = "increase(Forge_BlocksForgedNum[24h]) == 0"; | ||
for = "1m"; | ||
labels.severity = "page"; | ||
annotations = { | ||
summary = "{{$labels.instance}}: cardano-node has not forged any blocks for 1 day."; | ||
description = '' | ||
{{$labels.instance}}: cardano-node has not forged any blocks for 1 day. | ||
This should be investigated, or the alert adjusted if this is expected.''; | ||
}; | ||
} | ||
{ | ||
alert = "cardano_node_forge_not_adopted_error"; | ||
expr = "increase(cardano_node_metrics_Forge_didnt_adopt_int[1h]) > 0"; | ||
for = "1m"; | ||
labels.severity = "page"; | ||
annotations = { | ||
summary = "{{$labels.instance}}: cardano-node is failing to adopt recent forged blocks."; | ||
description = '' | ||
{{$labels.instance}}: cardano-node failed to adopt 1 or more blocks in the past hour. | ||
A restart of node on the affected machine(s) may be required.''; | ||
}; | ||
} | ||
{ | ||
alert = "cardano_node_forge_not_adopted_error_new_tracing"; | ||
expr = "sum_over_time(abs((Forge_ForgedSlotLast - Forge_ForgedSlotLast == bool 0) - 1)[1h:]) > 0"; | ||
for = "1m"; | ||
labels.severity = "page"; | ||
annotations = { | ||
summary = "{{$labels.instance}}: cardano-node is failing to adopt recent forged blocks."; | ||
description = '' | ||
{{$labels.instance}}: cardano-node failed to adopt 1 or more blocks in the past hour. | ||
A restart of node on the affected machine(s) may be required.''; | ||
}; | ||
} | ||
{ | ||
alert = "cardano_node_cannot_forge_new_tracing"; | ||
expr = "increase(Forge_NodeCannotForgeNum[1h]) > 0"; | ||
for = "1m"; | ||
labels.severity = "page"; | ||
annotations = { | ||
summary = "{{$labels.instance}}: cardano-node is failing to forge blocks."; | ||
description = '' | ||
{{$labels.instance}}: cardano-node failed to forge 1 or more blocks in the past hour. | ||
A restart of node on the affected machine(s) may be required.''; | ||
}; | ||
} | ||
{ | ||
alert = "too_many_slot_leadership_checks_missed"; | ||
expr = "rate(cardano_node_metrics_slotsMissedNum_int[5m]) * 1 > 0.5"; | ||
for = "2m"; | ||
labels.severity = "page"; | ||
annotations = { | ||
summary = "{{$labels.instance}}: block producing node is failing to check for slot leadership for more than half of the slots."; | ||
description = "{{$labels.instance}}: block producing node is failing to check for slot leadership for more than half of the slots for more than 2 min."; | ||
}; | ||
} | ||
{ | ||
alert = "too_many_slot_leadership_checks_missed_new_tracing"; | ||
expr = "rate(Forge_SlotsMissed[5m]) * 1 > 0.5"; | ||
for = "2m"; | ||
labels.severity = "page"; | ||
annotations = { | ||
summary = "{{$labels.instance}}: block producing node is failing to check for slot leadership for more than half of the slots."; | ||
description = "{{$labels.instance}}: block producing node is failing to check for slot leadership for more than half of the slots for more than 2 min."; | ||
}; | ||
} | ||
(mkKesAlert {periodNotice = 10;}) | ||
(mkKesAlert {periodNotice = 5;}) | ||
(mkKesAlert {periodNotice = 1;}) | ||
(mkKesAlert { | ||
periodNotice = 10; | ||
useLegacyTracing = false; | ||
}) | ||
(mkKesAlert { | ||
periodNotice = 5; | ||
useLegacyTracing = false; | ||
}) | ||
(mkKesAlert { | ||
periodNotice = 1; | ||
useLegacyTracing = false; | ||
}) | ||
]; | ||
} |
92 changes: 92 additions & 0 deletions
92
flake/opentofu/grafana/alerts/cardano-node-network.nix-import
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,92 @@ | ||
let | ||
highBlockUtilization = toString 95; # Alert if blocks are above that % full. | ||
in { | ||
namespace = "cardano"; | ||
name = "cardano-node-network"; | ||
rule = [ | ||
# TODO: Legacy cardano-ops metric; potentially convert to netdata statsd push/scrape | ||
{ | ||
alert = "high_cardano_ping_latency"; | ||
expr = "avg_over_time(cardano_ping_latency_ms[5m]) > 250"; | ||
for = "30m"; | ||
labels.severity = "page"; | ||
annotations = { | ||
summary = "{{$labels.instance}}: Cardano average ping latency over 5 minutes has been above 250 milliseconds for the last 30 minutes"; | ||
description = "{{$labels.instance}}: Cardano average ping latency over 5 minutes has been above 250 milliseconds for the last 30 minutes."; | ||
}; | ||
} | ||
# TODO: New tracer equivalent metric? | ||
{ | ||
alert = "blocks_adoption_delay_too_high"; | ||
expr = "avg(quantile_over_time(0.95, cardano_node_metrics_blockadoption_forgeDelay_real[6h])) >= 4.5"; | ||
for = "1m"; | ||
labels.severity = "page"; | ||
annotations = { | ||
summary = "Blocks adoption delay have been above 4.5s for more than 5% of blocks"; | ||
description = "Node average of blocks adoption delay have been above 4.5s for more than 5% of blocks for more than 6 hours"; | ||
}; | ||
} | ||
# TODO: Static max block size until node publishes max block size metric | ||
{ | ||
alert = "blocks_utilization_too_high"; | ||
expr = "100 * avg(avg_over_time(cardano_node_metrics_blockfetchclient_blocksize[6h]) / 90112) > ${highBlockUtilization}"; | ||
for = "5m"; | ||
labels.severity = "page"; | ||
annotations = { | ||
summary = "Block utilization is above ${highBlockUtilization}%."; | ||
description = "Block utilization has averaged above ${highBlockUtilization}% for more than 6h."; | ||
}; | ||
} | ||
# TODO: Static max block size until node publishes max block size metric | ||
{ | ||
alert = "blocks_utilization_too_high_new_tracing"; | ||
expr = "100 * avg(avg_over_time(Blockfetch_Client_Blocksize[6h]) / 90112) > ${highBlockUtilization}"; | ||
for = "5m"; | ||
labels.severity = "page"; | ||
annotations = { | ||
summary = "Block utilization is above ${highBlockUtilization}%."; | ||
description = "Block utilization has averaged above ${highBlockUtilization}% for more than 6h."; | ||
}; | ||
} | ||
{ | ||
alert = "cardano_blockfetchclient_blockdelay_high"; | ||
expr = ''cardano_node_metrics_blockfetchclient_blockdelay_cdfFive < 0.90''; | ||
for = "10m"; | ||
labels.severity = "page"; | ||
annotations = { | ||
summary = "{{$labels.instance}}: cardano-node has less than 90% of blocks arriving in 5 seconds."; | ||
description = "{{$labels.instance}}: cardano-node has block fetch delays with less than 90% of blocks over a 6 hour window arriving in 5 seconds."; | ||
}; | ||
} | ||
{ | ||
alert = "cardano_blockfetchclient_blockdelay_high_new_tracing"; | ||
expr = ''Blockfetch_Client_Blockdelay_cdfFive < 0.90''; | ||
for = "10m"; | ||
labels.severity = "page"; | ||
annotations = { | ||
summary = "{{$labels.instance}}: cardano-node has less than 90% of blocks arriving in 5 seconds."; | ||
description = "{{$labels.instance}}: cardano-node has block fetch delays with less than 90% of blocks over a 6 hour window arriving in 5 seconds."; | ||
}; | ||
} | ||
{ | ||
alert = "cardano_blockfetchclient_blockdelay_critical"; | ||
expr = "cardano_node_metrics_blockfetchclient_blockdelay_cdfFive < 0.50"; | ||
for = "10m"; | ||
labels.severity = "page"; | ||
annotations = { | ||
summary = "{{$labels.instance}}: cardano-node has less than 50% of blocks arriving in 5 seconds."; | ||
description = "{{$labels.instance}}: cardano-node has block fetch delays with less than 50% of blocks over a 6 hour window arriving in 5 seconds. This is abnormal even for mempool trace enabled nodes."; | ||
}; | ||
} | ||
{ | ||
alert = "cardano_blockfetchclient_blockdelay_critical_new_tracing"; | ||
expr = ''Blockfetch_Client_Blockdelay_cdfFive < 0.50''; | ||
for = "10m"; | ||
labels.severity = "page"; | ||
annotations = { | ||
summary = "{{$labels.instance}}: cardano-node has less than 50% of blocks arriving in 5 seconds."; | ||
description = "{{$labels.instance}}: cardano-node has block fetch delays with less than 50% of blocks over a 6 hour window arriving in 5 seconds. This is abnormal even for mempool trace enabled nodes."; | ||
}; | ||
} | ||
]; | ||
} |
49 changes: 49 additions & 0 deletions
49
flake/opentofu/grafana/alerts/cardano-node-quality.nix-import
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
let | ||
chainDensityLow = toString 70; | ||
chainDensityVeryLow = toString 50; | ||
in { | ||
namespace = "cardano"; | ||
name = "cardano-node-quality"; | ||
rule = [ | ||
{ | ||
alert = "chain_quality_degraded"; | ||
expr = ''100 * quantile by(environment) (0.2, (cardano_node_metrics_density_real{environment!~"private|sanchonet|shelley-qa|shelley_qa"} * 20)) < ${chainDensityLow}''; | ||
for = "5m"; | ||
labels.severity = "page"; | ||
annotations = { | ||
summary = "Degraded Chain Density: more than 20% of nodes have low chain density (<${chainDensityLow}%) in environment {{$labels.environment}}."; | ||
description = "Degraded Chain Density: more than 20% of nodes have low chain density (<${chainDensityLow}%) in environment {{$labels.environment}}."; | ||
}; | ||
} | ||
{ | ||
alert = "chain_quality_degraded_new_tracing"; | ||
expr = ''100 * quantile by(environment) (0.2, (ChainDB_Density{environment!~"private|sanchonet|shelley-qa|shelley_qa"} * 20)) < ${chainDensityLow}''; | ||
for = "5m"; | ||
labels.severity = "page"; | ||
annotations = { | ||
summary = "Degraded Chain Density: more than 20% of nodes have low chain density (<${chainDensityLow}%) in environment {{$labels.environment}}."; | ||
description = "Degraded Chain Density: more than 20% of nodes have low chain density (<${chainDensityLow}%) in environment {{$labels.environment}}."; | ||
}; | ||
} | ||
{ | ||
alert = "chain_quality_degraded_very_low"; | ||
expr = ''100 * quantile by(environment) (0.2, (cardano_node_metrics_density_real * 20)) < ${chainDensityVeryLow}''; | ||
for = "5m"; | ||
labels.severity = "page"; | ||
annotations = { | ||
summary = "Degraded Chain Density: more than 20% of nodes have low chain density (<${chainDensityVeryLow}%) in environment {{$labels.environment}}."; | ||
description = "Degraded Chain Density: more than 20% of nodes have low chain density (<${chainDensityVeryLow}%) in environment {{$labels.environment}}."; | ||
}; | ||
} | ||
{ | ||
alert = "chain_quality_degraded_very_low_new_tracing"; | ||
expr = ''100 * quantile by(environment) (0.2, (ChainDB_Density * 20)) < ${chainDensityVeryLow}''; | ||
for = "5m"; | ||
labels.severity = "page"; | ||
annotations = { | ||
summary = "Degraded Chain Density: more than 20% of nodes have low chain density (<${chainDensityVeryLow}%) in environment {{$labels.environment}}."; | ||
description = "Degraded Chain Density: more than 20% of nodes have low chain density (<${chainDensityVeryLow}%) in environment {{$labels.environment}}."; | ||
}; | ||
} | ||
]; | ||
} |
Oops, something went wrong.