forked from LinTechSo/sloth
/
alert_rules.go
120 lines (102 loc) 路 4.29 KB
/
alert_rules.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
package prometheus
import (
"bytes"
"context"
"fmt"
"text/template"
"github.com/prometheus/prometheus/model/rulefmt"
"github.com/jonas27/sloth/internal/alert"
)
// genFunc knows how to generate an SLI recording rule for a specific time window.
type alertGenFunc func(slo SLO, sloAlert AlertMeta, quick, slow alert.MWMBAlert) (*rulefmt.Rule, error)
type sloAlertRulesGenerator struct {
alertGenFunc alertGenFunc
}
// SLOAlertRulesGenerator knows how to generate the SLO prometheus alert rules
// from an SLO.
var SLOAlertRulesGenerator = sloAlertRulesGenerator{alertGenFunc: defaultSLOAlertGenerator}
func (s sloAlertRulesGenerator) GenerateSLOAlertRules(ctx context.Context, slo SLO, alerts alert.MWMBAlertGroup) ([]rulefmt.Rule, error) {
rules := []rulefmt.Rule{}
// Generate Page alerts.
if !slo.PageAlertMeta.Disable {
rule, err := s.alertGenFunc(slo, slo.PageAlertMeta, alerts.PageQuick, alerts.PageSlow)
if err != nil {
return nil, fmt.Errorf("could not create page alert: %w", err)
}
rules = append(rules, *rule)
}
// Generate Ticket alerts.
if !slo.TicketAlertMeta.Disable {
rule, err := s.alertGenFunc(slo, slo.TicketAlertMeta, alerts.TicketQuick, alerts.TicketSlow)
if err != nil {
return nil, fmt.Errorf("could not create ticket alert: %w", err)
}
rules = append(rules, *rule)
}
return rules, nil
}
func defaultSLOAlertGenerator(slo SLO, sloAlert AlertMeta, quick, slow alert.MWMBAlert) (*rulefmt.Rule, error) {
// Generate the filter labels based on the SLO ids.
metricFilter := labelsToPromFilter(slo.GetSLOIDPromLabels())
// Render the alert template.
tplData := struct {
MetricFilter string
ErrorBudgetRatio float64
QuickShortMetric string
QuickShortBurnFactor float64
QuickLongMetric string
QuickLongBurnFactor float64
SlowShortMetric string
SlowShortBurnFactor float64
SlowQuickMetric string
SlowQuickBurnFactor float64
WindowLabel string
}{
MetricFilter: metricFilter,
ErrorBudgetRatio: quick.ErrorBudget / 100, // Any(quick or slow) should work because are the same.
QuickShortMetric: slo.GetSLIErrorMetric(quick.ShortWindow),
QuickShortBurnFactor: quick.BurnRateFactor,
QuickLongMetric: slo.GetSLIErrorMetric(quick.LongWindow),
QuickLongBurnFactor: quick.BurnRateFactor,
SlowShortMetric: slo.GetSLIErrorMetric(slow.ShortWindow),
SlowShortBurnFactor: slow.BurnRateFactor,
SlowQuickMetric: slo.GetSLIErrorMetric(slow.LongWindow),
SlowQuickBurnFactor: slow.BurnRateFactor,
WindowLabel: sloWindowLabelName,
}
var expr bytes.Buffer
err := mwmbAlertTpl.Execute(&expr, tplData)
if err != nil {
return nil, fmt.Errorf("could not render alert expression: %w", err)
}
// Add specific annotations.
severity := quick.Severity.String() // Any(quick or slow) should work because are the same.
extraAnnotations := map[string]string{
"title": fmt.Sprintf("(%s) {{$labels.%s}} {{$labels.%s}} SLO error budget burn rate is too fast.", severity, sloServiceLabelName, sloNameLabelName),
"summary": fmt.Sprintf("{{$labels.%s}} {{$labels.%s}} SLO error budget burn rate is over expected.", sloServiceLabelName, sloNameLabelName),
}
// Add specific labels. We don't add the labels from the rules because we will
// inherit on the alerts, this way we avoid warnings of overrided labels.
extraLabels := map[string]string{
sloSeverityLabelName: severity,
}
return &rulefmt.Rule{
Alert: sloAlert.Name,
Expr: expr.String(),
Annotations: mergeLabels(extraAnnotations, sloAlert.Annotations),
Labels: mergeLabels(extraLabels, sloAlert.Labels),
}, nil
}
// Multiburn multiwindow alert template.
var mwmbAlertTpl = template.Must(template.New("mwmbAlertTpl").Option("missingkey=error").Parse(`(
max({{ .QuickShortMetric }}{{ .MetricFilter}} > ({{ .QuickShortBurnFactor }} * {{ .ErrorBudgetRatio }})) without ({{ .WindowLabel }})
and
max({{ .QuickLongMetric }}{{ .MetricFilter}} > ({{ .QuickLongBurnFactor }} * {{ .ErrorBudgetRatio }})) without ({{ .WindowLabel }})
)
or
(
max({{ .SlowShortMetric }}{{ .MetricFilter }} > ({{ .SlowShortBurnFactor }} * {{ .ErrorBudgetRatio }})) without ({{ .WindowLabel }})
and
max({{ .SlowQuickMetric }}{{ .MetricFilter }} > ({{ .SlowQuickBurnFactor }} * {{ .ErrorBudgetRatio }})) without ({{ .WindowLabel }})
)
`))