diff --git a/examples/custom_rule_group_interval.yml b/examples/custom_rule_group_interval.yml new file mode 100644 index 00000000..94cfc461 --- /dev/null +++ b/examples/custom_rule_group_interval.yml @@ -0,0 +1,43 @@ +# This example shows how you can adjust the Prometheus rule_group interval for expensive SLOs +# https://prometheus.io/docs/prometheus/latest/configuration/recording_rules/#rule_group +# The SLO SLI measures the rate of CPU seconds spent performing softirqs +# +# `sloth generate -i ./examples/custom_rule_group_interval.yml` +# +version: "prometheus/v1" +service: "myapp" +labels: + owner: "myteam" +slos: + - name: "cpu-availability" + objective: 99.99 + description: "Example, expensive SLO. Recording rules will run every 2 minutes." + # alternative way of specifying interval for all three sets of rules + # interval: + # all: "5m" + interval: # all of these are different sets of rule groups sloth can make + slierror: "4m" + metadata: "2m" + alert: "2m" + sli: + events: + error_query: | + sum( + rate(node_cpu_seconds_total{mode="softirq"}[{{.window}}]) + ) + total_query: | + sum( + rate(node_cpu_seconds_total[{{.window}}]) + ) + alerting: + name: MyServiceHighErrorRate + labels: + category: "availability" + annotations: + summary: "High error rate on 'myservice' requests responses" + page_alert: + labels: + severity: pageteam + routing_key: myteam + ticket_alert: + disable: true diff --git a/internal/prometheus/model.go b/internal/prometheus/model.go index f2ac02c1..4b7f2c42 100644 --- a/internal/prometheus/model.go +++ b/internal/prometheus/model.go @@ -39,16 +39,20 @@ type AlertMeta struct { // SLO represents a service level objective configuration. type SLO struct { - ID string `validate:"required,name"` - Name string `validate:"required,name"` - Description string - Service string `validate:"required,name"` - SLI SLI `validate:"required"` - TimeWindow time.Duration `validate:"required"` - Objective float64 `validate:"gt=0,lte=100"` - Labels map[string]string `validate:"dive,keys,prom_label_key,endkeys,required,prom_label_value"` - PageAlertMeta AlertMeta - TicketAlertMeta AlertMeta + ID string `validate:"required,name"` + Name string `validate:"required,name"` + Description string + Service string `validate:"required,name"` + RuleGroupInterval time.Duration `validate:"time"` + SLIErrorRulesInterval time.Duration `validate:"time"` + MetadataRulesInterval time.Duration `validate:"time"` + AlertRulesInterval time.Duration `validate:"time"` + SLI SLI `validate:"required"` + TimeWindow time.Duration `validate:"required"` + Objective float64 `validate:"gt=0,lte=100"` + Labels map[string]string `validate:"dive,keys,prom_label_key,endkeys,required,prom_label_value"` + PageAlertMeta AlertMeta + TicketAlertMeta AlertMeta } type SLOGroup struct { @@ -86,6 +90,7 @@ var modelSpecValidate = func() *validator.Validate { mustRegisterValidation(v, "name", validateName) mustRegisterValidation(v, "required_if_enabled", validateRequiredEnabledAlertName) mustRegisterValidation(v, "template_vars", validateTemplateVars) + mustRegisterValidation(v, "time", validateTime) v.RegisterStructValidation(validateOneSLI, SLI{}) v.RegisterStructValidation(validateSLOGroup, SLOGroup{}) v.RegisterStructValidation(validateSLIEvents, SLIEvents{}) @@ -181,6 +186,18 @@ func validateName(fl validator.FieldLevel) bool { return nameRegexp.MatchString(s) } +// validateTime implements validator.CustomTypeFunc by validating +// a time duration. +func validateTime(fl validator.FieldLevel) bool { + s, ok := fl.Field().Interface().(time.Duration) + if !ok { + return false + } + + _, err := time.ParseDuration(s.String()) + return err == nil +} + func validateRequiredEnabledAlertName(fl validator.FieldLevel) bool { alertMeta, ok := fl.Parent().Interface().(AlertMeta) if !ok { diff --git a/internal/prometheus/spec.go b/internal/prometheus/spec.go index 1935aa38..eb499b36 100644 --- a/internal/prometheus/spec.go +++ b/internal/prometheus/spec.go @@ -69,15 +69,19 @@ func (y YAMLSpecLoader) mapSpecToModel(ctx context.Context, spec prometheusv1.Sp models := make([]SLO, 0, len(spec.SLOs)) for _, specSLO := range spec.SLOs { slo := SLO{ - ID: fmt.Sprintf("%s-%s", spec.Service, specSLO.Name), - Name: specSLO.Name, - Description: specSLO.Description, - Service: spec.Service, - TimeWindow: y.windowPeriod, - Objective: specSLO.Objective, - Labels: mergeLabels(spec.Labels, specSLO.Labels), - PageAlertMeta: AlertMeta{Disable: true}, - TicketAlertMeta: AlertMeta{Disable: true}, + ID: fmt.Sprintf("%s-%s", spec.Service, specSLO.Name), + RuleGroupInterval: specSLO.Interval.RuleGroupInterval, + SLIErrorRulesInterval: specSLO.Interval.SLIErrorRulesInterval, + MetadataRulesInterval: specSLO.Interval.MetadataRulesInterval, + AlertRulesInterval: specSLO.Interval.AlertRulesInterval, + Name: specSLO.Name, + Description: specSLO.Description, + Service: spec.Service, + TimeWindow: y.windowPeriod, + Objective: specSLO.Objective, + Labels: mergeLabels(spec.Labels, specSLO.Labels), + PageAlertMeta: AlertMeta{Disable: true}, + TicketAlertMeta: AlertMeta{Disable: true}, } // Set SLIs. diff --git a/internal/prometheus/storage.go b/internal/prometheus/storage.go index 9b37ca92..4edf4881 100644 --- a/internal/prometheus/storage.go +++ b/internal/prometheus/storage.go @@ -49,24 +49,93 @@ func (i IOWriterGroupedRulesYAMLRepo) StoreSLOs(ctx context.Context, slos []Stor ruleGroups := ruleGroupsYAMLv2{} for _, slo := range slos { if len(slo.Rules.SLIErrorRecRules) > 0 { - ruleGroups.Groups = append(ruleGroups.Groups, ruleGroupYAMLv2{ + + group := ruleGroupYAMLv2{ Name: fmt.Sprintf("sloth-slo-sli-recordings-%s", slo.SLO.ID), Rules: slo.Rules.SLIErrorRecRules, - }) + } + + var ruleGroupIntervalDuration prommodel.Duration + var err error + + switch { + case slo.SLO.SLIErrorRulesInterval.String() != "0s": + ruleGroupIntervalDuration, err = prommodel.ParseDuration(slo.SLO.SLIErrorRulesInterval.String()) + if err != nil { + return fmt.Errorf("could not parse rule_group interval duration for alerts %w", err) + } else { + group.RuleGroupInterval = ruleGroupIntervalDuration + } + case slo.SLO.RuleGroupInterval.String() != "0s": + ruleGroupIntervalDuration, err = prommodel.ParseDuration(slo.SLO.RuleGroupInterval.String()) + if err != nil { + return fmt.Errorf("could not parse default ('all') rule_group interval duration %w", err) + } else { + group.RuleGroupInterval = ruleGroupIntervalDuration + } + } + + ruleGroups.Groups = append(ruleGroups.Groups, group) } if len(slo.Rules.MetadataRecRules) > 0 { - ruleGroups.Groups = append(ruleGroups.Groups, ruleGroupYAMLv2{ + + group := ruleGroupYAMLv2{ Name: fmt.Sprintf("sloth-slo-meta-recordings-%s", slo.SLO.ID), Rules: slo.Rules.MetadataRecRules, - }) + } + + var ruleGroupIntervalDuration prommodel.Duration + var err error + + switch { + case slo.SLO.MetadataRulesInterval.String() != "0s": + ruleGroupIntervalDuration, err = prommodel.ParseDuration(slo.SLO.MetadataRulesInterval.String()) + if err != nil { + return fmt.Errorf("could not parse rule_group interval duration for alerts %w", err) + } else { + group.RuleGroupInterval = ruleGroupIntervalDuration + } + case slo.SLO.RuleGroupInterval.String() != "0s": + ruleGroupIntervalDuration, err = prommodel.ParseDuration(slo.SLO.RuleGroupInterval.String()) + if err != nil { + return fmt.Errorf("could not parse default ('all') rule_group interval duration %w", err) + } else { + group.RuleGroupInterval = ruleGroupIntervalDuration + } + } + + ruleGroups.Groups = append(ruleGroups.Groups, group) } if len(slo.Rules.AlertRules) > 0 { - ruleGroups.Groups = append(ruleGroups.Groups, ruleGroupYAMLv2{ + + group := ruleGroupYAMLv2{ Name: fmt.Sprintf("sloth-slo-alerts-%s", slo.SLO.ID), Rules: slo.Rules.AlertRules, - }) + } + + var ruleGroupIntervalDuration prommodel.Duration + var err error + + switch { + case slo.SLO.AlertRulesInterval.String() != "0s": + ruleGroupIntervalDuration, err = prommodel.ParseDuration(slo.SLO.AlertRulesInterval.String()) + if err != nil { + return fmt.Errorf("could not parse rule_group interval duration for alerts %w", err) + } else { + group.RuleGroupInterval = ruleGroupIntervalDuration + } + case slo.SLO.RuleGroupInterval.String() != "0s": + ruleGroupIntervalDuration, err = prommodel.ParseDuration(slo.SLO.RuleGroupInterval.String()) + if err != nil { + return fmt.Errorf("could not parse default ('all') rule_group interval duration %w", err) + } else { + group.RuleGroupInterval = ruleGroupIntervalDuration + } + } + + ruleGroups.Groups = append(ruleGroups.Groups, group) } } @@ -112,7 +181,7 @@ type ruleGroupsYAMLv2 struct { } type ruleGroupYAMLv2 struct { - Name string `yaml:"name"` - Interval prommodel.Duration `yaml:"interval,omitempty"` - Rules []rulefmt.Rule `yaml:"rules"` + Name string `yaml:"name"` + RuleGroupInterval prommodel.Duration `yaml:"interval,omitempty"` + Rules []rulefmt.Rule `yaml:"rules"` } diff --git a/internal/prometheus/storage_test.go b/internal/prometheus/storage_test.go index 545c692b..b550b2a0 100644 --- a/internal/prometheus/storage_test.go +++ b/internal/prometheus/storage_test.go @@ -4,6 +4,7 @@ import ( "bytes" "context" "testing" + "time" "github.com/prometheus/prometheus/model/rulefmt" "github.com/stretchr/testify/assert" @@ -12,7 +13,29 @@ import ( "github.com/slok/sloth/internal/prometheus" ) +func parseDuration(durationStr string, t *testing.T) time.Duration { + duration, err := time.ParseDuration(durationStr) + if err != nil { + t.Errorf("could not parse duration: %v", err) + return 0 + } + return duration +} + func TestIOWriterGroupedRulesYAMLRepoStore(t *testing.T) { + // set intervals ahead of time + ruleGroupInterval := parseDuration("2m", t) + // 0s = default/blank + ruleGroupIntervalBlank := parseDuration("0s", t) + // A/B for multiple group test + ruleGroupIntervalA := parseDuration("3m", t) + ruleGroupIntervalB := parseDuration("1h", t) + // for individual settings + sliErrorRulesInterval := parseDuration("4m", t) + metadataRulesInterval := parseDuration("5m", t) + alertRulesInterval := parseDuration("6m", t) + // need test for mix of rulegroupinterval and individual + tests := map[string]struct { slos []prometheus.StorageSLO expYAML string @@ -30,10 +53,11 @@ func TestIOWriterGroupedRulesYAMLRepoStore(t *testing.T) { expErr: true, }, - "Having a single SLI recording rule should render correctly.": { + "Having a single SLI recording rule with the generic rule_group interval should render correctly.": { + slos: []prometheus.StorageSLO{ { - SLO: prometheus.SLO{ID: "test1"}, + SLO: prometheus.SLO{ID: "test1", RuleGroupInterval: ruleGroupInterval}, Rules: prometheus.SLORules{ SLIErrorRecRules: []rulefmt.Rule{ { @@ -52,6 +76,7 @@ func TestIOWriterGroupedRulesYAMLRepoStore(t *testing.T) { groups: - name: sloth-slo-sli-recordings-test1 + interval: 2m rules: - record: test:record expr: test-expr @@ -91,7 +116,7 @@ groups: "Having a single SLO alert rule should render correctly.": { slos: []prometheus.StorageSLO{ { - SLO: prometheus.SLO{ID: "test1"}, + SLO: prometheus.SLO{ID: "test1", RuleGroupInterval: ruleGroupInterval}, Rules: prometheus.SLORules{ AlertRules: []rulefmt.Rule{ { @@ -111,6 +136,7 @@ groups: groups: - name: sloth-slo-alerts-test1 + interval: 2m rules: - alert: testAlert expr: test-expr @@ -120,11 +146,40 @@ groups: test-annot: one `, }, + "Having a single a blank or empty rule_group interval render correctly.": { + slos: []prometheus.StorageSLO{ + { + SLO: prometheus.SLO{ID: "test1", RuleGroupInterval: ruleGroupIntervalBlank}, + Rules: prometheus.SLORules{ + SLIErrorRecRules: []rulefmt.Rule{ + { + Record: "test:record", + Expr: "test-expr", + Labels: map[string]string{"test-label": "one"}, + }, + }, + }, + }, + }, + expYAML: ` +--- +# Code generated by Sloth (dev): https://github.com/slok/sloth. +# DO NOT EDIT. + +groups: +- name: sloth-slo-sli-recordings-test1 + rules: + - record: test:record + expr: test-expr + labels: + test-label: one +`, + }, "Having a multiple SLO alert and recording rules should render correctly.": { slos: []prometheus.StorageSLO{ { - SLO: prometheus.SLO{ID: "testa"}, + SLO: prometheus.SLO{ID: "testa", RuleGroupInterval: ruleGroupIntervalA}, Rules: prometheus.SLORules{ SLIErrorRecRules: []rulefmt.Rule{ { @@ -167,7 +222,7 @@ groups: }, }, { - SLO: prometheus.SLO{ID: "testb"}, + SLO: prometheus.SLO{ID: "testb", RuleGroupInterval: ruleGroupIntervalB}, Rules: prometheus.SLORules{ SLIErrorRecRules: []rulefmt.Rule{ { @@ -201,6 +256,7 @@ groups: groups: - name: sloth-slo-sli-recordings-testa + interval: 3m rules: - record: test:record-a1 expr: test-expr-a1 @@ -211,6 +267,7 @@ groups: labels: test-label: a-2 - name: sloth-slo-meta-recordings-testa + interval: 3m rules: - record: test:record-a3 expr: test-expr-a3 @@ -221,6 +278,7 @@ groups: labels: test-label: a-4 - name: sloth-slo-alerts-testa + interval: 3m rules: - alert: testAlertA1 expr: test-expr-a1 @@ -235,18 +293,21 @@ groups: annotations: test-annot: a-2 - name: sloth-slo-sli-recordings-testb + interval: 1h rules: - record: test:record-b1 expr: test-expr-b1 labels: test-label: b-1 - name: sloth-slo-meta-recordings-testb + interval: 1h rules: - record: test:record-b2 expr: test-expr-b2 labels: test-label: b-2 - name: sloth-slo-alerts-testb + interval: 1h rules: - alert: testAlertB1 expr: test-expr-b1 @@ -256,6 +317,130 @@ groups: test-annot: b-1 `, }, + "Having a mix of rule group intervals should render correctly.": { + + slos: []prometheus.StorageSLO{ + { + SLO: prometheus.SLO{ID: "testa", SLIErrorRulesInterval: sliErrorRulesInterval, MetadataRulesInterval: metadataRulesInterval, AlertRulesInterval: alertRulesInterval}, + Rules: prometheus.SLORules{ + SLIErrorRecRules: []rulefmt.Rule{ + { + Record: "test:record-a1", + Expr: "test-expr-a1", + Labels: map[string]string{"test-label": "a-1"}, + }, + }, + MetadataRecRules: []rulefmt.Rule{ + { + Record: "test:record-a3", + Expr: "test-expr-a3", + Labels: map[string]string{"test-label": "a-3"}, + }, + }, + AlertRules: []rulefmt.Rule{ + { + Alert: "testAlertA1", + Expr: "test-expr-a1", + Labels: map[string]string{"test-label": "a-1"}, + Annotations: map[string]string{"test-annot": "a-1"}, + }, + }, + }, + }, + }, + expYAML: ` +--- +# Code generated by Sloth (dev): https://github.com/slok/sloth. +# DO NOT EDIT. + +groups: +- name: sloth-slo-sli-recordings-testa + interval: 4m + rules: + - record: test:record-a1 + expr: test-expr-a1 + labels: + test-label: a-1 +- name: sloth-slo-meta-recordings-testa + interval: 5m + rules: + - record: test:record-a3 + expr: test-expr-a3 + labels: + test-label: a-3 +- name: sloth-slo-alerts-testa + interval: 6m + rules: + - alert: testAlertA1 + expr: test-expr-a1 + labels: + test-label: a-1 + annotations: + test-annot: a-1 +`}, + "Having a mix of rule group intervals and the overarching rule_group interval should render correctly.": { + + slos: []prometheus.StorageSLO{ + { + SLO: prometheus.SLO{ID: "testa", SLIErrorRulesInterval: sliErrorRulesInterval, MetadataRulesInterval: metadataRulesInterval, RuleGroupInterval: ruleGroupInterval}, + // in this case we use the broad RuleGroupInterval to set a 2m interval for the alert rules + // that dont have an explicit one set + Rules: prometheus.SLORules{ + SLIErrorRecRules: []rulefmt.Rule{ + { + Record: "test:record-a1", + Expr: "test-expr-a1", + Labels: map[string]string{"test-label": "a-1"}, + }, + }, + MetadataRecRules: []rulefmt.Rule{ + { + Record: "test:record-a3", + Expr: "test-expr-a3", + Labels: map[string]string{"test-label": "a-3"}, + }, + }, + AlertRules: []rulefmt.Rule{ + { + Alert: "testAlertA1", + Expr: "test-expr-a1", + Labels: map[string]string{"test-label": "a-1"}, + Annotations: map[string]string{"test-annot": "a-1"}, + }, + }, + }, + }, + }, + expYAML: ` +--- +# Code generated by Sloth (dev): https://github.com/slok/sloth. +# DO NOT EDIT. + +groups: +- name: sloth-slo-sli-recordings-testa + interval: 4m + rules: + - record: test:record-a1 + expr: test-expr-a1 + labels: + test-label: a-1 +- name: sloth-slo-meta-recordings-testa + interval: 5m + rules: + - record: test:record-a3 + expr: test-expr-a3 + labels: + test-label: a-3 +- name: sloth-slo-alerts-testa + interval: 2m + rules: + - alert: testAlertA1 + expr: test-expr-a1 + labels: + test-label: a-1 + annotations: + test-annot: a-1 +`}, } for name, test := range tests { diff --git a/pkg/prometheus/api/v1/v1.go b/pkg/prometheus/api/v1/v1.go index d81d367d..44f71a3c 100644 --- a/pkg/prometheus/api/v1/v1.go +++ b/pkg/prometheus/api/v1/v1.go @@ -54,6 +54,8 @@ // disable: true package v1 +import "time" + const Version = "prometheus/v1" //go:generate gomarkdoc -o ./README.md ./ @@ -89,6 +91,9 @@ type SLO struct { // Alerting is the configuration with all the things related with the SLO // alerts. Alerting Alerting `yaml:"alerting"` + // Interval is the configuration for all things related to SLO rule_group intervals + // for specific rule groups and all rules. + Interval Interval `yaml:"interval,omitempty"` } // SLI will tell what is good or bad for the SLO. @@ -148,6 +153,18 @@ type Alerting struct { TicketAlert Alert `yaml:"ticket_alert,omitempty"` } +type Interval struct { + // RuleGroupInterval is an optional value for how often the Prometheus rule_group should be evaluated. + // RuleGroupInterval string `yaml:"rulegroup_interval,omitempty"` + RuleGroupInterval time.Duration `yaml:"all,omitempty"` + // Otherwise, specify custom rule_group intervals for each set of recording rules. + // RuleGroupInterval will "fill-in" for any non-specified individual groups + // but individual group settings override RuleGroupInterval. + SLIErrorRulesInterval time.Duration `yaml:"slierror,omitempty"` + MetadataRulesInterval time.Duration `yaml:"metadata,omitempty"` + AlertRulesInterval time.Duration `yaml:"alert,omitempty"` +} + // Alert configures specific SLO alert. type Alert struct { // Disable disables the alert and makes Sloth not generating this alert. This