From bba4879b60da0a645becb124cb690be91a1e748a Mon Sep 17 00:00:00 2001 From: Will Bollock Date: Tue, 13 Jun 2023 19:52:30 -0400 Subject: [PATCH 01/10] feat: adjustable rule_group interval This allows for more expensive SLO recording rules by letting users create a custom Prometheus `rule_group.interval`. https://prometheus.io/docs/prometheus/latest/configuration/recording_rules/#rule_group Now instead of all rule groups using the default global evaluation interval, a custom interval can be set on all sets of recording rules for an SLO. If no `interval` is set then the global default will be assumed, matching current Sloth behavior. Resolves: https://github.com/slok/sloth/issues/367 --- internal/prometheus/model.go | 21 +++++++++++---------- internal/prometheus/spec.go | 19 ++++++++++--------- internal/prometheus/storage.go | 25 ++++++++++++++++--------- pkg/prometheus/api/v1/v1.go | 2 ++ 4 files changed, 39 insertions(+), 28 deletions(-) diff --git a/internal/prometheus/model.go b/internal/prometheus/model.go index f2ac02c1..09dd2519 100644 --- a/internal/prometheus/model.go +++ b/internal/prometheus/model.go @@ -39,16 +39,17 @@ type AlertMeta struct { // SLO represents a service level objective configuration. type SLO struct { - ID string `validate:"required,name"` - Name string `validate:"required,name"` - Description string - Service string `validate:"required,name"` - SLI SLI `validate:"required"` - TimeWindow time.Duration `validate:"required"` - Objective float64 `validate:"gt=0,lte=100"` - Labels map[string]string `validate:"dive,keys,prom_label_key,endkeys,required,prom_label_value"` - PageAlertMeta AlertMeta - TicketAlertMeta AlertMeta + ID string `validate:"required,name"` + Name string `validate:"required,name"` + Description string + Service string `validate:"required,name"` + RuleGroupInterval string + SLI SLI `validate:"required"` + TimeWindow time.Duration `validate:"required"` + Objective float64 `validate:"gt=0,lte=100"` + Labels map[string]string `validate:"dive,keys,prom_label_key,endkeys,required,prom_label_value"` + PageAlertMeta AlertMeta + TicketAlertMeta AlertMeta } type SLOGroup struct { diff --git a/internal/prometheus/spec.go b/internal/prometheus/spec.go index 1935aa38..5164005a 100644 --- a/internal/prometheus/spec.go +++ b/internal/prometheus/spec.go @@ -69,15 +69,16 @@ func (y YAMLSpecLoader) mapSpecToModel(ctx context.Context, spec prometheusv1.Sp models := make([]SLO, 0, len(spec.SLOs)) for _, specSLO := range spec.SLOs { slo := SLO{ - ID: fmt.Sprintf("%s-%s", spec.Service, specSLO.Name), - Name: specSLO.Name, - Description: specSLO.Description, - Service: spec.Service, - TimeWindow: y.windowPeriod, - Objective: specSLO.Objective, - Labels: mergeLabels(spec.Labels, specSLO.Labels), - PageAlertMeta: AlertMeta{Disable: true}, - TicketAlertMeta: AlertMeta{Disable: true}, + ID: fmt.Sprintf("%s-%s", spec.Service, specSLO.Name), + RuleGroupInterval: specSLO.RuleGroupInterval, + Name: specSLO.Name, + Description: specSLO.Description, + Service: spec.Service, + TimeWindow: y.windowPeriod, + Objective: specSLO.Objective, + Labels: mergeLabels(spec.Labels, specSLO.Labels), + PageAlertMeta: AlertMeta{Disable: true}, + TicketAlertMeta: AlertMeta{Disable: true}, } // Set SLIs. diff --git a/internal/prometheus/storage.go b/internal/prometheus/storage.go index 9b37ca92..c60dcc4b 100644 --- a/internal/prometheus/storage.go +++ b/internal/prometheus/storage.go @@ -48,24 +48,31 @@ func (i IOWriterGroupedRulesYAMLRepo) StoreSLOs(ctx context.Context, slos []Stor ruleGroups := ruleGroupsYAMLv2{} for _, slo := range slos { + ruleGroupIntervalDuration, err := prommodel.ParseDuration(slo.SLO.RuleGroupInterval) + if err != nil { + return fmt.Errorf("could not parse rule_group interval duration %w", err) + } if len(slo.Rules.SLIErrorRecRules) > 0 { ruleGroups.Groups = append(ruleGroups.Groups, ruleGroupYAMLv2{ - Name: fmt.Sprintf("sloth-slo-sli-recordings-%s", slo.SLO.ID), - Rules: slo.Rules.SLIErrorRecRules, + Name: fmt.Sprintf("sloth-slo-sli-recordings-%s", slo.SLO.ID), + RuleGroupInterval: ruleGroupIntervalDuration, + Rules: slo.Rules.SLIErrorRecRules, }) } if len(slo.Rules.MetadataRecRules) > 0 { ruleGroups.Groups = append(ruleGroups.Groups, ruleGroupYAMLv2{ - Name: fmt.Sprintf("sloth-slo-meta-recordings-%s", slo.SLO.ID), - Rules: slo.Rules.MetadataRecRules, + Name: fmt.Sprintf("sloth-slo-meta-recordings-%s", slo.SLO.ID), + RuleGroupInterval: ruleGroupIntervalDuration, + Rules: slo.Rules.MetadataRecRules, }) } if len(slo.Rules.AlertRules) > 0 { ruleGroups.Groups = append(ruleGroups.Groups, ruleGroupYAMLv2{ - Name: fmt.Sprintf("sloth-slo-alerts-%s", slo.SLO.ID), - Rules: slo.Rules.AlertRules, + Name: fmt.Sprintf("sloth-slo-alerts-%s", slo.SLO.ID), + RuleGroupInterval: ruleGroupIntervalDuration, + Rules: slo.Rules.AlertRules, }) } } @@ -112,7 +119,7 @@ type ruleGroupsYAMLv2 struct { } type ruleGroupYAMLv2 struct { - Name string `yaml:"name"` - Interval prommodel.Duration `yaml:"interval,omitempty"` - Rules []rulefmt.Rule `yaml:"rules"` + Name string `yaml:"name"` + RuleGroupInterval prommodel.Duration `yaml:"interval,omitempty"` + Rules []rulefmt.Rule `yaml:"rules"` } diff --git a/pkg/prometheus/api/v1/v1.go b/pkg/prometheus/api/v1/v1.go index d81d367d..0a608466 100644 --- a/pkg/prometheus/api/v1/v1.go +++ b/pkg/prometheus/api/v1/v1.go @@ -89,6 +89,8 @@ type SLO struct { // Alerting is the configuration with all the things related with the SLO // alerts. Alerting Alerting `yaml:"alerting"` + // RuleGroupInterval is an optional value for how often the Prometheus rule_group should be evaluated. + RuleGroupInterval string `yaml:"interval,omitempty"` } // SLI will tell what is good or bad for the SLO. From 7863cfe963e013f4142fdba8cd7977031f9803e9 Mon Sep 17 00:00:00 2001 From: Will Bollock Date: Tue, 13 Jun 2023 20:06:50 -0400 Subject: [PATCH 02/10] docs: add example file for custom rule groups --- examples/custom_rule_group_interval.yml | 30 +++++++++++++++++++++++++ 1 file changed, 30 insertions(+) create mode 100644 examples/custom_rule_group_interval.yml diff --git a/examples/custom_rule_group_interval.yml b/examples/custom_rule_group_interval.yml new file mode 100644 index 00000000..a8d4aa0d --- /dev/null +++ b/examples/custom_rule_group_interval.yml @@ -0,0 +1,30 @@ +# This example shows how you can adjust the Prometheus rule_group interval for expensive SLOs +# https://prometheus.io/docs/prometheus/latest/configuration/recording_rules/#rule_group +# The SLO SLI measures the rate of CPU seconds spent performing softirqs +# +# `sloth generate -i ./examples/custom_rule_group_interval.yml` +# +version: "prometheus/v1" +service: "myapp" +labels: + owner: "myteam" +slos: + - name: "cpu-availability" + objective: 99.99 + description: "Example, expensive SLO. Recording rules will run every 2 minutes." + interval: "2m" + sli: + events: + error_query: | + sum( + rate(node_cpu_seconds_total{mode="softirq"}[{{.window}}]) + ) + total_query: | + sum( + rate(node_cpu_seconds_total[{{.window}}]) + ) + alerting: + page_alert: + disable: true + ticket_alert: + disable: true From 3bba8091fccfdc7ab986442c33a290a40a6eb1b5 Mon Sep 17 00:00:00 2001 From: Will Bollock Date: Wed, 14 Jun 2023 17:44:27 -0400 Subject: [PATCH 03/10] ref: only append RuleGroupInterval when not empty Related to how the unit test coverage works, this will make sure RuleGroupInterval is not appended to RuleGroups unless it is defined and not empty. Not requring the `yaml` file does an okay job of this but expliclity writing the rules without RuleGroupInterval is safer. It's a bit ugly and repetitive but I think that's just how Go works.. --- internal/prometheus/storage.go | 75 +++++++++++++++++++++++++--------- 1 file changed, 56 insertions(+), 19 deletions(-) diff --git a/internal/prometheus/storage.go b/internal/prometheus/storage.go index c60dcc4b..94cc6d9c 100644 --- a/internal/prometheus/storage.go +++ b/internal/prometheus/storage.go @@ -48,32 +48,69 @@ func (i IOWriterGroupedRulesYAMLRepo) StoreSLOs(ctx context.Context, slos []Stor ruleGroups := ruleGroupsYAMLv2{} for _, slo := range slos { - ruleGroupIntervalDuration, err := prommodel.ParseDuration(slo.SLO.RuleGroupInterval) - if err != nil { - return fmt.Errorf("could not parse rule_group interval duration %w", err) - } if len(slo.Rules.SLIErrorRecRules) > 0 { - ruleGroups.Groups = append(ruleGroups.Groups, ruleGroupYAMLv2{ - Name: fmt.Sprintf("sloth-slo-sli-recordings-%s", slo.SLO.ID), - RuleGroupInterval: ruleGroupIntervalDuration, - Rules: slo.Rules.SLIErrorRecRules, - }) + if slo.SLO.RuleGroupInterval != "" { + + ruleGroupIntervalDuration, err := prommodel.ParseDuration(slo.SLO.RuleGroupInterval) + if err != nil { + return fmt.Errorf("could not parse rule_group interval duration %w", err) + } + + ruleGroups.Groups = append(ruleGroups.Groups, ruleGroupYAMLv2{ + Name: fmt.Sprintf("sloth-slo-sli-recordings-%s", slo.SLO.ID), + RuleGroupInterval: ruleGroupIntervalDuration, + Rules: slo.Rules.SLIErrorRecRules, + }) + } else { + ruleGroups.Groups = append(ruleGroups.Groups, ruleGroupYAMLv2{ + Name: fmt.Sprintf("sloth-slo-sli-recordings-%s", slo.SLO.ID), + Rules: slo.Rules.SLIErrorRecRules, + }) + + } } if len(slo.Rules.MetadataRecRules) > 0 { - ruleGroups.Groups = append(ruleGroups.Groups, ruleGroupYAMLv2{ - Name: fmt.Sprintf("sloth-slo-meta-recordings-%s", slo.SLO.ID), - RuleGroupInterval: ruleGroupIntervalDuration, - Rules: slo.Rules.MetadataRecRules, - }) + if slo.SLO.RuleGroupInterval != "" { + + ruleGroupIntervalDuration, err := prommodel.ParseDuration(slo.SLO.RuleGroupInterval) + if err != nil { + return fmt.Errorf("could not parse rule_group interval duration %w", err) + } + + ruleGroups.Groups = append(ruleGroups.Groups, ruleGroupYAMLv2{ + Name: fmt.Sprintf("sloth-slo-meta-recordings-%s", slo.SLO.ID), + RuleGroupInterval: ruleGroupIntervalDuration, + Rules: slo.Rules.MetadataRecRules, + }) + } else { + ruleGroups.Groups = append(ruleGroups.Groups, ruleGroupYAMLv2{ + Name: fmt.Sprintf("sloth-slo-meta-recordings-%s", slo.SLO.ID), + Rules: slo.Rules.MetadataRecRules, + }) + } } if len(slo.Rules.AlertRules) > 0 { - ruleGroups.Groups = append(ruleGroups.Groups, ruleGroupYAMLv2{ - Name: fmt.Sprintf("sloth-slo-alerts-%s", slo.SLO.ID), - RuleGroupInterval: ruleGroupIntervalDuration, - Rules: slo.Rules.AlertRules, - }) + if slo.SLO.RuleGroupInterval != "" { + + ruleGroupIntervalDuration, err := prommodel.ParseDuration(slo.SLO.RuleGroupInterval) + if err != nil { + return fmt.Errorf("could not parse rule_group interval duration %w", err) + } + + ruleGroups.Groups = append(ruleGroups.Groups, ruleGroupYAMLv2{ + Name: fmt.Sprintf("sloth-slo-alerts-%s", slo.SLO.ID), + RuleGroupInterval: ruleGroupIntervalDuration, + Rules: slo.Rules.AlertRules, + }) + + } else { + ruleGroups.Groups = append(ruleGroups.Groups, ruleGroupYAMLv2{ + Name: fmt.Sprintf("sloth-slo-alerts-%s", slo.SLO.ID), + Rules: slo.Rules.AlertRules, + }) + } } } From 4c3a1f9bcdcfca2972c607faeba7f2e8db5ecd63 Mon Sep 17 00:00:00 2001 From: Will Bollock Date: Wed, 14 Jun 2023 17:46:33 -0400 Subject: [PATCH 04/10] test: update tests for custom rule_group interval This updates tests include `interval`'s and also includes a test to make sure the rules render correctly when interval is not included. --- internal/prometheus/storage_test.go | 45 ++++++++++++++++++++++++++--- 1 file changed, 41 insertions(+), 4 deletions(-) diff --git a/internal/prometheus/storage_test.go b/internal/prometheus/storage_test.go index 545c692b..eeaa977f 100644 --- a/internal/prometheus/storage_test.go +++ b/internal/prometheus/storage_test.go @@ -33,7 +33,7 @@ func TestIOWriterGroupedRulesYAMLRepoStore(t *testing.T) { "Having a single SLI recording rule should render correctly.": { slos: []prometheus.StorageSLO{ { - SLO: prometheus.SLO{ID: "test1"}, + SLO: prometheus.SLO{ID: "test1", RuleGroupInterval: "2m"}, Rules: prometheus.SLORules{ SLIErrorRecRules: []rulefmt.Rule{ { @@ -52,6 +52,7 @@ func TestIOWriterGroupedRulesYAMLRepoStore(t *testing.T) { groups: - name: sloth-slo-sli-recordings-test1 + interval: 2m rules: - record: test:record expr: test-expr @@ -91,7 +92,7 @@ groups: "Having a single SLO alert rule should render correctly.": { slos: []prometheus.StorageSLO{ { - SLO: prometheus.SLO{ID: "test1"}, + SLO: prometheus.SLO{ID: "test1", RuleGroupInterval: "2m"}, Rules: prometheus.SLORules{ AlertRules: []rulefmt.Rule{ { @@ -111,6 +112,7 @@ groups: groups: - name: sloth-slo-alerts-test1 + interval: 2m rules: - alert: testAlert expr: test-expr @@ -120,11 +122,40 @@ groups: test-annot: one `, }, + "Having a single a blank or empty rule_group interval render correctly.": { + slos: []prometheus.StorageSLO{ + { + SLO: prometheus.SLO{ID: "test1", RuleGroupInterval: ""}, + Rules: prometheus.SLORules{ + SLIErrorRecRules: []rulefmt.Rule{ + { + Record: "test:record", + Expr: "test-expr", + Labels: map[string]string{"test-label": "one"}, + }, + }, + }, + }, + }, + expYAML: ` +--- +# Code generated by Sloth (dev): https://github.com/slok/sloth. +# DO NOT EDIT. + +groups: +- name: sloth-slo-sli-recordings-test1 + rules: + - record: test:record + expr: test-expr + labels: + test-label: one +`, + }, "Having a multiple SLO alert and recording rules should render correctly.": { slos: []prometheus.StorageSLO{ { - SLO: prometheus.SLO{ID: "testa"}, + SLO: prometheus.SLO{ID: "testa", RuleGroupInterval: "3m"}, Rules: prometheus.SLORules{ SLIErrorRecRules: []rulefmt.Rule{ { @@ -167,7 +198,7 @@ groups: }, }, { - SLO: prometheus.SLO{ID: "testb"}, + SLO: prometheus.SLO{ID: "testb", RuleGroupInterval: "1h"}, Rules: prometheus.SLORules{ SLIErrorRecRules: []rulefmt.Rule{ { @@ -201,6 +232,7 @@ groups: groups: - name: sloth-slo-sli-recordings-testa + interval: 3m rules: - record: test:record-a1 expr: test-expr-a1 @@ -211,6 +243,7 @@ groups: labels: test-label: a-2 - name: sloth-slo-meta-recordings-testa + interval: 3m rules: - record: test:record-a3 expr: test-expr-a3 @@ -221,6 +254,7 @@ groups: labels: test-label: a-4 - name: sloth-slo-alerts-testa + interval: 3m rules: - alert: testAlertA1 expr: test-expr-a1 @@ -235,18 +269,21 @@ groups: annotations: test-annot: a-2 - name: sloth-slo-sli-recordings-testb + interval: 1h rules: - record: test:record-b1 expr: test-expr-b1 labels: test-label: b-1 - name: sloth-slo-meta-recordings-testb + interval: 1h rules: - record: test:record-b2 expr: test-expr-b2 labels: test-label: b-2 - name: sloth-slo-alerts-testb + interval: 1h rules: - alert: testAlertB1 expr: test-expr-b1 From a94fee994c770e741571f9d70efeee51890cd5f1 Mon Sep 17 00:00:00 2001 From: Will Bollock Date: Mon, 26 Jun 2023 16:11:12 -0400 Subject: [PATCH 05/10] ref: add rule intervals per rule type Instead of a singular global default, now a rule_group interval can be set for every individual type of rule_group Sloth generates. The generic, `interval:all` rule will also stay and can "fill in" any missing per-rule group defaults. Along with the default behavior of doing nothing if no `interval` is specified. --- internal/prometheus/model.go | 38 +++++++++++++++++++++++---------- internal/prometheus/spec.go | 23 +++++++++++--------- internal/prometheus/storage.go | 39 ++++++++++++++++++++++++++++------ pkg/prometheus/api/v1/v1.go | 19 +++++++++++++++-- 4 files changed, 90 insertions(+), 29 deletions(-) diff --git a/internal/prometheus/model.go b/internal/prometheus/model.go index 09dd2519..4b7f2c42 100644 --- a/internal/prometheus/model.go +++ b/internal/prometheus/model.go @@ -39,17 +39,20 @@ type AlertMeta struct { // SLO represents a service level objective configuration. type SLO struct { - ID string `validate:"required,name"` - Name string `validate:"required,name"` - Description string - Service string `validate:"required,name"` - RuleGroupInterval string - SLI SLI `validate:"required"` - TimeWindow time.Duration `validate:"required"` - Objective float64 `validate:"gt=0,lte=100"` - Labels map[string]string `validate:"dive,keys,prom_label_key,endkeys,required,prom_label_value"` - PageAlertMeta AlertMeta - TicketAlertMeta AlertMeta + ID string `validate:"required,name"` + Name string `validate:"required,name"` + Description string + Service string `validate:"required,name"` + RuleGroupInterval time.Duration `validate:"time"` + SLIErrorRulesInterval time.Duration `validate:"time"` + MetadataRulesInterval time.Duration `validate:"time"` + AlertRulesInterval time.Duration `validate:"time"` + SLI SLI `validate:"required"` + TimeWindow time.Duration `validate:"required"` + Objective float64 `validate:"gt=0,lte=100"` + Labels map[string]string `validate:"dive,keys,prom_label_key,endkeys,required,prom_label_value"` + PageAlertMeta AlertMeta + TicketAlertMeta AlertMeta } type SLOGroup struct { @@ -87,6 +90,7 @@ var modelSpecValidate = func() *validator.Validate { mustRegisterValidation(v, "name", validateName) mustRegisterValidation(v, "required_if_enabled", validateRequiredEnabledAlertName) mustRegisterValidation(v, "template_vars", validateTemplateVars) + mustRegisterValidation(v, "time", validateTime) v.RegisterStructValidation(validateOneSLI, SLI{}) v.RegisterStructValidation(validateSLOGroup, SLOGroup{}) v.RegisterStructValidation(validateSLIEvents, SLIEvents{}) @@ -182,6 +186,18 @@ func validateName(fl validator.FieldLevel) bool { return nameRegexp.MatchString(s) } +// validateTime implements validator.CustomTypeFunc by validating +// a time duration. +func validateTime(fl validator.FieldLevel) bool { + s, ok := fl.Field().Interface().(time.Duration) + if !ok { + return false + } + + _, err := time.ParseDuration(s.String()) + return err == nil +} + func validateRequiredEnabledAlertName(fl validator.FieldLevel) bool { alertMeta, ok := fl.Parent().Interface().(AlertMeta) if !ok { diff --git a/internal/prometheus/spec.go b/internal/prometheus/spec.go index 5164005a..eb499b36 100644 --- a/internal/prometheus/spec.go +++ b/internal/prometheus/spec.go @@ -69,16 +69,19 @@ func (y YAMLSpecLoader) mapSpecToModel(ctx context.Context, spec prometheusv1.Sp models := make([]SLO, 0, len(spec.SLOs)) for _, specSLO := range spec.SLOs { slo := SLO{ - ID: fmt.Sprintf("%s-%s", spec.Service, specSLO.Name), - RuleGroupInterval: specSLO.RuleGroupInterval, - Name: specSLO.Name, - Description: specSLO.Description, - Service: spec.Service, - TimeWindow: y.windowPeriod, - Objective: specSLO.Objective, - Labels: mergeLabels(spec.Labels, specSLO.Labels), - PageAlertMeta: AlertMeta{Disable: true}, - TicketAlertMeta: AlertMeta{Disable: true}, + ID: fmt.Sprintf("%s-%s", spec.Service, specSLO.Name), + RuleGroupInterval: specSLO.Interval.RuleGroupInterval, + SLIErrorRulesInterval: specSLO.Interval.SLIErrorRulesInterval, + MetadataRulesInterval: specSLO.Interval.MetadataRulesInterval, + AlertRulesInterval: specSLO.Interval.AlertRulesInterval, + Name: specSLO.Name, + Description: specSLO.Description, + Service: spec.Service, + TimeWindow: y.windowPeriod, + Objective: specSLO.Objective, + Labels: mergeLabels(spec.Labels, specSLO.Labels), + PageAlertMeta: AlertMeta{Disable: true}, + TicketAlertMeta: AlertMeta{Disable: true}, } // Set SLIs. diff --git a/internal/prometheus/storage.go b/internal/prometheus/storage.go index 94cc6d9c..72422739 100644 --- a/internal/prometheus/storage.go +++ b/internal/prometheus/storage.go @@ -49,9 +49,19 @@ func (i IOWriterGroupedRulesYAMLRepo) StoreSLOs(ctx context.Context, slos []Stor ruleGroups := ruleGroupsYAMLv2{} for _, slo := range slos { if len(slo.Rules.SLIErrorRecRules) > 0 { - if slo.SLO.RuleGroupInterval != "" { - ruleGroupIntervalDuration, err := prommodel.ParseDuration(slo.SLO.RuleGroupInterval) + // 0s is default empty string value for time.Duration + if slo.SLO.RuleGroupInterval.String() != "0s" || slo.SLO.SLIErrorRulesInterval.String() != "0s" { + var ruleGroupIntervalDuration prommodel.Duration + var err error + + // if we have a valid meta rule rule_group interval, use that first and overwrite any generic ones + if slo.SLO.SLIErrorRulesInterval.String() != "0s" { + ruleGroupIntervalDuration, err = prommodel.ParseDuration(slo.SLO.SLIErrorRulesInterval.String()) + } else { + ruleGroupIntervalDuration, err = prommodel.ParseDuration(slo.SLO.RuleGroupInterval.String()) + } + if err != nil { return fmt.Errorf("could not parse rule_group interval duration %w", err) } @@ -71,9 +81,18 @@ func (i IOWriterGroupedRulesYAMLRepo) StoreSLOs(ctx context.Context, slos []Stor } if len(slo.Rules.MetadataRecRules) > 0 { - if slo.SLO.RuleGroupInterval != "" { + // if either of these aren't empty we'll be adding a custom rule interval + if slo.SLO.RuleGroupInterval.String() != "0s" || slo.SLO.MetadataRulesInterval.String() != "0s" { + var ruleGroupIntervalDuration prommodel.Duration + var err error + + // if we have a valid meta rule rule_group interval, use that firs + if slo.SLO.MetadataRulesInterval.String() != "0s" { + ruleGroupIntervalDuration, err = prommodel.ParseDuration(slo.SLO.MetadataRulesInterval.String()) + } else { + ruleGroupIntervalDuration, err = prommodel.ParseDuration(slo.SLO.RuleGroupInterval.String()) + } - ruleGroupIntervalDuration, err := prommodel.ParseDuration(slo.SLO.RuleGroupInterval) if err != nil { return fmt.Errorf("could not parse rule_group interval duration %w", err) } @@ -92,9 +111,17 @@ func (i IOWriterGroupedRulesYAMLRepo) StoreSLOs(ctx context.Context, slos []Stor } if len(slo.Rules.AlertRules) > 0 { - if slo.SLO.RuleGroupInterval != "" { + if slo.SLO.RuleGroupInterval.String() != "0s" || slo.SLO.AlertRulesInterval.String() != "0s" { + var ruleGroupIntervalDuration prommodel.Duration + var err error + + // if we have a valid meta rule rule_group interval, use that firs + if slo.SLO.AlertRulesInterval.String() != "0s" { + ruleGroupIntervalDuration, err = prommodel.ParseDuration(slo.SLO.AlertRulesInterval.String()) + } else { + ruleGroupIntervalDuration, err = prommodel.ParseDuration(slo.SLO.RuleGroupInterval.String()) + } - ruleGroupIntervalDuration, err := prommodel.ParseDuration(slo.SLO.RuleGroupInterval) if err != nil { return fmt.Errorf("could not parse rule_group interval duration %w", err) } diff --git a/pkg/prometheus/api/v1/v1.go b/pkg/prometheus/api/v1/v1.go index 0a608466..44f71a3c 100644 --- a/pkg/prometheus/api/v1/v1.go +++ b/pkg/prometheus/api/v1/v1.go @@ -54,6 +54,8 @@ // disable: true package v1 +import "time" + const Version = "prometheus/v1" //go:generate gomarkdoc -o ./README.md ./ @@ -89,8 +91,9 @@ type SLO struct { // Alerting is the configuration with all the things related with the SLO // alerts. Alerting Alerting `yaml:"alerting"` - // RuleGroupInterval is an optional value for how often the Prometheus rule_group should be evaluated. - RuleGroupInterval string `yaml:"interval,omitempty"` + // Interval is the configuration for all things related to SLO rule_group intervals + // for specific rule groups and all rules. + Interval Interval `yaml:"interval,omitempty"` } // SLI will tell what is good or bad for the SLO. @@ -150,6 +153,18 @@ type Alerting struct { TicketAlert Alert `yaml:"ticket_alert,omitempty"` } +type Interval struct { + // RuleGroupInterval is an optional value for how often the Prometheus rule_group should be evaluated. + // RuleGroupInterval string `yaml:"rulegroup_interval,omitempty"` + RuleGroupInterval time.Duration `yaml:"all,omitempty"` + // Otherwise, specify custom rule_group intervals for each set of recording rules. + // RuleGroupInterval will "fill-in" for any non-specified individual groups + // but individual group settings override RuleGroupInterval. + SLIErrorRulesInterval time.Duration `yaml:"slierror,omitempty"` + MetadataRulesInterval time.Duration `yaml:"metadata,omitempty"` + AlertRulesInterval time.Duration `yaml:"alert,omitempty"` +} + // Alert configures specific SLO alert. type Alert struct { // Disable disables the alert and makes Sloth not generating this alert. This From ca78450231e2ded7baa2f995b3ff7f97af933be0 Mon Sep 17 00:00:00 2001 From: Will Bollock Date: Mon, 26 Jun 2023 16:55:48 -0400 Subject: [PATCH 06/10] test: add tests for custom rule group intervals Specific ones and general/specific combined --- internal/prometheus/storage_test.go | 160 ++++++++++++++++++++++++++-- 1 file changed, 154 insertions(+), 6 deletions(-) diff --git a/internal/prometheus/storage_test.go b/internal/prometheus/storage_test.go index eeaa977f..b550b2a0 100644 --- a/internal/prometheus/storage_test.go +++ b/internal/prometheus/storage_test.go @@ -4,6 +4,7 @@ import ( "bytes" "context" "testing" + "time" "github.com/prometheus/prometheus/model/rulefmt" "github.com/stretchr/testify/assert" @@ -12,7 +13,29 @@ import ( "github.com/slok/sloth/internal/prometheus" ) +func parseDuration(durationStr string, t *testing.T) time.Duration { + duration, err := time.ParseDuration(durationStr) + if err != nil { + t.Errorf("could not parse duration: %v", err) + return 0 + } + return duration +} + func TestIOWriterGroupedRulesYAMLRepoStore(t *testing.T) { + // set intervals ahead of time + ruleGroupInterval := parseDuration("2m", t) + // 0s = default/blank + ruleGroupIntervalBlank := parseDuration("0s", t) + // A/B for multiple group test + ruleGroupIntervalA := parseDuration("3m", t) + ruleGroupIntervalB := parseDuration("1h", t) + // for individual settings + sliErrorRulesInterval := parseDuration("4m", t) + metadataRulesInterval := parseDuration("5m", t) + alertRulesInterval := parseDuration("6m", t) + // need test for mix of rulegroupinterval and individual + tests := map[string]struct { slos []prometheus.StorageSLO expYAML string @@ -30,10 +53,11 @@ func TestIOWriterGroupedRulesYAMLRepoStore(t *testing.T) { expErr: true, }, - "Having a single SLI recording rule should render correctly.": { + "Having a single SLI recording rule with the generic rule_group interval should render correctly.": { + slos: []prometheus.StorageSLO{ { - SLO: prometheus.SLO{ID: "test1", RuleGroupInterval: "2m"}, + SLO: prometheus.SLO{ID: "test1", RuleGroupInterval: ruleGroupInterval}, Rules: prometheus.SLORules{ SLIErrorRecRules: []rulefmt.Rule{ { @@ -92,7 +116,7 @@ groups: "Having a single SLO alert rule should render correctly.": { slos: []prometheus.StorageSLO{ { - SLO: prometheus.SLO{ID: "test1", RuleGroupInterval: "2m"}, + SLO: prometheus.SLO{ID: "test1", RuleGroupInterval: ruleGroupInterval}, Rules: prometheus.SLORules{ AlertRules: []rulefmt.Rule{ { @@ -125,7 +149,7 @@ groups: "Having a single a blank or empty rule_group interval render correctly.": { slos: []prometheus.StorageSLO{ { - SLO: prometheus.SLO{ID: "test1", RuleGroupInterval: ""}, + SLO: prometheus.SLO{ID: "test1", RuleGroupInterval: ruleGroupIntervalBlank}, Rules: prometheus.SLORules{ SLIErrorRecRules: []rulefmt.Rule{ { @@ -155,7 +179,7 @@ groups: "Having a multiple SLO alert and recording rules should render correctly.": { slos: []prometheus.StorageSLO{ { - SLO: prometheus.SLO{ID: "testa", RuleGroupInterval: "3m"}, + SLO: prometheus.SLO{ID: "testa", RuleGroupInterval: ruleGroupIntervalA}, Rules: prometheus.SLORules{ SLIErrorRecRules: []rulefmt.Rule{ { @@ -198,7 +222,7 @@ groups: }, }, { - SLO: prometheus.SLO{ID: "testb", RuleGroupInterval: "1h"}, + SLO: prometheus.SLO{ID: "testb", RuleGroupInterval: ruleGroupIntervalB}, Rules: prometheus.SLORules{ SLIErrorRecRules: []rulefmt.Rule{ { @@ -293,6 +317,130 @@ groups: test-annot: b-1 `, }, + "Having a mix of rule group intervals should render correctly.": { + + slos: []prometheus.StorageSLO{ + { + SLO: prometheus.SLO{ID: "testa", SLIErrorRulesInterval: sliErrorRulesInterval, MetadataRulesInterval: metadataRulesInterval, AlertRulesInterval: alertRulesInterval}, + Rules: prometheus.SLORules{ + SLIErrorRecRules: []rulefmt.Rule{ + { + Record: "test:record-a1", + Expr: "test-expr-a1", + Labels: map[string]string{"test-label": "a-1"}, + }, + }, + MetadataRecRules: []rulefmt.Rule{ + { + Record: "test:record-a3", + Expr: "test-expr-a3", + Labels: map[string]string{"test-label": "a-3"}, + }, + }, + AlertRules: []rulefmt.Rule{ + { + Alert: "testAlertA1", + Expr: "test-expr-a1", + Labels: map[string]string{"test-label": "a-1"}, + Annotations: map[string]string{"test-annot": "a-1"}, + }, + }, + }, + }, + }, + expYAML: ` +--- +# Code generated by Sloth (dev): https://github.com/slok/sloth. +# DO NOT EDIT. + +groups: +- name: sloth-slo-sli-recordings-testa + interval: 4m + rules: + - record: test:record-a1 + expr: test-expr-a1 + labels: + test-label: a-1 +- name: sloth-slo-meta-recordings-testa + interval: 5m + rules: + - record: test:record-a3 + expr: test-expr-a3 + labels: + test-label: a-3 +- name: sloth-slo-alerts-testa + interval: 6m + rules: + - alert: testAlertA1 + expr: test-expr-a1 + labels: + test-label: a-1 + annotations: + test-annot: a-1 +`}, + "Having a mix of rule group intervals and the overarching rule_group interval should render correctly.": { + + slos: []prometheus.StorageSLO{ + { + SLO: prometheus.SLO{ID: "testa", SLIErrorRulesInterval: sliErrorRulesInterval, MetadataRulesInterval: metadataRulesInterval, RuleGroupInterval: ruleGroupInterval}, + // in this case we use the broad RuleGroupInterval to set a 2m interval for the alert rules + // that dont have an explicit one set + Rules: prometheus.SLORules{ + SLIErrorRecRules: []rulefmt.Rule{ + { + Record: "test:record-a1", + Expr: "test-expr-a1", + Labels: map[string]string{"test-label": "a-1"}, + }, + }, + MetadataRecRules: []rulefmt.Rule{ + { + Record: "test:record-a3", + Expr: "test-expr-a3", + Labels: map[string]string{"test-label": "a-3"}, + }, + }, + AlertRules: []rulefmt.Rule{ + { + Alert: "testAlertA1", + Expr: "test-expr-a1", + Labels: map[string]string{"test-label": "a-1"}, + Annotations: map[string]string{"test-annot": "a-1"}, + }, + }, + }, + }, + }, + expYAML: ` +--- +# Code generated by Sloth (dev): https://github.com/slok/sloth. +# DO NOT EDIT. + +groups: +- name: sloth-slo-sli-recordings-testa + interval: 4m + rules: + - record: test:record-a1 + expr: test-expr-a1 + labels: + test-label: a-1 +- name: sloth-slo-meta-recordings-testa + interval: 5m + rules: + - record: test:record-a3 + expr: test-expr-a3 + labels: + test-label: a-3 +- name: sloth-slo-alerts-testa + interval: 2m + rules: + - alert: testAlertA1 + expr: test-expr-a1 + labels: + test-label: a-1 + annotations: + test-annot: a-1 +`}, } for name, test := range tests { From dc3994be548f1598c633cc19de25b8b45a3dc416 Mon Sep 17 00:00:00 2001 From: Will Bollock Date: Mon, 26 Jun 2023 16:56:12 -0400 Subject: [PATCH 07/10] chore: update example for custom rule groups --- examples/custom_rule_group_interval.yml | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/examples/custom_rule_group_interval.yml b/examples/custom_rule_group_interval.yml index a8d4aa0d..5433fa69 100644 --- a/examples/custom_rule_group_interval.yml +++ b/examples/custom_rule_group_interval.yml @@ -12,7 +12,14 @@ slos: - name: "cpu-availability" objective: 99.99 description: "Example, expensive SLO. Recording rules will run every 2 minutes." - interval: "2m" + # alternative way of specifying interval for all three sets of rules + # interval: + # all: "15m" + interval: + all: "6m" + slierror: "4m" + # metadata: "2m" + alert: "2m" sli: events: error_query: | @@ -24,7 +31,14 @@ slos: rate(node_cpu_seconds_total[{{.window}}]) ) alerting: + name: MyServiceHighErrorRate + labels: + category: "availability" + annotations: + summary: "High error rate on 'myservice' requests responses" page_alert: - disable: true + labels: + severity: pageteam + routing_key: myteam ticket_alert: disable: true From e1c7bdb7228bf0245fb7fa622a5854a50423091d Mon Sep 17 00:00:00 2001 From: Will Bollock Date: Mon, 3 Jul 2023 08:57:30 -0400 Subject: [PATCH 08/10] docs: finish comments Co-authored-by: Will Hegedus --- internal/prometheus/storage.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/internal/prometheus/storage.go b/internal/prometheus/storage.go index 72422739..588daf6c 100644 --- a/internal/prometheus/storage.go +++ b/internal/prometheus/storage.go @@ -86,7 +86,7 @@ func (i IOWriterGroupedRulesYAMLRepo) StoreSLOs(ctx context.Context, slos []Stor var ruleGroupIntervalDuration prommodel.Duration var err error - // if we have a valid meta rule rule_group interval, use that firs + // if we have a valid meta rule rule_group interval, use that first and overwrite any generic ones if slo.SLO.MetadataRulesInterval.String() != "0s" { ruleGroupIntervalDuration, err = prommodel.ParseDuration(slo.SLO.MetadataRulesInterval.String()) } else { @@ -115,7 +115,7 @@ func (i IOWriterGroupedRulesYAMLRepo) StoreSLOs(ctx context.Context, slos []Stor var ruleGroupIntervalDuration prommodel.Duration var err error - // if we have a valid meta rule rule_group interval, use that firs + // if we have a valid meta rule rule_group interval, use that first and overwrite any generic ones if slo.SLO.AlertRulesInterval.String() != "0s" { ruleGroupIntervalDuration, err = prommodel.ParseDuration(slo.SLO.AlertRulesInterval.String()) } else { From d34697a591cb238b77e4db1770ea0773371c4c3b Mon Sep 17 00:00:00 2001 From: Will Bollock Date: Mon, 3 Jul 2023 09:19:31 -0400 Subject: [PATCH 09/10] ref: rule interval groups less repetitive --- internal/prometheus/storage.go | 130 ++++++++++++++++----------------- 1 file changed, 64 insertions(+), 66 deletions(-) diff --git a/internal/prometheus/storage.go b/internal/prometheus/storage.go index 588daf6c..4edf4881 100644 --- a/internal/prometheus/storage.go +++ b/internal/prometheus/storage.go @@ -50,94 +50,92 @@ func (i IOWriterGroupedRulesYAMLRepo) StoreSLOs(ctx context.Context, slos []Stor for _, slo := range slos { if len(slo.Rules.SLIErrorRecRules) > 0 { - // 0s is default empty string value for time.Duration - if slo.SLO.RuleGroupInterval.String() != "0s" || slo.SLO.SLIErrorRulesInterval.String() != "0s" { - var ruleGroupIntervalDuration prommodel.Duration - var err error - - // if we have a valid meta rule rule_group interval, use that first and overwrite any generic ones - if slo.SLO.SLIErrorRulesInterval.String() != "0s" { - ruleGroupIntervalDuration, err = prommodel.ParseDuration(slo.SLO.SLIErrorRulesInterval.String()) + group := ruleGroupYAMLv2{ + Name: fmt.Sprintf("sloth-slo-sli-recordings-%s", slo.SLO.ID), + Rules: slo.Rules.SLIErrorRecRules, + } + + var ruleGroupIntervalDuration prommodel.Duration + var err error + + switch { + case slo.SLO.SLIErrorRulesInterval.String() != "0s": + ruleGroupIntervalDuration, err = prommodel.ParseDuration(slo.SLO.SLIErrorRulesInterval.String()) + if err != nil { + return fmt.Errorf("could not parse rule_group interval duration for alerts %w", err) } else { - ruleGroupIntervalDuration, err = prommodel.ParseDuration(slo.SLO.RuleGroupInterval.String()) + group.RuleGroupInterval = ruleGroupIntervalDuration } - + case slo.SLO.RuleGroupInterval.String() != "0s": + ruleGroupIntervalDuration, err = prommodel.ParseDuration(slo.SLO.RuleGroupInterval.String()) if err != nil { - return fmt.Errorf("could not parse rule_group interval duration %w", err) + return fmt.Errorf("could not parse default ('all') rule_group interval duration %w", err) + } else { + group.RuleGroupInterval = ruleGroupIntervalDuration } - - ruleGroups.Groups = append(ruleGroups.Groups, ruleGroupYAMLv2{ - Name: fmt.Sprintf("sloth-slo-sli-recordings-%s", slo.SLO.ID), - RuleGroupInterval: ruleGroupIntervalDuration, - Rules: slo.Rules.SLIErrorRecRules, - }) - } else { - ruleGroups.Groups = append(ruleGroups.Groups, ruleGroupYAMLv2{ - Name: fmt.Sprintf("sloth-slo-sli-recordings-%s", slo.SLO.ID), - Rules: slo.Rules.SLIErrorRecRules, - }) - } + + ruleGroups.Groups = append(ruleGroups.Groups, group) } if len(slo.Rules.MetadataRecRules) > 0 { - // if either of these aren't empty we'll be adding a custom rule interval - if slo.SLO.RuleGroupInterval.String() != "0s" || slo.SLO.MetadataRulesInterval.String() != "0s" { - var ruleGroupIntervalDuration prommodel.Duration - var err error - - // if we have a valid meta rule rule_group interval, use that first and overwrite any generic ones - if slo.SLO.MetadataRulesInterval.String() != "0s" { - ruleGroupIntervalDuration, err = prommodel.ParseDuration(slo.SLO.MetadataRulesInterval.String()) + + group := ruleGroupYAMLv2{ + Name: fmt.Sprintf("sloth-slo-meta-recordings-%s", slo.SLO.ID), + Rules: slo.Rules.MetadataRecRules, + } + + var ruleGroupIntervalDuration prommodel.Duration + var err error + + switch { + case slo.SLO.MetadataRulesInterval.String() != "0s": + ruleGroupIntervalDuration, err = prommodel.ParseDuration(slo.SLO.MetadataRulesInterval.String()) + if err != nil { + return fmt.Errorf("could not parse rule_group interval duration for alerts %w", err) } else { - ruleGroupIntervalDuration, err = prommodel.ParseDuration(slo.SLO.RuleGroupInterval.String()) + group.RuleGroupInterval = ruleGroupIntervalDuration } - + case slo.SLO.RuleGroupInterval.String() != "0s": + ruleGroupIntervalDuration, err = prommodel.ParseDuration(slo.SLO.RuleGroupInterval.String()) if err != nil { - return fmt.Errorf("could not parse rule_group interval duration %w", err) + return fmt.Errorf("could not parse default ('all') rule_group interval duration %w", err) + } else { + group.RuleGroupInterval = ruleGroupIntervalDuration } - - ruleGroups.Groups = append(ruleGroups.Groups, ruleGroupYAMLv2{ - Name: fmt.Sprintf("sloth-slo-meta-recordings-%s", slo.SLO.ID), - RuleGroupInterval: ruleGroupIntervalDuration, - Rules: slo.Rules.MetadataRecRules, - }) - } else { - ruleGroups.Groups = append(ruleGroups.Groups, ruleGroupYAMLv2{ - Name: fmt.Sprintf("sloth-slo-meta-recordings-%s", slo.SLO.ID), - Rules: slo.Rules.MetadataRecRules, - }) } + + ruleGroups.Groups = append(ruleGroups.Groups, group) } if len(slo.Rules.AlertRules) > 0 { - if slo.SLO.RuleGroupInterval.String() != "0s" || slo.SLO.AlertRulesInterval.String() != "0s" { - var ruleGroupIntervalDuration prommodel.Duration - var err error - // if we have a valid meta rule rule_group interval, use that first and overwrite any generic ones - if slo.SLO.AlertRulesInterval.String() != "0s" { - ruleGroupIntervalDuration, err = prommodel.ParseDuration(slo.SLO.AlertRulesInterval.String()) + group := ruleGroupYAMLv2{ + Name: fmt.Sprintf("sloth-slo-alerts-%s", slo.SLO.ID), + Rules: slo.Rules.AlertRules, + } + + var ruleGroupIntervalDuration prommodel.Duration + var err error + + switch { + case slo.SLO.AlertRulesInterval.String() != "0s": + ruleGroupIntervalDuration, err = prommodel.ParseDuration(slo.SLO.AlertRulesInterval.String()) + if err != nil { + return fmt.Errorf("could not parse rule_group interval duration for alerts %w", err) } else { - ruleGroupIntervalDuration, err = prommodel.ParseDuration(slo.SLO.RuleGroupInterval.String()) + group.RuleGroupInterval = ruleGroupIntervalDuration } - + case slo.SLO.RuleGroupInterval.String() != "0s": + ruleGroupIntervalDuration, err = prommodel.ParseDuration(slo.SLO.RuleGroupInterval.String()) if err != nil { - return fmt.Errorf("could not parse rule_group interval duration %w", err) + return fmt.Errorf("could not parse default ('all') rule_group interval duration %w", err) + } else { + group.RuleGroupInterval = ruleGroupIntervalDuration } - - ruleGroups.Groups = append(ruleGroups.Groups, ruleGroupYAMLv2{ - Name: fmt.Sprintf("sloth-slo-alerts-%s", slo.SLO.ID), - RuleGroupInterval: ruleGroupIntervalDuration, - Rules: slo.Rules.AlertRules, - }) - - } else { - ruleGroups.Groups = append(ruleGroups.Groups, ruleGroupYAMLv2{ - Name: fmt.Sprintf("sloth-slo-alerts-%s", slo.SLO.ID), - Rules: slo.Rules.AlertRules, - }) } + + ruleGroups.Groups = append(ruleGroups.Groups, group) } } From 92854e300c5b05f1929b1c8b62c8fa0fbe7f0ee3 Mon Sep 17 00:00:00 2001 From: Will Bollock Date: Mon, 3 Jul 2023 09:21:57 -0400 Subject: [PATCH 10/10] docs: clean up custome rule group example --- examples/custom_rule_group_interval.yml | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/examples/custom_rule_group_interval.yml b/examples/custom_rule_group_interval.yml index 5433fa69..94cfc461 100644 --- a/examples/custom_rule_group_interval.yml +++ b/examples/custom_rule_group_interval.yml @@ -14,11 +14,10 @@ slos: description: "Example, expensive SLO. Recording rules will run every 2 minutes." # alternative way of specifying interval for all three sets of rules # interval: - # all: "15m" - interval: - all: "6m" + # all: "5m" + interval: # all of these are different sets of rule groups sloth can make slierror: "4m" - # metadata: "2m" + metadata: "2m" alert: "2m" sli: events: