Skip to content

Commit

Permalink
Separate health checks by tags (ava-labs#1579)
Browse files Browse the repository at this point in the history
Co-authored-by: Ceyhun Onur <ceyhun.onur@avalabs.org>
  • Loading branch information
StephenButtolph and ceyonur committed Jun 8, 2023
1 parent 26242ce commit efb7d90
Show file tree
Hide file tree
Showing 7 changed files with 218 additions and 71 deletions.
38 changes: 38 additions & 0 deletions api/health/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
# Health Checking

## Health Check Types

### Readiness

Readiness is a special type of health check. Readiness checks will only run until they pass for the first time. After a readiness check passes, it will never be run again. These checks are typically used to indicate that the startup of a component has finished.

### Health

Health checks typically indicate that a component is operating as expected. The health of a component may flip due to any arbitrary heuristic the component exposes.

### Liveness

Liveness checks are intended to indicate that a component has become unhealthy and has no way to recover.

## Naming and Tags

All registered checks must have a unique name which will be included in the health check results.

Additionally, checks can optionally specify an arbitrary number of tags which can be used to group health checks together.

### Special Tags

- "All" is a tag that is automatically added for every check that is registered.
- "Application" checks are checks that are globally applicable. This means that it is not possible to filter application-wide health checks from a response.

## Health Check Worker

Readiness, Health, and Liveness checks are all implemented by using their own health check worker.

A health check worker starts a goroutine that updates the health of all registered checks every `freq`. By default `freq` is set to `30s`.

When a health check is added it will always initially report as unhealthy.

Every health check runs in its own goroutine to maximize concurrency. It is guaranteed that no locks from the health checker are held during the execution of the health check.

When the health check worker is stopped, it will finish executing any currently running health checks and then terminate its primary goroutine. After the health check worker is stopped, the health checks will never run again.
29 changes: 18 additions & 11 deletions api/health/health.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,15 @@ import (
"github.com/ava-labs/avalanchego/utils/logging"
)

// GlobalTag is the tag that is returned for all health check results,
// regardless of the tags passed to the Reporter.
// Registering a health check with this tag will ensure that it is always
// included in the results.
const GlobalTag = "global"
const (
// AllTag is automatically added to every registered check.
AllTag = "all"
// ApplicationTag checks will act as if they specified every tag that has
// been registered.
// Registering a health check with this tag will ensure that it is always
// included in all health query results.
ApplicationTag = "application"
)

var _ Health = (*health)(nil)

Expand Down Expand Up @@ -59,17 +63,17 @@ type health struct {
}

func New(log logging.Logger, registerer prometheus.Registerer) (Health, error) {
readinessWorker, err := newWorker("readiness", registerer)
readinessWorker, err := newWorker(log, "readiness", registerer)
if err != nil {
return nil, err
}

healthWorker, err := newWorker("health", registerer)
healthWorker, err := newWorker(log, "health", registerer)
if err != nil {
return nil, err
}

livenessWorker, err := newWorker("liveness", registerer)
livenessWorker, err := newWorker(log, "liveness", registerer)
return &health{
log: log,
readiness: readinessWorker,
Expand All @@ -93,7 +97,8 @@ func (h *health) RegisterLivenessCheck(name string, checker Checker, tags ...str
func (h *health) Readiness(tags ...string) (map[string]Result, bool) {
results, healthy := h.readiness.Results(tags...)
if !healthy {
h.log.Warn("failing readiness check",
h.log.Warn("failing check",
zap.String("namespace", "readiness"),
zap.Reflect("reason", results),
)
}
Expand All @@ -103,7 +108,8 @@ func (h *health) Readiness(tags ...string) (map[string]Result, bool) {
func (h *health) Health(tags ...string) (map[string]Result, bool) {
results, healthy := h.health.Results(tags...)
if !healthy {
h.log.Warn("failing health check",
h.log.Warn("failing check",
zap.String("namespace", "health"),
zap.Reflect("reason", results),
)
}
Expand All @@ -113,7 +119,8 @@ func (h *health) Health(tags ...string) (map[string]Result, bool) {
func (h *health) Liveness(tags ...string) (map[string]Result, bool) {
results, healthy := h.liveness.Results(tags...)
if !healthy {
h.log.Warn("failing liveness check",
h.log.Warn("failing check",
zap.String("namespace", "liveness"),
zap.Reflect("reason", results),
)
}
Expand Down
6 changes: 3 additions & 3 deletions api/health/health_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -267,7 +267,7 @@ func TestTags(t *testing.T) {
require.NoError(h.RegisterHealthCheck("check2", check, "tag1"))
require.NoError(h.RegisterHealthCheck("check3", check, "tag2"))
require.NoError(h.RegisterHealthCheck("check4", check, "tag1", "tag2"))
require.NoError(h.RegisterHealthCheck("check5", check, GlobalTag))
require.NoError(h.RegisterHealthCheck("check5", check, ApplicationTag))

// default checks
{
Expand Down Expand Up @@ -377,8 +377,8 @@ func TestTags(t *testing.T) {
require.Contains(healthResult, "check5")
require.True(health)

// add global tag
require.NoError(h.RegisterHealthCheck("check7", check, GlobalTag))
// add application tag
require.NoError(h.RegisterHealthCheck("check7", check, ApplicationTag))

awaitHealthy(t, h, false)

Expand Down
17 changes: 11 additions & 6 deletions api/health/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,16 +7,21 @@ import "github.com/prometheus/client_golang/prometheus"

type metrics struct {
// failingChecks keeps track of the number of check failing
failingChecks prometheus.Gauge
failingChecks *prometheus.GaugeVec
}

func newMetrics(namespace string, registerer prometheus.Registerer) (*metrics, error) {
metrics := &metrics{
failingChecks: prometheus.NewGauge(prometheus.GaugeOpts{
Namespace: namespace,
Name: "checks_failing",
Help: "number of currently failing health checks",
}),
failingChecks: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: namespace,
Name: "checks_failing",
Help: "number of currently failing health checks",
},
[]string{"tag"},
),
}
metrics.failingChecks.WithLabelValues(AllTag).Set(0)
metrics.failingChecks.WithLabelValues(ApplicationTag).Set(0)
return metrics, registerer.Register(metrics.failingChecks)
}

0 comments on commit efb7d90

Please sign in to comment.