Skip to content

Commit

Permalink
Merge pull request #8504 from calvinrzachman/healthcheck
Browse files Browse the repository at this point in the history
lnd/healthcheck: add checks after initialization + success/failure callbacks
  • Loading branch information
yyforyongyu committed Mar 13, 2024
2 parents 1fd6bc8 + add2b29 commit d9887f3
Show file tree
Hide file tree
Showing 3 changed files with 280 additions and 19 deletions.
3 changes: 3 additions & 0 deletions docs/release-notes/release-notes-0.18.0.md
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,9 @@
for blinded path payloads to allow fuzzing before LND fully supports
blinded payment relay.

* Allow `healthcheck` package users to provide [custom callbacks](https://github.com/lightningnetwork/lnd/pull/8504)
which will execute whenever a healthcheck succeeds/fails.

### Logging
* [Add the htlc amount](https://github.com/lightningnetwork/lnd/pull/8156) to
contract court logs in case of timed-out htlcs in order to easily spot dust
Expand Down
90 changes: 80 additions & 10 deletions healthcheck/healthcheck.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ import (
"github.com/lightningnetwork/lnd/ticker"
)

var noOpCallback = func() {}

// Config contains configuration settings for our monitor.
type Config struct {
// Checks is a set of health checks that assert that lnd has access to
Expand Down Expand Up @@ -96,6 +98,19 @@ func (m *Monitor) Stop() error {
return nil
}

// AddCheck adds a new healthcheck to our monitor.
func (m *Monitor) AddCheck(check *Observation) error {

m.wg.Add(1)
go func(check *Observation) {
defer m.wg.Done()

check.monitor(m.cfg.Shutdown, m.quit)
}(check)

return nil
}

// CreateCheck is a helper function that takes a function that produces an error
// and wraps it in a function that returns its result on an error channel.
// We do not wait group the goroutine running our checkFunc because we expect
Expand Down Expand Up @@ -137,20 +152,65 @@ type Observation struct {
// Backoff is the amount of time we back off between retries for failed
// checks.
Backoff time.Duration

// OnSuccess is a callback which will be executed when the healthcheck
// succeeds. This is optional.
OnSuccess func()

// OnFailure is a callback which will be executed when the healthcheck
// fails. This is optional.
OnFailure func()
}

// ObservationOption describes the signature of a functional option that can be
// used to modify the behaviour of an Observation.
type ObservationOption func(*Observation)

// WithSuccessCallback configures an observation with a callback to be fired
// whenever the health check succeeds.
func WithSuccessCallback(callback func()) ObservationOption {
return func(o *Observation) {
o.OnSuccess = callback
}
}

// WithFailureCallback configures an observation with a callback to be fired
// whenever the health check reaches its failure threshold.
func WithFailureCallback(callback func()) ObservationOption {
return func(o *Observation) {
o.OnFailure = callback
}
}

// NewObservation creates an observation.
func NewObservation(name string, check func() error, interval,
timeout, backoff time.Duration, attempts int) *Observation {
func NewObservation(name string, check func() error, interval, timeout,
backoff time.Duration, attempts int,
opts ...ObservationOption) *Observation {

return &Observation{
observation := &Observation{
Name: name,
Check: CreateCheck(check),
Interval: ticker.New(interval),
Attempts: attempts,
Timeout: timeout,
Backoff: backoff,
}

// Apply each option to the observation.
for _, opt := range opts {
opt(observation)
}

// Ensure that we default to NO-OP callbacks.
if observation.OnSuccess == nil {
observation.OnSuccess = noOpCallback
}

if observation.OnFailure == nil {
observation.OnFailure = noOpCallback
}

return observation
}

// String returns a string representation of an observation.
Expand Down Expand Up @@ -206,6 +266,17 @@ func (o *Observation) retryCheck(quit chan struct{},
var err error
select {
case err = <-o.Check():
// If our error is nil, we have passed our health check,
// so we'll invoke our success callback if defined and
// then exit.
if err == nil {
log.Debug("invoking success callback")

// Invoke the success callback.
o.OnSuccess()

return false
}

case <-time.After(o.Timeout):
err = fmt.Errorf("health check: %v timed out after: "+
Expand All @@ -216,15 +287,14 @@ func (o *Observation) retryCheck(quit chan struct{},
return false
}

// If our error is nil, we have passed our health check, so we
// can exit.
if err == nil {
return false
}

// If we have reached our allowed number of attempts, this
// check has failed so we request shutdown.
// check has failed so we'll fire the on failure callback
// and request shutdown.
if count == o.Attempts {
log.Debug("invoking failure callback")

o.OnFailure()

shutdown("Health check: %v failed after %v "+
"calls", o, o.Attempts)
return true
Expand Down
206 changes: 197 additions & 9 deletions healthcheck/healthcheck_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -55,11 +55,13 @@ func TestMonitor(t *testing.T) {
cfg := &Config{
Checks: []*Observation{
{
Check: mock.call,
Interval: intervalTicker,
Attempts: 2,
Backoff: 0,
Timeout: time.Hour,
Check: mock.call,
Interval: intervalTicker,
Attempts: 2,
Backoff: 0,
Timeout: time.Hour,
OnSuccess: noOpCallback,
OnFailure: noOpCallback,
},
},
Shutdown: func(string, ...interface{}) {
Expand Down Expand Up @@ -202,10 +204,12 @@ func TestRetryCheck(t *testing.T) {
// function. We set a zero back off so that the test
// will not wait.
observation := &Observation{
Check: mock.call,
Attempts: test.attempts,
Timeout: test.timeout,
Backoff: 0,
Check: mock.call,
Attempts: test.attempts,
Timeout: test.timeout,
Backoff: 0,
OnSuccess: noOpCallback,
OnFailure: noOpCallback,
}
quit := make(chan struct{})

Expand Down Expand Up @@ -238,3 +242,187 @@ func TestRetryCheck(t *testing.T) {
})
}
}

// TestCallbacks verifies that we fire the OnSuccess/OnFailure callbacks
// as expected.
//
// - When the health check succeeds, the OnSuccess callback should fire.
// - When the failure threshold is reached, the OnFailure callback should fire.
func TestCallbacks(t *testing.T) {
intervalTicker := ticker.NewForce(time.Hour)

mock := newMockCheck(t)
failureThreshold := 3

successChan := make(chan struct{})
failChan := make(chan struct{})
shutdown := make(chan struct{})

// Create our config for monitoring. We will use a 0 back off so that
// out test does not need to wait.
observation := &Observation{
Check: mock.call,
Interval: intervalTicker,
Attempts: failureThreshold,
Backoff: 0,
Timeout: time.Hour,
OnSuccess: func() {
select {
case successChan <- struct{}{}:
case <-time.After(timeout):
t.Fatal("unable to fire onSuccess callback")
}
},
OnFailure: func() {
close(failChan)
},
}

cfg := &Config{
Checks: []*Observation{observation},
Shutdown: func(string, ...interface{}) {
shutdown <- struct{}{}
},
}
monitor := NewMonitor(cfg)
require.NoError(t, monitor.Start(), "could not start monitor")

// Tick is a helper we will use to tick our interval.
tick := func() {
select {
case intervalTicker.Force <- testTime:
case <-time.After(timeout):
t.Fatal("could not tick timer")
}
}

// We expect that the onSuccess callback is fired after each successful
// check.
for i := 0; i < failureThreshold; i++ {
tick()
mock.sendError(nil)

// We expect that the onSuccess callback will have fired.
select {
case <-successChan:
case <-time.After(timeout):
t.Fatal("expected success callback")
}

}

// Kick off another health check iteration. The monitor's internal
// retry mechanism will re-attempt the check until it has reached
// the configured maximum # of attempts.
//
// This mocks our check function failing the maximum # of times
// consecutively that it is allowed.
tick()
for i := 1; i <= failureThreshold; i++ {
mock.sendError(errNonNil)

// Verify that the onFailure callback does not fire unless
// the failure threshold (maximum # of attempts) is reached.
if i < failureThreshold {
select {
case <-failChan:
t.Fatal("unexpected onFailure callback")
default:
}
}
}

// After reaching the failure threshold for this health check,
// we expect that the onFailure callback will have fired.
select {
case <-failChan:
case <-time.After(timeout):
t.Fatal("expected onFailure callback")
}

// Since we have failed within our allowed number of retries, we now
// expect a call to our shutdown function.
select {
case <-shutdown:
case <-time.After(timeout):
t.Fatal("expected shutdown")
}
require.NoError(t, monitor.Stop(), "could not stop monitor")
}

// TestDynamicChecks verifies that we actually kick off health check routines
// for observations that are added after starting the monitor.
func TestDynamicChecks(t *testing.T) {
intervalTicker := ticker.NewForce(time.Hour)

mock := newMockCheck(t)

successChan := make(chan struct{})
shutdown := make(chan struct{})

// Don't configure any health checks for this monitor.
// We'd like to verify that we can add checks after startup.
cfg := &Config{
Checks: []*Observation{},
Shutdown: func(string, ...interface{}) {
shutdown <- struct{}{}
},
}
monitor := NewMonitor(cfg)
require.NoError(t, monitor.Start(), "could not start monitor")

// Tick is a helper we will use to tick our interval.
tick := func() {
select {
case intervalTicker.Force <- testTime:
case <-time.After(timeout):
t.Fatal("could not tick timer")
}
}

observation := &Observation{
Check: mock.call,
Interval: intervalTicker,
Attempts: 2,
Backoff: 0,
Timeout: time.Hour,
OnSuccess: func() {
select {
case successChan <- struct{}{}:
case <-time.After(timeout):
t.Fatal("unable to fire onSuccess callback")
}
},
OnFailure: noOpCallback,
}

// Add the check after having started the monitor.
err := monitor.AddCheck(observation)
require.NoError(t, err, "could not add new observation")

// This should initiate the check we dynamically added above.
tick()

// Verify that we can fire the OnSuccess callback.
mock.sendError(errNonNil)
mock.sendError(nil)
select {
case <-successChan:
case <-time.After(timeout):
t.Fatal("expected success callback")
}

// Verify that we correctly shutdown if the added health check fails.
tick()
mock.sendError(errNonNil)
mock.sendError(errNonNil)

// Since we have failed within our allowed number of retries, we now
// expect a call to our shutdown function.
select {
case <-shutdown:
case <-time.After(timeout):
t.Fatal("expected shutdown")
}
require.NoError(t, monitor.Stop(), "could not stop monitor")
}

0 comments on commit d9887f3

Please sign in to comment.