Merge pull request #8504 from calvinrzachman/healthcheck

lnd/healthcheck: add checks after initialization + success/failure callbacks
lightningnetwork · Mar 13, 2024 · d9887f3 · d9887f3
2 parents 1fd6bc8 + add2b29
commit d9887f3
Show file tree

Hide file tree

Showing 3 changed files with 280 additions and 19 deletions.
diff --git a/docs/release-notes/release-notes-0.18.0.md b/docs/release-notes/release-notes-0.18.0.md
@@ -205,6 +205,9 @@
   for blinded path payloads to allow fuzzing before LND fully supports 
   blinded payment relay.
 
+* Allow `healthcheck` package users to provide [custom callbacks](https://github.com/lightningnetwork/lnd/pull/8504)
+  which will execute whenever a healthcheck succeeds/fails.
+
 ### Logging
 * [Add the htlc amount](https://github.com/lightningnetwork/lnd/pull/8156) to
   contract court logs in case of timed-out htlcs in order to easily spot dust

diff --git a/healthcheck/healthcheck.go b/healthcheck/healthcheck.go
@@ -15,6 +15,8 @@ import (
 	"github.com/lightningnetwork/lnd/ticker"
 )
 
+var noOpCallback = func() {}
+
 // Config contains configuration settings for our monitor.
 type Config struct {
 	// Checks is a set of health checks that assert that lnd has access to
@@ -96,6 +98,19 @@ func (m *Monitor) Stop() error {
 	return nil
 }
 
+// AddCheck adds a new healthcheck to our monitor.
+func (m *Monitor) AddCheck(check *Observation) error {
+
+	m.wg.Add(1)
+	go func(check *Observation) {
+		defer m.wg.Done()
+
+		check.monitor(m.cfg.Shutdown, m.quit)
+	}(check)
+
+	return nil
+}
+
 // CreateCheck is a helper function that takes a function that produces an error
 // and wraps it in a function that returns its result on an error channel.
 // We do not wait group the goroutine running our checkFunc because we expect
@@ -137,20 +152,65 @@ type Observation struct {
 	// Backoff is the amount of time we back off between retries for failed
 	// checks.
 	Backoff time.Duration
+
+	// OnSuccess is a callback which will be executed when the healthcheck
+	// succeeds. This is optional.
+	OnSuccess func()
+
+	// OnFailure is a callback which will be executed when the healthcheck
+	// fails. This is optional.
+	OnFailure func()
+}
+
+// ObservationOption describes the signature of a functional option that can be
+// used to modify the behaviour of an Observation.
+type ObservationOption func(*Observation)
+
+// WithSuccessCallback configures an observation with a callback to be fired
+// whenever the health check succeeds.
+func WithSuccessCallback(callback func()) ObservationOption {
+	return func(o *Observation) {
+		o.OnSuccess = callback
+	}
+}
+
+// WithFailureCallback configures an observation with a callback to be fired
+// whenever the health check reaches its failure threshold.
+func WithFailureCallback(callback func()) ObservationOption {
+	return func(o *Observation) {
+		o.OnFailure = callback
+	}
 }
 
 // NewObservation creates an observation.
-func NewObservation(name string, check func() error, interval,
-	timeout, backoff time.Duration, attempts int) *Observation {
+func NewObservation(name string, check func() error, interval, timeout,
+	backoff time.Duration, attempts int,
+	opts ...ObservationOption) *Observation {
 
-	return &Observation{
+	observation := &Observation{
 		Name:     name,
 		Check:    CreateCheck(check),
 		Interval: ticker.New(interval),
 		Attempts: attempts,
 		Timeout:  timeout,
 		Backoff:  backoff,
 	}
+
+	// Apply each option to the observation.
+	for _, opt := range opts {
+		opt(observation)
+	}
+
+	// Ensure that we default to NO-OP callbacks.
+	if observation.OnSuccess == nil {
+		observation.OnSuccess = noOpCallback
+	}
+
+	if observation.OnFailure == nil {
+		observation.OnFailure = noOpCallback
+	}
+
+	return observation
 }
 
 // String returns a string representation of an observation.
@@ -206,6 +266,17 @@ func (o *Observation) retryCheck(quit chan struct{},
 		var err error
 		select {
 		case err = <-o.Check():
+			// If our error is nil, we have passed our health check,
+			// so we'll invoke our success callback if defined and
+			// then exit.
+			if err == nil {
+				log.Debug("invoking success callback")
+
+				// Invoke the success callback.
+				o.OnSuccess()
+
+				return false
+			}
 
 		case <-time.After(o.Timeout):
 			err = fmt.Errorf("health check: %v timed out after: "+
@@ -216,15 +287,14 @@ func (o *Observation) retryCheck(quit chan struct{},
 			return false
 		}
 
-		// If our error is nil, we have passed our health check, so we
-		// can exit.
-		if err == nil {
-			return false
-		}
-
 		// If we have reached our allowed number of attempts, this
-		// check has failed so we request shutdown.
+		// check has failed so we'll fire the on failure callback
+		// and request shutdown.
 		if count == o.Attempts {
+			log.Debug("invoking failure callback")
+
+			o.OnFailure()
+
 			shutdown("Health check: %v failed after %v "+
 				"calls", o, o.Attempts)
 			return true

diff --git a/healthcheck/healthcheck_test.go b/healthcheck/healthcheck_test.go
@@ -55,11 +55,13 @@ func TestMonitor(t *testing.T) {
 	cfg := &Config{
 		Checks: []*Observation{
 			{
-				Check:    mock.call,
-				Interval: intervalTicker,
-				Attempts: 2,
-				Backoff:  0,
-				Timeout:  time.Hour,
+				Check:     mock.call,
+				Interval:  intervalTicker,
+				Attempts:  2,
+				Backoff:   0,
+				Timeout:   time.Hour,
+				OnSuccess: noOpCallback,
+				OnFailure: noOpCallback,
 			},
 		},
 		Shutdown: func(string, ...interface{}) {
@@ -202,10 +204,12 @@ func TestRetryCheck(t *testing.T) {
 			// function. We set a zero back off so that the test
 			// will not wait.
 			observation := &Observation{
-				Check:    mock.call,
-				Attempts: test.attempts,
-				Timeout:  test.timeout,
-				Backoff:  0,
+				Check:     mock.call,
+				Attempts:  test.attempts,
+				Timeout:   test.timeout,
+				Backoff:   0,
+				OnSuccess: noOpCallback,
+				OnFailure: noOpCallback,
 			}
 			quit := make(chan struct{})
 
@@ -238,3 +242,187 @@ func TestRetryCheck(t *testing.T) {
 		})
 	}
 }
+
+// TestCallbacks verifies that we fire the OnSuccess/OnFailure callbacks
+// as expected.
+//
+// - When the health check succeeds, the OnSuccess callback should fire.
+// - When the failure threshold is reached, the OnFailure callback should fire.
+func TestCallbacks(t *testing.T) {
+	intervalTicker := ticker.NewForce(time.Hour)
+
+	mock := newMockCheck(t)
+	failureThreshold := 3
+
+	successChan := make(chan struct{})
+	failChan := make(chan struct{})
+	shutdown := make(chan struct{})
+
+	// Create our config for monitoring. We will use a 0 back off so that
+	// out test does not need to wait.
+	observation := &Observation{
+		Check:    mock.call,
+		Interval: intervalTicker,
+		Attempts: failureThreshold,
+		Backoff:  0,
+		Timeout:  time.Hour,
+		OnSuccess: func() {
+			select {
+			case successChan <- struct{}{}:
+			case <-time.After(timeout):
+				t.Fatal("unable to fire onSuccess callback")
+			}
+		},
+		OnFailure: func() {
+			close(failChan)
+		},
+	}
+
+	cfg := &Config{
+		Checks: []*Observation{observation},
+		Shutdown: func(string, ...interface{}) {
+			shutdown <- struct{}{}
+		},
+	}
+	monitor := NewMonitor(cfg)
+	require.NoError(t, monitor.Start(), "could not start monitor")
+
+	// Tick is a helper we will use to tick our interval.
+	tick := func() {
+		select {
+		case intervalTicker.Force <- testTime:
+		case <-time.After(timeout):
+			t.Fatal("could not tick timer")
+		}
+	}
+
+	// We expect that the onSuccess callback is fired after each successful
+	// check.
+	for i := 0; i < failureThreshold; i++ {
+		tick()
+		mock.sendError(nil)
+
+		// We expect that the onSuccess callback will have fired.
+		select {
+		case <-successChan:
+		case <-time.After(timeout):
+			t.Fatal("expected success callback")
+		}
+
+	}
+
+	// Kick off another health check iteration. The monitor's internal
+	// retry mechanism will re-attempt the check until it has reached
+	// the configured maximum # of attempts.
+	//
+	// This mocks our check function failing the maximum # of times
+	// consecutively that it is allowed.
+	tick()
+	for i := 1; i <= failureThreshold; i++ {
+		mock.sendError(errNonNil)
+
+		// Verify that the onFailure callback does not fire unless
+		// the failure threshold (maximum # of attempts) is reached.
+		if i < failureThreshold {
+			select {
+			case <-failChan:
+				t.Fatal("unexpected onFailure callback")
+			default:
+			}
+		}
+	}
+
+	// After reaching the failure threshold for this health check,
+	// we expect that the onFailure callback will have fired.
+	select {
+	case <-failChan:
+	case <-time.After(timeout):
+		t.Fatal("expected onFailure callback")
+	}
+
+	// Since we have failed within our allowed number of retries, we now
+	// expect a call to our shutdown function.
+	select {
+	case <-shutdown:
+	case <-time.After(timeout):
+		t.Fatal("expected shutdown")
+	}
+	require.NoError(t, monitor.Stop(), "could not stop monitor")
+}
+
+// TestDynamicChecks verifies that we actually kick off health check routines
+// for observations that are added after starting the monitor.
+func TestDynamicChecks(t *testing.T) {
+	intervalTicker := ticker.NewForce(time.Hour)
+
+	mock := newMockCheck(t)
+
+	successChan := make(chan struct{})
+	shutdown := make(chan struct{})
+
+	// Don't configure any health checks for this monitor.
+	// We'd like to verify that we can add checks after startup.
+	cfg := &Config{
+		Checks: []*Observation{},
+		Shutdown: func(string, ...interface{}) {
+			shutdown <- struct{}{}
+		},
+	}
+	monitor := NewMonitor(cfg)
+	require.NoError(t, monitor.Start(), "could not start monitor")
+
+	// Tick is a helper we will use to tick our interval.
+	tick := func() {
+		select {
+		case intervalTicker.Force <- testTime:
+		case <-time.After(timeout):
+			t.Fatal("could not tick timer")
+		}
+	}
+
+	observation := &Observation{
+		Check:    mock.call,
+		Interval: intervalTicker,
+		Attempts: 2,
+		Backoff:  0,
+		Timeout:  time.Hour,
+		OnSuccess: func() {
+			select {
+			case successChan <- struct{}{}:
+			case <-time.After(timeout):
+				t.Fatal("unable to fire onSuccess callback")
+			}
+		},
+		OnFailure: noOpCallback,
+	}
+
+	// Add the check after having started the monitor.
+	err := monitor.AddCheck(observation)
+	require.NoError(t, err, "could not add new observation")
+
+	// This should initiate the check we dynamically added above.
+	tick()
+
+	// Verify that we can fire the OnSuccess callback.
+	mock.sendError(errNonNil)
+	mock.sendError(nil)
+	select {
+	case <-successChan:
+	case <-time.After(timeout):
+		t.Fatal("expected success callback")
+	}
+
+	// Verify that we correctly shutdown if the added health check fails.
+	tick()
+	mock.sendError(errNonNil)
+	mock.sendError(errNonNil)
+
+	// Since we have failed within our allowed number of retries, we now
+	// expect a call to our shutdown function.
+	select {
+	case <-shutdown:
+	case <-time.After(timeout):
+		t.Fatal("expected shutdown")
+	}
+	require.NoError(t, monitor.Stop(), "could not stop monitor")
+}