Skip to content

Commit

Permalink
WIP
Browse files Browse the repository at this point in the history
License: MIT
Signed-off-by: Adrian Lanzafame <adrianlanzafame92@gmail.com>
  • Loading branch information
lanzafame committed Apr 23, 2019
1 parent 6d59379 commit d5ecd9e
Show file tree
Hide file tree
Showing 2 changed files with 37 additions and 20 deletions.
43 changes: 28 additions & 15 deletions monitor/metrics/checker.go
Expand Up @@ -38,19 +38,17 @@ func NewChecker(metrics *Store, threshold float64) *Checker {
}
}

// CheckPeers will trigger alerts all latest metrics from the given peerset
// CheckPeers will trigger alerts based on the latest metrics from the given peerset
// when they have expired and no alert has been sent before.
func (mc *Checker) CheckPeers(peers []peer.ID) error {
for _, peer := range peers {
// shortcut checking all metrics based on heartbeat
// failure detection
if mc.Failed(peer) {
err := mc.alert(peer, "ping")
if err != nil {
return err
}
}
for _, metric := range mc.metrics.PeerMetrics(peer) {
if mc.FailedMetric(metric.Name, peer) {
err := mc.alert(peer, metric.Name)
if err != nil {
return err
}
}
err := mc.alertIfExpired(metric)
if err != nil {
return err
Expand Down Expand Up @@ -133,20 +131,35 @@ func (mc *Checker) Watch(ctx context.Context, peersF func(context.Context) ([]pe
// Peers that are not present in the metrics store will return
// as failed.
func (mc *Checker) Failed(pid peer.ID) bool {
_, _, _, result := mc.failed(pid)
_, _, _, result := mc.failed("ping", pid)
return result
}

// FailedMetric is the same as Failed but can use any metric type,
// not just ping.
func (mc *Checker) FailedMetric(metric string, pid peer.ID) bool {
_, _, _, result := mc.failed(metric, pid)
return result
}

// failed returns all the values involved in making the decision
// as to whether a peer has failed or not. This mainly for debugging
// purposes.
func (mc *Checker) failed(pid peer.ID) (float64, []float64, float64, bool) {
latest := mc.metrics.PeerLatest("ping", pid)
func (mc *Checker) failed(metric string, pid peer.ID) (float64, []float64, float64, bool) {
latest := mc.metrics.PeerLatest(metric, pid)
if latest == nil {
return 0.0, nil, 0.0, true
}
v := time.Now().UnixNano() - latest.ReceivedAt
dv := mc.metrics.Distribution("ping", pid)
phiv := phi(float64(v), dv)
return float64(v), dv, phiv, phiv >= mc.threshold
dv := mc.metrics.Distribution(metric, pid)
// one metric isn't enough to calculate a distribution
// alerting/failure detection will fallback to the metric-expiring
// method
switch {
case len(dv) < 2 && !latest.Expired():
return float64(v), dv, 0.0, false
default:
phiv := phi(float64(v), dv)
return float64(v), dv, phiv, phiv >= mc.threshold
}
}
14 changes: 9 additions & 5 deletions monitor/metrics/checker_test.go
Expand Up @@ -17,12 +17,12 @@ import (
"github.com/ipfs/ipfs-cluster/test"
)

func TestChecker(t *testing.T) {
func TestCheckPeers(t *testing.T) {
metrics := NewStore()
checker := NewChecker(metrics, 2.0)

metr := &api.Metric{
Name: "test",
Name: "ping",
Peer: test.PeerID1,
Value: "1",
Valid: true,
Expand Down Expand Up @@ -112,6 +112,10 @@ func TestChecker_Failed(t *testing.T) {
})
}

//////////////////
// HELPER TESTS //
//////////////////

func TestThresholdValues(t *testing.T) {
t.Log("TestThresholdValues is useful for testing out different threshold values")
t.Log("It doesn't actually perform any 'tests', so it is skipped by default")
Expand All @@ -133,7 +137,7 @@ func TestThresholdValues(t *testing.T) {
output := false

check := func(i int) bool {
inputv, dist, phiv, got := checker.failed(test.PeerID1)
inputv, dist, phiv, got := checker.failed("ping", test.PeerID1)
if output {
fmt.Println(i)
fmt.Printf("phiv: %f\n", phiv)
Expand Down Expand Up @@ -193,7 +197,7 @@ func TestThresholdValues(t *testing.T) {
output := false

check := func(i int) bool {
inputv, dist, phiv, got := checker.failed(test.PeerID1)
inputv, dist, phiv, got := checker.failed("ping", test.PeerID1)
if output {
fmt.Println(i)
fmt.Printf("phiv: %f\n", phiv)
Expand Down Expand Up @@ -254,7 +258,7 @@ func TestThresholdValues(t *testing.T) {
output := false

check := func(i int) bool {
inputv, dist, phiv, got := checker.failed(test.PeerID1)
inputv, dist, phiv, got := checker.failed("ping", test.PeerID1)
if output {
fmt.Println(i)
fmt.Printf("phiv: %f\n", phiv)
Expand Down

0 comments on commit d5ecd9e

Please sign in to comment.