Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

autopilot: add Enterprise health information to API endpoint #20153

Merged
merged 2 commits into from
Mar 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
3 changes: 3 additions & 0 deletions .changelog/20153.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
```release-note:improvement
autopilot: add Enterprise health information to autopilot API
```
80 changes: 80 additions & 0 deletions api/operator_autopilot.go
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,86 @@ type OperatorHealthReply struct {

// Servers holds the health of each server.
Servers []ServerHealth

// The ID of the current leader.
Leader string

// List of servers that are voters in the Raft configuration.
Voters []string

// ReadReplicas holds the list of servers that are
// read replicas in the Raft configuration. (Enterprise only)
ReadReplicas []string `json:",omitempty"`

// RedundancyZones holds the list of servers in each redundancy zone.
// (Enterprise only)
RedundancyZones map[string]AutopilotZone `json:",omitempty"`

// Upgrade holds the current upgrade status.
Upgrade *AutopilotUpgrade `json:",omitempty"`

// The number of servers that could be lost without an outage
// occurring if all the voters don't fail at once. (Enterprise only)
OptimisticFailureTolerance int `json:",omitempty"`
}

// AutopilotZone holds the list of servers in a redundancy zone. (Enterprise only)
type AutopilotZone struct {
// Servers holds the list of servers in the redundancy zone.
Servers []string

// Voters holds the list of servers that are voters in the redundancy zone.
Voters []string

// FailureTolerance is the number of servers that could be lost without an
// outage occurring.
FailureTolerance int
}

// AutopilotUpgrade holds the current upgrade status. (Enterprise only)
type AutopilotUpgrade struct {
// Status of the upgrade.
Status string

// TargetVersion is the version that the cluster is upgrading to.
TargetVersion string

// TargetVersionVoters holds the list of servers that are voters in the Raft
// configuration of the TargetVersion.
TargetVersionVoters []string

// TargetVersionNonVoters holds the list of servers that are non-voters in
// the Raft configuration of the TargetVersion.
TargetVersionNonVoters []string

// TargetVersionReadReplicas holds the list of servers that are read
// replicas in the Raft configuration of the TargetVersion.
TargetVersionReadReplicas []string

// OtherVersionVoters holds the list of servers that are voters in the Raft
// configuration of a version other than the TargetVersion.
OtherVersionVoters []string

// OtherVersionNonVoters holds the list of servers that are non-voters in
// the Raft configuration of a version other than the TargetVersion.
OtherVersionNonVoters []string

// OtherVersionReadReplicas holds the list of servers that are read replicas
// in the Raft configuration of a version other than the TargetVersion.
OtherVersionReadReplicas []string

// RedundancyZones holds the list of servers in each redundancy zone for the
// TargetVersion.
RedundancyZones map[string]AutopilotZoneUpgradeVersions
}

// AutopilotZoneUpgradeVersions holds the list of servers
// in a redundancy zone for a specific version. (Enterprise only)
type AutopilotZoneUpgradeVersions struct {
TargetVersionVoters []string
TargetVersionNonVoters []string
OtherVersionVoters []string
OtherVersionNonVoters []string
}

// AutopilotGetConfiguration is used to query the current Autopilot configuration.
Expand Down
18 changes: 18 additions & 0 deletions api/operator_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -92,3 +92,21 @@ func TestOperator_SchedulerSetConfiguration(t *testing.T) {
must.True(t, schedulerConfig.SchedulerConfig.MemoryOversubscriptionEnabled)
must.Eq(t, schedulerConfig.SchedulerConfig.PreemptionConfig, newSchedulerConfig.PreemptionConfig)
}

func TestOperator_AutopilotState(t *testing.T) {
testutil.Parallel(t)

c, s, _ := makeACLClient(t, nil, nil)
defer s.Stop()

operator := c.Operator()

// Make authenticated request.
_, _, err := operator.AutopilotServerHealth(nil)
must.NoError(t, err)

// Make unauthenticated request.
c.SetSecretID("")
_, _, err = operator.AutopilotServerHealth(nil)
must.ErrorContains(t, err, "403")
}
8 changes: 7 additions & 1 deletion command/agent/operator_endpoint.go
Original file line number Diff line number Diff line change
Expand Up @@ -251,6 +251,8 @@ func (s *HTTPServer) OperatorServerHealth(resp http.ResponseWriter, req *http.Re
out := &api.OperatorHealthReply{
Healthy: reply.Healthy,
FailureTolerance: reply.FailureTolerance,
Voters: reply.Voters,
Leader: reply.Leader,
}
for _, server := range reply.Servers {
out.Servers = append(out.Servers, api.ServerHealth{
Expand All @@ -269,6 +271,9 @@ func (s *HTTPServer) OperatorServerHealth(resp http.ResponseWriter, req *http.Re
})
}

// Modify the reply to include Enterprise response
autopilotToAPIEntState(reply, out)

return out, nil
}

Expand Down Expand Up @@ -321,7 +326,8 @@ func (s *HTTPServer) schedulerUpdateConfig(resp http.ResponseWriter, req *http.R
SystemSchedulerEnabled: conf.PreemptionConfig.SystemSchedulerEnabled,
SysBatchSchedulerEnabled: conf.PreemptionConfig.SysBatchSchedulerEnabled,
BatchSchedulerEnabled: conf.PreemptionConfig.BatchSchedulerEnabled,
ServiceSchedulerEnabled: conf.PreemptionConfig.ServiceSchedulerEnabled},
ServiceSchedulerEnabled: conf.PreemptionConfig.ServiceSchedulerEnabled,
},
}

if err := args.Config.Validate(); err != nil {
Expand Down
6 changes: 6 additions & 0 deletions command/agent/operator_endpoint_ce.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@ package agent

import (
"net/http"

"github.com/hashicorp/nomad/api"
"github.com/hashicorp/nomad/nomad/structs"
)

func (s *HTTPServer) LicenseRequest(resp http.ResponseWriter, req *http.Request) (interface{}, error) {
Expand All @@ -20,5 +23,8 @@ func (s *HTTPServer) LicenseRequest(resp http.ResponseWriter, req *http.Request)
default:
return nil, CodedError(405, ErrInvalidMethod)
}
}

func autopilotToAPIEntState(_ structs.OperatorHealthReply, _ *api.OperatorHealthReply) interface{} {
return nil
}
56 changes: 56 additions & 0 deletions command/agent/operator_endpoint_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -430,6 +430,62 @@ func TestOperator_ServerHealth_Unhealthy(t *testing.T) {
})
}

func TestOperator_AutopilotHealth(t *testing.T) {
ci.Parallel(t)

httpTest(t, func(c *Config) {
c.Server.RaftProtocol = 3
}, func(s *TestAgent) {
body := bytes.NewBuffer(nil)
req, _ := http.NewRequest(http.MethodGet, "/v1/operator/autopilot/health", body)
f := func() error {
resp := httptest.NewRecorder()
obj, err := s.Server.OperatorServerHealth(resp, req)
if err != nil {
return fmt.Errorf("failed to get operator server state: %w", err)
}
if code := resp.Code; code != 200 {
return fmt.Errorf("response code not 200, got: %d", code)
}
out := obj.(*api.OperatorHealthReply)
if n := len(out.Servers); n != 1 {
return fmt.Errorf("expected 1 server, got: %d", n)
}
serfMember := s.server.LocalMember()
id, ok := serfMember.Tags["id"]
if !ok {
t.Errorf("Tag not found")
}
var leader api.ServerHealth
for _, srv := range out.Servers {
if srv.ID == id {
leader = srv
break
}
}

t.Log("serfMember", serfMember)
s1, s2 := leader.ID, id
if s1 != s2 {
return fmt.Errorf("expected server names to match, got %s and %s", s1, s2)
}
if leader.Healthy != true {
return fmt.Errorf("expected autopilot server status to be healthy, got: %t", leader.Healthy)
}
s1, s2 = out.Voters[0], id
if s1 != s2 {
return fmt.Errorf("expected server to be voter: %s", out.Voters[0])
}
return nil
}
must.Wait(t, wait.InitialSuccess(
wait.ErrorFunc(f),
wait.Timeout(10*time.Second),
wait.Gap(1*time.Second),
))
})
}

func TestOperator_SchedulerGetConfiguration(t *testing.T) {
ci.Parallel(t)
httpTest(t, nil, func(s *TestAgent) {
Expand Down
66 changes: 42 additions & 24 deletions nomad/autopilot.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import (
"strconv"

metrics "github.com/armon/go-metrics"
"github.com/hashicorp/nomad/helper"
"github.com/hashicorp/nomad/nomad/structs"
"github.com/hashicorp/raft"
autopilot "github.com/hashicorp/raft-autopilot"
Expand Down Expand Up @@ -116,43 +117,60 @@ func (s *Server) GetClusterHealth() *structs.OperatorHealthReply {
health := &structs.OperatorHealthReply{
Healthy: state.Healthy,
FailureTolerance: state.FailureTolerance,
Leader: string(state.Leader),
Voters: stringIDs(state.Voters),
Servers: make([]structs.ServerHealth, 0, len(state.Servers)),
}

for _, srv := range state.Servers {
srvHealth := structs.ServerHealth{
ID: string(srv.Server.ID),
Name: srv.Server.Name,
Address: string(srv.Server.Address),
Version: srv.Server.Version,
Leader: srv.State == autopilot.RaftLeader,
Voter: srv.State == autopilot.RaftLeader || srv.State == autopilot.RaftVoter,
LastContact: srv.Stats.LastContact,
LastTerm: srv.Stats.LastTerm,
LastIndex: srv.Stats.LastIndex,
Healthy: srv.Health.Healthy,
StableSince: srv.Health.StableSince,
}

switch srv.Server.NodeStatus {
case autopilot.NodeAlive:
srvHealth.SerfStatus = serf.StatusAlive
case autopilot.NodeLeft:
srvHealth.SerfStatus = serf.StatusLeft
case autopilot.NodeFailed:
srvHealth.SerfStatus = serf.StatusFailed
default:
srvHealth.SerfStatus = serf.StatusNone
}
srvHealth := autopilotToServerHealth(srv)

health.Servers = append(health.Servers, srvHealth)
}
err := s.autopilotStateExt(state, health)
if err != nil {
s.logger.Error("Error parsing autopilot state", "error", err)
}

return health
}

// -------------------
// helper functions

func autopilotToServerHealth(srv *autopilot.ServerState) structs.ServerHealth {
srvHealth := structs.ServerHealth{
ID: string(srv.Server.ID),
Name: srv.Server.Name,
Address: string(srv.Server.Address),
Version: srv.Server.Version,
Leader: srv.State == autopilot.RaftLeader,
Voter: srv.State == autopilot.RaftLeader || srv.State == autopilot.RaftVoter,
LastContact: srv.Stats.LastContact,
LastTerm: srv.Stats.LastTerm,
LastIndex: srv.Stats.LastIndex,
Healthy: srv.Health.Healthy,
StableSince: srv.Health.StableSince,
}

switch srv.Server.NodeStatus {
case autopilot.NodeAlive:
srvHealth.SerfStatus = serf.StatusAlive
case autopilot.NodeLeft:
srvHealth.SerfStatus = serf.StatusLeft
case autopilot.NodeFailed:
srvHealth.SerfStatus = serf.StatusFailed
default:
srvHealth.SerfStatus = serf.StatusNone
}

return srvHealth
}

func stringIDs(ids []raft.ServerID) []string {
return helper.ConvertSlice(ids, func(id raft.ServerID) string { return string(id) })
}

func minRaftProtocol(members []serf.Member, serverFunc func(serf.Member) (bool, *serverParts)) (int, error) {
minVersion := -1
for _, m := range members {
Expand Down
4 changes: 4 additions & 0 deletions nomad/autopilot_ce.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,10 @@ func (s *Server) autopilotServerExt(_ *serverParts) interface{} {
return nil
}

func (s *Server) autopilotStateExt(_ *autopilot.State, _ *structs.OperatorHealthReply) error {
return nil
}

// autopilotConfigExt returns the autopilot-enterprise.Config extensions needed
// for ENT feature support, but this is the empty OSS implementation.
func autopilotConfigExt(_ *structs.AutopilotConfig) interface{} {
Expand Down
38 changes: 38 additions & 0 deletions nomad/autopilot_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ import (
"github.com/shoenig/test/must"

"github.com/hashicorp/nomad/ci"
"github.com/hashicorp/nomad/nomad/structs"
"github.com/hashicorp/nomad/testutil"
)

Expand Down Expand Up @@ -306,5 +307,42 @@ func TestAutopilot_PromoteNonVoter(t *testing.T) {
}
return true, nil
}, func(err error) { must.NoError(t, err) })
}

func TestAutopilot_ReturnAutopilotHealth(t *testing.T) {
ci.Parallel(t)
s1, cleanupS1 := TestServer(t, func(c *Config) {
c.BootstrapExpect = 2
c.RaftConfig.ProtocolVersion = 3
c.AutopilotConfig.EnableCustomUpgrades = true
c.UpgradeVersion = "0.0.1"
c.NumSchedulers = 0 // reduce log noise
})
defer cleanupS1()

s2, cleanupS2 := TestServer(t, func(c *Config) {
c.BootstrapExpect = 2
c.RaftConfig.ProtocolVersion = 3
c.AutopilotConfig.EnableCustomUpgrades = true
c.UpgradeVersion = "0.0.1"
c.NumSchedulers = 0 // reduce log noise
})
defer cleanupS2()

TestJoin(t, s1, s2)
servers := []*Server{s1, s2}
leader := waitForStableLeadership(t, servers)

get := &structs.GenericRequest{
QueryOptions: structs.QueryOptions{
Region: "global",
},
}
reply := &structs.OperatorHealthReply{}
err := s1.RPC("Operator.ServerHealth", get, reply)
must.NoError(t, err)

must.Eq(t, reply.Healthy, true)
_, leaderID := leader.raft.LeaderWithID()
must.Eq(t, reply.Leader, string(leaderID))
}