Skip to content

Commit

Permalink
autopilot: add Enterprise health information to API endpoint
Browse files Browse the repository at this point in the history
Add information about autopilot health to the `/operator/autopilot/health` API
in Nomad Enterprise.

I've pulled the CE changes required for this feature out of @lindleywhite's PR
in the Enterprise repo. A separate PR will include a new `operator autopilot
health` command that can present this information at the command line.

Ref: hashicorp/nomad-enterprise#1394
  • Loading branch information
lindleywhite authored and tgross committed Mar 18, 2024
1 parent 1cbddfa commit cb8180d
Show file tree
Hide file tree
Showing 10 changed files with 359 additions and 26 deletions.
66 changes: 66 additions & 0 deletions api/operator_autopilot.go
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,72 @@ type OperatorHealthReply struct {

// Servers holds the health of each server.
Servers []ServerHealth

// The ID of the current leader.
Leader string

// List of servers that are voters in the Raft configuration.
Voters []string

// ReadReplicas holds the list of servers that are
// read replicas in the Raft configuration. (Enterprise only)
ReadReplicas []string `json:",omitempty"`
// RedundancyZones holds the list of servers in each redundancy zone. (Enterprise only)
RedundancyZones map[string]AutopilotZone `json:",omitempty"`

// Upgrade holds the current upgrade status.
Upgrade *AutopilotUpgrade `json:",omitempty"`

// The number of servers that could be lost without an outage
// occurring if all the voters don't fail at once. (Enterprise only)
OptimisticFailureTolerance int `json:",omitempty"`
}

// AutopilotZone holds the list of servers in a redundancy zone. (Enterprise only)
type AutopilotZone struct {
// Servers holds the list of servers in the redundancy zone.
Servers []string
// Voters holds the list of servers that are voters in the redundancy zone.
Voters []string
// FailureTolerance is the number of servers that could be lost without an outage occurring.
FailureTolerance int
}

// AutopilotUpgrade holds the current upgrade status. (Enterprise only)
type AutopilotUpgrade struct {
// Status of the upgrade.
Status string
// TargetVersion is the version that the cluster is upgrading to.
TargetVersion string
// TargetVersionVoters holds the list of servers that are voters
// in the Raft configuration of the TargetVersion.
TargetVersionVoters []string
// TargetVersionNonVoters holds the list of servers that are non-voters
// in the Raft configuration of the TargetVersion.
TargetVersionNonVoters []string
// TargetVersionReadReplicas holds the list of servers that are read replicas
// in the Raft configuration of the TargetVersion.
TargetVersionReadReplicas []string
// OtherVersionVoters holds the list of servers that are voters
// in the Raft configuration of a version other than the TargetVersion.
OtherVersionVoters []string
// OtherVersionNonVoters holds the list of servers that are non-voters
// in the Raft configuration of a version other than the TargetVersion.
OtherVersionNonVoters []string
// OtherVersionReadReplicas holds the list of servers that are read replicas
// in the Raft configuration of a version other than the TargetVersion.
OtherVersionReadReplicas []string
// RedundancyZones holds the list of servers in each redundancy zone for the TargetVersion.
RedundancyZones map[string]AutopilotZoneUpgradeVersions
}

// AutopilotZoneUpgradeVersions holds the list of servers
// in a redundancy zone for a specific version. (Enterprise only)
type AutopilotZoneUpgradeVersions struct {
TargetVersionVoters []string
TargetVersionNonVoters []string
OtherVersionVoters []string
OtherVersionNonVoters []string
}

// AutopilotGetConfiguration is used to query the current Autopilot configuration.
Expand Down
18 changes: 18 additions & 0 deletions api/operator_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -92,3 +92,21 @@ func TestOperator_SchedulerSetConfiguration(t *testing.T) {
must.True(t, schedulerConfig.SchedulerConfig.MemoryOversubscriptionEnabled)
must.Eq(t, schedulerConfig.SchedulerConfig.PreemptionConfig, newSchedulerConfig.PreemptionConfig)
}

func TestOperator_AutopilotState(t *testing.T) {
testutil.Parallel(t)

c, s, _ := makeACLClient(t, nil, nil)
defer s.Stop()

operator := c.Operator()

// Make authenticated request.
_, _, err := operator.AutopilotServerHealth(nil)
must.NoError(t, err)

// Make unauthenticated request.
c.SetSecretID("")
_, _, err = operator.AutopilotServerHealth(nil)
must.ErrorContains(t, err, "403")
}
8 changes: 7 additions & 1 deletion command/agent/operator_endpoint.go
Original file line number Diff line number Diff line change
Expand Up @@ -251,6 +251,8 @@ func (s *HTTPServer) OperatorServerHealth(resp http.ResponseWriter, req *http.Re
out := &api.OperatorHealthReply{
Healthy: reply.Healthy,
FailureTolerance: reply.FailureTolerance,
Voters: reply.Voters,
Leader: reply.Leader,
}
for _, server := range reply.Servers {
out.Servers = append(out.Servers, api.ServerHealth{
Expand All @@ -269,6 +271,9 @@ func (s *HTTPServer) OperatorServerHealth(resp http.ResponseWriter, req *http.Re
})
}

// Modify the reply to include Enterprise response
autopilotToAPIEntState(reply, out)

return out, nil
}

Expand Down Expand Up @@ -321,7 +326,8 @@ func (s *HTTPServer) schedulerUpdateConfig(resp http.ResponseWriter, req *http.R
SystemSchedulerEnabled: conf.PreemptionConfig.SystemSchedulerEnabled,
SysBatchSchedulerEnabled: conf.PreemptionConfig.SysBatchSchedulerEnabled,
BatchSchedulerEnabled: conf.PreemptionConfig.BatchSchedulerEnabled,
ServiceSchedulerEnabled: conf.PreemptionConfig.ServiceSchedulerEnabled},
ServiceSchedulerEnabled: conf.PreemptionConfig.ServiceSchedulerEnabled,
},
}

if err := args.Config.Validate(); err != nil {
Expand Down
6 changes: 6 additions & 0 deletions command/agent/operator_endpoint_ce.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@ package agent

import (
"net/http"

"github.com/hashicorp/nomad/api"
"github.com/hashicorp/nomad/nomad/structs"
)

func (s *HTTPServer) LicenseRequest(resp http.ResponseWriter, req *http.Request) (interface{}, error) {
Expand All @@ -20,5 +23,8 @@ func (s *HTTPServer) LicenseRequest(resp http.ResponseWriter, req *http.Request)
default:
return nil, CodedError(405, ErrInvalidMethod)
}
}

func autopilotToAPIEntState(_ structs.OperatorHealthReply, _ *api.OperatorHealthReply) interface{} {
return nil
}
56 changes: 56 additions & 0 deletions command/agent/operator_endpoint_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -430,6 +430,62 @@ func TestOperator_ServerHealth_Unhealthy(t *testing.T) {
})
}

func TestOperator_AutopilotHealth(t *testing.T) {
ci.Parallel(t)

httpTest(t, func(c *Config) {
c.Server.RaftProtocol = 3
}, func(s *TestAgent) {
body := bytes.NewBuffer(nil)
req, _ := http.NewRequest(http.MethodGet, "/v1/operator/autopilot/health", body)
f := func() error {
resp := httptest.NewRecorder()
obj, err := s.Server.OperatorServerHealth(resp, req)
if err != nil {
return fmt.Errorf("failed to get operator server state: %w", err)
}
if code := resp.Code; code != 200 {
return fmt.Errorf("response code not 200, got: %d", code)
}
out := obj.(*api.OperatorHealthReply)
if n := len(out.Servers); n != 1 {
return fmt.Errorf("expected 1 server, got: %d", n)
}
serfMember := s.server.LocalMember()
id, ok := serfMember.Tags["id"]
if !ok {
t.Errorf("Tag not found")
}
var leader api.ServerHealth
for _, srv := range out.Servers {
if srv.ID == id {
leader = srv
break
}
}

t.Log("serfMember", serfMember)
s1, s2 := leader.ID, id
if s1 != s2 {
return fmt.Errorf("expected server names to match, got %s and %s", s1, s2)
}
if leader.Healthy != true {
return fmt.Errorf("expected autopilot server status to be healthy, got: %t", leader.Healthy)
}
s1, s2 = out.Voters[0], id
if s1 != s2 {
return fmt.Errorf("expected server to be voter: %s", out.Voters[0])
}
return nil
}
must.Wait(t, wait.InitialSuccess(
wait.ErrorFunc(f),
wait.Timeout(10*time.Second),
wait.Gap(1*time.Second),
))
})
}

func TestOperator_SchedulerGetConfiguration(t *testing.T) {
ci.Parallel(t)
httpTest(t, nil, func(s *TestAgent) {
Expand Down
69 changes: 45 additions & 24 deletions nomad/autopilot.go
Original file line number Diff line number Diff line change
Expand Up @@ -116,43 +116,64 @@ func (s *Server) GetClusterHealth() *structs.OperatorHealthReply {
health := &structs.OperatorHealthReply{
Healthy: state.Healthy,
FailureTolerance: state.FailureTolerance,
Leader: string(state.Leader),
Voters: stringIDs(state.Voters),
Servers: make([]structs.ServerHealth, 0, len(state.Servers)),
}

for _, srv := range state.Servers {
srvHealth := structs.ServerHealth{
ID: string(srv.Server.ID),
Name: srv.Server.Name,
Address: string(srv.Server.Address),
Version: srv.Server.Version,
Leader: srv.State == autopilot.RaftLeader,
Voter: srv.State == autopilot.RaftLeader || srv.State == autopilot.RaftVoter,
LastContact: srv.Stats.LastContact,
LastTerm: srv.Stats.LastTerm,
LastIndex: srv.Stats.LastIndex,
Healthy: srv.Health.Healthy,
StableSince: srv.Health.StableSince,
}

switch srv.Server.NodeStatus {
case autopilot.NodeAlive:
srvHealth.SerfStatus = serf.StatusAlive
case autopilot.NodeLeft:
srvHealth.SerfStatus = serf.StatusLeft
case autopilot.NodeFailed:
srvHealth.SerfStatus = serf.StatusFailed
default:
srvHealth.SerfStatus = serf.StatusNone
}
srvHealth := autopilotToServerHealth(srv)

health.Servers = append(health.Servers, srvHealth)
}
err := s.autopilotStateExt(state, health)
if err != nil {
s.logger.Error("Error parsing autopilot state", "error", err)
}

return health
}

// -------------------
// helper functions

func autopilotToServerHealth(srv *autopilot.ServerState) structs.ServerHealth {
srvHealth := structs.ServerHealth{
ID: string(srv.Server.ID),
Name: srv.Server.Name,
Address: string(srv.Server.Address),
Version: srv.Server.Version,
Leader: srv.State == autopilot.RaftLeader,
Voter: srv.State == autopilot.RaftLeader || srv.State == autopilot.RaftVoter,
LastContact: srv.Stats.LastContact,
LastTerm: srv.Stats.LastTerm,
LastIndex: srv.Stats.LastIndex,
Healthy: srv.Health.Healthy,
StableSince: srv.Health.StableSince,
}

switch srv.Server.NodeStatus {
case autopilot.NodeAlive:
srvHealth.SerfStatus = serf.StatusAlive
case autopilot.NodeLeft:
srvHealth.SerfStatus = serf.StatusLeft
case autopilot.NodeFailed:
srvHealth.SerfStatus = serf.StatusFailed
default:
srvHealth.SerfStatus = serf.StatusNone
}

return srvHealth
}

func stringIDs(ids []raft.ServerID) []string {
out := make([]string, len(ids))
for i, id := range ids {
out[i] = string(id)
}
return out
}

func minRaftProtocol(members []serf.Member, serverFunc func(serf.Member) (bool, *serverParts)) (int, error) {
minVersion := -1
for _, m := range members {
Expand Down
4 changes: 4 additions & 0 deletions nomad/autopilot_ce.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,10 @@ func (s *Server) autopilotServerExt(_ *serverParts) interface{} {
return nil
}

func (s *Server) autopilotStateExt(_ *autopilot.State, _ *structs.OperatorHealthReply) error {
return nil
}

// autopilotConfigExt returns the autopilot-enterprise.Config extensions needed
// for ENT feature support, but this is the empty OSS implementation.
func autopilotConfigExt(_ *structs.AutopilotConfig) interface{} {
Expand Down
38 changes: 38 additions & 0 deletions nomad/autopilot_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ import (
"github.com/shoenig/test/must"

"github.com/hashicorp/nomad/ci"
"github.com/hashicorp/nomad/nomad/structs"
"github.com/hashicorp/nomad/testutil"
)

Expand Down Expand Up @@ -306,5 +307,42 @@ func TestAutopilot_PromoteNonVoter(t *testing.T) {
}
return true, nil
}, func(err error) { must.NoError(t, err) })
}

func TestAutopilot_ReturnAutopilotHealth(t *testing.T) {
ci.Parallel(t)
s1, cleanupS1 := TestServer(t, func(c *Config) {
c.BootstrapExpect = 2
c.RaftConfig.ProtocolVersion = 3
c.AutopilotConfig.EnableCustomUpgrades = true
c.UpgradeVersion = "0.0.1"
c.NumSchedulers = 0 // reduce log noise
})
defer cleanupS1()

s2, cleanupS2 := TestServer(t, func(c *Config) {
c.BootstrapExpect = 2
c.RaftConfig.ProtocolVersion = 3
c.AutopilotConfig.EnableCustomUpgrades = true
c.UpgradeVersion = "0.0.1"
c.NumSchedulers = 0 // reduce log noise
})
defer cleanupS2()

TestJoin(t, s1, s2)
servers := []*Server{s1, s2}
leader := waitForStableLeadership(t, servers)

get := &structs.GenericRequest{
QueryOptions: structs.QueryOptions{
Region: "global",
},
}
reply := &structs.OperatorHealthReply{}
err := s1.RPC("Operator.ServerHealth", get, reply)
must.NoError(t, err)

must.Eq(t, reply.Healthy, true)
_, leaderID := leader.raft.LeaderWithID()
must.Eq(t, reply.Leader, string(leaderID))
}

0 comments on commit cb8180d

Please sign in to comment.