Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

nfd-master: run a separate gRPC health server #1535

Merged
merged 1 commit into from
Jan 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion cmd/nfd-master/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,8 @@ import (

const (
// ProgramName is the canonical name of this program
ProgramName = "nfd-master"
ProgramName = "nfd-master"
GrpcHealthPort = 8082
)

func main() {
Expand Down Expand Up @@ -100,6 +101,7 @@ func main() {
utils.ConfigureGrpcKlog()

// Get new NfdMaster instance
args.GrpcHealthPort = GrpcHealthPort
instance, err := master.NewNfdMaster(args)
if err != nil {
klog.ErrorS(err, "failed to initialize NfdMaster instance")
Expand Down
6 changes: 2 additions & 4 deletions deployment/base/master/master-deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,12 @@ spec:
imagePullPolicy: Always
livenessProbe:
grpc:
port: 8080
port: 8082
initialDelaySeconds: 10
periodSeconds: 10
readinessProbe:
grpc:
port: 8080
port: 8082
initialDelaySeconds: 5
periodSeconds: 10
failureThreshold: 10
Expand All @@ -37,5 +37,3 @@ spec:
ports:
- name: metrics
containerPort: 8081
- name: grpc
containerPort: 8080
4 changes: 2 additions & 2 deletions deployment/helm/node-feature-discovery/templates/master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -43,12 +43,12 @@ spec:
imagePullPolicy: {{ .Values.image.pullPolicy }}
livenessProbe:
grpc:
port: 8080
port: 8082
initialDelaySeconds: 10
periodSeconds: 10
readinessProbe:
grpc:
port: 8080
port: 8082
initialDelaySeconds: 5
periodSeconds: 10
failureThreshold: 10
Expand Down
57 changes: 49 additions & 8 deletions pkg/nfd-master/nfd-master.go
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,9 @@
CrdController bool
EnableNodeFeatureApi bool
Port int
// GrpcHealthPort is only needed to avoid races between tests (by skipping the health server).
// Could be removed when gRPC labler service is dropped (when nfd-worker tests stop running nfd-master).
GrpcHealthPort int
Prune bool
VerifyNodeName bool
Options string
Expand Down Expand Up @@ -144,6 +147,7 @@
nodeName string
configFilePath string
server *grpc.Server
healthServer *grpc.Server
stop chan struct{}
ready chan bool
apihelper apihelper.APIHelpers
Expand Down Expand Up @@ -270,7 +274,11 @@

// Run gRPC server
grpcErr := make(chan error, 1)
go m.runGrpcServer(grpcErr)
// If the NodeFeature API is enabled, don'tregister the labeler API
// server. Otherwise, register the labeler server.
if !m.args.EnableNodeFeatureApi {
go m.runGrpcServer(grpcErr)
}

// Run updater that handles events from the nfd CRD API.
if m.nfdController != nil {
Expand All @@ -281,6 +289,13 @@
}
}

// Start gRPC server for liveness probe (at this point we're "live")
if m.args.GrpcHealthPort != 0 {
if err := m.startGrpcHealthServer(grpcErr); err != nil {
return fmt.Errorf("failed to start gRPC health server: %w", err)
}

Check warning on line 296 in pkg/nfd-master/nfd-master.go

View check run for this annotation

Codecov / codecov/patch

pkg/nfd-master/nfd-master.go#L294-L296

Added lines #L294 - L296 were not covered by tests
}

// Notify that we're ready to accept connections
m.ready <- true
close(m.ready)
Expand Down Expand Up @@ -323,6 +338,32 @@
}
}

// startGrpcHealthServer starts a gRPC health server for Kubernetes readiness/liveness probes.
// TODO: improve status checking e.g. with watchdog in the main event loop and
// cheking that node updater pool is alive.
func (m *nfdMaster) startGrpcHealthServer(errChan chan<- error) error {
lis, err := net.Listen("tcp", fmt.Sprintf(":%d", m.args.GrpcHealthPort))
if err != nil {
return fmt.Errorf("failed to listen: %w", err)
}

Check warning on line 348 in pkg/nfd-master/nfd-master.go

View check run for this annotation

Codecov / codecov/patch

pkg/nfd-master/nfd-master.go#L344-L348

Added lines #L344 - L348 were not covered by tests

s := grpc.NewServer()
grpc_health_v1.RegisterHealthServer(s, health.NewServer())
klog.InfoS("gRPC health server serving", "port", m.args.GrpcHealthPort)

go func() {
defer func() {
lis.Close()
}()
if err := s.Serve(lis); err != nil {
errChan <- fmt.Errorf("gRPC health server exited with an error: %w", err)
}
klog.InfoS("gRPC health server stopped")

Check warning on line 361 in pkg/nfd-master/nfd-master.go

View check run for this annotation

Codecov / codecov/patch

pkg/nfd-master/nfd-master.go#L350-L361

Added lines #L350 - L361 were not covered by tests
}()
m.healthServer = s
return nil

Check warning on line 364 in pkg/nfd-master/nfd-master.go

View check run for this annotation

Codecov / codecov/patch

pkg/nfd-master/nfd-master.go#L363-L364

Added lines #L363 - L364 were not covered by tests
}

func (m *nfdMaster) runGrpcServer(errChan chan<- error) {
// Create server listening for TCP connections
lis, err := net.Listen("tcp", fmt.Sprintf(":%d", m.args.Port))
Expand Down Expand Up @@ -352,13 +393,8 @@
}
m.server = grpc.NewServer(serverOpts...)

// If the NodeFeature API is enabled, don'tregister the labeler API
// server. Otherwise, register the labeler server.
if !m.args.EnableNodeFeatureApi {
pb.RegisterLabelerServer(m.server, m)
}
pb.RegisterLabelerServer(m.server, m)

grpc_health_v1.RegisterHealthServer(m.server, health.NewServer())
klog.InfoS("gRPC server serving", "port", m.args.Port)

// Run gRPC server
Expand Down Expand Up @@ -421,7 +457,12 @@

// Stop NfdMaster
func (m *nfdMaster) Stop() {
m.server.GracefulStop()
if m.server != nil {
m.server.GracefulStop()
}
if m.healthServer != nil {
m.healthServer.GracefulStop()
}

Check warning on line 465 in pkg/nfd-master/nfd-master.go

View check run for this annotation

Codecov / codecov/patch

pkg/nfd-master/nfd-master.go#L464-L465

Added lines #L464 - L465 were not covered by tests

if m.nfdController != nil {
m.nfdController.stop()
Expand Down