From 4335b7ea9e6aee041a4bd47db7735182c1a1a452 Mon Sep 17 00:00:00 2001 From: Jan Safranek Date: Thu, 4 Jan 2024 14:41:35 +0100 Subject: [PATCH] Don't exit the probe on connection issues Do not exit the liveness probe process when the liveness probe cannot connect to the CSI driver. The driver could be crashlooping, and we should not crashloop the liveness probe process too. The process should only fail all probes to /healthz endpoint. Since the HTTP server is not running when connecting to the driver for the first time, "connection refused" must be a good enough failure. --- cmd/livenessprobe/main.go | 44 +++++++-------------------------------- go.mod | 2 +- 2 files changed, 9 insertions(+), 37 deletions(-) diff --git a/cmd/livenessprobe/main.go b/cmd/livenessprobe/main.go index 906bd90c..c62beb38 100644 --- a/cmd/livenessprobe/main.go +++ b/cmd/livenessprobe/main.go @@ -23,10 +23,8 @@ import ( "net" "net/http" "os" - "sync" "time" - "google.golang.org/grpc" "k8s.io/klog/v2" "k8s.io/component-base/featuregate" @@ -62,7 +60,7 @@ func (h *healthProbe) checkProbe(w http.ResponseWriter, req *http.Request) { ctx, cancel := context.WithTimeout(req.Context(), *probeTimeout) defer cancel() - conn, err := acquireConnection(ctx, h.metricsManager) + conn, err := connlib.Connect(*csiAddress, h.metricsManager, connlib.WithTimeout(*probeTimeout)) if err != nil { w.WriteHeader(http.StatusInternalServerError) w.Write([]byte(err.Error())) @@ -92,37 +90,6 @@ func (h *healthProbe) checkProbe(w http.ResponseWriter, req *http.Request) { klog.V(5).InfoS("Health check succeeded") } -// acquireConnection wraps the connlib.Connect but adding support to context -// cancelation. -func acquireConnection(ctx context.Context, metricsManager metrics.CSIMetricsManager) (conn *grpc.ClientConn, err error) { - - var m sync.Mutex - var canceled bool - ready := make(chan bool) - go func() { - conn, err = connlib.Connect(*csiAddress, metricsManager) - - m.Lock() - defer m.Unlock() - if err != nil && canceled && conn != nil { - conn.Close() - } - - close(ready) - }() - - select { - case <-ctx.Done(): - m.Lock() - defer m.Unlock() - canceled = true - return nil, ctx.Err() - - case <-ready: - return conn, err - } -} - func main() { fg := featuregate.NewFeatureGate() logsapi.AddFeatureGates(fg) @@ -151,10 +118,14 @@ func main() { } metricsManager := metrics.NewCSIMetricsManager("" /* driverName */) - csiConn, err := acquireConnection(context.Background(), metricsManager) + // Connect to the CSI driver without any timeout to avoid crashing the probe when the driver is not ready yet. + // Goal: liveness probe never crashes, it only fails the probe when the driver is not available (yet). + // Since a http server for the probe is not running at this point, Kubernetes liveness probe will fail immediately + // with "connection refused", which is good enough to fail the probe. + csiConn, err := connlib.Connect(*csiAddress, metricsManager, connlib.WithTimeout(0)) if err != nil { // connlib should retry forever so a returned error should mean - // the grpc client is misconfigured rather than an error on the network + // the grpc client is misconfigured rather than an error on the network or CSI driver. klog.ErrorS(err, "Failed to establish connection to CSI driver") klog.FlushAndExit(klog.ExitFlushTimeout, 1) } @@ -163,6 +134,7 @@ func main() { csiDriverName, err := rpc.GetDriverName(context.Background(), csiConn) csiConn.Close() if err != nil { + // The CSI driver does not support GetDriverName, which is serious enough to crash the probe. klog.ErrorS(err, "Failed to get CSI driver name") klog.FlushAndExit(klog.ExitFlushTimeout, 1) } diff --git a/go.mod b/go.mod index 787cc161..66f1999c 100644 --- a/go.mod +++ b/go.mod @@ -7,7 +7,6 @@ require ( github.com/golang/mock v1.6.0 github.com/kubernetes-csi/csi-lib-utils v0.17.0 github.com/kubernetes-csi/csi-test/v5 v5.2.0 - google.golang.org/grpc v1.60.1 k8s.io/component-base v0.29.0 k8s.io/klog/v2 v2.110.1 ) @@ -46,6 +45,7 @@ require ( golang.org/x/sys v0.14.0 // indirect golang.org/x/text v0.14.0 // indirect google.golang.org/genproto/googleapis/rpc v0.0.0-20231106174013-bbf56f31fb17 // indirect + google.golang.org/grpc v1.60.1 // indirect google.golang.org/protobuf v1.31.0 // indirect gopkg.in/inf.v0 v0.9.1 // indirect gopkg.in/yaml.v2 v2.4.0 // indirect