Skip to content

Commit

Permalink
operator: LeaderElectionReleaseOnCancel
Browse files Browse the repository at this point in the history
When manually restarting the operator, the leader election may
take 5+ minutest to acquire the lease on startup:

```
I1205 16:06:02.101302       1 leaderelection.go:245] attempting to acquire leader lease openshift-sriov-network-operator/a56def2a.openshift.io...
...
I1205 16:08:40.133558       1 leaderelection.go:255] successfully acquired lease openshift-sriov-network-operator/a56def2a.openshift.io
```

The manager's option `LeaderElectionReleaseOnCancel` would solve this
problem, but it's not safe as the shutdown cleanup procedures
(inhibiting webhooks and removing finalizers) would run without any
leader guard.

This commit moves the LeaderElection mechanism from the namespaced
manager to a dedicated, no-op controller manager. This approach has been
preferred to directly dealing with the LeaderElection API as:
- It leverages library code that has been proved to be stable
- It includes recording k8s Events about the Lease process
- The election process must come after setting up the health probe.
  Doing it manually would involve handling the healthz endpoint as well.

Signed-off-by: Andrea Panattoni <apanatto@redhat.com>
  • Loading branch information
zeeke committed Dec 20, 2023
1 parent 1939bc8 commit 28cff08
Showing 1 changed file with 55 additions and 19 deletions.
74 changes: 55 additions & 19 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import (
"flag"
"fmt"
"os"
"sync"

netattdefv1 "github.com/k8snetworkplumbingwg/network-attachment-definition-client/pkg/apis/k8s.cni.cncf.io/v1"
openshiftconfigv1 "github.com/openshift/api/config/v1"
Expand Down Expand Up @@ -99,22 +100,42 @@ func main() {
le := leaderelection.GetLeaderElectionConfig(kubeClient, enableLeaderElection)

namespace := os.Getenv("NAMESPACE")
leaderElectionMgr, err := ctrl.NewManager(restConfig, ctrl.Options{
Scheme: scheme,
HealthProbeBindAddress: probeAddr,
Metrics: server.Options{BindAddress: "0"},
LeaderElection: enableLeaderElection,
LeaseDuration: &le.LeaseDuration,
LeaderElectionReleaseOnCancel: true,
RenewDeadline: &le.RenewDeadline,
RetryPeriod: &le.RetryPeriod,
LeaderElectionID: "a56def2a.openshift.io",
})
if err != nil {
setupLog.Error(err, "unable to start leader election manager")
os.Exit(1)
}

if err := leaderElectionMgr.AddHealthzCheck("healthz", healthz.Ping); err != nil {
setupLog.Error(err, "unable to set up health check")
os.Exit(1)
}
if err := leaderElectionMgr.AddReadyzCheck("readyz", healthz.Ping); err != nil {
setupLog.Error(err, "unable to set up ready check")
os.Exit(1)
}

mgr, err := ctrl.NewManager(restConfig, ctrl.Options{
Scheme: scheme,
Metrics: server.Options{BindAddress: metricsAddr},
WebhookServer: webhook.NewServer(webhook.Options{Port: 9443}),
HealthProbeBindAddress: probeAddr,
LeaderElection: enableLeaderElection,
LeaseDuration: &le.LeaseDuration,
RenewDeadline: &le.RenewDeadline,
RetryPeriod: &le.RetryPeriod,
LeaderElectionID: "a56def2a.openshift.io",
Cache: cache.Options{DefaultNamespaces: map[string]cache.Config{namespace: {}}},
Scheme: scheme,
Metrics: server.Options{BindAddress: metricsAddr},
WebhookServer: webhook.NewServer(webhook.Options{Port: 9443}),
Cache: cache.Options{DefaultNamespaces: map[string]cache.Config{namespace: {}}},
})
if err != nil {
setupLog.Error(err, "unable to start manager")
os.Exit(1)
}

mgrGlobal, err := ctrl.NewManager(restConfig, ctrl.Options{
Scheme: scheme,
Metrics: server.Options{BindAddress: "0"},
Expand Down Expand Up @@ -190,31 +211,46 @@ func main() {
os.Exit(1)
}

if err := mgr.AddHealthzCheck("healthz", healthz.Ping); err != nil {
setupLog.Error(err, "unable to set up health check")
os.Exit(1)
}
if err := mgr.AddReadyzCheck("readyz", healthz.Ping); err != nil {
setupLog.Error(err, "unable to set up ready check")
os.Exit(1)
}
var managersWg sync.WaitGroup
leaderElectionContext, cancelLeaderElection := context.WithCancel(context.Background())
managersWg.Add(1)
go func() {
defer managersWg.Done()
setupLog.Info("starting leader election manager")
if err := leaderElectionMgr.Start(leaderElectionContext); err != nil {
setupLog.Error(err, "Leader Election Manager exited non-zero")
os.Exit(1)
}
}()

<-leaderElectionMgr.Elected()

stopCh := ctrl.SetupSignalHandler()

managersWg.Add(1)
go func() {
defer managersWg.Done()
setupLog.Info("starting global manager")
if err := mgrGlobal.Start(stopCh); err != nil {
setupLog.Error(err, "Manager Global exited non-zero")
os.Exit(1)
}
}()

// LeaderElection must be canceled after resource cleanups
defer cancelLeaderElection()

// Remove all finalizers after controller is shut down
defer utils.Shutdown()

setupLog.Info("starting manager")
setupLog.Info("starting namespaced manager")
if err := mgr.Start(stopCh); err != nil {
setupLog.Error(err, "problem running manager")
os.Exit(1)
}

// Wait for all the other managers to have finshed. leaderElectionMgr will release the lock when correctly stopped
managersWg.Wait()
}

func initNicIDMap() error {
Expand Down

0 comments on commit 28cff08

Please sign in to comment.