Skip to content

Commit

Permalink
[feat] health check for katib-controller (#1934)
Browse files Browse the repository at this point in the history
* [feat]: add health check endpoint

* remove time sleep in github action test script

* add error check

* update docs
  • Loading branch information
anencore94 committed Oct 11, 2022
1 parent 96ab64b commit aaa42c1
Show file tree
Hide file tree
Showing 6 changed files with 39 additions and 9 deletions.
23 changes: 20 additions & 3 deletions cmd/katib-controller/v1beta1/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ import (
"github.com/spf13/viper"
_ "k8s.io/client-go/plugin/pkg/client/auth/gcp"
"sigs.k8s.io/controller-runtime/pkg/client/config"
"sigs.k8s.io/controller-runtime/pkg/healthz"
logf "sigs.k8s.io/controller-runtime/pkg/log"
"sigs.k8s.io/controller-runtime/pkg/log/zap"
"sigs.k8s.io/controller-runtime/pkg/manager"
Expand All @@ -44,6 +45,7 @@ func main() {

var experimentSuggestionName string
var metricsAddr string
var healthzAddr string
var webhookPort int
var injectSecurityContext bool
var enableGRPCProbeInSuggestion bool
Expand All @@ -54,6 +56,7 @@ func main() {
flag.StringVar(&experimentSuggestionName, "experiment-suggestion-name",
"default", "The implementation of suggestion interface in experiment controller (default)")
flag.StringVar(&metricsAddr, "metrics-addr", ":8080", "The address the metric endpoint binds to.")
flag.StringVar(&healthzAddr, "healthz-addr", ":18080", "The address the healthz endpoint binds to.")
flag.BoolVar(&injectSecurityContext, "webhook-inject-securitycontext", false, "Inject the securityContext of container[0] in the sidecar")
flag.BoolVar(&enableGRPCProbeInSuggestion, "enable-grpc-probe-in-suggestion", true, "enable grpc probe in suggestions")
flag.Var(&trialResources, "trial-resources", "The list of resources that can be used as trial template, in the form: Kind.version.group (e.g. TFJob.v1.kubeflow.org)")
Expand Down Expand Up @@ -82,6 +85,8 @@ func main() {
webhookPort,
"metrics-addr",
metricsAddr,
"healthz-addr",
healthzAddr,
consts.ConfigInjectSecurityContext,
viper.GetBool(consts.ConfigInjectSecurityContext),
consts.ConfigEnableGRPCProbeInSuggestion,
Expand All @@ -99,9 +104,10 @@ func main() {

// Create a new katib controller to provide shared dependencies and start components
mgr, err := manager.New(cfg, manager.Options{
MetricsBindAddress: metricsAddr,
LeaderElection: enableLeaderElection,
LeaderElectionID: leaderElectionID,
MetricsBindAddress: metricsAddr,
HealthProbeBindAddress: healthzAddr,
LeaderElection: enableLeaderElection,
LeaderElectionID: leaderElectionID,
})
if err != nil {
log.Error(err, "Failed to create the manager")
Expand Down Expand Up @@ -129,6 +135,17 @@ func main() {
os.Exit(1)
}

log.Info("Setting up health checker.")
if err := mgr.AddHealthzCheck("healthz", healthz.Ping); err != nil {
log.Error(err, "Unable to add healthz endpoint to the manager")
os.Exit(1)
}
// TODO (@anencore94) need to more detailed check whether is it possible to communicate with k8s-apiserver or db-manager at '/readyz' ?
if err := mgr.AddReadyzCheck("readyz", healthz.Ping); err != nil {
log.Error(err, "Unable to add readyz endpoint to the manager")
os.Exit(1)
}

// Start the Cmd
log.Info("Starting the Cmd.")
if err := mgr.Start(signals.SetupSignalHandler()); err != nil {
Expand Down
3 changes: 2 additions & 1 deletion docs/developer-guide.md
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,8 @@ Below is a list of command-line flags accepted by Katib controller:
|---------------------------------|---------------------------|-------------------------------|------------------------------------------------------------------------------------------------------------------------|
| enable-grpc-probe-in-suggestion | bool | true | Enable grpc probe in suggestions |
| experiment-suggestion-name | string | "default" | The implementation of suggestion interface in experiment controller |
| metrics-addr | string | ":8080" | The address the metric endpoint binds to |
| metrics-addr | string | ":8080" | The address that the metrics endpoint binds to |
| healthz-addr | string | ":18080" | The address that the healthz endpoint binds to |
| trial-resources | []schema.GroupVersionKind | null | The list of resources that can be used as trial template, in the form: Kind.version.group (e.g. TFJob.v1.kubeflow.org) |
| webhook-inject-securitycontext | bool | false | Inject the securityContext of container[0] in the sidecar |
| webhook-port | int | 8443 | The port number to be used for admission webhook server |
Expand Down
11 changes: 11 additions & 0 deletions manifests/v1beta1/components/controller/controller.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,17 @@ spec:
- containerPort: 8080
name: metrics
protocol: TCP
- containerPort: 18080
name: healthz
protocol: TCP
readinessProbe:
httpGet:
path: /readyz
port: healthz
livenessProbe:
httpGet:
path: /healthz
port: healthz
env:
- name: KATIB_CORE_NAMESPACE
valueFrom:
Expand Down
3 changes: 3 additions & 0 deletions manifests/v1beta1/components/controller/service.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,5 +19,8 @@ spec:
- name: metrics
port: 8080
targetPort: 8080
- name: healthz
port: 18080
targetPort: 18080
selector:
katib.kubeflow.org/component: controller
4 changes: 3 additions & 1 deletion pkg/webhook/v1beta1/webhook.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@ import (
)

func AddToManager(mgr manager.Manager, port int) error {

// Create a webhook server.
hookServer := &webhook.Server{
Port: port,
Expand All @@ -36,6 +35,9 @@ func AddToManager(mgr manager.Manager, port int) error {
if err := mgr.Add(hookServer); err != nil {
return fmt.Errorf("Add webhook server to the manager failed: %v", err)
}
if err := mgr.AddHealthzCheck("healthz", hookServer.StartedChecker()); err != nil {
return fmt.Errorf("Add webhook server health checker to the manager failed: %v", err)
}

experimentValidator := experiment.NewExperimentValidator(mgr.GetClient())
experimentDefaulter := experiment.NewExperimentDefaulter(mgr.GetClient())
Expand Down
4 changes: 0 additions & 4 deletions test/e2e/v1beta1/scripts/gh-actions/setup-katib.sh
Original file line number Diff line number Diff line change
Expand Up @@ -72,10 +72,6 @@ kubectl wait --for=condition=complete --timeout=${TIMEOUT} -l katib.kubeflow.org
kubectl wait --for=condition=ready --timeout=${TIMEOUT} -l "katib.kubeflow.org/component in ($WITH_DATABASE_TYPE,controller,db-manager,ui)" -n kubeflow pod ||
(kubectl get pods -n kubeflow && kubectl describe pods -n kubeflow && exit 1)

# Wait until all Katib pods is actually ready.
# Since Katib-controller does not use Readinessprobe yet, just wait for a while.
sleep 30

echo "All Katib components are running."
echo "Katib deployments"
kubectl -n kubeflow get deploy
Expand Down

0 comments on commit aaa42c1

Please sign in to comment.