Skip to content

Commit

Permalink
Implement KatibConfig API (#2176)
Browse files Browse the repository at this point in the history
* Implement KatibConfig API

Signed-off-by: Yuki Iwai <yuki.iwai.tz@gmail.com>

* Replace 'collectorKind' with 'kind'

Signed-off-by: Yuki Iwai <yuki.iwai.tz@gmail.com>

* Replace 'metricsCollectorSidecars' with 'metricsCollectors'

Signed-off-by: Yuki Iwai <yuki.iwai.tz@gmail.com>

* Fix a typo

Signed-off-by: Yuki Iwai <yuki.iwai.tz@gmail.com>

* Make the init.controller.leaderElection non-pointer

Signed-off-by: Yuki Iwai <yuki.iwai.tz@gmail.com>

* Make the init.controller.injectSecurityContext non-pointer

Signed-off-by: Yuki Iwai <yuki.iwai.tz@gmail.com>

* Update a comment for the future works

Signed-off-by: Yuki Iwai <yuki.iwai.tz@gmail.com>

* Update manifest for the katib-leader-election

Signed-off-by: Yuki Iwai <yuki.iwai.tz@gmail.com>

* Fix a comment for the KatibConfig API

Signed-off-by: Yuki Iwai <yuki.iwai.tz@gmail.com>

* Replace 'configapi' with 'configv1beta1'

Signed-off-by: Yuki Iwai <yuki.iwai.tz@gmail.com>

* Remove debug code

Signed-off-by: Yuki Iwai <yuki.iwai.tz@gmail.com>

* Put constant for the default KatibConfig value on /pkg/apis/config/v1beta1

Signed-off-by: Yuki Iwai <yuki.iwai.tz@gmail.com>

* Use 'sigs.k8s.io/yaml' instead of 'github.com/ghodss/yaml'

Signed-off-by: Yuki Iwai <yuki.iwai.tz@gmail.com>

* Avoid to depend on k8s.io/utils directly

Signed-off-by: Yuki Iwai <yuki.iwai.tz@gmail.com>

* Fix a typo

Signed-off-by: Yuki Iwai <yuki.iwai.tz@gmail.com>

* Refactor katib-config using kustomize vars

Signed-off-by: Yuki Iwai <yuki.iwai.tz@gmail.com>

* Fix a typo

Signed-off-by: Yuki Iwai <yuki.iwai.tz@gmail.com>

* Put KatibConfig on every install

Signed-off-by: Yuki Iwai <yuki.iwai.tz@gmail.com>

* Remove configMapGenerator from the katib-with-kubeflow

Signed-off-by: Yuki Iwai <yuki.iwai.tz@gmail.com>

---------

Signed-off-by: Yuki Iwai <yuki.iwai.tz@gmail.com>
  • Loading branch information
tenzen-y committed Aug 1, 2023
1 parent f1e3f3a commit e69235d
Show file tree
Hide file tree
Showing 49 changed files with 1,995 additions and 792 deletions.
7 changes: 6 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -81,13 +81,18 @@ endif
sync-go-mod:
go mod tidy -go $(GO_VERSION)

CONTROLLER_GEN = $(shell pwd)/bin/controller-gen
.PHONY: controller-gen
controller-gen:
@GOBIN=$(shell pwd)/bin GO111MODULE=on go install sigs.k8s.io/controller-tools/cmd/controller-gen@v0.10.0

# Run this if you update any existing controller APIs.
# 1. Generate deepcopy, clientset, listers, informers for the APIs (hack/update-codegen.sh)
# 2. Generate open-api for the APIs (hack/update-openapigen)
# 3. Generate Python SDK for Katib (hack/gen-python-sdk/gen-sdk.sh)
# 4. Generate gRPC manager APIs (pkg/apis/manager/v1beta1/build.sh and pkg/apis/manager/health/build.sh)
# 5. Generate Go mock codes
generate:
generate: controller-gen
ifndef GOPATH
$(error GOPATH not defined, please define GOPATH. Run "go help gopath" to learn more about GOPATH)
endif
Expand Down
84 changes: 43 additions & 41 deletions cmd/katib-controller/v1beta1/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ import (
"os"

"github.com/spf13/viper"
"k8s.io/apimachinery/pkg/runtime"
_ "k8s.io/client-go/plugin/pkg/client/auth/gcp"
"sigs.k8s.io/controller-runtime/pkg/client/apiutil"
"sigs.k8s.io/controller-runtime/pkg/client/config"
Expand All @@ -33,38 +34,32 @@ import (
"sigs.k8s.io/controller-runtime/pkg/manager"
"sigs.k8s.io/controller-runtime/pkg/manager/signals"

configv1beta1 "github.com/kubeflow/katib/pkg/apis/config/v1beta1"
apis "github.com/kubeflow/katib/pkg/apis/controller"
controller "github.com/kubeflow/katib/pkg/controller.v1beta1"
"github.com/kubeflow/katib/pkg/controller.v1beta1"
"github.com/kubeflow/katib/pkg/controller.v1beta1/consts"
trialutil "github.com/kubeflow/katib/pkg/controller.v1beta1/trial/util"
"github.com/kubeflow/katib/pkg/util/v1beta1/katibconfig"
webhook "github.com/kubeflow/katib/pkg/webhook/v1beta1"
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
clientgoscheme "k8s.io/client-go/kubernetes/scheme"
)

var scheme = runtime.NewScheme()

func init() {
utilruntime.Must(apis.AddToScheme(scheme))
utilruntime.Must(configv1beta1.AddToScheme(scheme))
utilruntime.Must(clientgoscheme.AddToScheme(scheme))
}

func main() {
logf.SetLogger(zap.New())
log := logf.Log.WithName("entrypoint")

var experimentSuggestionName string
var metricsAddr string
var healthzAddr string
var webhookPort int
var injectSecurityContext bool
var enableGRPCProbeInSuggestion bool
var trialResources trialutil.GvkListFlag
var enableLeaderElection bool
var leaderElectionID string

flag.StringVar(&experimentSuggestionName, "experiment-suggestion-name",
"default", "The implementation of suggestion interface in experiment controller (default)")
flag.StringVar(&metricsAddr, "metrics-addr", ":8080", "The address the metric endpoint binds to.")
flag.StringVar(&healthzAddr, "healthz-addr", ":18080", "The address the healthz endpoint binds to.")
flag.BoolVar(&injectSecurityContext, "webhook-inject-securitycontext", false, "Inject the securityContext of container[0] in the sidecar")
flag.BoolVar(&enableGRPCProbeInSuggestion, "enable-grpc-probe-in-suggestion", true, "enable grpc probe in suggestions")
flag.Var(&trialResources, "trial-resources", "The list of resources that can be used as trial template, in the form: Kind.version.group (e.g. TFJob.v1.kubeflow.org)")
flag.IntVar(&webhookPort, "webhook-port", 8443, "The port number to be used for admission webhook server.")
// For leader election
flag.BoolVar(&enableLeaderElection, "enable-leader-election", false, "Enable leader election for katib-controller. Enabling this will ensure there is only one active katib-controller.")
flag.StringVar(&leaderElectionID, "leader-election-id", "3fbc96e9.katib.kubeflow.org", "The ID for leader election.")
var katibConfigFile string
flag.StringVar(&katibConfigFile, "katib-config", "",
"The katib-controller will load its initial configuration from this file. "+
"Omit this flag to use the default configuration values. ")

// TODO (andreyvelich): Currently it is not possible to set different webhook service name.
// flag.StringVar(&serviceName, "webhook-service-name", "katib-controller", "The service name which will be used in webhook")
Expand All @@ -73,21 +68,33 @@ func main() {

flag.Parse()

initConfig, err := katibconfig.GetInitConfigData(scheme, katibConfigFile)
if err != nil {
log.Error(err, "Failed to get KatibConfig")
os.Exit(1)
}

// Set the config in viper.
viper.Set(consts.ConfigExperimentSuggestionName, experimentSuggestionName)
viper.Set(consts.ConfigInjectSecurityContext, injectSecurityContext)
viper.Set(consts.ConfigEnableGRPCProbeInSuggestion, enableGRPCProbeInSuggestion)
viper.Set(consts.ConfigTrialResources, trialResources)
viper.Set(consts.ConfigExperimentSuggestionName, initConfig.ControllerConfig.ExperimentSuggestionName)
viper.Set(consts.ConfigInjectSecurityContext, initConfig.ControllerConfig.InjectSecurityContext)
viper.Set(consts.ConfigEnableGRPCProbeInSuggestion, initConfig.ControllerConfig.EnableGRPCProbeInSuggestion)

trialGVKs, err := katibconfig.TrialResourcesToGVKs(initConfig.ControllerConfig.TrialResources)
if err != nil {
log.Error(err, "Failed to parse trialResources")
os.Exit(1)
}
viper.Set(consts.ConfigTrialResources, trialGVKs)

log.Info("Config:",
consts.ConfigExperimentSuggestionName,
viper.GetString(consts.ConfigExperimentSuggestionName),
"webhook-port",
webhookPort,
initConfig.ControllerConfig.WebhookPort,
"metrics-addr",
metricsAddr,
initConfig.ControllerConfig.MetricsAddr,
"healthz-addr",
healthzAddr,
initConfig.ControllerConfig.HealthzAddr,
consts.ConfigInjectSecurityContext,
viper.GetBool(consts.ConfigInjectSecurityContext),
consts.ConfigEnableGRPCProbeInSuggestion,
Expand All @@ -105,10 +112,11 @@ func main() {

// Create a new katib controller to provide shared dependencies and start components
mgr, err := manager.New(cfg, manager.Options{
MetricsBindAddress: metricsAddr,
HealthProbeBindAddress: healthzAddr,
LeaderElection: enableLeaderElection,
LeaderElectionID: leaderElectionID,
MetricsBindAddress: initConfig.ControllerConfig.MetricsAddr,
HealthProbeBindAddress: initConfig.ControllerConfig.HealthzAddr,
LeaderElection: initConfig.ControllerConfig.EnableLeaderElection,
LeaderElectionID: initConfig.ControllerConfig.LeaderElectionID,
Scheme: scheme,
// TODO: Once the below issue is resolved, we need to switch discovery-client to the built-in one.
// https://github.com/kubernetes-sigs/controller-runtime/issues/2354
// https://github.com/kubernetes-sigs/controller-runtime/issues/2424
Expand All @@ -121,12 +129,6 @@ func main() {

log.Info("Registering Components.")

// Setup Scheme for all resources
if err := apis.AddToScheme(mgr.GetScheme()); err != nil {
log.Error(err, "Unable to add APIs to scheme")
os.Exit(1)
}

// Setup all Controllers
log.Info("Setting up controller.")
if err := controller.AddToManager(mgr); err != nil {
Expand All @@ -135,7 +137,7 @@ func main() {
}

log.Info("Setting up webhooks.")
if err := webhook.AddToManager(mgr, webhookPort); err != nil {
if err := webhook.AddToManager(mgr, *initConfig.ControllerConfig.WebhookPort); err != nil {
log.Error(err, "Unable to register webhooks to the manager")
os.Exit(1)
}
Expand Down
16 changes: 4 additions & 12 deletions docs/developer-guide.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ make build REGISTRY=<image-registry> TAG=<image-tag>

To use your custom images for the Katib components, modify
[Kustomization file](https://github.com/kubeflow/katib/blob/master/manifests/v1beta1/installs/katib-standalone/kustomization.yaml)
and [Katib Config](https://github.com/kubeflow/katib/blob/master/manifests/v1beta1/components/controller/katib-config.yaml)
and [Katib Config](https://github.com/kubeflow/katib/blob/master/manifests/v1beta1/installs/katib-standalone/katib-config.yaml)

You can deploy Katib v1beta1 manifests into a Kubernetes cluster as follows:

Expand Down Expand Up @@ -57,17 +57,9 @@ make generate

Below is a list of command-line flags accepted by Katib controller:

| Name | Type | Default | Description |
| ------------------------------- | ------------------------- | ----------------------------- | ---------------------------------------------------------------------------------------------------------------------- |
| enable-grpc-probe-in-suggestion | bool | true | Enable grpc probe in suggestions |
| experiment-suggestion-name | string | "default" | The implementation of suggestion interface in experiment controller |
| metrics-addr | string | ":8080" | The address that the metrics endpoint binds to |
| healthz-addr | string | ":18080" | The address that the healthz endpoint binds to |
| trial-resources | []schema.GroupVersionKind | null | The list of resources that can be used as trial template, in the form: Kind.version.group (e.g. TFJob.v1.kubeflow.org) |
| webhook-inject-securitycontext | bool | false | Inject the securityContext of container[0] in the sidecar |
| webhook-port | int | 8443 | The port number to be used for admission webhook server |
| enable-leader-election | bool | false | Enable leader election for katib-controller. Enabling this will ensure there is only one active katib-controller. |
| leader-election-id | string | "3fbc96e9.katib.kubeflow.org" | The ID for leader election. |
| Name | Type | Default | Description |
|--------------|--------|---------|----------------------------------------------------------------------------------------------------------------------------------|
| katib-config | string | "" | The katib-controller will load its initial configuration from this file. Omit this flag to use the default configuration values. |

## DB Manager Flags

Expand Down
22 changes: 9 additions & 13 deletions docs/new-algorithm-service.md
Original file line number Diff line number Diff line change
Expand Up @@ -90,21 +90,17 @@ Then build the Docker image.

### Use the algorithm in Katib.

Update the [Katib config](../manifests/v1beta1/components/controller/katib-config.yaml) and [operator](../operators/katib-controller/src/suggestion.json) with the new algorithm entity:
Update the [Katib config](../manifests/v1beta1/installs/katib-standalone/katib-config.yaml) with the new algorithm entity:

```diff
suggestion: |-
{
"tpe": {
"image": "docker.io/kubeflowkatib/suggestion-hyperopt"
},
"random": {
"image": "docker.io/kubeflowkatib/suggestion-hyperopt"
},
+ "<new-algorithm-name>": {
+ "image": "image built in the previous stage"
+ }
}
runtime:
suggestions:
- algorithmName: random
image: docker.io/kubeflowkatib/suggestion-hyperopt:$(KATIB_VERSION)
- algorithmName: tpe
image: docker.io/kubeflowkatib/suggestion-hyperopt:$(KATIB_VERSION)
+ - algorithmName: <new-algorithm-name>
+ image: "image built in the previous stage":$(KATIB_VERSION)
```

Learn more about Katib config in the
Expand Down
4 changes: 2 additions & 2 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ require (
github.com/go-sql-driver/mysql v1.5.0
github.com/golang/mock v1.6.0
github.com/golang/protobuf v1.5.3
github.com/google/go-cmp v0.5.9
github.com/google/go-containerregistry v0.9.0
github.com/google/go-containerregistry/pkg/authn/k8schain v0.0.0-20211222182933-7c19fa370dbd
github.com/grpc-ecosystem/go-grpc-middleware v1.3.0
Expand All @@ -31,6 +32,7 @@ require (
k8s.io/kube-openapi v0.0.0-20230501164219-8b0f38b5fd1f
sigs.k8s.io/controller-runtime v0.15.0
sigs.k8s.io/structured-merge-diff/v4 v4.2.3
sigs.k8s.io/yaml v1.3.0
)

require (
Expand Down Expand Up @@ -65,7 +67,6 @@ require (
github.com/golang-jwt/jwt/v4 v4.4.2 // indirect
github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect
github.com/google/gnostic v0.5.7-v3refs // indirect
github.com/google/go-cmp v0.5.9 // indirect
github.com/google/gofuzz v1.2.0 // indirect
github.com/google/uuid v1.3.0 // indirect
github.com/hashicorp/hcl v1.0.0 // indirect
Expand Down Expand Up @@ -135,5 +136,4 @@ require (
k8s.io/legacy-cloud-providers v0.21.0 // indirect
k8s.io/utils v0.0.0-20230209194617-a36077c30491 // indirect
sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd // indirect
sigs.k8s.io/yaml v1.3.0 // indirect
)
7 changes: 6 additions & 1 deletion hack/update-codegen.sh
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ if [[ -z "${GOPATH:-}" ]]; then
fi

# Grab code-generator version from go.mod
CODEGEN_VERSION=$(cd ../../.. && grep 'k8s.io/code-generator' go.mod | awk '{print $2}')
CODEGEN_VERSION=$(cd ../.. && grep 'k8s.io/code-generator' go.mod | awk '{print $2}')
CODEGEN_PKG="$GOPATH/pkg/mod/k8s.io/code-generator@${CODEGEN_VERSION}"

if [[ ! -d "${CODEGEN_PKG}" ]]; then
Expand All @@ -53,3 +53,8 @@ echo "Generating clients for ${GROUP_VERSIONS} ..."
github.com/kubeflow/katib/pkg/apis/controller \
"${GROUP_VERSIONS}" \
--go-header-file "${PROJECT_ROOT}/hack/boilerplate/boilerplate.go.txt"

echo "Generating deepcopy for config.kubeflow.org ..."
"${PROJECT_ROOT}/bin/controller-gen" \
object:headerFile="${PROJECT_ROOT}/hack/boilerplate/boilerplate.go.txt" \
paths="${PROJECT_ROOT}/pkg/apis/config/..."
2 changes: 1 addition & 1 deletion hack/update-openapigen.sh
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ if [[ -z "${GOPATH:-}" ]]; then
fi

# Grab code-generator version from go.mod
CODEGEN_VERSION=$(cd ../../.. && grep 'k8s.io/code-generator' go.mod | awk '{print $2}')
CODEGEN_VERSION=$(cd ../.. && grep 'k8s.io/code-generator' go.mod | awk '{print $2}')
CODEGEN_PKG="${GOPATH}/pkg/mod/k8s.io/code-generator@${CODEGEN_VERSION}"

if [[ ! -d ${CODEGEN_PKG} ]]; then
Expand Down
15 changes: 8 additions & 7 deletions manifests/v1beta1/components/controller/controller.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,13 +26,7 @@ spec:
image: docker.io/kubeflowkatib/katib-controller
command: ["./katib-controller"]
args:
- "--webhook-port=8443"
- "--trial-resources=Job.v1.batch"
- "--trial-resources=TFJob.v1.kubeflow.org"
- "--trial-resources=PyTorchJob.v1.kubeflow.org"
- "--trial-resources=MPIJob.v1.kubeflow.org"
- "--trial-resources=XGBoostJob.v1.kubeflow.org"
- "--trial-resources=MXJob.v1.kubeflow.org"
- --katib-config=/katib-config.yaml
ports:
- containerPort: 8443
name: webhook
Expand Down Expand Up @@ -60,8 +54,15 @@ spec:
- mountPath: /tmp/cert
name: cert
readOnly: true
- mountPath: /katib-config.yaml
name: katib-config
subPath: katib-config.yaml
readOnly: true
volumes:
- name: cert
secret:
defaultMode: 420
secretName: katib-webhook-cert
- name: katib-config
configMap:
name: katib-config
81 changes: 0 additions & 81 deletions manifests/v1beta1/components/controller/katib-config.yaml

This file was deleted.

Loading

0 comments on commit e69235d

Please sign in to comment.