Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement KatibConfig API #2176

Merged
merged 19 commits into from
Aug 1, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -81,13 +81,18 @@ endif
sync-go-mod:
go mod tidy -go $(GO_VERSION)

CONTROLLER_GEN = $(shell pwd)/bin/controller-gen
.PHONY: controller-gen
controller-gen:
@GOBIN=$(shell pwd)/bin GO111MODULE=on go install sigs.k8s.io/controller-tools/cmd/controller-gen@v0.10.0
andreyvelich marked this conversation as resolved.
Show resolved Hide resolved

# Run this if you update any existing controller APIs.
# 1. Generate deepcopy, clientset, listers, informers for the APIs (hack/update-codegen.sh)
# 2. Generate open-api for the APIs (hack/update-openapigen)
# 3. Generate Python SDK for Katib (hack/gen-python-sdk/gen-sdk.sh)
# 4. Generate gRPC manager APIs (pkg/apis/manager/v1beta1/build.sh and pkg/apis/manager/health/build.sh)
# 5. Generate Go mock codes
generate:
generate: controller-gen
ifndef GOPATH
$(error GOPATH not defined, please define GOPATH. Run "go help gopath" to learn more about GOPATH)
endif
Expand Down
84 changes: 43 additions & 41 deletions cmd/katib-controller/v1beta1/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ import (
"os"

"github.com/spf13/viper"
"k8s.io/apimachinery/pkg/runtime"
_ "k8s.io/client-go/plugin/pkg/client/auth/gcp"
"sigs.k8s.io/controller-runtime/pkg/client/apiutil"
"sigs.k8s.io/controller-runtime/pkg/client/config"
Expand All @@ -33,38 +34,32 @@ import (
"sigs.k8s.io/controller-runtime/pkg/manager"
"sigs.k8s.io/controller-runtime/pkg/manager/signals"

configv1beta1 "github.com/kubeflow/katib/pkg/apis/config/v1beta1"
apis "github.com/kubeflow/katib/pkg/apis/controller"
controller "github.com/kubeflow/katib/pkg/controller.v1beta1"
"github.com/kubeflow/katib/pkg/controller.v1beta1"
"github.com/kubeflow/katib/pkg/controller.v1beta1/consts"
trialutil "github.com/kubeflow/katib/pkg/controller.v1beta1/trial/util"
"github.com/kubeflow/katib/pkg/util/v1beta1/katibconfig"
webhook "github.com/kubeflow/katib/pkg/webhook/v1beta1"
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
clientgoscheme "k8s.io/client-go/kubernetes/scheme"
)

var scheme = runtime.NewScheme()

func init() {
utilruntime.Must(apis.AddToScheme(scheme))
utilruntime.Must(configv1beta1.AddToScheme(scheme))
utilruntime.Must(clientgoscheme.AddToScheme(scheme))
andreyvelich marked this conversation as resolved.
Show resolved Hide resolved
}

func main() {
logf.SetLogger(zap.New())
log := logf.Log.WithName("entrypoint")

var experimentSuggestionName string
var metricsAddr string
var healthzAddr string
var webhookPort int
var injectSecurityContext bool
var enableGRPCProbeInSuggestion bool
var trialResources trialutil.GvkListFlag
var enableLeaderElection bool
var leaderElectionID string

flag.StringVar(&experimentSuggestionName, "experiment-suggestion-name",
"default", "The implementation of suggestion interface in experiment controller (default)")
flag.StringVar(&metricsAddr, "metrics-addr", ":8080", "The address the metric endpoint binds to.")
flag.StringVar(&healthzAddr, "healthz-addr", ":18080", "The address the healthz endpoint binds to.")
flag.BoolVar(&injectSecurityContext, "webhook-inject-securitycontext", false, "Inject the securityContext of container[0] in the sidecar")
flag.BoolVar(&enableGRPCProbeInSuggestion, "enable-grpc-probe-in-suggestion", true, "enable grpc probe in suggestions")
flag.Var(&trialResources, "trial-resources", "The list of resources that can be used as trial template, in the form: Kind.version.group (e.g. TFJob.v1.kubeflow.org)")
flag.IntVar(&webhookPort, "webhook-port", 8443, "The port number to be used for admission webhook server.")
// For leader election
flag.BoolVar(&enableLeaderElection, "enable-leader-election", false, "Enable leader election for katib-controller. Enabling this will ensure there is only one active katib-controller.")
flag.StringVar(&leaderElectionID, "leader-election-id", "3fbc96e9.katib.kubeflow.org", "The ID for leader election.")
var katibConfigFile string
flag.StringVar(&katibConfigFile, "katib-config", "",
"The katib-controller will load its initial configuration from this file. "+
"Omit this flag to use the default configuration values. ")

// TODO (andreyvelich): Currently it is not possible to set different webhook service name.
// flag.StringVar(&serviceName, "webhook-service-name", "katib-controller", "The service name which will be used in webhook")
Expand All @@ -73,21 +68,33 @@ func main() {

flag.Parse()

initConfig, err := katibconfig.GetInitConfigData(scheme, katibConfigFile)
if err != nil {
log.Error(err, "Failed to get KatibConfig")
os.Exit(1)
}

// Set the config in viper.
viper.Set(consts.ConfigExperimentSuggestionName, experimentSuggestionName)
viper.Set(consts.ConfigInjectSecurityContext, injectSecurityContext)
viper.Set(consts.ConfigEnableGRPCProbeInSuggestion, enableGRPCProbeInSuggestion)
viper.Set(consts.ConfigTrialResources, trialResources)
viper.Set(consts.ConfigExperimentSuggestionName, initConfig.ControllerConfig.ExperimentSuggestionName)
viper.Set(consts.ConfigInjectSecurityContext, initConfig.ControllerConfig.InjectSecurityContext)
viper.Set(consts.ConfigEnableGRPCProbeInSuggestion, initConfig.ControllerConfig.EnableGRPCProbeInSuggestion)

trialGVKs, err := katibconfig.TrialResourcesToGVKs(initConfig.ControllerConfig.TrialResources)
if err != nil {
log.Error(err, "Failed to parse trialResources")
os.Exit(1)
}
viper.Set(consts.ConfigTrialResources, trialGVKs)

log.Info("Config:",
consts.ConfigExperimentSuggestionName,
viper.GetString(consts.ConfigExperimentSuggestionName),
"webhook-port",
webhookPort,
initConfig.ControllerConfig.WebhookPort,
"metrics-addr",
metricsAddr,
initConfig.ControllerConfig.MetricsAddr,
"healthz-addr",
healthzAddr,
initConfig.ControllerConfig.HealthzAddr,
consts.ConfigInjectSecurityContext,
viper.GetBool(consts.ConfigInjectSecurityContext),
consts.ConfigEnableGRPCProbeInSuggestion,
Expand All @@ -105,10 +112,11 @@ func main() {

// Create a new katib controller to provide shared dependencies and start components
mgr, err := manager.New(cfg, manager.Options{
MetricsBindAddress: metricsAddr,
HealthProbeBindAddress: healthzAddr,
LeaderElection: enableLeaderElection,
LeaderElectionID: leaderElectionID,
MetricsBindAddress: initConfig.ControllerConfig.MetricsAddr,
HealthProbeBindAddress: initConfig.ControllerConfig.HealthzAddr,
LeaderElection: initConfig.ControllerConfig.EnableLeaderElection,
LeaderElectionID: initConfig.ControllerConfig.LeaderElectionID,
Scheme: scheme,
// TODO: Once the below issue is resolved, we need to switch discovery-client to the built-in one.
// https://github.com/kubernetes-sigs/controller-runtime/issues/2354
// https://github.com/kubernetes-sigs/controller-runtime/issues/2424
Expand All @@ -121,12 +129,6 @@ func main() {

log.Info("Registering Components.")

// Setup Scheme for all resources
if err := apis.AddToScheme(mgr.GetScheme()); err != nil {
log.Error(err, "Unable to add APIs to scheme")
os.Exit(1)
}

// Setup all Controllers
log.Info("Setting up controller.")
if err := controller.AddToManager(mgr); err != nil {
Expand All @@ -135,7 +137,7 @@ func main() {
}

log.Info("Setting up webhooks.")
if err := webhook.AddToManager(mgr, webhookPort); err != nil {
if err := webhook.AddToManager(mgr, *initConfig.ControllerConfig.WebhookPort); err != nil {
gaocegege marked this conversation as resolved.
Show resolved Hide resolved
log.Error(err, "Unable to register webhooks to the manager")
os.Exit(1)
}
Expand Down
16 changes: 4 additions & 12 deletions docs/developer-guide.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ make build REGISTRY=<image-registry> TAG=<image-tag>

To use your custom images for the Katib components, modify
[Kustomization file](https://github.com/kubeflow/katib/blob/master/manifests/v1beta1/installs/katib-standalone/kustomization.yaml)
and [Katib Config](https://github.com/kubeflow/katib/blob/master/manifests/v1beta1/components/controller/katib-config.yaml)
and [Katib Config](https://github.com/kubeflow/katib/blob/master/manifests/v1beta1/installs/katib-standalone/katib-config.yaml)

You can deploy Katib v1beta1 manifests into a Kubernetes cluster as follows:

Expand Down Expand Up @@ -57,17 +57,9 @@ make generate

Below is a list of command-line flags accepted by Katib controller:

| Name | Type | Default | Description |
| ------------------------------- | ------------------------- | ----------------------------- | ---------------------------------------------------------------------------------------------------------------------- |
| enable-grpc-probe-in-suggestion | bool | true | Enable grpc probe in suggestions |
| experiment-suggestion-name | string | "default" | The implementation of suggestion interface in experiment controller |
| metrics-addr | string | ":8080" | The address that the metrics endpoint binds to |
| healthz-addr | string | ":18080" | The address that the healthz endpoint binds to |
| trial-resources | []schema.GroupVersionKind | null | The list of resources that can be used as trial template, in the form: Kind.version.group (e.g. TFJob.v1.kubeflow.org) |
| webhook-inject-securitycontext | bool | false | Inject the securityContext of container[0] in the sidecar |
| webhook-port | int | 8443 | The port number to be used for admission webhook server |
| enable-leader-election | bool | false | Enable leader election for katib-controller. Enabling this will ensure there is only one active katib-controller. |
| leader-election-id | string | "3fbc96e9.katib.kubeflow.org" | The ID for leader election. |
| Name | Type | Default | Description |
|--------------|--------|---------|----------------------------------------------------------------------------------------------------------------------------------|
| katib-config | string | "" | The katib-controller will load its initial configuration from this file. Omit this flag to use the default configuration values. |

## DB Manager Flags

Expand Down
22 changes: 9 additions & 13 deletions docs/new-algorithm-service.md
Original file line number Diff line number Diff line change
Expand Up @@ -90,21 +90,17 @@ Then build the Docker image.

### Use the algorithm in Katib.

Update the [Katib config](../manifests/v1beta1/components/controller/katib-config.yaml) and [operator](../operators/katib-controller/src/suggestion.json) with the new algorithm entity:
Update the [Katib config](../manifests/v1beta1/installs/katib-standalone/katib-config.yaml) with the new algorithm entity:

```diff
suggestion: |-
{
"tpe": {
"image": "docker.io/kubeflowkatib/suggestion-hyperopt"
},
"random": {
"image": "docker.io/kubeflowkatib/suggestion-hyperopt"
},
+ "<new-algorithm-name>": {
+ "image": "image built in the previous stage"
+ }
}
runtime:
suggestions:
- algorithmName: random
image: docker.io/kubeflowkatib/suggestion-hyperopt:$(KATIB_VERSION)
- algorithmName: tpe
image: docker.io/kubeflowkatib/suggestion-hyperopt:$(KATIB_VERSION)
+ - algorithmName: <new-algorithm-name>
+ image: "image built in the previous stage":$(KATIB_VERSION)
```

Learn more about Katib config in the
Expand Down
4 changes: 2 additions & 2 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ require (
github.com/go-sql-driver/mysql v1.5.0
github.com/golang/mock v1.6.0
github.com/golang/protobuf v1.5.3
github.com/google/go-cmp v0.5.9
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think, previously I removed go-cmp package to use reflect everywhere for our unit tests.
WDYT @tenzen-y ?

Copy link
Member Author

@tenzen-y tenzen-y Jul 21, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd like to use go-cmp since comparing the expected struct and the actual one (eyeball grep) is too harder...
Actually, I had a hard time fixing tests in this PR.
WDYT? @andreyvelich

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see, I think, if that is easier to use this lib instead of reflect , we might to want re-use it everywhere across our unit tests in the future.
Maybe make it as best practice to write unit test in our Developer Guide.

What are your thoughts @tenzen-y @johnugeorge ?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

SGTM

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd like to work on updating the developer guide.

github.com/google/go-containerregistry v0.9.0
github.com/google/go-containerregistry/pkg/authn/k8schain v0.0.0-20211222182933-7c19fa370dbd
github.com/grpc-ecosystem/go-grpc-middleware v1.3.0
Expand All @@ -31,6 +32,7 @@ require (
k8s.io/kube-openapi v0.0.0-20230501164219-8b0f38b5fd1f
sigs.k8s.io/controller-runtime v0.15.0
sigs.k8s.io/structured-merge-diff/v4 v4.2.3
sigs.k8s.io/yaml v1.3.0
)

require (
Expand Down Expand Up @@ -65,7 +67,6 @@ require (
github.com/golang-jwt/jwt/v4 v4.4.2 // indirect
github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect
github.com/google/gnostic v0.5.7-v3refs // indirect
github.com/google/go-cmp v0.5.9 // indirect
github.com/google/gofuzz v1.2.0 // indirect
github.com/google/uuid v1.3.0 // indirect
github.com/hashicorp/hcl v1.0.0 // indirect
Expand Down Expand Up @@ -135,5 +136,4 @@ require (
k8s.io/legacy-cloud-providers v0.21.0 // indirect
k8s.io/utils v0.0.0-20230209194617-a36077c30491 // indirect
sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd // indirect
sigs.k8s.io/yaml v1.3.0 // indirect
)
7 changes: 6 additions & 1 deletion hack/update-codegen.sh
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ if [[ -z "${GOPATH:-}" ]]; then
fi

# Grab code-generator version from go.mod
CODEGEN_VERSION=$(cd ../../.. && grep 'k8s.io/code-generator' go.mod | awk '{print $2}')
CODEGEN_VERSION=$(cd ../.. && grep 'k8s.io/code-generator' go.mod | awk '{print $2}')
CODEGEN_PKG="$GOPATH/pkg/mod/k8s.io/code-generator@${CODEGEN_VERSION}"

if [[ ! -d "${CODEGEN_PKG}" ]]; then
Expand All @@ -53,3 +53,8 @@ echo "Generating clients for ${GROUP_VERSIONS} ..."
github.com/kubeflow/katib/pkg/apis/controller \
"${GROUP_VERSIONS}" \
--go-header-file "${PROJECT_ROOT}/hack/boilerplate/boilerplate.go.txt"

echo "Generating deepcopy for config.kubeflow.org ..."
"${PROJECT_ROOT}/bin/controller-gen" \
object:headerFile="${PROJECT_ROOT}/hack/boilerplate/boilerplate.go.txt" \
paths="${PROJECT_ROOT}/pkg/apis/config/..."
2 changes: 1 addition & 1 deletion hack/update-openapigen.sh
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ if [[ -z "${GOPATH:-}" ]]; then
fi

# Grab code-generator version from go.mod
CODEGEN_VERSION=$(cd ../../.. && grep 'k8s.io/code-generator' go.mod | awk '{print $2}')
CODEGEN_VERSION=$(cd ../.. && grep 'k8s.io/code-generator' go.mod | awk '{print $2}')
CODEGEN_PKG="${GOPATH}/pkg/mod/k8s.io/code-generator@${CODEGEN_VERSION}"

if [[ ! -d ${CODEGEN_PKG} ]]; then
Expand Down
15 changes: 8 additions & 7 deletions manifests/v1beta1/components/controller/controller.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,13 +26,7 @@ spec:
image: docker.io/kubeflowkatib/katib-controller
andreyvelich marked this conversation as resolved.
Show resolved Hide resolved
command: ["./katib-controller"]
args:
- "--webhook-port=8443"
- "--trial-resources=Job.v1.batch"
- "--trial-resources=TFJob.v1.kubeflow.org"
- "--trial-resources=PyTorchJob.v1.kubeflow.org"
- "--trial-resources=MPIJob.v1.kubeflow.org"
- "--trial-resources=XGBoostJob.v1.kubeflow.org"
- "--trial-resources=MXJob.v1.kubeflow.org"
- --katib-config=/katib-config.yaml
ports:
- containerPort: 8443
name: webhook
Expand Down Expand Up @@ -60,8 +54,15 @@ spec:
- mountPath: /tmp/cert
name: cert
readOnly: true
- mountPath: /katib-config.yaml
name: katib-config
subPath: katib-config.yaml
readOnly: true
volumes:
- name: cert
secret:
defaultMode: 420
secretName: katib-webhook-cert
- name: katib-config
configMap:
name: katib-config
81 changes: 0 additions & 81 deletions manifests/v1beta1/components/controller/katib-config.yaml

This file was deleted.

Loading