Skip to content

Commit

Permalink
add support for ttl after finished controller for cleaning up finishe…
Browse files Browse the repository at this point in the history
…d jobsets
  • Loading branch information
dejanzele committed Dec 30, 2023
1 parent c1b5a4b commit d714d55
Show file tree
Hide file tree
Showing 20 changed files with 867 additions and 16 deletions.
1 change: 1 addition & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ RUN go mod download
# Copy the go source
COPY main.go main.go
COPY api/ api/
COPY client-go/ client-go/
COPY pkg/controllers/ pkg/controllers/
COPY pkg/util/ pkg/util/
COPY pkg/webhooks pkg/webhooks
Expand Down
12 changes: 12 additions & 0 deletions api/jobset/v1alpha2/jobset_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,18 @@ type JobSetSpec struct {

// Suspend suspends all running child Jobs when set to true.
Suspend *bool `json:"suspend,omitempty"`

// TTLSecondsAfterFinished limits the lifetime of a JobSet that has finished
// execution (either Complete or Failed). If this field is set,
// TTLSecondsAfterFinished after the JobSet finishes, it is eligible to be
// automatically deleted. When the JobSet is being deleted, its lifecycle
// guarantees (e.g. finalizers) will be honored. If this field is unset,
// the JobSet won't be automatically deleted. If this field is set to zero,
// the JobSet becomes eligible to be deleted immediately after it finishes.
// +kubebuilder:validation:Minimum=0
// +kubebuilder:default=0
// +optional
TTLSecondsAfterFinished *int32 `json:"ttlSecondsAfterFinished,omitempty"`
}

// JobSetStatus defines the observed state of JobSet
Expand Down
7 changes: 7 additions & 0 deletions api/jobset/v1alpha2/openapi_generated.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 5 additions & 0 deletions api/jobset/v1alpha2/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

19 changes: 14 additions & 5 deletions client-go/applyconfiguration/jobset/v1alpha2/jobsetspec.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

13 changes: 13 additions & 0 deletions config/components/crd/bases/jobset.x-k8s.io_jobsets.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9711,6 +9711,19 @@ spec:
suspend:
description: Suspend suspends all running child Jobs when set to true.
type: boolean
ttlSecondsAfterFinished:
default: 0
description: TTLSecondsAfterFinished limits the lifetime of a JobSet
that has finished execution (either Complete or Failed). If this
field is set, TTLSecondsAfterFinished after the JobSet finishes,
it is eligible to be automatically deleted. When the JobSet is being
deleted, its lifecycle guarantees (e.g. finalizers) will be honored.
If this field is unset, the JobSet won't be automatically deleted.
If this field is set to zero, the JobSet becomes eligible to be
deleted immediately after it finishes.
format: int32
minimum: 0
type: integer
type: object
status:
description: JobSetStatus defines the observed state of JobSet
Expand Down
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ require (
github.com/onsi/ginkgo/v2 v2.13.2
github.com/onsi/gomega v1.30.0
github.com/open-policy-agent/cert-controller v0.10.1
github.com/prometheus/client_golang v1.16.0
github.com/stretchr/testify v1.8.4
k8s.io/api v0.28.5
k8s.io/apimachinery v0.28.5
Expand Down Expand Up @@ -51,7 +52,6 @@ require (
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
github.com/pkg/errors v0.9.1 // indirect
github.com/pmezard/go-difflib v1.0.0 // indirect
github.com/prometheus/client_golang v1.16.0 // indirect
github.com/prometheus/client_model v0.4.0 // indirect
github.com/prometheus/common v0.44.0 // indirect
github.com/prometheus/procfs v0.10.1 // indirect
Expand Down
5 changes: 5 additions & 0 deletions hack/python-sdk/swagger.json
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,11 @@
"suspend": {
"description": "Suspend suspends all running child Jobs when set to true.",
"type": "boolean"
},
"ttlSecondsAfterFinished": {
"description": "TTLSecondsAfterFinished limits the lifetime of a JobSet that has finished execution (either Complete or Failed). If this field is set, TTLSecondsAfterFinished after the JobSet finishes, it is eligible to be automatically deleted. When the JobSet is being deleted, its lifecycle guarantees (e.g. finalizers) will be honored. If this field is unset, the JobSet won't be automatically deleted. If this field is set to zero, the JobSet becomes eligible to be deleted immediately after it finishes.",
"type": "integer",
"format": "int32"
}
}
},
Expand Down
28 changes: 25 additions & 3 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,15 @@ limitations under the License.
package main

import (
"context"
"flag"
"os"
"time"

"sigs.k8s.io/jobset/pkg/util/cert"

"sigs.k8s.io/jobset/client-go/clientset/versioned"
"sigs.k8s.io/jobset/client-go/informers/externalversions"

// Import all Kubernetes client auth plugins (e.g. Azure, GCP, OIDC, etc.)
// to ensure that exec-entrypoint and run can make use of them.
Expand All @@ -35,7 +42,6 @@ import (

jobset "sigs.k8s.io/jobset/api/jobset/v1alpha2"
"sigs.k8s.io/jobset/pkg/controllers"
"sigs.k8s.io/jobset/pkg/util/cert"
"sigs.k8s.io/jobset/pkg/webhooks"
//+kubebuilder:scaffold:imports
)
Expand Down Expand Up @@ -125,7 +131,7 @@ func main() {
// Cert won't be ready until manager starts, so start a goroutine here which
// will block until the cert is ready before setting up the controllers.
// Controllers who register after manager starts will start directly.
go setupControllers(mgr, certsReady)
go setupControllers(ctx, mgr, certsReady)

setupHealthzAndReadyzCheck(mgr)

Expand All @@ -136,7 +142,7 @@ func main() {
}
}

func setupControllers(mgr ctrl.Manager, certsReady chan struct{}) {
func setupControllers(ctx context.Context, mgr ctrl.Manager, certsReady chan struct{}) {
// The controllers won't work until the webhooks are operating,
// and the webhook won't work until the certs are all in places.
setupLog.Info("waiting for the cert generation to complete")
Expand All @@ -157,6 +163,22 @@ func setupControllers(mgr ctrl.Manager, certsReady chan struct{}) {
os.Exit(1)
}

clientset := versioned.NewForConfigOrDie(mgr.GetConfig())
sharedInformers := externalversions.NewSharedInformerFactory(clientset, 30*time.Minute)
jobSetInformer := sharedInformers.Jobset().V1alpha2().JobSets()
ttlAfterFinishedController := controllers.NewTTLAfterFinishedReconciler(
mgr.GetClient(),
mgr.GetScheme(),
jobSetInformer,
ctrl.Log.WithValues("controller", "TTLAfterFinished"),
)
if err := ttlAfterFinishedController.SetupWithManager(mgr); err != nil {
setupLog.Error(err, "unable to create controller", "controller", "TTLAfterFinished")
os.Exit(1)
}

go ttlAfterFinishedController.Run(ctx, 1)

// Set up JobSet validating/defaulting webhook.
if err := (&jobset.JobSet{}).SetupWebhookWithManager(mgr); err != nil {
setupLog.Error(err, "unable to create webhook", "webhook", "JobSet")
Expand Down
28 changes: 28 additions & 0 deletions pkg/controllers/metrics/metrics.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
package metrics

import (
"github.com/prometheus/client_golang/prometheus"
"sigs.k8s.io/controller-runtime/pkg/metrics"
)

// TTLAfterFinishedSubsystem - subsystem name used for this controller.
const TTLAfterFinishedSubsystem = "ttl_after_finished_controller"

var (
// JobSetDeletionDurationSeconds tracks the time it took to delete the jobset since it
// became eligible for deletion.
JobSetDeletionDurationSeconds = prometheus.NewHistogram(
prometheus.HistogramOpts{
Subsystem: TTLAfterFinishedSubsystem,
Name: "jobset_deletion_duration_seconds",
Help: "The time it took to delete the jobset since it became eligible for deletion",
// Start with 100ms with the last bucket being [~27m, Inf).
Buckets: prometheus.ExponentialBuckets(0.1, 2, 14),
},
)
)

func init() {
// Register custom metrics with the global prometheus registry
metrics.Registry.MustRegister(JobSetDeletionDurationSeconds)
}
Loading

0 comments on commit d714d55

Please sign in to comment.