Skip to content

Commit

Permalink
Merge pull request #421 from danielvegamyhre/val-err-msg
Browse files Browse the repository at this point in the history
Improve error messages and logging in webhooks
  • Loading branch information
k8s-ci-robot committed Feb 13, 2024
2 parents fb8aa28 + 15c24a3 commit 5f2220c
Show file tree
Hide file tree
Showing 3 changed files with 39 additions and 6 deletions.
28 changes: 28 additions & 0 deletions api/jobset/v1alpha2/jobset_webhook.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ import (
"fmt"
"math"
"strconv"
"strings"

apivalidation "k8s.io/apimachinery/pkg/api/validation"
"k8s.io/apimachinery/pkg/runtime"
Expand All @@ -36,6 +37,24 @@ import (
corev1 "k8s.io/api/core/v1"
)

const (
// This is the error message returned by IsDNS1035Label when the given input
// is longer than 63 characters.
dns1035MaxLengthExceededErrorMsg = "must be no more than 63 characters"

// Error message returned by JobSet validation if the generated child jobs
// will be longer than 63 characters.
jobNameTooLongErrorMsg = "JobSet name is too long, job names generated for this JobSet will exceed 63 characters"

// Error message returned by JobSet validation if the generated pod names
// will be longer than 63 characters.
podNameTooLongErrorMsg = "JobSet name is too long, pod names generated for this JobSet will exceed 63 characters"

// Error message returned by JobSet validation if the network subdomain
// will be longer than 63 characters.
subdomainTooLongErrMsg = ".spec.network.subdomain is too long, must be less than 63 characters"
)

func (js *JobSet) SetupWebhookWithManager(mgr ctrl.Manager) error {
return ctrl.NewWebhookManagedBy(mgr).
For(js).
Expand Down Expand Up @@ -102,6 +121,9 @@ func (js *JobSet) ValidateCreate() (admission.Warnings, error) {

// Since subdomain name is also used as service name, it must adhere to RFC 1035 as well.
for _, errMessage := range validation.IsDNS1035Label(js.Spec.Network.Subdomain) {
if strings.Contains(errMessage, dns1035MaxLengthExceededErrorMsg) {
errMessage = subdomainTooLongErrMsg
}
allErrs = append(allErrs, fmt.Errorf(errMessage))
}
}
Expand All @@ -118,6 +140,9 @@ func (js *JobSet) ValidateCreate() (admission.Warnings, error) {
// Use the largest job index as it will have the longest name.
longestJobName := placement.GenJobName(js.Name, rjob.Name, int(rjob.Replicas-1))
for _, errMessage := range validation.IsDNS1035Label(longestJobName) {
if strings.Contains(errMessage, dns1035MaxLengthExceededErrorMsg) {
errMessage = jobNameTooLongErrorMsg
}
allErrs = append(allErrs, fmt.Errorf(errMessage))
}
// Check that the generated pod names for the replicated job is DNS 1035 compliant.
Expand All @@ -128,6 +153,9 @@ func (js *JobSet) ValidateCreate() (admission.Warnings, error) {
// Add 5 char suffix to the deterministic part of the pod name to validate the full pod name is compliant.
longestPodName := placement.GenPodName(js.Name, rjob.Name, maxJobIndex, maxPodIndex) + "-abcde"
for _, errMessage := range validation.IsDNS1035Label(longestPodName) {
if strings.Contains(errMessage, dns1035MaxLengthExceededErrorMsg) {
errMessage = podNameTooLongErrorMsg
}
allErrs = append(allErrs, fmt.Errorf(errMessage))
}
}
Expand Down
6 changes: 3 additions & 3 deletions api/jobset/v1alpha2/jobset_webhook_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -664,7 +664,7 @@ func TestValidateCreate(t *testing.T) {
},
},
want: errors.Join(
fmt.Errorf("must be no more than 63 characters"),
fmt.Errorf(subdomainTooLongErrMsg),
),
},
{
Expand Down Expand Up @@ -708,7 +708,7 @@ func TestValidateCreate(t *testing.T) {
},
},
want: errors.Join(
fmt.Errorf("must be no more than 63 characters"),
fmt.Errorf(jobNameTooLongErrorMsg),
),
},
{
Expand All @@ -735,7 +735,7 @@ func TestValidateCreate(t *testing.T) {
},
},
want: errors.Join(
fmt.Errorf("must be no more than 63 characters"),
fmt.Errorf(podNameTooLongErrorMsg),
),
},
}
Expand Down
11 changes: 8 additions & 3 deletions pkg/webhooks/pod_admission_webhook.go
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ func (p *podWebhook) ValidateCreate(ctx context.Context, obj runtime.Object) (ad
return nil, err
}
if !leaderScheduled {
return nil, fmt.Errorf("leader pod not yet scheduled, not creating follower pod %q", pod.Name)
return nil, fmt.Errorf("leader pod not yet scheduled, not creating follower pod. this is an expected, transient error.")
}
return nil, nil
}
Expand All @@ -75,11 +75,16 @@ func (p *podWebhook) ValidateDelete(ctx context.Context, obj runtime.Object) (ad
}

func (p *podWebhook) leaderPodScheduled(ctx context.Context, pod *corev1.Pod) (bool, error) {
log := ctrl.LoggerFrom(ctx)
leaderPod, err := p.leaderPodForFollower(ctx, pod)
if err != nil {
return false, err
}
return leaderPod.Spec.NodeName != "", nil
scheduled := leaderPod.Spec.NodeName != ""
if !scheduled {
log.V(3).Info("leader pod %s is not yet scheduled", leaderPod.Name)
}
return scheduled, nil
}

func (p *podWebhook) leaderPodForFollower(ctx context.Context, pod *corev1.Pod) (*corev1.Pod, error) {
Expand All @@ -99,7 +104,7 @@ func (p *podWebhook) leaderPodForFollower(ctx context.Context, pod *corev1.Pod)

// Validate there is only 1 leader pod for this job.
if len(podList.Items) != 1 {
return nil, fmt.Errorf("incorrect number of leader pods for this job (expected 1, got %d)", len(podList.Items))
return nil, fmt.Errorf("expected 1 leader pod (%s), but got %d. this is an expected, transient error.", leaderPodName, len(podList.Items))
}

// Check if the leader pod is scheduled.
Expand Down

0 comments on commit 5f2220c

Please sign in to comment.