Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update schedule logic to properly calculate missed schedules #119137

Merged
merged 2 commits into from Jul 7, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
83 changes: 53 additions & 30 deletions pkg/controller/cronjob/utils.go
Expand Up @@ -90,56 +90,79 @@ func getNextScheduleTime(cj batchv1.CronJob, now time.Time, schedule cron.Schedu
return nil, nil
}

t, numberOfMissedSchedules, err := getMostRecentScheduleTime(earliestTime, now, schedule)

if numberOfMissedSchedules > 100 {
// An object might miss several starts. For example, if
// controller gets wedged on friday at 5:01pm when everyone has
// gone home, and someone comes in on tuesday AM and discovers
// the problem and restarts the controller, then all the hourly
// jobs, more than 80 of them for one hourly cronJob, should
// all start running with no further intervention (if the cronJob
// allows concurrency and late starts).
//
// However, if there is a bug somewhere, or incorrect clock
// on controller's server or apiservers (for setting creationTimestamp)
// then there could be so many missed start times (it could be off
// by decades or more), that it would eat up all the CPU and memory
// of this controller. In that case, we want to not try to list
// all the missed start times.
//
// I've somewhat arbitrarily picked 100, as more than 80,
// but less than "lots".
recorder.Eventf(&cj, corev1.EventTypeWarning, "TooManyMissedTimes", "too many missed start times: %d. Set or decrease .spec.startingDeadlineSeconds or check clock skew", numberOfMissedSchedules)
klog.InfoS("too many missed times", "cronjob", klog.KRef(cj.GetNamespace(), cj.GetName()), "missed times", numberOfMissedSchedules)
t, tooManyMissed, err := getMostRecentScheduleTime(earliestTime, now, schedule)

if tooManyMissed {
recorder.Eventf(&cj, corev1.EventTypeWarning, "TooManyMissedTimes", "too many missed start times. Set or decrease .spec.startingDeadlineSeconds or check clock skew")
klog.InfoS("too many missed times", "cronjob", klog.KRef(cj.GetNamespace(), cj.GetName()))
}
return t, err
}

// getMostRecentScheduleTime returns the latest schedule time between earliestTime and the count of number of
// schedules in between them
func getMostRecentScheduleTime(earliestTime time.Time, now time.Time, schedule cron.Schedule) (*time.Time, int64, error) {
// getMostRecentScheduleTime returns the latest schedule time between earliestTime and
// boolean whether an excessive number of schedules are missed
func getMostRecentScheduleTime(earliestTime time.Time, now time.Time, schedule cron.Schedule) (*time.Time, bool, error) {
t1 := schedule.Next(earliestTime)
t2 := schedule.Next(t1)

if now.Before(t1) {
return nil, 0, nil
return nil, false, nil
}
if now.Before(t2) {
return &t1, 1, nil
return &t1, false, nil
}

// It is possible for cron.ParseStandard("59 23 31 2 *") to return an invalid schedule
// seconds - 59, minute - 23, hour - 31 (?!) dom - 2, and dow is optional, clearly 31 is invalid
// In this case the timeBetweenTwoSchedules will be 0, and we error out the invalid schedule
timeBetweenTwoSchedules := int64(t2.Sub(t1).Round(time.Second).Seconds())
if timeBetweenTwoSchedules < 1 {
return nil, 0, fmt.Errorf("time difference between two schedules less than 1 second")
return nil, false, fmt.Errorf("time difference between two schedules less than 1 second")
}
// this logic used for calculating number of missed schedules does a rough
// approximation, by calculating a diff between two schedules (t1 and t2),
// and counting how many of these will fit in between last schedule and now
timeElapsed := int64(now.Sub(t1).Seconds())
numberOfMissedSchedules := (timeElapsed / timeBetweenTwoSchedules) + 1
t := time.Unix(t1.Unix()+((numberOfMissedSchedules-1)*timeBetweenTwoSchedules), 0).UTC()
return &t, numberOfMissedSchedules, nil

var mostRecentTime time.Time
// to get the most recent time accurate for regular schedules and the ones
// specified with @every form, we first need to calculate the potential earliest
// time by multiplying the initial number of missed schedules by its interval,
// this is critical to ensure @every starts at the correct time, this explains
// the numberOfMissedSchedules-1, the additional -1 serves there to go back
// in time one more time unit, and let the cron library calculate a proper
// schedule, for case where the schedule is not consistent, for example
// something like 30 6-16/4 * * 1-5
potentialEarliest := t1.Add(time.Duration((numberOfMissedSchedules-1-1)*timeBetweenTwoSchedules) * time.Second)
for t := schedule.Next(potentialEarliest); !t.After(now); t = schedule.Next(t) {
mostRecentTime = t
}

// An object might miss several starts. For example, if
// controller gets wedged on friday at 5:01pm when everyone has
// gone home, and someone comes in on tuesday AM and discovers
// the problem and restarts the controller, then all the hourly
// jobs, more than 80 of them for one hourly cronJob, should
// all start running with no further intervention (if the cronJob
// allows concurrency and late starts).
//
// However, if there is a bug somewhere, or incorrect clock
// on controller's server or apiservers (for setting creationTimestamp)
// then there could be so many missed start times (it could be off
// by decades or more), that it would eat up all the CPU and memory
// of this controller. In that case, we want to not try to list
// all the missed start times.
//
// I've somewhat arbitrarily picked 100, as more than 80,
// but less than "lots".
tooManyMissed := numberOfMissedSchedules > 100

if mostRecentTime.IsZero() {
return nil, tooManyMissed, nil
}

return &mostRecentTime, tooManyMissed, nil
}

func copyLabels(template *batchv1.JobTemplateSpec) labels.Set {
Expand Down
75 changes: 59 additions & 16 deletions pkg/controller/cronjob/utils_test.go
Expand Up @@ -25,7 +25,7 @@ import (

cron "github.com/robfig/cron/v3"
batchv1 "k8s.io/api/batch/v1"
"k8s.io/api/core/v1"
v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types"
"k8s.io/client-go/tools/record"
Expand Down Expand Up @@ -284,11 +284,11 @@ func TestGetMostRecentScheduleTime(t *testing.T) {
schedule string
}
tests := []struct {
name string
args args
expectedTime *time.Time
expectedNumberOfMisses int64
wantErr bool
name string
args args
expectedTime *time.Time
expectedTooManyMissed bool
wantErr bool
}{
{
name: "now before next schedule",
Expand All @@ -306,8 +306,7 @@ func TestGetMostRecentScheduleTime(t *testing.T) {
now: topOfTheHour().Add(time.Minute * 61),
schedule: "0 * * * *",
},
expectedTime: deltaTimeAfterTopOfTheHour(time.Minute * 60),
expectedNumberOfMisses: 1,
expectedTime: deltaTimeAfterTopOfTheHour(time.Minute * 60),
},
{
name: "missed 5 schedules",
Expand All @@ -316,8 +315,53 @@ func TestGetMostRecentScheduleTime(t *testing.T) {
now: *deltaTimeAfterTopOfTheHour(time.Minute * 301),
schedule: "0 * * * *",
},
expectedTime: deltaTimeAfterTopOfTheHour(time.Minute * 300),
expectedNumberOfMisses: 5,
expectedTime: deltaTimeAfterTopOfTheHour(time.Minute * 300),
},
{
name: "complex schedule",
args: args{
earliestTime: deltaTimeAfterTopOfTheHour(30 * time.Minute),
now: *deltaTimeAfterTopOfTheHour(24*time.Hour + 31*time.Minute),
schedule: "30 6-16/4 * * 1-5",
},
expectedTime: deltaTimeAfterTopOfTheHour(24*time.Hour + 30*time.Minute),
},
{
name: "another complex schedule",
args: args{
earliestTime: deltaTimeAfterTopOfTheHour(30 * time.Minute),
now: *deltaTimeAfterTopOfTheHour(30*time.Hour + 30*time.Minute),
schedule: "30 10,11,12 * * 1-5",
},
expectedTime: nil,
},
{
name: "complex schedule with longer diff between executions",
args: args{
earliestTime: deltaTimeAfterTopOfTheHour(30 * time.Minute),
now: *deltaTimeAfterTopOfTheHour(96*time.Hour + 31*time.Minute),
schedule: "30 6-16/4 * * 1-5",
},
expectedTime: deltaTimeAfterTopOfTheHour(96*time.Hour + 30*time.Minute),
},
{
name: "complex schedule with shorter diff between executions",
args: args{
earliestTime: topOfTheHour(),
now: *deltaTimeAfterTopOfTheHour(24*time.Hour + 31*time.Minute),
schedule: "30 6-16/4 * * 1-5",
},
expectedTime: deltaTimeAfterTopOfTheHour(24*time.Hour + 30*time.Minute),
},
{
name: "@every schedule",
args: args{
earliestTime: deltaTimeAfterTopOfTheHour(1 * time.Minute),
now: *deltaTimeAfterTopOfTheHour(7 * 24 * time.Hour),
schedule: "@every 1h",
},
expectedTime: deltaTimeAfterTopOfTheHour((6 * 24 * time.Hour) + 23*time.Hour + 1*time.Minute),
expectedTooManyMissed: true,
},
{
name: "rogue cronjob",
Expand All @@ -326,9 +370,8 @@ func TestGetMostRecentScheduleTime(t *testing.T) {
now: *deltaTimeAfterTopOfTheHour(time.Hour * 1000000),
schedule: "59 23 31 2 *",
},
expectedTime: nil,
expectedNumberOfMisses: 0,
wantErr: true,
expectedTime: nil,
wantErr: true,
},
}
for _, tt := range tests {
Expand All @@ -337,7 +380,7 @@ func TestGetMostRecentScheduleTime(t *testing.T) {
if err != nil {
t.Errorf("error setting up the test, %s", err)
}
gotTime, gotNumberOfMisses, err := getMostRecentScheduleTime(*tt.args.earliestTime, tt.args.now, sched)
gotTime, gotTooManyMissed, err := getMostRecentScheduleTime(*tt.args.earliestTime, tt.args.now, sched)
if tt.wantErr {
if err == nil {
t.Error("getMostRecentScheduleTime() got no error when expected one")
Expand All @@ -353,8 +396,8 @@ func TestGetMostRecentScheduleTime(t *testing.T) {
if gotTime != nil && tt.expectedTime != nil && !gotTime.Equal(*tt.expectedTime) {
t.Errorf("getMostRecentScheduleTime() got = %v, want %v", gotTime, tt.expectedTime)
}
if gotNumberOfMisses != tt.expectedNumberOfMisses {
t.Errorf("getMostRecentScheduleTime() got1 = %v, want %v", gotNumberOfMisses, tt.expectedNumberOfMisses)
if gotTooManyMissed != tt.expectedTooManyMissed {
t.Errorf("getMostRecentScheduleTime() got1 = %v, want %v", gotTooManyMissed, tt.expectedTooManyMissed)
}
})
}
Expand Down