Skip to content

Commit

Permalink
Merge pull request #119137 from kmala/1.25
Browse files Browse the repository at this point in the history
Update schedule logic to properly calculate missed schedules
  • Loading branch information
k8s-ci-robot committed Jul 7, 2023
2 parents 3785d30 + e6b9bd0 commit 7af93f9
Show file tree
Hide file tree
Showing 2 changed files with 112 additions and 46 deletions.
83 changes: 53 additions & 30 deletions pkg/controller/cronjob/utils.go
Expand Up @@ -90,56 +90,79 @@ func getNextScheduleTime(cj batchv1.CronJob, now time.Time, schedule cron.Schedu
return nil, nil
}

t, numberOfMissedSchedules, err := getMostRecentScheduleTime(earliestTime, now, schedule)

if numberOfMissedSchedules > 100 {
// An object might miss several starts. For example, if
// controller gets wedged on friday at 5:01pm when everyone has
// gone home, and someone comes in on tuesday AM and discovers
// the problem and restarts the controller, then all the hourly
// jobs, more than 80 of them for one hourly cronJob, should
// all start running with no further intervention (if the cronJob
// allows concurrency and late starts).
//
// However, if there is a bug somewhere, or incorrect clock
// on controller's server or apiservers (for setting creationTimestamp)
// then there could be so many missed start times (it could be off
// by decades or more), that it would eat up all the CPU and memory
// of this controller. In that case, we want to not try to list
// all the missed start times.
//
// I've somewhat arbitrarily picked 100, as more than 80,
// but less than "lots".
recorder.Eventf(&cj, corev1.EventTypeWarning, "TooManyMissedTimes", "too many missed start times: %d. Set or decrease .spec.startingDeadlineSeconds or check clock skew", numberOfMissedSchedules)
klog.InfoS("too many missed times", "cronjob", klog.KRef(cj.GetNamespace(), cj.GetName()), "missed times", numberOfMissedSchedules)
t, tooManyMissed, err := getMostRecentScheduleTime(earliestTime, now, schedule)

if tooManyMissed {
recorder.Eventf(&cj, corev1.EventTypeWarning, "TooManyMissedTimes", "too many missed start times. Set or decrease .spec.startingDeadlineSeconds or check clock skew")
klog.InfoS("too many missed times", "cronjob", klog.KRef(cj.GetNamespace(), cj.GetName()))
}
return t, err
}

// getMostRecentScheduleTime returns the latest schedule time between earliestTime and the count of number of
// schedules in between them
func getMostRecentScheduleTime(earliestTime time.Time, now time.Time, schedule cron.Schedule) (*time.Time, int64, error) {
// getMostRecentScheduleTime returns the latest schedule time between earliestTime and
// boolean whether an excessive number of schedules are missed
func getMostRecentScheduleTime(earliestTime time.Time, now time.Time, schedule cron.Schedule) (*time.Time, bool, error) {
t1 := schedule.Next(earliestTime)
t2 := schedule.Next(t1)

if now.Before(t1) {
return nil, 0, nil
return nil, false, nil
}
if now.Before(t2) {
return &t1, 1, nil
return &t1, false, nil
}

// It is possible for cron.ParseStandard("59 23 31 2 *") to return an invalid schedule
// seconds - 59, minute - 23, hour - 31 (?!) dom - 2, and dow is optional, clearly 31 is invalid
// In this case the timeBetweenTwoSchedules will be 0, and we error out the invalid schedule
timeBetweenTwoSchedules := int64(t2.Sub(t1).Round(time.Second).Seconds())
if timeBetweenTwoSchedules < 1 {
return nil, 0, fmt.Errorf("time difference between two schedules less than 1 second")
return nil, false, fmt.Errorf("time difference between two schedules less than 1 second")
}
// this logic used for calculating number of missed schedules does a rough
// approximation, by calculating a diff between two schedules (t1 and t2),
// and counting how many of these will fit in between last schedule and now
timeElapsed := int64(now.Sub(t1).Seconds())
numberOfMissedSchedules := (timeElapsed / timeBetweenTwoSchedules) + 1
t := time.Unix(t1.Unix()+((numberOfMissedSchedules-1)*timeBetweenTwoSchedules), 0).UTC()
return &t, numberOfMissedSchedules, nil

var mostRecentTime time.Time
// to get the most recent time accurate for regular schedules and the ones
// specified with @every form, we first need to calculate the potential earliest
// time by multiplying the initial number of missed schedules by its interval,
// this is critical to ensure @every starts at the correct time, this explains
// the numberOfMissedSchedules-1, the additional -1 serves there to go back
// in time one more time unit, and let the cron library calculate a proper
// schedule, for case where the schedule is not consistent, for example
// something like 30 6-16/4 * * 1-5
potentialEarliest := t1.Add(time.Duration((numberOfMissedSchedules-1-1)*timeBetweenTwoSchedules) * time.Second)
for t := schedule.Next(potentialEarliest); !t.After(now); t = schedule.Next(t) {
mostRecentTime = t
}

// An object might miss several starts. For example, if
// controller gets wedged on friday at 5:01pm when everyone has
// gone home, and someone comes in on tuesday AM and discovers
// the problem and restarts the controller, then all the hourly
// jobs, more than 80 of them for one hourly cronJob, should
// all start running with no further intervention (if the cronJob
// allows concurrency and late starts).
//
// However, if there is a bug somewhere, or incorrect clock
// on controller's server or apiservers (for setting creationTimestamp)
// then there could be so many missed start times (it could be off
// by decades or more), that it would eat up all the CPU and memory
// of this controller. In that case, we want to not try to list
// all the missed start times.
//
// I've somewhat arbitrarily picked 100, as more than 80,
// but less than "lots".
tooManyMissed := numberOfMissedSchedules > 100

if mostRecentTime.IsZero() {
return nil, tooManyMissed, nil
}

return &mostRecentTime, tooManyMissed, nil
}

func copyLabels(template *batchv1.JobTemplateSpec) labels.Set {
Expand Down
75 changes: 59 additions & 16 deletions pkg/controller/cronjob/utils_test.go
Expand Up @@ -25,7 +25,7 @@ import (

cron "github.com/robfig/cron/v3"
batchv1 "k8s.io/api/batch/v1"
"k8s.io/api/core/v1"
v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types"
"k8s.io/client-go/tools/record"
Expand Down Expand Up @@ -284,11 +284,11 @@ func TestGetMostRecentScheduleTime(t *testing.T) {
schedule string
}
tests := []struct {
name string
args args
expectedTime *time.Time
expectedNumberOfMisses int64
wantErr bool
name string
args args
expectedTime *time.Time
expectedTooManyMissed bool
wantErr bool
}{
{
name: "now before next schedule",
Expand All @@ -306,8 +306,7 @@ func TestGetMostRecentScheduleTime(t *testing.T) {
now: topOfTheHour().Add(time.Minute * 61),
schedule: "0 * * * *",
},
expectedTime: deltaTimeAfterTopOfTheHour(time.Minute * 60),
expectedNumberOfMisses: 1,
expectedTime: deltaTimeAfterTopOfTheHour(time.Minute * 60),
},
{
name: "missed 5 schedules",
Expand All @@ -316,8 +315,53 @@ func TestGetMostRecentScheduleTime(t *testing.T) {
now: *deltaTimeAfterTopOfTheHour(time.Minute * 301),
schedule: "0 * * * *",
},
expectedTime: deltaTimeAfterTopOfTheHour(time.Minute * 300),
expectedNumberOfMisses: 5,
expectedTime: deltaTimeAfterTopOfTheHour(time.Minute * 300),
},
{
name: "complex schedule",
args: args{
earliestTime: deltaTimeAfterTopOfTheHour(30 * time.Minute),
now: *deltaTimeAfterTopOfTheHour(24*time.Hour + 31*time.Minute),
schedule: "30 6-16/4 * * 1-5",
},
expectedTime: deltaTimeAfterTopOfTheHour(24*time.Hour + 30*time.Minute),
},
{
name: "another complex schedule",
args: args{
earliestTime: deltaTimeAfterTopOfTheHour(30 * time.Minute),
now: *deltaTimeAfterTopOfTheHour(30*time.Hour + 30*time.Minute),
schedule: "30 10,11,12 * * 1-5",
},
expectedTime: nil,
},
{
name: "complex schedule with longer diff between executions",
args: args{
earliestTime: deltaTimeAfterTopOfTheHour(30 * time.Minute),
now: *deltaTimeAfterTopOfTheHour(96*time.Hour + 31*time.Minute),
schedule: "30 6-16/4 * * 1-5",
},
expectedTime: deltaTimeAfterTopOfTheHour(96*time.Hour + 30*time.Minute),
},
{
name: "complex schedule with shorter diff between executions",
args: args{
earliestTime: topOfTheHour(),
now: *deltaTimeAfterTopOfTheHour(24*time.Hour + 31*time.Minute),
schedule: "30 6-16/4 * * 1-5",
},
expectedTime: deltaTimeAfterTopOfTheHour(24*time.Hour + 30*time.Minute),
},
{
name: "@every schedule",
args: args{
earliestTime: deltaTimeAfterTopOfTheHour(1 * time.Minute),
now: *deltaTimeAfterTopOfTheHour(7 * 24 * time.Hour),
schedule: "@every 1h",
},
expectedTime: deltaTimeAfterTopOfTheHour((6 * 24 * time.Hour) + 23*time.Hour + 1*time.Minute),
expectedTooManyMissed: true,
},
{
name: "rogue cronjob",
Expand All @@ -326,9 +370,8 @@ func TestGetMostRecentScheduleTime(t *testing.T) {
now: *deltaTimeAfterTopOfTheHour(time.Hour * 1000000),
schedule: "59 23 31 2 *",
},
expectedTime: nil,
expectedNumberOfMisses: 0,
wantErr: true,
expectedTime: nil,
wantErr: true,
},
}
for _, tt := range tests {
Expand All @@ -337,7 +380,7 @@ func TestGetMostRecentScheduleTime(t *testing.T) {
if err != nil {
t.Errorf("error setting up the test, %s", err)
}
gotTime, gotNumberOfMisses, err := getMostRecentScheduleTime(*tt.args.earliestTime, tt.args.now, sched)
gotTime, gotTooManyMissed, err := getMostRecentScheduleTime(*tt.args.earliestTime, tt.args.now, sched)
if tt.wantErr {
if err == nil {
t.Error("getMostRecentScheduleTime() got no error when expected one")
Expand All @@ -353,8 +396,8 @@ func TestGetMostRecentScheduleTime(t *testing.T) {
if gotTime != nil && tt.expectedTime != nil && !gotTime.Equal(*tt.expectedTime) {
t.Errorf("getMostRecentScheduleTime() got = %v, want %v", gotTime, tt.expectedTime)
}
if gotNumberOfMisses != tt.expectedNumberOfMisses {
t.Errorf("getMostRecentScheduleTime() got1 = %v, want %v", gotNumberOfMisses, tt.expectedNumberOfMisses)
if gotTooManyMissed != tt.expectedTooManyMissed {
t.Errorf("getMostRecentScheduleTime() got1 = %v, want %v", gotTooManyMissed, tt.expectedTooManyMissed)
}
})
}
Expand Down

0 comments on commit 7af93f9

Please sign in to comment.