Job lifecycle latency measurement added

kubernetes · Nov 14, 2022 · 0981cd2 · 0981cd2
1 parent 34b4a7a
commit 0981cd2
Show file tree

Hide file tree

Showing 3 changed files with 252 additions and 1 deletion.
diff --git a/clusterloader2/pkg/measurement/common/job_lifecycle_latency.go b/clusterloader2/pkg/measurement/common/job_lifecycle_latency.go
@@ -0,0 +1,236 @@
+/*
+Copyright 2022 The Kubernetes Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package common
+
+import (
+	"context"
+	"fmt"
+	"time"
+
+	batchv1 "k8s.io/api/batch/v1"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/apimachinery/pkg/runtime"
+	"k8s.io/apimachinery/pkg/util/wait"
+	"k8s.io/apimachinery/pkg/watch"
+	clientset "k8s.io/client-go/kubernetes"
+	"k8s.io/client-go/tools/cache"
+	"k8s.io/client-go/util/workqueue"
+	"k8s.io/klog/v2"
+	"k8s.io/perf-tests/clusterloader2/pkg/measurement"
+	measurementutil "k8s.io/perf-tests/clusterloader2/pkg/measurement/util"
+	"k8s.io/perf-tests/clusterloader2/pkg/measurement/util/informer"
+	"k8s.io/perf-tests/clusterloader2/pkg/util"
+)
+
+type eventData struct {
+	obj      interface{}
+	recvTime time.Time
+}
+
+const (
+	jobLifecycleLatencyMeasurementName = "JobLifecycleLatency"
+	checkCompletedJobsInterval         = time.Second
+	jobCreated                         = "JobCreated"
+	jobStarted                         = "JobStarted"
+	jobCompleted                       = "JobCompleted"
+)
+
+func init() {
+	if err := measurement.Register(jobLifecycleLatencyMeasurementName, createJobLifecycleLatencyMeasurement); err != nil {
+		klog.Fatalf("Can't register service %v", err)
+	}
+}
+
+func createJobLifecycleLatencyMeasurement() measurement.Measurement {
+	return &jobLifecycleLatencyMeasurement{
+		selector:        util.NewObjectSelector(),
+		jobStateEntries: measurementutil.NewObjectTransitionTimes(jobLifecycleLatencyMeasurementName),
+		eventQueue:      workqueue.New(),
+	}
+}
+
+type jobLifecycleLatencyMeasurement struct {
+	selector        *util.ObjectSelector
+	isRunning       bool
+	stopCh          chan struct{}
+	eventQueue      *workqueue.Type
+	jobStateEntries *measurementutil.ObjectTransitionTimes
+}
+
+// Execute supports two actions:
+// - start - Starts to observe jobs and their state transitions.
+// - gather - Gathers and prints job latency data.
+// heavily influenced by pod_startup_latency measurement
+func (p *jobLifecycleLatencyMeasurement) Execute(config *measurement.Config) ([]measurement.Summary, error) {
+	action, err := util.GetString(config.Params, "action")
+	if err != nil {
+		return nil, err
+	}
+	switch action {
+	case "start":
+		if err := p.selector.Parse(config.Params); err != nil {
+			return nil, err
+		}
+		return nil, p.start(config.ClusterFramework.GetClientSets().GetClient())
+	case "gather":
+		timeout, err := util.GetDurationOrDefault(config.Params, "timeout", defaultWaitForFinishedJobsTimeout)
+		if err != nil {
+			return nil, err
+		}
+		return p.gather(config.ClusterFramework.GetClientSets().GetClient(), config.Identifier, timeout)
+	default:
+		return nil, fmt.Errorf("unknown action %v", action)
+	}
+
+}
+
+// Dispose cleans up after the measurement.
+func (p *jobLifecycleLatencyMeasurement) Dispose() {
+	p.stop()
+}
+
+// String returns string representation of this measurement.
+func (p *jobLifecycleLatencyMeasurement) String() string {
+	return jobLifecycleLatencyMeasurementName + ": " + p.selector.String()
+}
+
+func (p *jobLifecycleLatencyMeasurement) start(c clientset.Interface) error {
+	if p.isRunning {
+		klog.V(2).Infof("%s: job lifecycle latency measurement already running", p)
+		return nil
+	}
+	klog.V(2).Infof("%s: starting job lifecycle latency measurement...", p)
+	p.isRunning = true
+	p.stopCh = make(chan struct{})
+	i := informer.NewInformer(
+		&cache.ListWatch{
+			ListFunc: func(options metav1.ListOptions) (runtime.Object, error) {
+				p.selector.ApplySelectors(&options)
+				return c.BatchV1().Jobs(p.selector.Namespace).List(context.TODO(), options)
+			},
+			WatchFunc: func(options metav1.ListOptions) (watch.Interface, error) {
+				p.selector.ApplySelectors(&options)
+				return c.BatchV1().Jobs(p.selector.Namespace).Watch(context.TODO(), options)
+			},
+		},
+		p.addEvent,
+	)
+	go p.processEvents()
+	return informer.StartAndSync(i, p.stopCh, informerSyncTimeout)
+}
+
+func (p *jobLifecycleLatencyMeasurement) addEvent(_, obj interface{}) {
+	event := &eventData{obj: obj, recvTime: time.Now()}
+	p.eventQueue.Add(event)
+}
+
+func (p *jobLifecycleLatencyMeasurement) processEvents() {
+	for p.processNextWorkItem() {
+	}
+}
+
+func (p *jobLifecycleLatencyMeasurement) processNextWorkItem() bool {
+	item, quit := p.eventQueue.Get()
+	if quit {
+		return false
+	}
+	defer p.eventQueue.Done(item)
+	event, ok := item.(*eventData)
+	if !ok {
+		klog.Warningf("Couldn't convert work item to evetData: %v", item)
+		return true
+	}
+	p.processEvent(event)
+	return true
+}
+
+// processEvent processes job state change events:
+// uses Phase Latency utility to record job state transitions
+// it currently captures the following transitions:
+// JobCreated (job.CreationTimestamp.Time) -> JobStarted (job.Status.StartTime.Time)
+// JobStarted (job.Status.StartTime.Time) -> JobCompleted (job.Status.CompletionTime.Time)
+func (p *jobLifecycleLatencyMeasurement) processEvent(event *eventData) {
+	obj := event.obj
+	if obj == nil {
+		return
+	}
+	job, ok := obj.(*batchv1.Job)
+	if !ok {
+		return
+	}
+	key := createMetaNamespaceKey(job.Namespace, job.Name)
+	if _, found := p.jobStateEntries.Get(key, jobCreated); !found {
+		p.jobStateEntries.Set(key, jobCreated, job.CreationTimestamp.Time)
+	}
+	if job.Status.StartTime != nil {
+		if _, found := p.jobStateEntries.Get(key, jobStarted); !found {
+			p.jobStateEntries.Set(key, jobStarted, job.Status.StartTime.Time)
+		}
+	}
+	if job.Status.CompletionTime != nil {
+		if _, found := p.jobStateEntries.Get(key, jobCompleted); !found {
+			p.jobStateEntries.Set(key, jobCompleted, job.Status.CompletionTime.Time)
+		}
+	}
+}
+
+func (p *jobLifecycleLatencyMeasurement) stop() {
+	if p.isRunning {
+		p.isRunning = false
+		close(p.stopCh)
+		p.eventQueue.ShutDown()
+	}
+}
+
+var jobLifecycleTransitions = map[string]measurementutil.Transition{
+	"create_to_start": {
+		From: jobCreated,
+		To:   jobStarted,
+	},
+	"start_to_complete": {
+		From: jobStarted,
+		To:   jobCompleted,
+	},
+}
+
+// gather collects job lifecycle latency and calculates percentiles using Phase Latency utility
+// it waits for all jobs to be completed before collecting the metrics or times out
+func (p *jobLifecycleLatencyMeasurement) gather(c clientset.Interface, identifier string, timeout time.Duration) ([]measurement.Summary, error) {
+	klog.V(2).Infof("%s: gathering job lifecycle latency measurement...", p)
+	if !p.isRunning {
+		return nil, fmt.Errorf("metric %s has not been started", jobLifecycleLatencyMeasurementName)
+	}
+	condition := func() (bool, error) {
+		return p.jobStateEntries.Count(jobCreated) == p.jobStateEntries.Count(jobCompleted), nil
+	}
+	if err := wait.Poll(checkCompletedJobsInterval, timeout, condition); err != nil {
+		klog.V(2).Infof("Timed out waiting for all jobs to complete: %v", err)
+	}
+	p.stop()
+	jobLifecycleLatency := p.jobStateEntries.CalculateTransitionsLatency(jobLifecycleTransitions, measurementutil.MatchAll)
+	content, jsonErr := util.PrettyPrintJSON(measurementutil.LatencyMapToPerfData(jobLifecycleLatency))
+	if jsonErr != nil {
+		return nil, jsonErr
+	}
+	summaryName := fmt.Sprintf("%s_%s", jobLifecycleLatencyMeasurementName, identifier)
+	summaries := []measurement.Summary{measurement.CreateSummary(summaryName, "json", content)}
+	return summaries, nil
+}
+
+func createMetaNamespaceKey(namespace, name string) string {
+	return namespace + "/" + name
+}
diff --git a/clusterloader2/testing/batch/config.yaml b/clusterloader2/testing/batch/config.yaml
@@ -17,6 +17,8 @@
 {{$largeJobSize := 400}}
 {{$largeJobsPerNamespace := DivideInt $podsPerNamespace (MultiplyInt 4 $largeJobSize)}}
 
+{{$jobRunningTime := DefaultParam .CL2_JOB_RUNNING_TIME "30s"}}
+
 name: batch
 
 namespace:
@@ -35,6 +37,11 @@ steps:
     Params:
       action: start
       labelSelector: group = test-job
+  - Identifier: JobLifecycleLatency
+    Method: JobLifecycleLatency
+    Params:
+      action: start
+      labelSelector: group = test-job      
 - name: Create {{$MODE}} jobs
   phases:
   - namespaceRange:
@@ -48,6 +55,7 @@ steps:
       templateFillMap:
         Replicas: {{$smallJobSize}}
         Mode: {{$MODE}}
+        Sleep: {{$jobRunningTime}}
   - namespaceRange:
       min: 1
       max: {{$namespaces}}
@@ -59,6 +67,7 @@ steps:
       templateFillMap:
         Replicas: {{$mediumJobSize}}
         Mode: {{$MODE}}
+        Sleep: {{$jobRunningTime}}
   - namespaceRange:
       min: 1
       max: {{$namespaces}}
@@ -70,8 +79,14 @@ steps:
       templateFillMap:
         Replicas: {{$largeJobSize}}
         Mode: {{$MODE}}
+        Sleep: {{$jobRunningTime}}        
 - name: Wait for {{$MODE}} jobs to finish
   measurements:
+  - Identifier: JobLifecycleLatency
+    Method: JobLifecycleLatency
+    Params:
+      action: gather
+      timeout: 10m  
   - Identifier: WaitForFinishedJobs
     Method: WaitForFinishedJobs
     Params:

diff --git a/clusterloader2/testing/batch/job.yaml b/clusterloader2/testing/batch/job.yaml
@@ -17,5 +17,5 @@ spec:
       - name: {{.Name}}
         image: gcr.io/k8s-staging-perf-tests/sleep:v0.0.3
         args:
-          - "30s"
+          - {{.Sleep}}
       restartPolicy: Never