kubernetes · k8s-ci-robot · Nov 8, 2022 · Sep 14, 2022 · ruiwen-zhao · Oct 20, 2022
diff --git a/pkg/features/kube_features.go b/pkg/features/kube_features.go
@@ -284,6 +284,14 @@ const (
 	// Allows running an ephemeral container in pod namespaces to troubleshoot a running pod.
 	EphemeralContainers featuregate.Feature = "EphemeralContainers"
 
+	// owner: @harche
+	// kep: http://kep.k8s.io/3386
+	// alpha: v1.25
+	//
+	// Allows using event-driven PLEG (pod lifecycle event generator) through kubelet
+	// which avoids frequent relisting of containers which helps optimize performance.
+	EventedPLEG featuregate.Feature = "EventedPLEG"
+
 	// owner: @andrewsykim @SergeyKanzhelev
 	// GA: v1.20
 	//
@@ -914,6 +922,8 @@ var defaultKubernetesFeatureGates = map[featuregate.Feature]featuregate.FeatureS
 
 	EphemeralContainers: {Default: true, PreRelease: featuregate.GA, LockToDefault: true}, // remove in 1.27
 
+	EventedPLEG: {Default: false, PreRelease: featuregate.Alpha},
+
 	ExecProbeTimeout: {Default: true, PreRelease: featuregate.GA}, // lock to default and remove after v1.22 based on KEP #1972 update
 
 	ExpandCSIVolumes: {Default: true, PreRelease: featuregate.GA}, // remove in 1.26

diff --git a/pkg/kubelet/container/cache.go b/pkg/kubelet/container/cache.go
@@ -21,6 +21,8 @@ import (
 	"time"
 
 	"k8s.io/apimachinery/pkg/types"
+	utilfeature "k8s.io/apiserver/pkg/util/feature"
+	"k8s.io/kubernetes/pkg/features"
 )
 
 // Cache stores the PodStatus for the pods. It represents *all* the visible
@@ -36,7 +38,10 @@ import (
 // cache entries.
 type Cache interface {
 	Get(types.UID) (*PodStatus, error)
-	Set(types.UID, *PodStatus, error, time.Time)
+	// Set updates the cache by setting the PodStatus for the pod only
+	// if the data is newer than the cache based on the provided
+	// time stamp. Returns if the cache was updated.
+	Set(types.UID, *PodStatus, error, time.Time) (updated bool)
 	// GetNewerThan is a blocking call that only returns the status
 	// when it is newer than the given time.
 	GetNewerThan(types.UID, time.Time) (*PodStatus, error)
@@ -93,12 +98,22 @@ func (c *cache) GetNewerThan(id types.UID, minTime time.Time) (*PodStatus, error
 	return d.status, d.err
 }
 
-// Set sets the PodStatus for the pod.
-func (c *cache) Set(id types.UID, status *PodStatus, err error, timestamp time.Time) {
+// Set sets the PodStatus for the pod only if the data is newer than the cache
+func (c *cache) Set(id types.UID, status *PodStatus, err error, timestamp time.Time) (updated bool) {
 	c.lock.Lock()
 	defer c.lock.Unlock()
-	defer c.notify(id, timestamp)
+
+	if utilfeature.DefaultFeatureGate.Enabled(features.EventedPLEG) {
+		// Set the value in the cache only if it's not present already
+		// or the timestamp in the cache is older than the current update timestamp
+		if cachedVal, ok := c.pods[id]; ok && cachedVal.modified.After(timestamp) {
+			return false
+		}
+	}
+
 	c.pods[id] = &data{status: status, err: err, modified: timestamp}
+	c.notify(id, timestamp)
+	return true
 }
 
 // Delete removes the entry of the pod.
@@ -142,6 +157,29 @@ func (c *cache) get(id types.UID) *data {
 // Otherwise, it returns nil. The caller should acquire the lock.
 func (c *cache) getIfNewerThan(id types.UID, minTime time.Time) *data {
 	d, ok := c.pods[id]
+	if utilfeature.DefaultFeatureGate.Enabled(features.EventedPLEG) {
+		// Evented PLEG has CREATED, STARTED, STOPPED and DELETED events
+		// However if the container creation fails for some reason there is no
+		// CRI event received by the kubelet and that pod will get stuck a
+		// GetNewerThan call in the pod workers. This is reproducible with
+		// the node e2e test,
+		// https://github.com/kubernetes/kubernetes/blob/83415e5c9e6e59a3d60a148160490560af2178a1/test/e2e_node/pod_hostnamefqdn_test.go#L161
+		// which forces failure during pod creation. This issue also exists in
+		// Generic PLEG but since it updates global timestamp periodically
+		// the GetNewerThan call gets unstuck.
+
+		// During node e2e tests, it was observed this change does not have any
+		// adverse impact on the behaviour of the Generic PLEG as well.
+		switch {
+		case !ok:
+			return makeDefaultData(id)
+		case ok && (d.modified.After(minTime) || (c.timestamp != nil && c.timestamp.After(minTime))):
+			return d
+		default:
+			return nil
+		}
+	}
+
 	globalTimestampIsNewer := (c.timestamp != nil && c.timestamp.After(minTime))
 	if !ok && globalTimestampIsNewer {
 		// Status is not cached, but the global timestamp is newer than

diff --git a/pkg/kubelet/container/runtime.go b/pkg/kubelet/container/runtime.go
@@ -122,6 +122,8 @@ type Runtime interface {
 	// CheckpointContainer tells the runtime to checkpoint a container
 	// and store the resulting archive to the checkpoint directory.
 	CheckpointContainer(ctx context.Context, options *runtimeapi.CheckpointContainerRequest) error
+	// Generate pod status from the CRI event
+	GeneratePodStatus(event *runtimeapi.ContainerEventResponse) (*PodStatus, error)
 }
 
 // StreamingRuntime is the interface implemented by runtimes that handle the serving of the
@@ -305,6 +307,8 @@ type PodStatus struct {
 	// Status of the pod sandbox.
 	// Only for kuberuntime now, other runtime may keep it nil.
 	SandboxStatuses []*runtimeapi.PodSandboxStatus
+	// Timestamp at which container and pod statuses were recorded
+	TimeStamp time.Time
 }
 
 // Status represents the status of a container.

diff --git a/pkg/kubelet/container/testing/fake_cache.go b/pkg/kubelet/container/testing/fake_cache.go
@@ -40,7 +40,8 @@ func (c *fakeCache) GetNewerThan(id types.UID, minTime time.Time) (*container.Po
 	return c.Get(id)
 }
 
-func (c *fakeCache) Set(id types.UID, status *container.PodStatus, err error, timestamp time.Time) {
+func (c *fakeCache) Set(id types.UID, status *container.PodStatus, err error, timestamp time.Time) (updated bool) {
+	return true
 }
 
 func (c *fakeCache) Delete(id types.UID) {

diff --git a/pkg/kubelet/container/testing/fake_runtime.go b/pkg/kubelet/container/testing/fake_runtime.go
@@ -276,6 +276,15 @@ func (f *FakeRuntime) KillContainerInPod(container v1.Container, pod *v1.Pod) er
 	return f.Err
 }
 
+func (f *FakeRuntime) GeneratePodStatus(event *runtimeapi.ContainerEventResponse) (*kubecontainer.PodStatus, error) {
+	f.Lock()
+	defer f.Unlock()
+
+	f.CalledFunctions = append(f.CalledFunctions, "GeneratePodStatus")
+	status := f.PodStatus
+	return &status, f.Err
+}
+
 func (f *FakeRuntime) GetPodStatus(_ context.Context, uid types.UID, name, namespace string) (*kubecontainer.PodStatus, error) {
 	f.Lock()
 	defer f.Unlock()

diff --git a/pkg/kubelet/container/testing/runtime_mock.go b/pkg/kubelet/container/testing/runtime_mock.go
diff --git a/pkg/kubelet/cri/remote/remote_runtime.go b/pkg/kubelet/cri/remote/remote_runtime.go
@@ -20,6 +20,7 @@ import (
 	"context"
 	"errors"
 	"fmt"
+	"io"
 	"strings"
 	"time"
 
@@ -792,5 +793,25 @@ func (r *remoteRuntimeService) CheckpointContainer(ctx context.Context, options
 }
 
 func (r *remoteRuntimeService) GetContainerEvents(containerEventsCh chan *runtimeapi.ContainerEventResponse) error {
-	return nil
+	containerEventsStreamingClient, err := r.runtimeClient.GetContainerEvents(context.Background(), &runtimeapi.GetEventsRequest{})
+	if err != nil {
+		klog.ErrorS(err, "GetContainerEvents failed to get streaming client")
+		return err
+	}
+
+	for {
+		resp, err := containerEventsStreamingClient.Recv()
+		if err == io.EOF {
+			klog.ErrorS(err, "container events stream is closed")
+			return err
+		}
+		if err != nil {
+			klog.ErrorS(err, "failed to receive streaming container event")
+			return err
+		}
+		if resp != nil {
+			containerEventsCh <- resp
+			klog.V(4).InfoS("container event received", "resp", resp)
+		}
+	}
 }
diff --git a/pkg/kubelet/kubelet.go b/pkg/kubelet/kubelet.go
@@ -162,7 +162,13 @@ const (
 	// Note that even though we set the period to 1s, the relisting itself can
 	// take more than 1s to finish if the container runtime responds slowly
 	// and/or when there are many container changes in one cycle.
-	plegRelistPeriod = time.Second * 1
+	genericPlegRelistPeriod    = time.Second * 1
+	genericPlegRelistThreshold = time.Minute * 3
+
+	// Generic PLEG relist period and threshold when used with Evented PLEG.
+	eventedPlegRelistPeriod     = time.Second * 300
+	eventedPlegRelistThreshold  = time.Minute * 10
+	eventedPlegMaxStreamRetries = 5
 
 	// backOffPeriod is the period to back off when pod syncing results in an
 	// error. It is also used as the base period for the exponential backoff
@@ -699,9 +705,37 @@ func NewMainKubelet(kubeCfg *kubeletconfiginternal.KubeletConfiguration,
 			utilfeature.DefaultFeatureGate.Enabled(features.PodAndContainerStatsFromCRI))
 	}
 
-	klet.pleg = pleg.NewGenericPLEG(klet.containerRuntime, plegChannelCapacity, plegRelistPeriod, klet.podCache, clock.RealClock{})
+	eventChannel := make(chan *pleg.PodLifecycleEvent, plegChannelCapacity)
+
+	if utilfeature.DefaultFeatureGate.Enabled(features.EventedPLEG) {
+		// adjust Generic PLEG relisting period and threshold to higher value when Evented PLEG is turned on
+		genericRelistDuration := &pleg.RelistDuration{
+			RelistPeriod:    eventedPlegRelistPeriod,
+			RelistThreshold: eventedPlegRelistThreshold,
+		}
+		klet.pleg = pleg.NewGenericPLEG(klet.containerRuntime, eventChannel, genericRelistDuration, klet.podCache, clock.RealClock{})
+		// In case Evented PLEG has to fall back on Generic PLEG due to an error,
+		// Evented PLEG should be able to reset the Generic PLEG relisting duration
+		// to the default value.
+		eventedRelistDuration := &pleg.RelistDuration{
+			RelistPeriod:    genericPlegRelistPeriod,
+			RelistThreshold: genericPlegRelistThreshold,
+		}
+		klet.eventedPleg = pleg.NewEventedPLEG(klet.containerRuntime, klet.runtimeService, eventChannel,
+			klet.podCache, klet.pleg, eventedPlegMaxStreamRetries, eventedRelistDuration, clock.RealClock{})
+	} else {
+		genericRelistDuration := &pleg.RelistDuration{
+			RelistPeriod:    genericPlegRelistPeriod,
+			RelistThreshold: genericPlegRelistThreshold,
+		}
+		klet.pleg = pleg.NewGenericPLEG(klet.containerRuntime, eventChannel, genericRelistDuration, klet.podCache, clock.RealClock{})
+	}
+
 	klet.runtimeState = newRuntimeState(maxWaitForContainerRuntime)
 	klet.runtimeState.addHealthCheck("PLEG", klet.pleg.Healthy)
+	if utilfeature.DefaultFeatureGate.Enabled(features.EventedPLEG) {
+		klet.runtimeState.addHealthCheck("EventedPLEG", klet.eventedPleg.Healthy)
+	}
 	if _, err := klet.updatePodCIDR(ctx, kubeCfg.PodCIDR); err != nil {
 		klog.ErrorS(err, "Pod CIDR update failed")
 	}
@@ -1062,6 +1096,9 @@ type Kubelet struct {
 	// Generates pod events.
 	pleg pleg.PodLifecycleEventGenerator
 
+	// Evented PLEG
+	eventedPleg pleg.PodLifecycleEventGenerator
+
 	// Store kubecontainer.PodStatus for all pods.
 	podCache kubecontainer.Cache
 
@@ -1485,6 +1522,12 @@ func (kl *Kubelet) Run(updates <-chan kubetypes.PodUpdate) {
 
 	// Start the pod lifecycle event generator.
 	kl.pleg.Start()
+
+	// Start eventedPLEG only if EventedPLEG feature gate is enabled.
+	if utilfeature.DefaultFeatureGate.Enabled(features.EventedPLEG) {
+		kl.eventedPleg.Start()
+	}
+
 	kl.syncLoop(ctx, updates, kl)
 }
 

diff --git a/pkg/kubelet/kubelet_test.go b/pkg/kubelet/kubelet_test.go
@@ -313,7 +313,7 @@ func newTestKubeletWithImageList(
 	kubelet.resyncInterval = 10 * time.Second
 	kubelet.workQueue = queue.NewBasicWorkQueue(fakeClock)
 	// Relist period does not affect the tests.
-	kubelet.pleg = pleg.NewGenericPLEG(fakeRuntime, 100, time.Hour, kubelet.podCache, clock.RealClock{})
+	kubelet.pleg = pleg.NewGenericPLEG(fakeRuntime, make(chan *pleg.PodLifecycleEvent, 100), &pleg.RelistDuration{RelistPeriod: time.Hour, RelistThreshold: genericPlegRelistThreshold}, kubelet.podCache, clock.RealClock{})
 	kubelet.clock = fakeClock
 
 	nodeRef := &v1.ObjectReference{

diff --git a/pkg/kubelet/kuberuntime/kuberuntime_container.go b/pkg/kubelet/kuberuntime/kuberuntime_container.go
@@ -490,6 +490,29 @@ func (m *kubeGenericRuntimeManager) readLastStringFromContainerLogs(path string)
 	return buf.String()
 }
 
+func (m *kubeGenericRuntimeManager) convertToKubeContainerStatus(status *runtimeapi.ContainerStatus) (cStatus *kubecontainer.Status) {
+	cStatus = toKubeContainerStatus(status, m.runtimeName)
+	if status.State == runtimeapi.ContainerState_CONTAINER_EXITED {
+		// Populate the termination message if needed.
+		annotatedInfo := getContainerInfoFromAnnotations(status.Annotations)
+		// If a container cannot even be started, it certainly does not have logs, so no need to fallbackToLogs.
+		fallbackToLogs := annotatedInfo.TerminationMessagePolicy == v1.TerminationMessageFallbackToLogsOnError &&
+			cStatus.ExitCode != 0 && cStatus.Reason != "ContainerCannotRun"
+		tMessage, checkLogs := getTerminationMessage(status, annotatedInfo.TerminationMessagePath, fallbackToLogs)
+		if checkLogs {
+			tMessage = m.readLastStringFromContainerLogs(status.GetLogPath())
+		}
+		// Enrich the termination message written by the application is not empty
+		if len(tMessage) != 0 {
+			if len(cStatus.Message) != 0 {
+				cStatus.Message += ": "
+			}
+			cStatus.Message += tMessage
+		}
+	}
+	return cStatus
+}
+
 // getPodContainerStatuses gets all containers' statuses for the pod.
 func (m *kubeGenericRuntimeManager) getPodContainerStatuses(ctx context.Context, uid kubetypes.UID, name, namespace string) ([]*kubecontainer.Status, error) {
 	// Select all containers of the given pod.
@@ -521,25 +544,7 @@ func (m *kubeGenericRuntimeManager) getPodContainerStatuses(ctx context.Context,
 		if status == nil {
 			return nil, remote.ErrContainerStatusNil
 		}
-		cStatus := toKubeContainerStatus(status, m.runtimeName)
-		if status.State == runtimeapi.ContainerState_CONTAINER_EXITED {
-			// Populate the termination message if needed.
-			annotatedInfo := getContainerInfoFromAnnotations(status.Annotations)
-			// If a container cannot even be started, it certainly does not have logs, so no need to fallbackToLogs.
-			fallbackToLogs := annotatedInfo.TerminationMessagePolicy == v1.TerminationMessageFallbackToLogsOnError &&
-				cStatus.ExitCode != 0 && cStatus.Reason != "ContainerCannotRun"
-			tMessage, checkLogs := getTerminationMessage(status, annotatedInfo.TerminationMessagePath, fallbackToLogs)
-			if checkLogs {
-				tMessage = m.readLastStringFromContainerLogs(status.GetLogPath())
-			}
-			// Enrich the termination message written by the application is not empty
-			if len(tMessage) != 0 {
-				if len(cStatus.Message) != 0 {
-					cStatus.Message += ": "
-				}
-				cStatus.Message += tMessage
-			}
-		}
+		cStatus := m.convertToKubeContainerStatus(status)
 		statuses = append(statuses, cStatus)
 	}