Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Strip container id from events #47462

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions pkg/kubelet/dockershim/docker_container.go
Original file line number Diff line number Diff line change
Expand Up @@ -233,6 +233,7 @@ func (ds *dockerService) removeContainerLogSymlink(containerID string) error {
func (ds *dockerService) StartContainer(containerID string) error {
err := ds.client.StartContainer(containerID)
if err != nil {
err = transformStartContainerError(err)
return fmt.Errorf("failed to start container %q: %v", containerID, err)
}
// Create container log symlink.
Expand Down
17 changes: 17 additions & 0 deletions pkg/kubelet/dockershim/helpers.go
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,10 @@ const (
var (
conflictRE = regexp.MustCompile(`Conflict. (?:.)+ is already in use by container ([0-9a-z]+)`)

// this is hacky, but extremely common.
// if a container starts but the executable file is not found, runc gives a message that matches
startRE = regexp.MustCompile(`\\\\\\\"(.*)\\\\\\\": executable file not found`)

// Docker changes the security option separator from ':' to '=' in the 1.23
// API version.
optsSeparatorChangeVersion = semver.MustParse(securityOptSeparatorChangeVersion)
Expand Down Expand Up @@ -359,6 +363,19 @@ func recoverFromCreationConflictIfNeeded(client libdocker.Interface, createConfi
return client.CreateContainer(createConfig)
}

// transformStartContainerError does regex parsing on returned error
// for where container runtimes are giving less than ideal error messages.
func transformStartContainerError(err error) error {
if err == nil {
return nil
}
matches := startRE.FindStringSubmatch(err.Error())
if len(matches) > 0 {
return fmt.Errorf("executable not found in $PATH")
}
return err
}

// getSecurityOptSeparator returns the security option separator based on the
// docker API version.
// TODO: Remove this function along with the relevant code when we no longer
Expand Down
1 change: 1 addition & 0 deletions pkg/kubelet/kuberuntime/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ go_library(
"//vendor/github.com/fsnotify/fsnotify:go_default_library",
"//vendor/github.com/golang/glog:go_default_library",
"//vendor/github.com/google/cadvisor/info/v1:go_default_library",
"//vendor/google.golang.org/grpc:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/apis/meta/v1:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/types:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/util/errors:go_default_library",
Expand Down
56 changes: 36 additions & 20 deletions pkg/kubelet/kuberuntime/kuberuntime_container.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,12 @@ import (
"os"
"path/filepath"
"sort"
"strings"
"sync"
"time"

"google.golang.org/grpc"

"github.com/armon/circbuf"
"github.com/golang/glog"

Expand All @@ -45,6 +48,30 @@ import (
"k8s.io/kubernetes/pkg/util/tail"
)

// recordContainerEvent should be used by the runtime manager for all container related events.
// it has sanity checks to ensure that we do not write events that can abuse our masters.
// in particular, it ensures that a containerID never appears in an event message as that
// is prone to causing a lot of distinct events that do not count well.
// it replaces any reference to a containerID with the containerName which is stable, and is what users know.
func (m *kubeGenericRuntimeManager) recordContainerEvent(pod *v1.Pod, container *v1.Container, containerID, eventType, reason, message string, args ...interface{}) {
ref, err := kubecontainer.GenerateContainerRef(pod, container)
if err != nil {
glog.Errorf("Can't make a ref to pod %q, container %v: %v", format.Pod(pod), container.Name, err)
return
}
eventMessage := message
if len(args) > 0 {
eventMessage = fmt.Sprintf(message, args...)
}
// this is a hack, but often the error from the runtime includes the containerID
// which kills our ability to deduplicate events. this protection makes a huge
// difference in the number of unique events
if containerID != "" {
eventMessage = strings.Replace(eventMessage, containerID, container.Name, -1)
}
m.recorder.Event(events.ToObjectReference(ref), eventType, reason, eventMessage)
}

// startContainer starts a container and returns a message indicates why it is failed on error.
// It starts the container through the following steps:
// * pull the image
Expand Down Expand Up @@ -74,15 +101,16 @@ func (m *kubeGenericRuntimeManager) startContainer(podSandboxID string, podSandb

containerConfig, err := m.generateContainerConfig(container, pod, restartCount, podIP, imageRef)
if err != nil {
m.recorder.Eventf(ref, v1.EventTypeWarning, events.FailedToCreateContainer, "Failed to create container with error: %v", err)
m.recordContainerEvent(pod, container, "", v1.EventTypeWarning, events.FailedToCreateContainer, "Error: %v", grpc.ErrorDesc(err))
return "Generate Container Config Failed", err
}
containerID, err := m.runtimeService.CreateContainer(podSandboxID, containerConfig, podSandboxConfig)
if err != nil {
m.recorder.Eventf(ref, v1.EventTypeWarning, events.FailedToCreateContainer, "Failed to create container with error: %v", err)
m.recordContainerEvent(pod, container, containerID, v1.EventTypeWarning, events.FailedToCreateContainer, "Error: %v", grpc.ErrorDesc(err))
return "Create Container Failed", err
}
m.recorder.Eventf(ref, v1.EventTypeNormal, events.CreatedContainer, "Created container with id %v", containerID)
m.recordContainerEvent(pod, container, containerID, v1.EventTypeNormal, events.CreatedContainer, "Created container")

if ref != nil {
m.containerRefManager.SetRef(kubecontainer.ContainerID{
Type: m.runtimeName,
Expand All @@ -93,11 +121,10 @@ func (m *kubeGenericRuntimeManager) startContainer(podSandboxID string, podSandb
// Step 3: start the container.
err = m.runtimeService.StartContainer(containerID)
if err != nil {
m.recorder.Eventf(ref, v1.EventTypeWarning, events.FailedToStartContainer,
"Failed to start container with id %v with error: %v", containerID, err)
m.recordContainerEvent(pod, container, containerID, v1.EventTypeWarning, events.FailedToStartContainer, "Error: %v", grpc.ErrorDesc(err))
return "Start Container Failed", err
}
m.recorder.Eventf(ref, v1.EventTypeNormal, events.StartedContainer, "Started container with id %v", containerID)
m.recordContainerEvent(pod, container, containerID, v1.EventTypeNormal, events.StartedContainer, "Started container")

// Symlink container logs to the legacy container log location for cluster logging
// support.
Expand All @@ -120,8 +147,7 @@ func (m *kubeGenericRuntimeManager) startContainer(podSandboxID string, podSandb
}
msg, handlerErr := m.runner.Run(kubeContainerID, pod, container, container.Lifecycle.PostStart)
if handlerErr != nil {
err := fmt.Errorf("PostStart handler: %v", handlerErr)
m.generateContainerEvent(kubeContainerID, v1.EventTypeWarning, events.FailedPostStartHook, msg)
m.recordContainerEvent(pod, container, kubeContainerID.ID, v1.EventTypeWarning, events.FailedPostStartHook, msg)
m.killContainer(pod, kubeContainerID, container.Name, "FailedPostStartHook", nil)
return "PostStart Hook Failed", err
}
Expand Down Expand Up @@ -439,16 +465,6 @@ func toKubeContainerStatus(status *runtimeapi.ContainerStatus, runtimeName strin
return cStatus
}

// generateContainerEvent generates an event for the container.
func (m *kubeGenericRuntimeManager) generateContainerEvent(containerID kubecontainer.ContainerID, eventType, reason, message string) {
ref, ok := m.containerRefManager.GetRef(containerID)
if !ok {
glog.Warningf("No ref for container %q", containerID)
return
}
m.recorder.Event(events.ToObjectReference(ref), eventType, reason, message)
}

// executePreStopHook runs the pre-stop lifecycle hooks if applicable and returns the duration it takes.
func (m *kubeGenericRuntimeManager) executePreStopHook(pod *v1.Pod, containerID kubecontainer.ContainerID, containerSpec *v1.Container, gracePeriod int64) int64 {
glog.V(3).Infof("Running preStop hook for container %q", containerID.String())
Expand All @@ -460,7 +476,7 @@ func (m *kubeGenericRuntimeManager) executePreStopHook(pod *v1.Pod, containerID
defer utilruntime.HandleCrash()
if msg, err := m.runner.Run(containerID, pod, containerSpec, containerSpec.Lifecycle.PreStop); err != nil {
glog.Errorf("preStop hook for container %q failed: %v", containerSpec.Name, err)
m.generateContainerEvent(containerID, v1.EventTypeWarning, events.FailedPreStopHook, msg)
m.recordContainerEvent(pod, containerSpec, containerID.ID, v1.EventTypeWarning, events.FailedPreStopHook, msg)
}
}()

Expand Down Expand Up @@ -568,7 +584,7 @@ func (m *kubeGenericRuntimeManager) killContainer(pod *v1.Pod, containerID kubec
if reason != "" {
message = fmt.Sprint(message, ":", reason)
}
m.generateContainerEvent(containerID, v1.EventTypeNormal, events.KillingContainer, message)
m.recordContainerEvent(pod, containerSpec, containerID.ID, v1.EventTypeNormal, events.KillingContainer, message)
m.containerRefManager.ClearRef(containerID)

return err
Expand Down
8 changes: 7 additions & 1 deletion pkg/kubelet/pod_workers.go
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,13 @@ func (p *podWorkers) managePodLoop(podUpdates <-chan UpdatePodOptions) {
}
if err != nil {
glog.Errorf("Error syncing pod %s (%q), skipping: %v", update.Pod.UID, format.Pod(update.Pod), err)
p.recorder.Eventf(update.Pod, v1.EventTypeWarning, events.FailedSync, "Error syncing pod, skipping: %v", err)
// if we failed sync, we throw more specific events for why it happened.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We don't, or at least not in syncPod() in kubelet.go. There are errors if the pod is not runnable or network is not ready, and I only checked a few. This lets you aggregates event, yes, but may actually lose the information that user wants to know.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@yujuhong - i am open to adding an error when the network is not ready or any other more specific error situation. what i dislike is that we used FailedSync as a catch-all and as a result we duplicated the same information twice. As a user, what does FailedSync even mean? i would argue it only has meaning for kubelet authors, and not kubernetes users. if i got an event "NetworkNotReady" with a message that is way better than "FailedSync".

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wasn't saying the FailedSync as a catch-all message is good. Just pointing out that your PR removed some information that could be meaningful to the users, an no alternative event was added.

// as a result, i question the value of this event.
// TODO: determine if we can remove this in a future release.
// do not include descriptive text that can vary on why it failed so in a pathological
// scenario, kubelet does not create enough discrete events that miss default aggregation
// window.
p.recorder.Eventf(update.Pod, v1.EventTypeWarning, events.FailedSync, "Error syncing pod")
}
p.wrapUp(update.Pod.UID, err)
}
Expand Down