diff --git a/pkg/kubelet/kuberuntime/kuberuntime_gc.go b/pkg/kubelet/kuberuntime/kuberuntime_gc.go index b139dba92726..687e86d82e12 100644 --- a/pkg/kubelet/kuberuntime/kuberuntime_gc.go +++ b/pkg/kubelet/kuberuntime/kuberuntime_gc.go @@ -356,9 +356,35 @@ func (cgc *containerGC) evictPodLogsDirectories(allSourcesReady bool) error { logSymlinks, _ := osInterface.Glob(filepath.Join(legacyContainerLogsDir, fmt.Sprintf("*.%s", legacyLogSuffix))) for _, logSymlink := range logSymlinks { if _, err := osInterface.Stat(logSymlink); os.IsNotExist(err) { + if containerID, err := getContainerIDFromLegacyLogSymlink(logSymlink); err == nil { + status, err := cgc.manager.runtimeService.ContainerStatus(containerID) + if err != nil { + // TODO: we should handle container not found (i.e. container was deleted) case differently + // once https://github.com/kubernetes/kubernetes/issues/63336 is resolved + klog.Infof("Error getting ContainerStatus for containerID %q: %v", containerID, err) + } else if status.State != runtimeapi.ContainerState_CONTAINER_EXITED { + // Here is how container log rotation works (see containerLogManager#rotateLatestLog): + // + // 1. rename current log to rotated log file whose filename contains current timestamp (fmt.Sprintf("%s.%s", log, timestamp)) + // 2. reopen the container log + // 3. if #2 fails, rename rotated log file back to container log + // + // There is small but indeterministic amount of time during which log file doesn't exist (between steps #1 and #2, between #1 and #3). + // Hence the symlink may be deemed unhealthy during that period. + // See https://github.com/kubernetes/kubernetes/issues/52172 + // + // We only remove unhealthy symlink for dead containers + klog.V(5).Infof("Container %q is still running, not removing symlink %q.", containerID, logSymlink) + continue + } + } else { + klog.V(4).Infof("unable to obtain container Id: %v", err) + } err := osInterface.Remove(logSymlink) if err != nil { klog.Errorf("Failed to remove container log dead symlink %q: %v", logSymlink, err) + } else { + klog.V(4).Infof("removed symlink %s", logSymlink) } } } diff --git a/pkg/kubelet/kuberuntime/legacy.go b/pkg/kubelet/kuberuntime/legacy.go index ae208a01a200..1187123ad7fa 100644 --- a/pkg/kubelet/kuberuntime/legacy.go +++ b/pkg/kubelet/kuberuntime/legacy.go @@ -19,6 +19,7 @@ package kuberuntime import ( "fmt" "path" + "strings" kubecontainer "k8s.io/kubernetes/pkg/kubelet/container" ) @@ -44,6 +45,25 @@ func legacyLogSymlink(containerID string, containerName, podName, podNamespace s containerName, containerID) } +// getContainerIDFromLegacyLogSymlink returns error if container Id cannot be parsed +func getContainerIDFromLegacyLogSymlink(logSymlink string) (string, error) { + parts := strings.Split(logSymlink, "-") + if len(parts) == 0 { + return "", fmt.Errorf("unable to find separator in %q", logSymlink) + } + containerIDWithSuffix := parts[len(parts)-1] + suffix := fmt.Sprintf(".%s", legacyLogSuffix) + if !strings.HasSuffix(containerIDWithSuffix, suffix) { + return "", fmt.Errorf("%q doesn't end with %q", logSymlink, suffix) + } + containerIDWithoutSuffix := strings.TrimSuffix(containerIDWithSuffix, suffix) + // container can be retrieved with container Id as short as 6 characters + if len(containerIDWithoutSuffix) < 6 { + return "", fmt.Errorf("container Id %q is too short", containerIDWithoutSuffix) + } + return containerIDWithoutSuffix, nil +} + func logSymlink(containerLogsDir, podFullName, containerName, containerID string) string { suffix := fmt.Sprintf(".%s", legacyLogSuffix) logPath := fmt.Sprintf("%s_%s-%s", podFullName, containerName, containerID) diff --git a/pkg/kubelet/remote/remote_runtime.go b/pkg/kubelet/remote/remote_runtime.go index 30ee914094e2..98867b355f90 100644 --- a/pkg/kubelet/remote/remote_runtime.go +++ b/pkg/kubelet/remote/remote_runtime.go @@ -473,7 +473,7 @@ func (r *RemoteRuntimeService) ContainerStats(containerID string) (*runtimeapi.C }) if err != nil { if r.logReduction.ShouldMessageBePrinted(err.Error(), containerID) { - klog.Errorf("ContainerStatus %q from runtime service failed: %v", containerID, err) + klog.Errorf("ContainerStats %q from runtime service failed: %v", containerID, err) } return nil, err }