Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Put restart count into docker label #15965

Merged
merged 1 commit into from
Oct 24, 2015
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
11 changes: 9 additions & 2 deletions pkg/kubelet/dockertools/fake_docker_client.go
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,9 @@ func (f *FakeDockerClient) ListContainers(options docker.ListContainersOptions)
f.called = append(f.called, "list")
err := f.popError("list")
if options.All {
// Althought the container is not sorted, but the container with the same name should be in order,
// that is enough for us now.
// TODO (random-liu) Is a fully sorted array needed?
return append(f.ContainerList, f.ExitedContainerList...), err
}
return append([]docker.APIContainers{}, f.ContainerList...), err
Expand Down Expand Up @@ -204,7 +207,10 @@ func (f *FakeDockerClient) CreateContainer(c docker.CreateContainerOptions) (*do
// This is not a very good fake. We'll just add this container's name to the list.
// Docker likes to add a '/', so copy that behavior.
name := "/" + c.Name
f.ContainerList = append(f.ContainerList, docker.APIContainers{ID: name, Names: []string{name}, Image: c.Config.Image})
// The newest container should be in front, because we assume so in GetPodStatus()
f.ContainerList = append([]docker.APIContainers{
{ID: name, Names: []string{name}, Image: c.Config.Image, Labels: c.Config.Labels},
}, f.ContainerList...)
container := docker.Container{ID: name, Name: name, Config: c.Config}
if f.ContainerMap != nil {
containerCopy := container
Expand Down Expand Up @@ -266,7 +272,8 @@ func (f *FakeDockerClient) StopContainer(id string, timeout uint) error {
var newList []docker.APIContainers
for _, container := range f.ContainerList {
if container.ID == id {
f.ExitedContainerList = append(f.ExitedContainerList, container)
// The newest exited container should be in front. Because we assume so in GetPodStatus()
f.ExitedContainerList = append([]docker.APIContainers{container}, f.ExitedContainerList...)
continue
}
newList = append(newList, container)
Expand Down
131 changes: 72 additions & 59 deletions pkg/kubelet/dockertools/manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ const (
kubernetesPodLabel = "io.kubernetes.pod.data"
kubernetesTerminationGracePeriodLabel = "io.kubernetes.pod.terminationGracePeriod"
kubernetesContainerLabel = "io.kubernetes.container.name"
kubernetesContainerRestartCountLabel = "io.kubernetes.container.restartCount"

DockerNetnsFmt = "/proc/%v/ns/net"
)
Expand Down Expand Up @@ -368,11 +369,27 @@ func (dm *DockerManager) inspectContainer(dockerID, containerName, tPath string,
}

glog.V(4).Infof("Container inspect result: %+v", *inspectResult)

// Get restartCount from docker label, and add into the result.
// If there is no restart count label in an container:
// 1. It is an infraContainer, it will never use restart count.
// 2. It is an old container or an invalid container, we just set restart count to 0 now.
var restartCount int
if restartCountString, found := inspectResult.Config.Labels[kubernetesContainerRestartCountLabel]; found {
restartCount, err = strconv.Atoi(restartCountString)
if err != nil {
glog.Errorf("Error parsing restart count string %s for container %s: %v,", restartCountString, dockerID, err)
// Just set restartCount to 0 to handle this abnormal case
restartCount = 0
}
}

result.status = api.ContainerStatus{
Name: containerName,
Image: inspectResult.Config.Image,
ImageID: DockerPrefix + inspectResult.Image,
ContainerID: DockerPrefix + dockerID,
Name: containerName,
RestartCount: restartCount,
Image: inspectResult.Config.Image,
ImageID: DockerPrefix + inspectResult.Image,
ContainerID: DockerPrefix + dockerID,
}

if inspectResult.State.Running {
Expand Down Expand Up @@ -431,23 +448,21 @@ func (dm *DockerManager) inspectContainer(dockerID, containerName, tPath string,
// GetPodStatus returns docker related status for all containers in the pod as
// well as the infrastructure container.
func (dm *DockerManager) GetPodStatus(pod *api.Pod) (*api.PodStatus, error) {
// Now we retain restart count of container as a docker label. Each time a container
// restarts, pod will read the restart count from the latest dead container, increment
// it to get the new restart count, and then add a label with the new restart count on
// the newly started container.
// However, there are some limitations of this method:
// 1. When all dead containers were garbage collected, the container status could
// not get the historical value and would be *inaccurate*. Fortunately, the chance
// is really slim.
// 2. When working with old version containers which have no restart count label,
// we can only assume their restart count is 0.
// Anyhow, we only promised "best-effort" restart count reporting, we can just ignore
// these limitations now.
podFullName := kubecontainer.GetPodFullName(pod)
uid := pod.UID
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggestion: add a comment here stating the limitation of this function. E.g., if all dead containers were garbage collected, the container status may be inaccurate (including state and restart count).

manifest := pod.Spec

oldStatuses := make(map[string]api.ContainerStatus, len(pod.Spec.Containers))
lastObservedTime := make(map[string]unversioned.Time, len(pod.Spec.Containers))
// Record the last time we observed a container termination.
for _, status := range pod.Status.ContainerStatuses {
oldStatuses[status.Name] = status
if status.LastTerminationState.Terminated != nil {
timestamp, ok := lastObservedTime[status.Name]
if !ok || timestamp.Before(status.LastTerminationState.Terminated.FinishedAt) {
lastObservedTime[status.Name] = status.LastTerminationState.Terminated.FinishedAt
}
}
}

var podStatus api.PodStatus
statuses := make(map[string]*api.ContainerStatus, len(pod.Spec.Containers))

Expand All @@ -467,6 +482,7 @@ func (dm *DockerManager) GetPodStatus(pod *api.Pod) (*api.PodStatus, error) {
// the statuses. We assume docker returns a list of containers sorted in
// reverse by time.
for _, value := range containers {
// TODO (random-liu) Filter by docker label later
if len(value.Names) == 0 {
continue
}
Expand All @@ -490,65 +506,41 @@ func (dm *DockerManager) GetPodStatus(pod *api.Pod) (*api.PodStatus, error) {
continue
}

var terminationState *api.ContainerState = nil
// Inspect the container.
result := dm.inspectContainer(value.ID, dockerContainerName, terminationMessagePath, pod)
if result.err != nil {
return nil, result.err
} else if result.status.State.Terminated != nil {
terminationState = &result.status.State
}

if containerStatus, found := statuses[dockerContainerName]; found {
if containerStatus.LastTerminationState.Terminated == nil && terminationState != nil {
// Populate the last termination state.
containerStatus.LastTerminationState = *terminationState
}
if terminationState == nil {
// Not a dead container.
// There should be no alive containers with the same name. Just in case.
if result.status.State.Terminated == nil {
continue
}
// Only count dead containers terminated after last time we observed,
lastObservedTime, ok := lastObservedTime[dockerContainerName]
if !ok || terminationState.Terminated.FinishedAt.After(lastObservedTime.Time) {
containerStatus.RestartCount += 1
} else {
// The container finished before the last observation. No
// need to examine/count the older containers. Mark the
// container name as done.
containerDone.Insert(dockerContainerName)
}
containerStatus.LastTerminationState = result.status.State
// Got the last termination state, we do not need to care about the other containers any more
containerDone.Insert(dockerContainerName)
continue
}

if dockerContainerName == PodInfraContainerName {
// Found network container
if result.status.State.Running != nil {
podStatus.PodIP = result.ip
}
} else {
// Add user container information.
if oldStatus, found := oldStatuses[dockerContainerName]; found {
// Use the last observed restart count if it's available.
result.status.RestartCount = oldStatus.RestartCount
}
statuses[dockerContainerName] = &result.status
}
}

// Handle the containers for which we cannot find any associated active or dead docker containers or are in restart backoff
// Fetch old containers statuses from old pod status.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we may not need this now that we write the restart count as a label. kubelet would only lose the count if the most recently died container had been GC'd before we create a new container. I think the chance is slim (though higher now due to container restart backoff).

You may also want to remove the hacky fix in https://github.com/kubernetes/kubernetes/blob/master/pkg/kubelet/kubelet.go#L2633

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Remove line 527 to 530 since you've removed the hacky fix. The function here will not get the latest status anyway.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let me see...If we really lost all the containers, would it be better to retain the restart count in old status although it may not be up-to-date? Or do you think this effort is inefficient, and the chance is really slim, we can really just ignore it?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The hacky fix actually passes the latest status known to kubelet to this function. Now that you've removed that, this only prevents once situation -- where the containers were removed during kubelet restarts. This should not happen often, but I suppose we can keep this :)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK, I'll keep it now, maybe remove it someday in the future. :)

oldStatuses := make(map[string]api.ContainerStatus, len(manifest.Containers))
for _, status := range pod.Status.ContainerStatuses {
oldStatuses[status.Name] = status
}
for _, container := range manifest.Containers {
if containerStatus, found := statuses[container.Name]; found {
reasonInfo, ok := dm.reasonCache.Get(uid, container.Name)
if ok && reasonInfo.reason == kubecontainer.ErrCrashLoopBackOff.Error() {
// We need to increment the restart count if we are going to
// move the current state to last terminated state.
if containerStatus.State.Terminated != nil {
lastObservedTime, ok := lastObservedTime[container.Name]
if !ok || containerStatus.State.Terminated.FinishedAt.After(lastObservedTime.Time) {
containerStatus.RestartCount += 1
}
}
containerStatus.LastTerminationState = containerStatus.State
containerStatus.State = api.ContainerState{
Waiting: &api.ContainerStateWaiting{
Expand Down Expand Up @@ -690,7 +682,8 @@ func (dm *DockerManager) runContainer(
netMode string,
ipcMode string,
utsMode string,
pidMode string) (kubecontainer.ContainerID, error) {
pidMode string,
labels map[string]string) (kubecontainer.ContainerID, error) {

dockerName := KubeletContainerName{
PodFullName: kubecontainer.GetPodFullName(pod),
Expand All @@ -712,9 +705,11 @@ func (dm *DockerManager) runContainer(
// termination information like the termination grace period and the pre stop hooks.
// TODO: keep these labels up to date if the pod changes
namespacedName := types.NamespacedName{Namespace: pod.Namespace, Name: pod.Name}
labels := map[string]string{
kubernetesNameLabel: namespacedName.String(),
// Just in case. If there is no label, just pass nil. An empty map will be created here.
if labels == nil {
labels = map[string]string{}
}
labels[kubernetesNameLabel] = namespacedName.String()
if pod.Spec.TerminationGracePeriodSeconds != nil {
labels[kubernetesTerminationGracePeriodLabel] = strconv.FormatInt(*pod.Spec.TerminationGracePeriodSeconds, 10)
}
Expand Down Expand Up @@ -1500,7 +1495,9 @@ func containerAndPodFromLabels(inspect *docker.Container) (pod *api.Pod, contain
}

// Run a single container from a pod. Returns the docker container ID
func (dm *DockerManager) runContainerInPod(pod *api.Pod, container *api.Container, netMode, ipcMode, pidMode string) (kubecontainer.ContainerID, error) {
// If do not need to pass labels, just pass nil.
// TODO (random-liu) Just add labels directly now, maybe should make some abstraction.
func (dm *DockerManager) runContainerInPod(pod *api.Pod, container *api.Container, netMode, ipcMode, pidMode string, labels map[string]string) (kubecontainer.ContainerID, error) {
start := time.Now()
defer func() {
metrics.ContainerManagerLatency.WithLabelValues("runContainerInPod").Observe(metrics.SinceInMicroseconds(start))
Expand All @@ -1520,7 +1517,7 @@ func (dm *DockerManager) runContainerInPod(pod *api.Pod, container *api.Containe
if pod.Spec.SecurityContext != nil && pod.Spec.SecurityContext.HostNetwork {
utsMode = "host"
}
id, err := dm.runContainer(pod, container, opts, ref, netMode, ipcMode, utsMode, pidMode)
id, err := dm.runContainer(pod, container, opts, ref, netMode, ipcMode, utsMode, pidMode, labels)
if err != nil {
return kubecontainer.ContainerID{}, err
}
Expand Down Expand Up @@ -1657,7 +1654,8 @@ func (dm *DockerManager) createPodInfraContainer(pod *api.Pod) (kubetypes.Docker
return "", err
}

id, err := dm.runContainerInPod(pod, container, netNamespace, getIPCMode(pod), getPidMode(pod))
// There is no meaningful labels for infraContainer now, so just pass nil.
id, err := dm.runContainerInPod(pod, container, netNamespace, getIPCMode(pod), getPidMode(pod), nil)
if err != nil {
return "", err
}
Expand Down Expand Up @@ -1920,13 +1918,28 @@ func (dm *DockerManager) SyncPod(pod *api.Pod, runningPod kubecontainer.Pod, pod
}
}

labels := map[string]string{}
containerStatuses := podStatus.ContainerStatuses
// podStatus is generated by GetPodStatus(). In GetPodStatus(), we make sure that ContainerStatuses
// contains statuses of all containers in pod.Spec.Containers.
// ContainerToStart is a subset of pod.Spec.Containers, we should always find a result here.
// For a new container, the RestartCount should be 0
labels[kubernetesContainerRestartCountLabel] = "0"
for _, containerStatus := range containerStatuses {
// If the container's terminate state is not empty, it exited before. Increment the restart count.
if containerStatus.Name == container.Name && (containerStatus.State.Terminated != nil || containerStatus.LastTerminationState.Terminated != nil) {
labels[kubernetesContainerRestartCountLabel] = strconv.Itoa(containerStatus.RestartCount + 1)
break
}
}

// TODO(dawnchen): Check RestartPolicy.DelaySeconds before restart a container
// Note: when configuring the pod's containers anything that can be configured by pointing
// to the namespace of the infra container should use namespaceMode. This includes things like the net namespace
// and IPC namespace. PID mode cannot point to another container right now.
// See createPodInfraContainer for infra container setup.
namespaceMode := fmt.Sprintf("container:%v", podInfraContainerID)
_, err = dm.runContainerInPod(pod, container, namespaceMode, namespaceMode, getPidMode(pod))
_, err = dm.runContainerInPod(pod, container, namespaceMode, namespaceMode, getPidMode(pod), labels)
dm.updateReasonCache(pod, container, kubecontainer.ErrRunContainer.Error(), err)
if err != nil {
// TODO(bburns) : Perhaps blacklist a container after N failures?
Expand Down