Skip to content

Commit

Permalink
Merge pull request #48739 from yguo0905/fix-dup-metrics
Browse files Browse the repository at this point in the history
Automatic merge from submit-queue

Remove the status of the terminated containers in the summary endpoint

Ref: #47853

- When building summary, a container is considered to be terminated if it has an older creation time and no CPU instantaneous or memory RSS usage.
- We remove the terminated containers in the summary by grouping the containers with the same name in the same pod, sorting them in each group by creation time, and skipping the oldest ones with no usage in each group. Let me know if there's simpler way.

**Release note**:
```
None
```
/assign @yujuhong
  • Loading branch information
Kubernetes Submit Queue committed Aug 15, 2017
2 parents 3537f8f + af76564 commit f59b04b
Show file tree
Hide file tree
Showing 2 changed files with 151 additions and 10 deletions.
110 changes: 100 additions & 10 deletions pkg/kubelet/server/stats/summary.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ package stats

import (
"fmt"
"sort"
"strings"
"time"

Expand Down Expand Up @@ -128,7 +129,7 @@ func (sb *summaryBuilder) build() (*stats.Summary, error) {
}

rootStats := sb.containerInfoV2ToStats("", &rootInfo)
cStats, _ := sb.latestContainerStats(&rootInfo)
cStats, _ := latestContainerStats(&rootInfo)
nodeStats := stats.NodeStats{
NodeName: sb.node.Name,
CPU: rootStats.CPU,
Expand Down Expand Up @@ -184,7 +185,7 @@ func (sb *summaryBuilder) containerInfoV2FsStats(
info *cadvisorapiv2.ContainerInfo,
cs *stats.ContainerStats) {

lcs, found := sb.latestContainerStats(info)
lcs, found := latestContainerStats(info)
if !found {
return
}
Expand Down Expand Up @@ -230,7 +231,7 @@ func (sb *summaryBuilder) containerInfoV2FsStats(
}

// latestContainerStats returns the latest container stats from cadvisor, or nil if none exist
func (sb *summaryBuilder) latestContainerStats(info *cadvisorapiv2.ContainerInfo) (*cadvisorapiv2.ContainerStats, bool) {
func latestContainerStats(info *cadvisorapiv2.ContainerInfo) (*cadvisorapiv2.ContainerStats, bool) {
stats := info.Stats
if len(stats) < 1 {
return nil, false
Expand All @@ -242,23 +243,112 @@ func (sb *summaryBuilder) latestContainerStats(info *cadvisorapiv2.ContainerInfo
return latest, true
}

// hasMemoryAndCPUInstUsage returns true if the specified container info has
// both non-zero CPU instantaneous usage and non-zero memory RSS usage, and
// false otherwise.
func hasMemoryAndCPUInstUsage(info *cadvisorapiv2.ContainerInfo) bool {
if !info.Spec.HasCpu || !info.Spec.HasMemory {
return false
}
cstat, found := latestContainerStats(info)
if !found {
return false
}
if cstat.CpuInst == nil {
return false
}
return cstat.CpuInst.Usage.Total != 0 && cstat.Memory.RSS != 0
}

// ByCreationTime implements sort.Interface for []containerInfoWithCgroup based
// on the cinfo.Spec.CreationTime field.
type ByCreationTime []containerInfoWithCgroup

func (a ByCreationTime) Len() int { return len(a) }
func (a ByCreationTime) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
func (a ByCreationTime) Less(i, j int) bool {
if a[i].cinfo.Spec.CreationTime.Equal(a[j].cinfo.Spec.CreationTime) {
// There shouldn't be two containers with the same name and/or the same
// creation time. However, to make the logic here robust, we break the
// tie by moving the one without CPU instantaneous or memory RSS usage
// to the beginning.
return hasMemoryAndCPUInstUsage(&a[j].cinfo)
}
return a[i].cinfo.Spec.CreationTime.Before(a[j].cinfo.Spec.CreationTime)
}

// containerID is the identity of a container in a pod.
type containerID struct {
podRef stats.PodReference
containerName string
}

// containerInfoWithCgroup contains the ContainerInfo and its cgroup name.
type containerInfoWithCgroup struct {
cinfo cadvisorapiv2.ContainerInfo
cgroup string
}

// removeTerminatedContainerInfo returns the specified containerInfo but with
// the stats of the terminated containers removed.
//
// A ContainerInfo is considered to be of a terminated container if it has an
// older CreationTime and zero CPU instantaneous and memory RSS usage.
func removeTerminatedContainerInfo(containerInfo map[string]cadvisorapiv2.ContainerInfo) map[string]cadvisorapiv2.ContainerInfo {
cinfoMap := make(map[containerID][]containerInfoWithCgroup)
for key, cinfo := range containerInfo {
if !isPodManagedContainer(&cinfo) {
continue
}
cinfoID := containerID{
podRef: buildPodRef(&cinfo),
containerName: types.GetContainerName(cinfo.Spec.Labels),
}
cinfoMap[cinfoID] = append(cinfoMap[cinfoID], containerInfoWithCgroup{
cinfo: cinfo,
cgroup: key,
})
}
result := make(map[string]cadvisorapiv2.ContainerInfo)
for _, refs := range cinfoMap {
if len(refs) == 1 {
result[refs[0].cgroup] = refs[0].cinfo
continue
}
sort.Sort(ByCreationTime(refs))
i := 0
for ; i < len(refs); i++ {
if hasMemoryAndCPUInstUsage(&refs[i].cinfo) {
// Stops removing when we first see an info with non-zero
// CPU/Memory usage.
break
}
}
for ; i < len(refs); i++ {
result[refs[i].cgroup] = refs[i].cinfo
}
}
return result
}

// buildSummaryPods aggregates and returns the container stats in cinfos by the Pod managing the container.
// Containers not managed by a Pod are omitted.
func (sb *summaryBuilder) buildSummaryPods() []stats.PodStats {
// Map each container to a pod and update the PodStats with container data
podToStats := map[stats.PodReference]*stats.PodStats{}
for key, cinfo := range sb.infos {
infos := removeTerminatedContainerInfo(sb.infos)
for key, cinfo := range infos {
// on systemd using devicemapper each mount into the container has an associated cgroup.
// we ignore them to ensure we do not get duplicate entries in our summary.
// for details on .mount units: http://man7.org/linux/man-pages/man5/systemd.mount.5.html
if strings.HasSuffix(key, ".mount") {
continue
}
// Build the Pod key if this container is managed by a Pod
if !sb.isPodManagedContainer(&cinfo) {
if !isPodManagedContainer(&cinfo) {
continue
}
ref := sb.buildPodRef(&cinfo)
ref := buildPodRef(&cinfo)

// Lookup the PodStats for the pod using the PodRef. If none exists, initialize a new entry.
podStats, found := podToStats[ref]
Expand Down Expand Up @@ -292,15 +382,15 @@ func (sb *summaryBuilder) buildSummaryPods() []stats.PodStats {
}

// buildPodRef returns a PodReference that identifies the Pod managing cinfo
func (sb *summaryBuilder) buildPodRef(cinfo *cadvisorapiv2.ContainerInfo) stats.PodReference {
func buildPodRef(cinfo *cadvisorapiv2.ContainerInfo) stats.PodReference {
podName := types.GetPodName(cinfo.Spec.Labels)
podNamespace := types.GetPodNamespace(cinfo.Spec.Labels)
podUID := types.GetPodUID(cinfo.Spec.Labels)
return stats.PodReference{Name: podName, Namespace: podNamespace, UID: podUID}
}

// isPodManagedContainer returns true if the cinfo container is managed by a Pod
func (sb *summaryBuilder) isPodManagedContainer(cinfo *cadvisorapiv2.ContainerInfo) bool {
func isPodManagedContainer(cinfo *cadvisorapiv2.ContainerInfo) bool {
podName := types.GetPodName(cinfo.Spec.Labels)
podNamespace := types.GetPodNamespace(cinfo.Spec.Labels)
managed := podName != "" && podNamespace != ""
Expand All @@ -319,7 +409,7 @@ func (sb *summaryBuilder) containerInfoV2ToStats(
StartTime: metav1.NewTime(info.Spec.CreationTime),
Name: name,
}
cstat, found := sb.latestContainerStats(info)
cstat, found := latestContainerStats(info)
if !found {
return cStats
}
Expand Down Expand Up @@ -371,7 +461,7 @@ func (sb *summaryBuilder) containerInfoV2ToNetworkStats(name string, info *cadvi
if !info.Spec.HasNetwork {
return nil
}
cstat, found := sb.latestContainerStats(info)
cstat, found := latestContainerStats(info)
if !found {
return nil
}
Expand Down
51 changes: 51 additions & 0 deletions pkg/kubelet/server/stats/summary_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,50 @@ var (
creationTime = timestamp.Add(-5 * time.Minute)
)

func TestRemoveTerminatedContainerInfo(t *testing.T) {
const (
seedPastPod0Infra = 1000
seedPastPod0Container0 = 2000
seedPod0Infra = 3000
seedPod0Container0 = 4000
)
const (
namespace = "test"
pName0 = "pod0"
cName00 = "c0"
)
infos := map[string]v2.ContainerInfo{
// ContainerInfo with past creation time and no CPU/memory usage for
// simulating uncleaned cgroups of already terminated containers, which
// should not be shown in the results.
"/pod0-i-terminated-1": summaryTerminatedContainerInfo(seedPastPod0Infra, pName0, namespace, leaky.PodInfraContainerName),
"/pod0-c0-terminated-1": summaryTerminatedContainerInfo(seedPastPod0Container0, pName0, namespace, cName00),

// Same as above but uses the same creation time as the latest
// containers. They are terminated containers, so they should not be in
// the results.
"/pod0-i-terminated-2": summaryTerminatedContainerInfo(seedPod0Infra, pName0, namespace, leaky.PodInfraContainerName),
"/pod0-c0-terminated-2": summaryTerminatedContainerInfo(seedPod0Container0, pName0, namespace, cName00),

// The latest containers, which should be in the results.
"/pod0-i": summaryTestContainerInfo(seedPod0Infra, pName0, namespace, leaky.PodInfraContainerName),
"/pod0-c0": summaryTestContainerInfo(seedPod0Container0, pName0, namespace, cName00),

// Duplicated containers with non-zero CPU and memory usage. This case
// shouldn't happen unless something goes wrong, but we want to test
// that the metrics reporting logic works in this scenario.
"/pod0-i-duplicated": summaryTestContainerInfo(seedPod0Infra, pName0, namespace, leaky.PodInfraContainerName),
"/pod0-c0-duplicated": summaryTestContainerInfo(seedPod0Container0, pName0, namespace, cName00),
}
output := removeTerminatedContainerInfo(infos)
assert.Len(t, output, 4)
for _, c := range []string{"/pod0-i", "/pod0-c0", "/pod0-i-duplicated", "/pod0-c0-duplicated"} {
if _, found := output[c]; !found {
t.Errorf("%q is expected to be in the output\n", c)
}
}
}

func TestBuildSummary(t *testing.T) {
node := k8sv1.Node{}
node.Name = "FooNode"
Expand Down Expand Up @@ -280,6 +324,13 @@ func generateCustomMetrics(spec []v1.MetricSpec) map[string][]v1.MetricVal {
return ret
}

func summaryTerminatedContainerInfo(seed int, podName string, podNamespace string, containerName string) v2.ContainerInfo {
cinfo := summaryTestContainerInfo(seed, podName, podNamespace, containerName)
cinfo.Stats[0].Memory.RSS = 0
cinfo.Stats[0].CpuInst.Usage.Total = 0
return cinfo
}

func summaryTestContainerInfo(seed int, podName string, podNamespace string, containerName string) v2.ContainerInfo {
labels := map[string]string{}
if podName != "" {
Expand Down

0 comments on commit f59b04b

Please sign in to comment.