Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add debug logging to eviction manager #41147

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
8 changes: 8 additions & 0 deletions hack/local-up-cluster.sh
Expand Up @@ -41,6 +41,11 @@ CGROUP_ROOT=${CGROUP_ROOT:-""}
# name of the cgroup driver, i.e. cgroupfs or systemd
CGROUP_DRIVER=${CGROUP_DRIVER:-""}

# enables testing eviction scenarios locally.
EVICTION_HARD=${EVICTION_HARD:-"memory.available<100Mi"}
EVICTION_SOFT=${EVICTION_SOFT:-""}
EVICTION_PRESSURE_TRANSITION_PERIOD=${EVICTION_PRESSURE_TRANSITION_PERIOD:-"1m"}

# We disable cluster DNS by default because this script uses docker0 (or whatever
# container bridge docker is currently using) and we don't know the IP of the
# DNS pod to pass in as --cluster-dns. To set this up by hand, set this flag
Expand Down Expand Up @@ -545,6 +550,9 @@ function start_kubelet {
--cgroup-driver=${CGROUP_DRIVER} \
--cgroup-root=${CGROUP_ROOT} \
--keep-terminated-pod-volumes=true \
--eviction-hard=${EVICTION_HARD} \
--eviction-soft=${EVICTION_SOFT} \
--eviction-pressure-transition-period=${EVICTION_PRESSURE_TRANSITION_PERIOD} \
${auth_args} \
${dns_args} \
${net_plugin_dir_args} \
Expand Down
13 changes: 13 additions & 0 deletions pkg/kubelet/eviction/eviction_manager.go
Expand Up @@ -186,6 +186,8 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act
return
}

glog.V(3).Infof("eviction manager: synchronize housekeeping")

// build the ranking functions (if not yet known)
// TODO: have a function in cadvisor that lets us know if global housekeeping has completed
if len(m.resourceToRankFunc) == 0 || len(m.resourceToNodeReclaimFuncs) == 0 {
Expand All @@ -204,6 +206,7 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act
glog.Errorf("eviction manager: unexpected err: %v", err)
return
}
debugLogObservations("observations", observations)

// attempt to create a threshold notifier to improve eviction response time
if m.config.KernelMemcgNotification && !m.notifiersInitialized {
Expand All @@ -230,31 +233,41 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act

// determine the set of thresholds met independent of grace period
thresholds = thresholdsMet(thresholds, observations, false)
debugLogThresholdsWithObservation("thresholds - ignoring grace period", thresholds, observations)

// determine the set of thresholds previously met that have not yet satisfied the associated min-reclaim
if len(m.thresholdsMet) > 0 {
thresholdsNotYetResolved := thresholdsMet(m.thresholdsMet, observations, true)
thresholds = mergeThresholds(thresholds, thresholdsNotYetResolved)
}
debugLogThresholdsWithObservation("thresholds - reclaim not satisfied", thresholds, observations)

// determine the set of thresholds whose stats have been updated since the last sync
thresholds = thresholdsUpdatedStats(thresholds, observations, m.lastObservations)
debugLogThresholdsWithObservation("thresholds - updated stats", thresholds, observations)

// track when a threshold was first observed
now := m.clock.Now()
thresholdsFirstObservedAt := thresholdsFirstObservedAt(thresholds, m.thresholdsFirstObservedAt, now)

// the set of node conditions that are triggered by currently observed thresholds
nodeConditions := nodeConditions(thresholds)
if len(nodeConditions) > 0 {
glog.V(3).Infof("eviction manager: node conditions - observed: %v", nodeConditions)
}

// track when a node condition was last observed
nodeConditionsLastObservedAt := nodeConditionsLastObservedAt(nodeConditions, m.nodeConditionsLastObservedAt, now)

// node conditions report true if it has been observed within the transition period window
nodeConditions = nodeConditionsObservedSince(nodeConditionsLastObservedAt, m.config.PressureTransitionPeriod, now)
if len(nodeConditions) > 0 {
glog.V(3).Infof("eviction manager: node conditions - transition period not met: %v", nodeConditions)
}

// determine the set of thresholds we need to drive eviction behavior (i.e. all grace periods are met)
thresholds = thresholdsMetGracePeriod(thresholdsFirstObservedAt, now)
debugLogThresholdsWithObservation("thresholds - grace periods satisified", thresholds, observations)

// update internal state
m.Lock()
Expand Down
23 changes: 23 additions & 0 deletions pkg/kubelet/eviction/helpers.go
Expand Up @@ -694,6 +694,29 @@ func thresholdsMet(thresholds []Threshold, observations signalObservations, enfo
return results
}

func debugLogObservations(logPrefix string, observations signalObservations) {
for k, v := range observations {
if !v.time.IsZero() {
glog.V(3).Infof("eviction manager: %v: signal=%v, available: %v, capacity: %v, time: %v", logPrefix, k, v.available, v.capacity, v.time)
} else {
glog.V(3).Infof("eviction manager: %v: signal=%v, available: %v, capacity: %v", logPrefix, k, v.available, v.capacity)
}
}
}

func debugLogThresholdsWithObservation(logPrefix string, thresholds []Threshold, observations signalObservations) {
for i := range thresholds {
threshold := thresholds[i]
observed, found := observations[threshold.Signal]
if found {
quantity := getThresholdQuantity(threshold.Value, observed.capacity)
glog.V(3).Infof("eviction manager: %v: threshold [signal=%v, quantity=%v] observed %v", logPrefix, threshold.Signal, quantity, observed.available)
} else {
glog.V(3).Infof("eviction manager: %v: threshold [signal=%v] had no observation", logPrefix, threshold.Signal)
}
}
}

func thresholdsUpdatedStats(thresholds []Threshold, observations, lastObservations signalObservations) []Threshold {
results := []Threshold{}
for i := range thresholds {
Expand Down