Skip to content

Commit

Permalink
Merge pull request #118480 from carlory/gc_metrics
Browse files Browse the repository at this point in the history
podgc metrics should count all pod deletion behaviors
  • Loading branch information
k8s-ci-robot committed Jul 13, 2023
2 parents 2d6d2b9 + 322da7c commit cd92159
Show file tree
Hide file tree
Showing 3 changed files with 55 additions and 25 deletions.
18 changes: 11 additions & 7 deletions pkg/controller/podgc/gc_controller.go
Expand Up @@ -37,6 +37,7 @@ import (
"k8s.io/client-go/tools/cache"
"k8s.io/client-go/util/workqueue"
"k8s.io/klog/v2"
"k8s.io/kubernetes/pkg/controller/podgc/metrics"
"k8s.io/kubernetes/pkg/features"
"k8s.io/kubernetes/pkg/kubelet/eviction"
nodeutil "k8s.io/kubernetes/pkg/util/node"
Expand Down Expand Up @@ -69,11 +70,6 @@ type PodGCController struct {
quarantineTime time.Duration
}

func init() {
// Register prometheus metrics
RegisterMetrics()
}

func NewPodGC(ctx context.Context, kubeClient clientset.Interface, podInformer coreinformers.PodInformer,
nodeInformer coreinformers.NodeInformer, terminatedPodThreshold int) *PodGCController {
return NewPodGCInternal(ctx, kubeClient, podInformer, nodeInformer, terminatedPodThreshold, gcCheckPeriod, quarantineTime)
Expand All @@ -94,6 +90,8 @@ func NewPodGCInternal(ctx context.Context, kubeClient clientset.Interface, podIn
quarantineTime: quarantineTime,
}

// Register prometheus metrics
metrics.RegisterMetrics()
return gcc
}

Expand Down Expand Up @@ -182,11 +180,11 @@ func (gcc *PodGCController) gcTerminating(ctx context.Context, pods []*v1.Pod) {
wait.Add(1)
go func(pod *v1.Pod) {
defer wait.Done()
deletingPodsTotal.WithLabelValues().Inc()
metrics.DeletingPodsTotal.WithLabelValues(pod.Namespace, metrics.PodGCReasonTerminatingOutOfService).Inc()
if err := gcc.markFailedAndDeletePod(ctx, pod); err != nil {
// ignore not founds
utilruntime.HandleError(err)
deletingPodsErrorTotal.WithLabelValues().Inc()
metrics.DeletingPodsErrorTotal.WithLabelValues(pod.Namespace, metrics.PodGCReasonTerminatingOutOfService).Inc()
}
}(terminatingPods[i])
}
Expand Down Expand Up @@ -220,7 +218,9 @@ func (gcc *PodGCController) gcTerminated(ctx context.Context, pods []*v1.Pod) {
if err := gcc.markFailedAndDeletePod(ctx, pod); err != nil {
// ignore not founds
defer utilruntime.HandleError(err)
metrics.DeletingPodsErrorTotal.WithLabelValues(pod.Namespace, metrics.PodGCReasonTerminated).Inc()
}
metrics.DeletingPodsTotal.WithLabelValues(pod.Namespace, metrics.PodGCReasonTerminated).Inc()
}(terminatedPods[i])
}
wait.Wait()
Expand Down Expand Up @@ -259,9 +259,11 @@ func (gcc *PodGCController) gcOrphaned(ctx context.Context, pods []*v1.Pod, node
WithLastTransitionTime(metav1.Now())
if err := gcc.markFailedAndDeletePodWithCondition(ctx, pod, condition); err != nil {
utilruntime.HandleError(err)
metrics.DeletingPodsErrorTotal.WithLabelValues(pod.Namespace, metrics.PodGCReasonOrphaned).Inc()
} else {
logger.Info("Forced deletion of orphaned Pod succeeded", "pod", klog.KObj(pod))
}
metrics.DeletingPodsTotal.WithLabelValues(pod.Namespace, metrics.PodGCReasonOrphaned).Inc()
}
}

Expand Down Expand Up @@ -309,9 +311,11 @@ func (gcc *PodGCController) gcUnscheduledTerminating(ctx context.Context, pods [
logger.V(2).Info("Found unscheduled terminating Pod not assigned to any Node, deleting", "pod", klog.KObj(pod))
if err := gcc.markFailedAndDeletePod(ctx, pod); err != nil {
utilruntime.HandleError(err)
metrics.DeletingPodsErrorTotal.WithLabelValues(pod.Namespace, metrics.PodGCReasonTerminatingUnscheduled).Inc()
} else {
logger.Info("Forced deletion of unscheduled terminating Pod succeeded", "pod", klog.KObj(pod))
}
metrics.DeletingPodsTotal.WithLabelValues(pod.Namespace, metrics.PodGCReasonTerminatingUnscheduled).Inc()
}
}

Expand Down
33 changes: 22 additions & 11 deletions pkg/controller/podgc/gc_controller_test.go
Expand Up @@ -38,6 +38,7 @@ import (
metricstestutil "k8s.io/component-base/metrics/testutil"
"k8s.io/klog/v2/ktesting"
"k8s.io/kubernetes/pkg/controller"
"k8s.io/kubernetes/pkg/controller/podgc/metrics"
"k8s.io/kubernetes/pkg/controller/testutil"
"k8s.io/kubernetes/pkg/features"
"k8s.io/kubernetes/pkg/kubelet/eviction"
Expand Down Expand Up @@ -160,7 +161,7 @@ func TestGCTerminated(t *testing.T) {
for _, pod := range test.pods {
creationTime = creationTime.Add(1 * time.Hour)
pods = append(pods, &v1.Pod{
ObjectMeta: metav1.ObjectMeta{Name: pod.name, CreationTimestamp: metav1.Time{Time: creationTime}},
ObjectMeta: metav1.ObjectMeta{Name: pod.name, Namespace: metav1.NamespaceDefault, CreationTimestamp: metav1.Time{Time: creationTime}},
Status: v1.PodStatus{Phase: pod.phase, Reason: pod.reason},
Spec: v1.PodSpec{NodeName: "node"},
})
Expand All @@ -176,12 +177,16 @@ func TestGCTerminated(t *testing.T) {
verifyDeletedAndPatchedPods(t, client, test.deletedPodNames, test.patchedPodNames)
})
}

// testDeletingPodsMetrics is 9 in this test
testDeletingPodsMetrics(t, 9, metrics.PodGCReasonTerminated)
}

func makePod(name string, nodeName string, phase v1.PodPhase) *v1.Pod {
return &v1.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: name,
Name: name,
Namespace: metav1.NamespaceDefault,
},
Spec: v1.PodSpec{NodeName: nodeName},
Status: v1.PodStatus{Phase: phase},
Expand Down Expand Up @@ -408,6 +413,9 @@ func TestGCOrphaned(t *testing.T) {
verifyDeletedAndPatchedPods(t, client, test.deletedPodNames, test.patchedPodNames)
})
}

// testDeletingPodsMetrics is 10 in this test
testDeletingPodsMetrics(t, 10, metrics.PodGCReasonOrphaned)
}

func TestGCUnscheduledTerminating(t *testing.T) {
Expand Down Expand Up @@ -466,7 +474,7 @@ func TestGCUnscheduledTerminating(t *testing.T) {
for _, pod := range test.pods {
creationTime = creationTime.Add(1 * time.Hour)
pods = append(pods, &v1.Pod{
ObjectMeta: metav1.ObjectMeta{Name: pod.name, CreationTimestamp: metav1.Time{Time: creationTime},
ObjectMeta: metav1.ObjectMeta{Name: pod.name, Namespace: metav1.NamespaceDefault, CreationTimestamp: metav1.Time{Time: creationTime},
DeletionTimestamp: pod.deletionTimeStamp},
Status: v1.PodStatus{Phase: pod.phase},
Spec: v1.PodSpec{NodeName: pod.nodeName},
Expand All @@ -489,6 +497,9 @@ func TestGCUnscheduledTerminating(t *testing.T) {
verifyDeletedAndPatchedPods(t, client, test.deletedPodNames, test.patchedPodNames)
})
}

// testDeletingPodsMetrics is 6 in this test
testDeletingPodsMetrics(t, 6, metrics.PodGCReasonTerminatingUnscheduled)
}

func TestGCTerminating(t *testing.T) {
Expand Down Expand Up @@ -637,7 +648,7 @@ func TestGCTerminating(t *testing.T) {
for _, pod := range test.pods {
creationTime = creationTime.Add(1 * time.Hour)
pods = append(pods, &v1.Pod{
ObjectMeta: metav1.ObjectMeta{Name: pod.name, CreationTimestamp: metav1.Time{Time: creationTime},
ObjectMeta: metav1.ObjectMeta{Name: pod.name, Namespace: metav1.NamespaceDefault, CreationTimestamp: metav1.Time{Time: creationTime},
DeletionTimestamp: pod.deletionTimeStamp},
Status: v1.PodStatus{Phase: pod.phase},
Spec: v1.PodSpec{NodeName: pod.nodeName},
Expand All @@ -657,8 +668,8 @@ func TestGCTerminating(t *testing.T) {
verifyDeletedAndPatchedPods(t, client, test.deletedPodNames, test.patchedPodNames)
})
}
// deletingPodsTotal is 7 in this test
testDeletingPodsMetrics(t, 7)
// testDeletingPodsMetrics is 7 in this test
testDeletingPodsMetrics(t, 7, metrics.PodGCReasonTerminatingOutOfService)
}

func verifyDeletedAndPatchedPods(t *testing.T, client *fake.Clientset, wantDeletedPodNames, wantPatchedPodNames sets.String) {
Expand All @@ -673,18 +684,18 @@ func verifyDeletedAndPatchedPods(t *testing.T, client *fake.Clientset, wantDelet
}
}

func testDeletingPodsMetrics(t *testing.T, inputDeletingPodsTotal int) {
func testDeletingPodsMetrics(t *testing.T, total int, reason string) {
t.Helper()

actualDeletingPodsTotal, err := metricstestutil.GetCounterMetricValue(deletingPodsTotal.WithLabelValues())
actualDeletingPodsTotal, err := metricstestutil.GetCounterMetricValue(metrics.DeletingPodsTotal.WithLabelValues(metav1.NamespaceDefault, reason))
if err != nil {
t.Errorf("Error getting actualDeletingPodsTotal")
}
if actualDeletingPodsTotal != float64(inputDeletingPodsTotal) {
t.Errorf("Expected desiredDeletingPodsTotal to be %d, got %v", inputDeletingPodsTotal, actualDeletingPodsTotal)
if actualDeletingPodsTotal != float64(total) {
t.Errorf("Expected desiredDeletingPodsTotal to be %d, got %v", total, actualDeletingPodsTotal)
}

actualDeletingPodsErrorTotal, err := metricstestutil.GetCounterMetricValue(deletingPodsErrorTotal.WithLabelValues())
actualDeletingPodsErrorTotal, err := metricstestutil.GetCounterMetricValue(metrics.DeletingPodsErrorTotal.WithLabelValues("", reason))
if err != nil {
t.Errorf("Error getting actualDeletingPodsErrorTotal")
}
Expand Down
Expand Up @@ -14,7 +14,7 @@ See the License for the specific language governing permissions and
limitations under the License.
*/

package podgc
package metrics

import (
"sync"
Expand All @@ -28,32 +28,47 @@ const (
)

var (
deletingPodsTotal = metrics.NewCounterVec(
DeletingPodsTotal = metrics.NewCounterVec(
&metrics.CounterOpts{
Subsystem: podGCController,
Name: "force_delete_pods_total",
Help: "Number of pods that are being forcefully deleted since the Pod GC Controller started.",
StabilityLevel: metrics.ALPHA,
},
[]string{},
[]string{"namespace", "reason"},
)
deletingPodsErrorTotal = metrics.NewCounterVec(
DeletingPodsErrorTotal = metrics.NewCounterVec(
&metrics.CounterOpts{
Subsystem: podGCController,
Name: "force_delete_pod_errors_total",
Help: "Number of errors encountered when forcefully deleting the pods since the Pod GC Controller started.",
StabilityLevel: metrics.ALPHA,
},
[]string{},
[]string{"namespace", "reason"},
)
)

const (
// Possible values for the "reason" label in the above metrics.

// PodGCReasonTerminated is used when the pod is terminated.
PodGCReasonTerminated = "terminated"
// PodGCReasonCompleted is used when the pod is terminating and the corresponding node
// is not ready and has `node.kubernetes.io/out-of-service` taint.
PodGCReasonTerminatingOutOfService = "out-of-service"
// PodGCReasonOrphaned is used when the pod is orphaned which means the corresponding node
// has been deleted.
PodGCReasonOrphaned = "orphaned"
// PodGCReasonUnscheduled is used when the pod is terminating and unscheduled.
PodGCReasonTerminatingUnscheduled = "unscheduled"
)

var registerMetrics sync.Once

// Register the metrics that are to be monitored.
func RegisterMetrics() {
registerMetrics.Do(func() {
legacyregistry.MustRegister(deletingPodsTotal)
legacyregistry.MustRegister(deletingPodsErrorTotal)
legacyregistry.MustRegister(DeletingPodsTotal)
legacyregistry.MustRegister(DeletingPodsErrorTotal)
})
}

0 comments on commit cd92159

Please sign in to comment.