diff --git a/pkg/controller/podgc/gc_controller.go b/pkg/controller/podgc/gc_controller.go index 8e6d6cc5b1e0..eba8edb1070c 100644 --- a/pkg/controller/podgc/gc_controller.go +++ b/pkg/controller/podgc/gc_controller.go @@ -37,6 +37,7 @@ import ( "k8s.io/client-go/tools/cache" "k8s.io/client-go/util/workqueue" "k8s.io/klog/v2" + "k8s.io/kubernetes/pkg/controller/podgc/metrics" "k8s.io/kubernetes/pkg/features" "k8s.io/kubernetes/pkg/kubelet/eviction" nodeutil "k8s.io/kubernetes/pkg/util/node" @@ -69,11 +70,6 @@ type PodGCController struct { quarantineTime time.Duration } -func init() { - // Register prometheus metrics - RegisterMetrics() -} - func NewPodGC(ctx context.Context, kubeClient clientset.Interface, podInformer coreinformers.PodInformer, nodeInformer coreinformers.NodeInformer, terminatedPodThreshold int) *PodGCController { return NewPodGCInternal(ctx, kubeClient, podInformer, nodeInformer, terminatedPodThreshold, gcCheckPeriod, quarantineTime) @@ -94,6 +90,8 @@ func NewPodGCInternal(ctx context.Context, kubeClient clientset.Interface, podIn quarantineTime: quarantineTime, } + // Register prometheus metrics + metrics.RegisterMetrics() return gcc } @@ -182,11 +180,11 @@ func (gcc *PodGCController) gcTerminating(ctx context.Context, pods []*v1.Pod) { wait.Add(1) go func(pod *v1.Pod) { defer wait.Done() - deletingPodsTotal.WithLabelValues().Inc() + metrics.DeletingPodsTotal.WithLabelValues(pod.Namespace, metrics.PodGCReasonTerminatingOutOfService).Inc() if err := gcc.markFailedAndDeletePod(ctx, pod); err != nil { // ignore not founds utilruntime.HandleError(err) - deletingPodsErrorTotal.WithLabelValues().Inc() + metrics.DeletingPodsErrorTotal.WithLabelValues(pod.Namespace, metrics.PodGCReasonTerminatingOutOfService).Inc() } }(terminatingPods[i]) } @@ -220,7 +218,9 @@ func (gcc *PodGCController) gcTerminated(ctx context.Context, pods []*v1.Pod) { if err := gcc.markFailedAndDeletePod(ctx, pod); err != nil { // ignore not founds defer utilruntime.HandleError(err) + metrics.DeletingPodsErrorTotal.WithLabelValues(pod.Namespace, metrics.PodGCReasonTerminated).Inc() } + metrics.DeletingPodsTotal.WithLabelValues(pod.Namespace, metrics.PodGCReasonTerminated).Inc() }(terminatedPods[i]) } wait.Wait() @@ -259,9 +259,11 @@ func (gcc *PodGCController) gcOrphaned(ctx context.Context, pods []*v1.Pod, node WithLastTransitionTime(metav1.Now()) if err := gcc.markFailedAndDeletePodWithCondition(ctx, pod, condition); err != nil { utilruntime.HandleError(err) + metrics.DeletingPodsErrorTotal.WithLabelValues(pod.Namespace, metrics.PodGCReasonOrphaned).Inc() } else { logger.Info("Forced deletion of orphaned Pod succeeded", "pod", klog.KObj(pod)) } + metrics.DeletingPodsTotal.WithLabelValues(pod.Namespace, metrics.PodGCReasonOrphaned).Inc() } } @@ -309,9 +311,11 @@ func (gcc *PodGCController) gcUnscheduledTerminating(ctx context.Context, pods [ logger.V(2).Info("Found unscheduled terminating Pod not assigned to any Node, deleting", "pod", klog.KObj(pod)) if err := gcc.markFailedAndDeletePod(ctx, pod); err != nil { utilruntime.HandleError(err) + metrics.DeletingPodsErrorTotal.WithLabelValues(pod.Namespace, metrics.PodGCReasonTerminatingUnscheduled).Inc() } else { logger.Info("Forced deletion of unscheduled terminating Pod succeeded", "pod", klog.KObj(pod)) } + metrics.DeletingPodsTotal.WithLabelValues(pod.Namespace, metrics.PodGCReasonTerminatingUnscheduled).Inc() } } diff --git a/pkg/controller/podgc/gc_controller_test.go b/pkg/controller/podgc/gc_controller_test.go index b02a21ae3b57..9f1a2c57cfdd 100644 --- a/pkg/controller/podgc/gc_controller_test.go +++ b/pkg/controller/podgc/gc_controller_test.go @@ -38,6 +38,7 @@ import ( metricstestutil "k8s.io/component-base/metrics/testutil" "k8s.io/klog/v2/ktesting" "k8s.io/kubernetes/pkg/controller" + "k8s.io/kubernetes/pkg/controller/podgc/metrics" "k8s.io/kubernetes/pkg/controller/testutil" "k8s.io/kubernetes/pkg/features" "k8s.io/kubernetes/pkg/kubelet/eviction" @@ -160,7 +161,7 @@ func TestGCTerminated(t *testing.T) { for _, pod := range test.pods { creationTime = creationTime.Add(1 * time.Hour) pods = append(pods, &v1.Pod{ - ObjectMeta: metav1.ObjectMeta{Name: pod.name, CreationTimestamp: metav1.Time{Time: creationTime}}, + ObjectMeta: metav1.ObjectMeta{Name: pod.name, Namespace: metav1.NamespaceDefault, CreationTimestamp: metav1.Time{Time: creationTime}}, Status: v1.PodStatus{Phase: pod.phase, Reason: pod.reason}, Spec: v1.PodSpec{NodeName: "node"}, }) @@ -176,12 +177,16 @@ func TestGCTerminated(t *testing.T) { verifyDeletedAndPatchedPods(t, client, test.deletedPodNames, test.patchedPodNames) }) } + + // testDeletingPodsMetrics is 9 in this test + testDeletingPodsMetrics(t, 9, metrics.PodGCReasonTerminated) } func makePod(name string, nodeName string, phase v1.PodPhase) *v1.Pod { return &v1.Pod{ ObjectMeta: metav1.ObjectMeta{ - Name: name, + Name: name, + Namespace: metav1.NamespaceDefault, }, Spec: v1.PodSpec{NodeName: nodeName}, Status: v1.PodStatus{Phase: phase}, @@ -408,6 +413,9 @@ func TestGCOrphaned(t *testing.T) { verifyDeletedAndPatchedPods(t, client, test.deletedPodNames, test.patchedPodNames) }) } + + // testDeletingPodsMetrics is 10 in this test + testDeletingPodsMetrics(t, 10, metrics.PodGCReasonOrphaned) } func TestGCUnscheduledTerminating(t *testing.T) { @@ -466,7 +474,7 @@ func TestGCUnscheduledTerminating(t *testing.T) { for _, pod := range test.pods { creationTime = creationTime.Add(1 * time.Hour) pods = append(pods, &v1.Pod{ - ObjectMeta: metav1.ObjectMeta{Name: pod.name, CreationTimestamp: metav1.Time{Time: creationTime}, + ObjectMeta: metav1.ObjectMeta{Name: pod.name, Namespace: metav1.NamespaceDefault, CreationTimestamp: metav1.Time{Time: creationTime}, DeletionTimestamp: pod.deletionTimeStamp}, Status: v1.PodStatus{Phase: pod.phase}, Spec: v1.PodSpec{NodeName: pod.nodeName}, @@ -489,6 +497,9 @@ func TestGCUnscheduledTerminating(t *testing.T) { verifyDeletedAndPatchedPods(t, client, test.deletedPodNames, test.patchedPodNames) }) } + + // testDeletingPodsMetrics is 6 in this test + testDeletingPodsMetrics(t, 6, metrics.PodGCReasonTerminatingUnscheduled) } func TestGCTerminating(t *testing.T) { @@ -637,7 +648,7 @@ func TestGCTerminating(t *testing.T) { for _, pod := range test.pods { creationTime = creationTime.Add(1 * time.Hour) pods = append(pods, &v1.Pod{ - ObjectMeta: metav1.ObjectMeta{Name: pod.name, CreationTimestamp: metav1.Time{Time: creationTime}, + ObjectMeta: metav1.ObjectMeta{Name: pod.name, Namespace: metav1.NamespaceDefault, CreationTimestamp: metav1.Time{Time: creationTime}, DeletionTimestamp: pod.deletionTimeStamp}, Status: v1.PodStatus{Phase: pod.phase}, Spec: v1.PodSpec{NodeName: pod.nodeName}, @@ -657,8 +668,8 @@ func TestGCTerminating(t *testing.T) { verifyDeletedAndPatchedPods(t, client, test.deletedPodNames, test.patchedPodNames) }) } - // deletingPodsTotal is 7 in this test - testDeletingPodsMetrics(t, 7) + // testDeletingPodsMetrics is 7 in this test + testDeletingPodsMetrics(t, 7, metrics.PodGCReasonTerminatingOutOfService) } func verifyDeletedAndPatchedPods(t *testing.T, client *fake.Clientset, wantDeletedPodNames, wantPatchedPodNames sets.String) { @@ -673,18 +684,18 @@ func verifyDeletedAndPatchedPods(t *testing.T, client *fake.Clientset, wantDelet } } -func testDeletingPodsMetrics(t *testing.T, inputDeletingPodsTotal int) { +func testDeletingPodsMetrics(t *testing.T, total int, reason string) { t.Helper() - actualDeletingPodsTotal, err := metricstestutil.GetCounterMetricValue(deletingPodsTotal.WithLabelValues()) + actualDeletingPodsTotal, err := metricstestutil.GetCounterMetricValue(metrics.DeletingPodsTotal.WithLabelValues(metav1.NamespaceDefault, reason)) if err != nil { t.Errorf("Error getting actualDeletingPodsTotal") } - if actualDeletingPodsTotal != float64(inputDeletingPodsTotal) { - t.Errorf("Expected desiredDeletingPodsTotal to be %d, got %v", inputDeletingPodsTotal, actualDeletingPodsTotal) + if actualDeletingPodsTotal != float64(total) { + t.Errorf("Expected desiredDeletingPodsTotal to be %d, got %v", total, actualDeletingPodsTotal) } - actualDeletingPodsErrorTotal, err := metricstestutil.GetCounterMetricValue(deletingPodsErrorTotal.WithLabelValues()) + actualDeletingPodsErrorTotal, err := metricstestutil.GetCounterMetricValue(metrics.DeletingPodsErrorTotal.WithLabelValues("", reason)) if err != nil { t.Errorf("Error getting actualDeletingPodsErrorTotal") } diff --git a/pkg/controller/podgc/metrics.go b/pkg/controller/podgc/metrics/metrics.go similarity index 59% rename from pkg/controller/podgc/metrics.go rename to pkg/controller/podgc/metrics/metrics.go index e0c742a81cb8..369c386eb2c4 100644 --- a/pkg/controller/podgc/metrics.go +++ b/pkg/controller/podgc/metrics/metrics.go @@ -14,7 +14,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package podgc +package metrics import ( "sync" @@ -28,32 +28,47 @@ const ( ) var ( - deletingPodsTotal = metrics.NewCounterVec( + DeletingPodsTotal = metrics.NewCounterVec( &metrics.CounterOpts{ Subsystem: podGCController, Name: "force_delete_pods_total", Help: "Number of pods that are being forcefully deleted since the Pod GC Controller started.", StabilityLevel: metrics.ALPHA, }, - []string{}, + []string{"namespace", "reason"}, ) - deletingPodsErrorTotal = metrics.NewCounterVec( + DeletingPodsErrorTotal = metrics.NewCounterVec( &metrics.CounterOpts{ Subsystem: podGCController, Name: "force_delete_pod_errors_total", Help: "Number of errors encountered when forcefully deleting the pods since the Pod GC Controller started.", StabilityLevel: metrics.ALPHA, }, - []string{}, + []string{"namespace", "reason"}, ) ) +const ( + // Possible values for the "reason" label in the above metrics. + + // PodGCReasonTerminated is used when the pod is terminated. + PodGCReasonTerminated = "terminated" + // PodGCReasonCompleted is used when the pod is terminating and the corresponding node + // is not ready and has `node.kubernetes.io/out-of-service` taint. + PodGCReasonTerminatingOutOfService = "out-of-service" + // PodGCReasonOrphaned is used when the pod is orphaned which means the corresponding node + // has been deleted. + PodGCReasonOrphaned = "orphaned" + // PodGCReasonUnscheduled is used when the pod is terminating and unscheduled. + PodGCReasonTerminatingUnscheduled = "unscheduled" +) + var registerMetrics sync.Once // Register the metrics that are to be monitored. func RegisterMetrics() { registerMetrics.Do(func() { - legacyregistry.MustRegister(deletingPodsTotal) - legacyregistry.MustRegister(deletingPodsErrorTotal) + legacyregistry.MustRegister(DeletingPodsTotal) + legacyregistry.MustRegister(DeletingPodsErrorTotal) }) }