From 67e6dde241392a4d31e843320d0bca3d9737ba98 Mon Sep 17 00:00:00 2001 From: Kevin Earls Date: Thu, 1 Dec 2022 10:48:51 +0100 Subject: [PATCH] Ignore reconcile errors that occur because a pod is being terminated (#1233) * Ignore reconcile errors that occur because a pod is being terminated Signed-off-by: Kevin Earls * Appease the all powerfull linter Signed-off-by: Kevin Earls * Change behavior to end reconcile loop if pod has been terminated Signed-off-by: Kevin Earls * Print a log message if we exit the reconciler loop Signed-off-by: Kevin Earls * Look for NamespaceTerminatingCause Signed-off-by: Kevin Earls * Appease the almighty linter Signed-off-by: Kevin Earls * Fix log message Signed-off-by: Kevin Earls * Skip flaky test Signed-off-by: Kevin Earls Signed-off-by: Kevin Earls Co-authored-by: Ben B --- cmd/otel-allocator/allocation/least_weighted_test.go | 1 + controllers/opentelemetrycollector_controller.go | 5 +++++ 2 files changed, 6 insertions(+) diff --git a/cmd/otel-allocator/allocation/least_weighted_test.go b/cmd/otel-allocator/allocation/least_weighted_test.go index 2812541966..f70d5025fb 100644 --- a/cmd/otel-allocator/allocation/least_weighted_test.go +++ b/cmd/otel-allocator/allocation/least_weighted_test.go @@ -181,6 +181,7 @@ func TestNoCollectorReassignment(t *testing.T) { } func TestSmartCollectorReassignment(t *testing.T) { + t.Skip("This test is flaky and fails frequently, see issue 1291") s, _ := New("least-weighted", logger) cols := makeNCollectors(4, 0) diff --git a/controllers/opentelemetrycollector_controller.go b/controllers/opentelemetrycollector_controller.go index 8e986c4695..6d3bb96a67 100644 --- a/controllers/opentelemetrycollector_controller.go +++ b/controllers/opentelemetrycollector_controller.go @@ -168,6 +168,11 @@ func (r *OpenTelemetryCollectorReconciler) Reconcile(ctx context.Context, req ct func (r *OpenTelemetryCollectorReconciler) RunTasks(ctx context.Context, params reconcile.Params) error { for _, task := range r.tasks { if err := task.Do(ctx, params); err != nil { + // If we get an error that occurs because a pod is being terminated, then exit this loop + if apierrors.IsForbidden(err) && apierrors.HasStatusCause(err, corev1.NamespaceTerminatingCause) { + r.log.V(2).Info("Exiting reconcile loop because namespace is being terminated", "namespace", params.Instance.Namespace) + return nil + } r.log.Error(err, fmt.Sprintf("failed to reconcile %s", task.Name)) if task.BailOnError { return err