From 41dfe84b865fc0d1010750277225747193d265d8 Mon Sep 17 00:00:00 2001 From: Jesse Glick Date: Tue, 7 Mar 2023 14:41:43 -0500 Subject: [PATCH 1/2] [JENKINS-60507] Eventually cancel `node` blocks in anomalous status --- .../support/steps/ExecutorStepExecution.java | 47 +++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/src/main/java/org/jenkinsci/plugins/workflow/support/steps/ExecutorStepExecution.java b/src/main/java/org/jenkinsci/plugins/workflow/support/steps/ExecutorStepExecution.java index eab9dc961..9736572ac 100644 --- a/src/main/java/org/jenkinsci/plugins/workflow/support/steps/ExecutorStepExecution.java +++ b/src/main/java/org/jenkinsci/plugins/workflow/support/steps/ExecutorStepExecution.java @@ -21,6 +21,7 @@ import hudson.model.Job; import hudson.model.Label; import hudson.model.Node; +import hudson.model.PeriodicWork; import hudson.model.Queue; import hudson.model.ResourceList; import hudson.model.Result; @@ -41,12 +42,15 @@ import hudson.slaves.WorkspaceList; import java.io.IOException; import java.io.Serializable; +import java.time.Duration; import java.util.Arrays; import java.util.Collection; import java.util.Collections; import java.util.HashMap; +import java.util.HashSet; import java.util.List; import java.util.Map; +import java.util.Set; import java.util.UUID; import java.util.concurrent.TimeUnit; import java.util.function.Consumer; @@ -246,6 +250,49 @@ public void stop(@NonNull Throwable cause) throws Exception { } + /** + * Looks for executions whose {@link #getStatus} would be neither running nor scheduled, and cancels them. + */ + @Extension public static final class AnomalousStatus extends PeriodicWork { + + @Override public long getRecurrencePeriod() { + return Duration.ofHours(1).toMillis(); + } + + @Override protected void doRun() throws Exception { + Set knownTasks = new HashSet<>(); + for (Queue.Item item : Queue.getInstance().getItems()) { + if (item.task instanceof PlaceholderTask) { + knownTasks.add(((PlaceholderTask) item.task).context); + } + } + Jenkins j = Jenkins.getInstanceOrNull(); + if (j != null) { + for (Computer c : j.getComputers()) { + for (Executor e : c.getExecutors()) { + Queue.Executable exec = e.getCurrentExecutable(); + if (exec instanceof PlaceholderTask.PlaceholderExecutable) { + knownTasks.add(((PlaceholderTask.PlaceholderExecutable) exec).getParent().context); + } + } + } + } + StepExecution.applyAll(ExecutorStepExecution.class, exec -> { + StepContext ctx = exec.getContext(); + if (!knownTasks.contains(ctx)) { + try { + ctx.get(TaskListener.class).error("node block appears to be neither running nor scheduled; cancelling"); + } catch (IOException | InterruptedException x) { + LOGGER.log(Level.WARNING, null, x); + } + ctx.onFailure(new FlowInterruptedException(Result.ABORTED, false, new QueueTaskCancelled())); + } + return null; + }); + } + + } + public static final class QueueTaskCancelled extends CauseOfInterruption { @Override public String getShortDescription() { return Messages.ExecutorStepExecution_queue_task_cancelled(); From 2edabfda859279ca6ff99ea7f77f4f2a5da6502f Mon Sep 17 00:00:00 2001 From: Jesse Glick Date: Tue, 7 Mar 2023 16:08:01 -0500 Subject: [PATCH 2/2] More conservatively, cancel only if we see the same task in this status twice in a row --- .../support/steps/ExecutorStepExecution.java | 40 +++++++++++++++---- 1 file changed, 33 insertions(+), 7 deletions(-) diff --git a/src/main/java/org/jenkinsci/plugins/workflow/support/steps/ExecutorStepExecution.java b/src/main/java/org/jenkinsci/plugins/workflow/support/steps/ExecutorStepExecution.java index 9736572ac..2fcf15b03 100644 --- a/src/main/java/org/jenkinsci/plugins/workflow/support/steps/ExecutorStepExecution.java +++ b/src/main/java/org/jenkinsci/plugins/workflow/support/steps/ExecutorStepExecution.java @@ -256,13 +256,25 @@ public void stop(@NonNull Throwable cause) throws Exception { @Extension public static final class AnomalousStatus extends PeriodicWork { @Override public long getRecurrencePeriod() { - return Duration.ofHours(1).toMillis(); + return Duration.ofMinutes(30).toMillis(); } + @Override public long getInitialDelay() { + // Do not run too soon after startup, in case things are still loading, agents are still reattaching, etc. + return Duration.ofMinutes(15).toMillis(); + } + + /** + * Tasks considered to be in an anomalous status the last time we ran. + */ + private Set anomalous = Set.of(); + @Override protected void doRun() throws Exception { + LOGGER.fine("checking"); Set knownTasks = new HashSet<>(); for (Queue.Item item : Queue.getInstance().getItems()) { if (item.task instanceof PlaceholderTask) { + LOGGER.fine(() -> "pending " + item); knownTasks.add(((PlaceholderTask) item.task).context); } } @@ -272,23 +284,37 @@ public void stop(@NonNull Throwable cause) throws Exception { for (Executor e : c.getExecutors()) { Queue.Executable exec = e.getCurrentExecutable(); if (exec instanceof PlaceholderTask.PlaceholderExecutable) { + LOGGER.fine(() -> "running " + exec); knownTasks.add(((PlaceholderTask.PlaceholderExecutable) exec).getParent().context); } } } } + Set newAnomalous = new HashSet<>(); StepExecution.applyAll(ExecutorStepExecution.class, exec -> { StepContext ctx = exec.getContext(); if (!knownTasks.contains(ctx)) { - try { - ctx.get(TaskListener.class).error("node block appears to be neither running nor scheduled; cancelling"); - } catch (IOException | InterruptedException x) { - LOGGER.log(Level.WARNING, null, x); + LOGGER.warning(() -> "do not know about " + ctx); + if (anomalous.contains(ctx)) { + try { + ctx.get(TaskListener.class).error("node block still appears to be neither running nor scheduled; cancelling"); + } catch (IOException | InterruptedException x) { + LOGGER.log(Level.WARNING, null, x); + } + ctx.onFailure(new FlowInterruptedException(Result.ABORTED, false, new QueueTaskCancelled())); + } else { + newAnomalous.add(ctx); } - ctx.onFailure(new FlowInterruptedException(Result.ABORTED, false, new QueueTaskCancelled())); + } else { + LOGGER.fine(() -> "know about " + ctx); } return null; - }); + }).get(); + for (StepContext ctx : newAnomalous) { + ctx.get(TaskListener.class).error("node block appears to be neither running nor scheduled; will cancel if this condition persists"); + } + LOGGER.fine(() -> "done checking: " + anomalous + " → " + newAnomalous); + anomalous = newAnomalous; } }