From 2338f882fed64546afecc38c00e472a36c5fde94 Mon Sep 17 00:00:00 2001 From: Sam Van Oort Date: Tue, 17 Apr 2018 19:37:13 -0400 Subject: [PATCH 01/34] Ensure FlowHead always has persisted FlowNodes before mutating it, persist at shutdown after threads idle to avoid concurrentModification --- .../workflow/cps/CpsBodyExecution.java | 1 - .../workflow/cps/CpsFlowExecution.java | 20 ++++++++++++------- .../plugins/workflow/cps/CpsStepContext.java | 1 - .../plugins/workflow/cps/FlowHead.java | 19 +++++++++++++----- 4 files changed, 27 insertions(+), 14 deletions(-) diff --git a/src/main/java/org/jenkinsci/plugins/workflow/cps/CpsBodyExecution.java b/src/main/java/org/jenkinsci/plugins/workflow/cps/CpsBodyExecution.java index 6a61e0f29..0fabfcad3 100644 --- a/src/main/java/org/jenkinsci/plugins/workflow/cps/CpsBodyExecution.java +++ b/src/main/java/org/jenkinsci/plugins/workflow/cps/CpsBodyExecution.java @@ -337,7 +337,6 @@ public Next receive(Object o) { FlowHead h = CpsThread.current().head; StepStartNode ssn = addBodyStartFlowNode(h); h.setNewHead(ssn); - CpsFlowExecution.maybeAutoPersistNode(ssn); } StepEndNode en = addBodyEndFlowNode(); diff --git a/src/main/java/org/jenkinsci/plugins/workflow/cps/CpsFlowExecution.java b/src/main/java/org/jenkinsci/plugins/workflow/cps/CpsFlowExecution.java index b34366d14..f5c41113e 100644 --- a/src/main/java/org/jenkinsci/plugins/workflow/cps/CpsFlowExecution.java +++ b/src/main/java/org/jenkinsci/plugins/workflow/cps/CpsFlowExecution.java @@ -134,7 +134,6 @@ import java.util.Set; import java.util.concurrent.ConcurrentMap; import java.util.concurrent.TimeUnit; -import java.util.stream.Collector; import java.util.stream.Collectors; import javax.annotation.CheckForNull; import javax.annotation.Nonnull; @@ -1482,7 +1481,7 @@ public void pause(final boolean v) throws IOException { @Override public void onSuccess(CpsThreadGroup g) { if (v) { g.pause(); - checkAndAbortNonresumableBuild(); + checkAndAbortNonresumableBuild(); // TODO Verify if we can rely on just killing paused builds at shutdown via checkAndAbortNonresumableBuild() checkpoint(); } else { g.unpause(); @@ -1510,12 +1509,16 @@ public void pause(final boolean v) throws IOException { LOGGER.fine("starting to suspend all executions"); for (FlowExecution execution : FlowExecutionList.get()) { if (execution instanceof CpsFlowExecution) { + CpsFlowExecution cpsExec = (CpsFlowExecution)execution; + cpsExec.checkAndAbortNonresumableBuild(); + LOGGER.log(Level.FINE, "waiting to suspend {0}", execution); exec = (CpsFlowExecution) execution; // Like waitForSuspension but with a timeout: if (exec.programPromise != null) { exec.programPromise.get(1, TimeUnit.MINUTES).scheduleRun().get(1, TimeUnit.MINUTES); } + cpsExec.checkpoint(); } } LOGGER.fine("finished suspending all executions"); @@ -1858,19 +1861,23 @@ public void onFailure(Throwable t) { persistOk = false; LOGGER.log(Level.FINE, "Error saving program, that should be handled elsewhere.", ex); } + try { // Flush node storage just in case the Program mutated it, just to be sure + storage.flush(); + } catch (IOException ioe) { + persistOk=false; + LOGGER.log(Level.WARNING, "Error persisting FlowNode storage before shutdown", ioe); + } persistedClean = persistOk; saveOwner(); } } - /** Clean shutdown of build. */ + /** Abort any running builds at Jenkins shutdown if they don't support resuming at next startup. */ private void checkAndAbortNonresumableBuild() { if (isComplete() || this.getDurabilityHint().isPersistWithEveryStep() || !isResumeBlocked()) { return; } try { - // FIXME we need to actually kill the darn build - owner.getListener().getLogger().println("Failing build: shutting down master and build is marked to not resume"); final Throwable x = new FlowInterruptedException(Result.ABORTED); Futures.addCallback(this.getCurrentExecutions(/* cf. JENKINS-26148 */true), new FutureCallback>() { @@ -1901,8 +1908,7 @@ private void checkAndAbortNonresumableBuild() { /** Ensures that even if we're limiting persistence of data for performance, we still write out data for shutdown. */ @Override protected void notifyShutdown() { - checkAndAbortNonresumableBuild(); - checkpoint(); + // No-op, handled in the suspendAll terminator } } diff --git a/src/main/java/org/jenkinsci/plugins/workflow/cps/CpsStepContext.java b/src/main/java/org/jenkinsci/plugins/workflow/cps/CpsStepContext.java index d1195c358..2b90bec1b 100644 --- a/src/main/java/org/jenkinsci/plugins/workflow/cps/CpsStepContext.java +++ b/src/main/java/org/jenkinsci/plugins/workflow/cps/CpsStepContext.java @@ -436,7 +436,6 @@ public void onSuccess(CpsThreadGroup g) { g.getExecution().subsumeHead(parents.get(i)); StepEndNode en = new StepEndNode(flow, (StepStartNode) n, parents); thread.head.setNewHead(en); - CpsFlowExecution.maybeAutoPersistNode(en); } thread.head.markIfFail(getOutcome()); thread.setStep(null); diff --git a/src/main/java/org/jenkinsci/plugins/workflow/cps/FlowHead.java b/src/main/java/org/jenkinsci/plugins/workflow/cps/FlowHead.java index b9f7f7e57..1726cbcf5 100644 --- a/src/main/java/org/jenkinsci/plugins/workflow/cps/FlowHead.java +++ b/src/main/java/org/jenkinsci/plugins/workflow/cps/FlowHead.java @@ -124,11 +124,14 @@ void setNewHead(@Nonnull FlowNode v) { try { if (this.head != null) { CpsFlowExecution.maybeAutoPersistNode(head); + assert execution.storage.getNode(this.head.getId()) != null; } execution.storage.storeNode(v, true); + assert execution.storage.getNode(v.getId()) != null; v.addAction(new TimingAction()); - } catch (IOException e) { - LOGGER.log(Level.FINE, "Failed to record new head: " + v, e); + CpsFlowExecution.maybeAutoPersistNode(v); // Persist node before changing head, otherwise Program can have unpersisted nodes. + } catch (Exception e) { + LOGGER.log(Level.FINE, "Failed to record new head or persist old: " + v, e); } this.head = v; CpsThreadGroup c = CpsThreadGroup.current(); @@ -179,10 +182,16 @@ private void readObject(ObjectInputStream ois) throws IOException, ClassNotFound private Object readResolve() { execution = CpsFlowExecution.PROGRAM_STATE_SERIALIZATION.get(); - if (execution!=null) - return execution.getFlowHead(id); - else + if (execution!=null) { + // See if parallel loading? + FlowHead myHead = execution.getFlowHead(id); + if (myHead == null) { + LOGGER.log(Level.WARNING, "FlowHead loading problem at deserialize: Null FlowHead with id "+id+" in execution "+execution); + } + return myHead; + } else { return this; + } } private static final Logger LOGGER = Logger.getLogger(FlowHead.class.getName()); From ee259e9743d81a5fe4399de25bb2d410c63da322 Mon Sep 17 00:00:00 2001 From: Sam Van Oort Date: Tue, 17 Apr 2018 20:32:02 -0400 Subject: [PATCH 02/34] Extended fuzzing test coverage --- .../workflow/cps/FlowDurabilityTest.java | 64 ++++++++++++++++++- 1 file changed, 62 insertions(+), 2 deletions(-) diff --git a/src/test/java/org/jenkinsci/plugins/workflow/cps/FlowDurabilityTest.java b/src/test/java/org/jenkinsci/plugins/workflow/cps/FlowDurabilityTest.java index dd5e95cd2..53af4e3c8 100644 --- a/src/test/java/org/jenkinsci/plugins/workflow/cps/FlowDurabilityTest.java +++ b/src/test/java/org/jenkinsci/plugins/workflow/cps/FlowDurabilityTest.java @@ -839,7 +839,7 @@ private WorkflowRun runFuzzerJob(JenkinsRule jrule, String jobName, FlowDurabili * May fail rarely due to files being copied in a different order than they are modified as part of simulating a dirty restart. * See {@link RestartableJenkinsRule#simulateAbruptShutdown()} for why that copying happens. */ @Test - @Ignore //Too long to run as part of main suite + //Too long to run as part of main suite @TimedRepeatRule.RepeatForTime(repeatMillis = 150_000) public void fuzzTimingDurable() throws Exception { final String jobName = "NestedParallelDurableJob"; @@ -874,6 +874,7 @@ public void evaluate() throws Throwable { } catch (AssertionError ase) { throw new AssertionError("Build hung: "+run, ase); } + verifyCompletedCleanly(story.j.jenkins, run); Assert.assertEquals(Result.SUCCESS, run.getResult()); } else { verifyCompletedCleanly(story.j.jenkins, run); @@ -889,7 +890,7 @@ public void evaluate() throws Throwable { * May fail rarely due to files being copied in a different order than they are modified as part of simulating a dirty restart. * See {@link RestartableJenkinsRule#simulateAbruptShutdown()} for why that copying happens. */ @Test - @Ignore //Too long to run as part of main suite + //Too long to run as part of main suite @TimedRepeatRule.RepeatForTime(repeatMillis = 150_000) public void fuzzTimingNonDurable() throws Exception { final String jobName = "NestedParallelDurableJob"; @@ -924,6 +925,65 @@ public void evaluate() throws Throwable { story.j.assertLogContains(logStart[0], run); } }); + story.addStep(new Statement() { + @Override + public void evaluate() throws Throwable { + Assert.assertFalse(FlowExecutionList.get().iterator().hasNext()); + WorkflowRun run = story.j.jenkins.getItemByFullName(jobName, WorkflowJob.class).getLastBuild(); + Assert.assertFalse(run.isBuilding()); + Assert.assertTrue(run.getExecution().isComplete()); + if (run.getExecution() instanceof CpsFlowExecution) { + assert ((CpsFlowExecution)(run.getExecution())).done; + } + } + }); + + } + + /** Test interrupting build by randomly restarting *cleanly* at unpredictable times and verify we stick pick up and resume. */ + @Test + //Too long to run as part of main suite + @TimedRepeatRule.RepeatForTime(repeatMillis = 150_000) + public void fuzzTimingNonDurableWithCleanRestart() throws Exception { + final String jobName = "NestedParallelDurableJob"; + final String[] logStart = new String[1]; + // Create thread that eventually interrupts Jenkins with a hard shutdown at a random time interval + story.addStep(new Statement() { + @Override + public void evaluate() throws Throwable { + WorkflowRun run = runFuzzerJob(story.j, jobName, FlowDurabilityHint.PERFORMANCE_OPTIMIZED); + logStart[0] = JenkinsRule.getLog(run); + if (run.getExecution() != null) { + Assert.assertEquals(FlowDurabilityHint.PERFORMANCE_OPTIMIZED, run.getExecution().getDurabilityHint()); + } + } + }); + story.addStep(new Statement() { + @Override + public void evaluate() throws Throwable { + WorkflowRun run = story.j.jenkins.getItemByFullName(jobName, WorkflowJob.class).getLastBuild(); + if (run.getExecution() != null) { + Assert.assertEquals(FlowDurabilityHint.PERFORMANCE_OPTIMIZED, run.getExecution().getDurabilityHint()); + } + if (run.isBuilding()) { + try { + story.j.waitUntilNoActivityUpTo(30_000); + } catch (AssertionError ase) { + throw new AssertionError("Build hung: "+run, ase); + } + } + verifyCompletedCleanly(story.j.jenkins, run); + story.j.assertLogContains(logStart[0], run); + if (run.isBuilding()) { + try { + story.j.waitUntilNoActivityUpTo(30_000); + } catch (AssertionError ase) { + throw new AssertionError("Build hung: "+run, ase); + } + } + Assert.assertEquals(Result.SUCCESS, run.getResult()); + } + }); } } From 7c4faa938fb1f1e2a573559dc9bffffe69f75a6b Mon Sep 17 00:00:00 2001 From: Sam Van Oort Date: Tue, 17 Apr 2018 20:37:18 -0400 Subject: [PATCH 03/34] Code comments to explain --- .../org/jenkinsci/plugins/workflow/cps/CpsFlowExecution.java | 2 +- .../java/org/jenkinsci/plugins/workflow/cps/FlowHead.java | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/main/java/org/jenkinsci/plugins/workflow/cps/CpsFlowExecution.java b/src/main/java/org/jenkinsci/plugins/workflow/cps/CpsFlowExecution.java index f5c41113e..70d468d6a 100644 --- a/src/main/java/org/jenkinsci/plugins/workflow/cps/CpsFlowExecution.java +++ b/src/main/java/org/jenkinsci/plugins/workflow/cps/CpsFlowExecution.java @@ -1809,7 +1809,7 @@ void saveOwner() { saveable.save(); } } catch (IOException ex) { - LOGGER.log(Level.WARNING, "Error persisting Run before shutdown", ex); + LOGGER.log(Level.WARNING, "Error persisting Run "+owner, ex); persistedClean = false; } } diff --git a/src/main/java/org/jenkinsci/plugins/workflow/cps/FlowHead.java b/src/main/java/org/jenkinsci/plugins/workflow/cps/FlowHead.java index 1726cbcf5..0e168a08a 100644 --- a/src/main/java/org/jenkinsci/plugins/workflow/cps/FlowHead.java +++ b/src/main/java/org/jenkinsci/plugins/workflow/cps/FlowHead.java @@ -129,7 +129,9 @@ void setNewHead(@Nonnull FlowNode v) { execution.storage.storeNode(v, true); assert execution.storage.getNode(v.getId()) != null; v.addAction(new TimingAction()); - CpsFlowExecution.maybeAutoPersistNode(v); // Persist node before changing head, otherwise Program can have unpersisted nodes. + CpsFlowExecution.maybeAutoPersistNode(v); // Persist node before changing head, otherwise Program can have unpersisted nodes and will fail to deserialize + // NOTE: we may also need to persist the FlowExecution by persisting its owner (which will be a WorkflowRun) + // But this will be handled by the WorkflowRun GraphListener momentarily } catch (Exception e) { LOGGER.log(Level.FINE, "Failed to record new head or persist old: " + v, e); } From cc61b6f8ea66ae2725b6b6bf2de8e503e34fd3d3 Mon Sep 17 00:00:00 2001 From: Sam Van Oort Date: Thu, 19 Apr 2018 20:35:31 -0400 Subject: [PATCH 04/34] Save notes on persistence --- doc/persistence.md | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) create mode 100644 doc/persistence.md diff --git a/doc/persistence.md b/doc/persistence.md new file mode 100644 index 000000000..1403dc537 --- /dev/null +++ b/doc/persistence.md @@ -0,0 +1,39 @@ +# The Pipeline Persistence Model + +# Data Model +Running pipelines persist in 3 pieces: + +1. The FlowNodes - stored by a FlowNodeStorage - this holds the FlowNodes created to map to Steps, and for block scoped Steps, start and end of blocks +2. The CpsFlowExecution - this is currently stored in the WorkflowRun, and the primary pieces of interest are: + * heads - the current "tips" of the Flow Graph, i.e. the FlowNodes that represent running steps and are appended to + - A head maps to a CpsThread in the Pipeline program, within the CPSThreadGroup + * starts - the BlockStartNodes marking the start(s) of the currently executing blocks + * scripts - the loaded Pipeline script files (text) + * persistedClean + - If true, Pipeline saved its execution cleanly to disk and we *might* be able to resume it + - If false, something went wrong saving the execution, so we cannot resume even if we'd otherwise be able to + - If null, probably the build dates back to before this field was added - we check to see if this is running in a highly persistent DurabilityMode (Max_survivability generally) + * done - if true, this execution completed, if false or un-set, the pipeline is a candidate to resume unless its only head is a FlowEndNode + - The handling of false is for legacy reasons, since it was only recently made persistent. + * + * various other boolean flags & settings for the execution (durability setting, user that started the build, is it sandboxed, etc) +3. The Program -- this is the current execution state of the Pipeline + * This holds the Groovy state (transformed by CPS) plus the CPSThreadGroup for the running branches of the Pipeline + * The program depends on the FlowNodes from the FlowNodeStorage, since it reads them by ID rather than storing them in the program + * This also depends on the heads in the CpsFlowExecution, because its FlowHeads are loaded from the heads of the CpsFlowExecution + * Also holds the CpsStepContext, i.e. the variables such as EnvVars, Executor and Workspace uses (the latter stored as Pickles) + - The pickles will be specially restored when the Pipeline resumes since they don't serialize/deserialize normally + +## Persistence Issues And Logic + +Some basic rules: + +1. If the FlowNodeStorage is corrupt, incomplete, or un-persisted, all manner of heck will break loose + - In terms of Pipeline execution, the impact is like the Resonance Cascade from the Half-Life games + - The pipeline can never be resumed (the key piece is missing) + - Usually we fake up some placeholder FlowNodes to cover this situation and save them +2. Whenever persisting data, the Pipeline *must* have the FlowNodes persisted on disk (via `storage.flush()` generally) +in order to be able to load the heads and restore the program. +3. Once we've set persistedClean as false and saved the FlowExecution, then it doesn't matter what we do -- the Pipeline will assume + it already has incomplete persistence data (as with 1) when trying to resume. This is how we handle the low-durability modes, to + avoid resuming a stale state of the Pipeline simply because we have old data persisted and are not updating it. \ No newline at end of file From 40dd2c44eba7f77c5aa6ef8a480d4485e54d185e Mon Sep 17 00:00:00 2001 From: Sam Van Oort Date: Thu, 19 Apr 2018 20:47:23 -0400 Subject: [PATCH 05/34] Force persistence of the Pipeline build when the heads are mutated, plus remove redundant writes --- .../workflow/cps/CpsBodyExecution.java | 2 -- .../workflow/cps/CpsFlowExecution.java | 20 +++++++++++++++++++ .../jenkinsci/plugins/workflow/cps/DSL.java | 3 +++ .../plugins/workflow/cps/FlowHead.java | 8 ++++++-- .../workflow/cps/FlowDurabilityTest.java | 6 +++--- 5 files changed, 32 insertions(+), 7 deletions(-) diff --git a/src/main/java/org/jenkinsci/plugins/workflow/cps/CpsBodyExecution.java b/src/main/java/org/jenkinsci/plugins/workflow/cps/CpsBodyExecution.java index 0fabfcad3..bd564ca30 100644 --- a/src/main/java/org/jenkinsci/plugins/workflow/cps/CpsBodyExecution.java +++ b/src/main/java/org/jenkinsci/plugins/workflow/cps/CpsBodyExecution.java @@ -120,7 +120,6 @@ class CpsBodyExecution extends BodyExecution { } head.setNewHead(sn); - CpsFlowExecution.maybeAutoPersistNode(sn); StepContext sc = new CpsBodySubContext(context, sn); for (BodyExecutionCallback c : callbacks) { @@ -366,7 +365,6 @@ public Next receive(Object o) { for (BodyExecutionCallback c : callbacks) { c.onSuccess(sc, o); } - return Next.terminate(null); } diff --git a/src/main/java/org/jenkinsci/plugins/workflow/cps/CpsFlowExecution.java b/src/main/java/org/jenkinsci/plugins/workflow/cps/CpsFlowExecution.java index 70d468d6a..c41531594 100644 --- a/src/main/java/org/jenkinsci/plugins/workflow/cps/CpsFlowExecution.java +++ b/src/main/java/org/jenkinsci/plugins/workflow/cps/CpsFlowExecution.java @@ -1054,10 +1054,12 @@ public synchronized boolean isCurrentHead(FlowNode n) { // synchronized void addHead(FlowHead h) { heads.put(h.getId(), h); + saveExecutionIfDurable(); // We need to save the mutated heads for the run } synchronized void removeHead(FlowHead h) { heads.remove(h.getId()); + saveExecutionIfDurable(); // We need to save the mutated heads for the run } /** @@ -1073,6 +1075,7 @@ void subsumeHead(FlowNode n) { for (FlowHead h : _heads) { if (h.get()==n) { h.remove(); + saveExecutionIfDurable(); // We need to save the mutated heads for the run return; } } @@ -1801,11 +1804,28 @@ public void autopersist(@Nonnull FlowNode n) throws IOException { } + /** Persist the execution if we are set up to save the execution with every step. */ + void saveExecutionIfDurable() { + if (this.getDurabilityHint().isPersistWithEveryStep()) { + saveOwner(); + } + } + /** Save the owner that holds this execution. */ void saveOwner() { try { if (this.owner != null && this.owner.getExecutable() instanceof Saveable) { // Null-check covers some anomalous cases we've seen Saveable saveable = (Saveable)(this.owner.getExecutable()); + persistedClean = true; + if (storage != null && storage.delegate != null) { + // Defensively flush FlowNodes to storage + try { + storage.flush(); + } catch (Exception ex) { + LOGGER.log(Level.WARNING, "Error persisting FlowNodes for execution "+owner, ex); + persistedClean = false; + } + } saveable.save(); } } catch (IOException ex) { diff --git a/src/main/java/org/jenkinsci/plugins/workflow/cps/DSL.java b/src/main/java/org/jenkinsci/plugins/workflow/cps/DSL.java index d94df50b7..9bfd5c506 100644 --- a/src/main/java/org/jenkinsci/plugins/workflow/cps/DSL.java +++ b/src/main/java/org/jenkinsci/plugins/workflow/cps/DSL.java @@ -577,6 +577,9 @@ private void invokeBody(CpsThread cur) { // the first one can reuse the current thread, but other ones need to create new heads // we want to do this first before starting body so that the order of heads preserve // natural ordering. + + // FIXME give this javadocs worth a darn, because this is how we create parallel branches and the docs are crypic. + // Also we need to double-check this logic because this might cause a failure of persistence FlowHead[] heads = new FlowHead[context.bodyInvokers.size()]; for (int i = 0; i < heads.length; i++) { heads[i] = i==0 ? cur.head : cur.head.fork(); diff --git a/src/main/java/org/jenkinsci/plugins/workflow/cps/FlowHead.java b/src/main/java/org/jenkinsci/plugins/workflow/cps/FlowHead.java index 0e168a08a..c38a10504 100644 --- a/src/main/java/org/jenkinsci/plugins/workflow/cps/FlowHead.java +++ b/src/main/java/org/jenkinsci/plugins/workflow/cps/FlowHead.java @@ -45,6 +45,7 @@ import org.jenkinsci.plugins.workflow.graph.FlowNode; import org.jenkinsci.plugins.workflow.graph.FlowStartNode; +import javax.annotation.CheckForNull; import javax.annotation.Nonnull; /** @@ -116,6 +117,7 @@ void newStartNode(FlowStartNode n) throws IOException { execution.storage.storeNode(head, false); } + /** Could be better described as "append to Flow graph" except for parallel cases. */ void setNewHead(@Nonnull FlowNode v) { if (v == null) { // Because Findbugs isn't 100% at catching cases where this can happen and we really need to fail hard-and-fast @@ -133,7 +135,7 @@ void setNewHead(@Nonnull FlowNode v) { // NOTE: we may also need to persist the FlowExecution by persisting its owner (which will be a WorkflowRun) // But this will be handled by the WorkflowRun GraphListener momentarily } catch (Exception e) { - LOGGER.log(Level.FINE, "Failed to record new head or persist old: " + v, e); + LOGGER.log(Level.WARNING, "Failed to record new head or persist old: " + v, e); } this.head = v; CpsThreadGroup c = CpsThreadGroup.current(); @@ -182,16 +184,18 @@ private void readObject(ObjectInputStream ois) throws IOException, ClassNotFound // we'll replace this with one of execution.heads() } + @Nonnull private Object readResolve() { execution = CpsFlowExecution.PROGRAM_STATE_SERIALIZATION.get(); if (execution!=null) { // See if parallel loading? FlowHead myHead = execution.getFlowHead(id); if (myHead == null) { - LOGGER.log(Level.WARNING, "FlowHead loading problem at deserialize: Null FlowHead with id "+id+" in execution "+execution); + throw new IllegalStateException("FlowHead loading problem at deserialize: Null FlowHead with id "+id+" in execution "+execution); } return myHead; } else { + LOGGER.log(Level.WARNING, "Tried to load a FlowHead from program with no Execution in PROGRAM_STATE_SERIALIZTION"); return this; } } diff --git a/src/test/java/org/jenkinsci/plugins/workflow/cps/FlowDurabilityTest.java b/src/test/java/org/jenkinsci/plugins/workflow/cps/FlowDurabilityTest.java index 53af4e3c8..9d617aad8 100644 --- a/src/test/java/org/jenkinsci/plugins/workflow/cps/FlowDurabilityTest.java +++ b/src/test/java/org/jenkinsci/plugins/workflow/cps/FlowDurabilityTest.java @@ -839,7 +839,7 @@ private WorkflowRun runFuzzerJob(JenkinsRule jrule, String jobName, FlowDurabili * May fail rarely due to files being copied in a different order than they are modified as part of simulating a dirty restart. * See {@link RestartableJenkinsRule#simulateAbruptShutdown()} for why that copying happens. */ @Test - //Too long to run as part of main suite + @Ignore //Too long to run as part of main suite @TimedRepeatRule.RepeatForTime(repeatMillis = 150_000) public void fuzzTimingDurable() throws Exception { final String jobName = "NestedParallelDurableJob"; @@ -890,7 +890,7 @@ public void evaluate() throws Throwable { * May fail rarely due to files being copied in a different order than they are modified as part of simulating a dirty restart. * See {@link RestartableJenkinsRule#simulateAbruptShutdown()} for why that copying happens. */ @Test - //Too long to run as part of main suite + @Ignore //Too long to run as part of main suite @TimedRepeatRule.RepeatForTime(repeatMillis = 150_000) public void fuzzTimingNonDurable() throws Exception { final String jobName = "NestedParallelDurableJob"; @@ -942,7 +942,7 @@ public void evaluate() throws Throwable { /** Test interrupting build by randomly restarting *cleanly* at unpredictable times and verify we stick pick up and resume. */ @Test - //Too long to run as part of main suite + @Ignore //Too long to run as part of main suite @TimedRepeatRule.RepeatForTime(repeatMillis = 150_000) public void fuzzTimingNonDurableWithCleanRestart() throws Exception { final String jobName = "NestedParallelDurableJob"; From e173bcad3bbe9c77db26922aa2ce51424fe5ea8f Mon Sep 17 00:00:00 2001 From: Sam Van Oort Date: Thu, 19 Apr 2018 21:56:02 -0400 Subject: [PATCH 06/34] Refactor and harden the fuzz tests --- .../workflow/cps/FlowDurabilityTest.java | 53 +++++++++++-------- 1 file changed, 30 insertions(+), 23 deletions(-) diff --git a/src/test/java/org/jenkinsci/plugins/workflow/cps/FlowDurabilityTest.java b/src/test/java/org/jenkinsci/plugins/workflow/cps/FlowDurabilityTest.java index 9d617aad8..fdfa563ab 100644 --- a/src/test/java/org/jenkinsci/plugins/workflow/cps/FlowDurabilityTest.java +++ b/src/test/java/org/jenkinsci/plugins/workflow/cps/FlowDurabilityTest.java @@ -47,6 +47,7 @@ import org.jvnet.hudson.test.JenkinsRule; import org.jvnet.hudson.test.RestartableJenkinsRule; +import javax.annotation.Nonnull; import java.io.File; import java.io.FileOutputStream; import java.lang.annotation.ElementType; @@ -84,7 +85,7 @@ public class FlowDurabilityTest { @Rule public TimedRepeatRule repeater = new TimedRepeatRule(); - // Used in testing + // Used in Race-condition/persistence fuzzing where we need to run repeatedly static class TimedRepeatRule implements TestRule { @Target({ElementType.METHOD}) @@ -790,7 +791,17 @@ public void evaluate() throws Throwable { }); } - /** Launches the job used for fuzzing in {@link #fuzzTimingDurable()} and {@link #fuzzTimingNonDurable()} -- including the timeout. */ + private static void assertBuildNotHung(@Nonnull RestartableJenkinsRule story, @Nonnull WorkflowRun run, int timeOutMillis) throws Exception { + if (run.isBuilding()) { + try { + story.j.waitUntilNoActivityUpTo(timeOutMillis); + } catch (AssertionError ase) { // Allows attaching a debugger here + throw new AssertionError("Build hung: " + run, ase); + } + } + } + + /** Launches the job used for fuzzing in the various timed fuzzing tests to catch timing-sensitive issues -- including the timeout. */ private WorkflowRun runFuzzerJob(JenkinsRule jrule, String jobName, FlowDurabilityHint hint) throws Exception { Jenkins jenkins = jrule.jenkins; WorkflowJob job = jenkins.getItemByFullName(jobName, WorkflowJob.class); @@ -869,16 +880,10 @@ public void evaluate() throws Throwable { Assert.assertEquals(FlowDurabilityHint.MAX_SURVIVABILITY, run.getExecution().getDurabilityHint()); } if (run.isBuilding()) { - try { - story.j.waitUntilNoActivityUpTo(30_000); - } catch (AssertionError ase) { - throw new AssertionError("Build hung: "+run, ase); - } - verifyCompletedCleanly(story.j.jenkins, run); + assertBuildNotHung(story, run, 30_000); Assert.assertEquals(Result.SUCCESS, run.getResult()); - } else { - verifyCompletedCleanly(story.j.jenkins, run); } + verifyCompletedCleanly(story.j.jenkins, run); assertIncludesNodes(nodesOut, run); story.j.assertLogContains(logStart[0], run); } @@ -892,7 +897,7 @@ public void evaluate() throws Throwable { @Test @Ignore //Too long to run as part of main suite @TimedRepeatRule.RepeatForTime(repeatMillis = 150_000) - public void fuzzTimingNonDurable() throws Exception { + public void fuzzTimingNonDurableWithDirtyRestart() throws Exception { final String jobName = "NestedParallelDurableJob"; final String[] logStart = new String[1]; @@ -905,6 +910,11 @@ public void evaluate() throws Throwable { if (run.getExecution() != null) { Assert.assertEquals(FlowDurabilityHint.PERFORMANCE_OPTIMIZED, run.getExecution().getDurabilityHint()); } + if (run.isBuilding()) { + Assert.assertNotEquals(Boolean.TRUE, ((CpsFlowExecution)run.getExecution()).persistedClean); + } else { + Assert.assertEquals(Boolean.TRUE, ((CpsFlowExecution)run.getExecution()).persistedClean); + } } }); story.addStep(new Statement() { @@ -914,13 +924,7 @@ public void evaluate() throws Throwable { if (run.getExecution() != null) { Assert.assertEquals(FlowDurabilityHint.PERFORMANCE_OPTIMIZED, run.getExecution().getDurabilityHint()); } - if (run.isBuilding()) { - try { - story.j.waitUntilNoActivityUpTo(30_000); - } catch (AssertionError ase) { - throw new AssertionError("Build hung: "+run, ase); - } - } + assertBuildNotHung(story, run, 30_000); verifyCompletedCleanly(story.j.jenkins, run); story.j.assertLogContains(logStart[0], run); } @@ -928,6 +932,7 @@ public void evaluate() throws Throwable { story.addStep(new Statement() { @Override public void evaluate() throws Throwable { + // Verify build doesn't resume at next restart, see JENKINS-50199 Assert.assertFalse(FlowExecutionList.get().iterator().hasNext()); WorkflowRun run = story.j.jenkins.getItemByFullName(jobName, WorkflowJob.class).getLastBuild(); Assert.assertFalse(run.isBuilding()); @@ -945,8 +950,10 @@ public void evaluate() throws Throwable { @Ignore //Too long to run as part of main suite @TimedRepeatRule.RepeatForTime(repeatMillis = 150_000) public void fuzzTimingNonDurableWithCleanRestart() throws Exception { + final String jobName = "NestedParallelDurableJob"; final String[] logStart = new String[1]; + final List nodesOut = new ArrayList(); // Create thread that eventually interrupts Jenkins with a hard shutdown at a random time interval story.addStep(new Statement() { @@ -957,6 +964,9 @@ public void evaluate() throws Throwable { if (run.getExecution() != null) { Assert.assertEquals(FlowDurabilityHint.PERFORMANCE_OPTIMIZED, run.getExecution().getDurabilityHint()); } + nodesOut.clear(); + nodesOut.addAll(new DepthFirstScanner().allNodes(run.getExecution())); + nodesOut.sort(FlowScanningUtils.ID_ORDER_COMPARATOR); } }); story.addStep(new Statement() { @@ -967,11 +977,7 @@ public void evaluate() throws Throwable { Assert.assertEquals(FlowDurabilityHint.PERFORMANCE_OPTIMIZED, run.getExecution().getDurabilityHint()); } if (run.isBuilding()) { - try { - story.j.waitUntilNoActivityUpTo(30_000); - } catch (AssertionError ase) { - throw new AssertionError("Build hung: "+run, ase); - } + assertBuildNotHung(story, run, 30_000); } verifyCompletedCleanly(story.j.jenkins, run); story.j.assertLogContains(logStart[0], run); @@ -983,6 +989,7 @@ public void evaluate() throws Throwable { } } Assert.assertEquals(Result.SUCCESS, run.getResult()); + assertIncludesNodes(nodesOut, run); } }); } From dde404b5a1bc790724dcb9d4f24bb92978d3d6d5 Mon Sep 17 00:00:00 2001 From: Sam Van Oort Date: Fri, 20 Apr 2018 17:34:16 -0400 Subject: [PATCH 07/34] Ensure we always initialize heads for FlowNodes and handle additional errors --- .../plugins/workflow/cps/CpsFlowExecution.java | 18 +++++++++++++----- .../cps/actions/PersistanceController.java | 7 +++++++ 2 files changed, 20 insertions(+), 5 deletions(-) create mode 100644 src/main/java/org/jenkinsci/plugins/workflow/cps/actions/PersistanceController.java diff --git a/src/main/java/org/jenkinsci/plugins/workflow/cps/CpsFlowExecution.java b/src/main/java/org/jenkinsci/plugins/workflow/cps/CpsFlowExecution.java index a31175ad5..4c4a24fa0 100644 --- a/src/main/java/org/jenkinsci/plugins/workflow/cps/CpsFlowExecution.java +++ b/src/main/java/org/jenkinsci/plugins/workflow/cps/CpsFlowExecution.java @@ -617,12 +617,14 @@ private synchronized String getHeadsAsString() { private void rebuildEmptyGraph() { synchronized (this) { // something went catastrophically wrong and there's no live head. fake one - LOGGER.log(Level.WARNING, "Failed to load pipeline heads, so faking some up for execution " + this.toString()); + LOGGER.log(Level.WARNING, "Failed to load pipeline heads/start nodes, so faking some up for execution " + this.toString()); if (this.startNodes == null) { this.startNodes = new Stack(); } - if (this.heads != null && this.heads.size() > 0) { + if (this.heads == null) { + this.heads = new TreeMap(); + } else if (!this.heads.isEmpty()) { if (LOGGER.isLoggable(Level.INFO)) { LOGGER.log(Level.INFO, "Resetting heads to rebuild the Pipeline structure, tossing existing heads: "+getHeadsAsString()); } @@ -686,8 +688,8 @@ protected synchronized void initializeStorage() throws IOException { } startNodesSerial = null; - } catch (IOException ioe) { - LOGGER.log(Level.WARNING, "Error initializing storage and loading nodes", ioe); + } catch (Exception ioe) { + LOGGER.log(Level.WARNING, "Error initializing storage and loading nodes for "+this, ioe); storageErrors = true; } @@ -1519,7 +1521,13 @@ public void pause(final boolean v) throws IOException { exec = (CpsFlowExecution) execution; // Like waitForSuspension but with a timeout: if (exec.programPromise != null) { - exec.programPromise.get(1, TimeUnit.MINUTES).scheduleRun().get(1, TimeUnit.MINUTES); + LOGGER.log(Level.FINER, "Waiting for Pipeline to go to sleep for shutdown: "+execution); + try { + exec.programPromise.get(1, TimeUnit.MINUTES).scheduleRun().get(1, TimeUnit.MINUTES); + LOGGER.log(Level.FINER, " Pipeline went to sleep OK: "+execution); + } catch (InterruptedException | TimeoutException ex) { + LOGGER.log(Level.WARNING, "Error waiting for Pipeline to suspend: "+exec, ex); + } } cpsExec.checkpoint(); } diff --git a/src/main/java/org/jenkinsci/plugins/workflow/cps/actions/PersistanceController.java b/src/main/java/org/jenkinsci/plugins/workflow/cps/actions/PersistanceController.java new file mode 100644 index 000000000..5cc39d0b3 --- /dev/null +++ b/src/main/java/org/jenkinsci/plugins/workflow/cps/actions/PersistanceController.java @@ -0,0 +1,7 @@ +package org.jenkinsci.plugins.workflow.cps.actions; + +/** + * @author Sam Van Oort + */ +public class PersistanceController { +} From 7b9ebe043e54e644bb668f09bdddd2bf005a8e1c Mon Sep 17 00:00:00 2001 From: Sam Van Oort Date: Fri, 20 Apr 2018 18:15:28 -0400 Subject: [PATCH 08/34] Add logging for some odd circumstances with deserializing and intializing the execution --- .../plugins/workflow/cps/CpsFlowExecution.java | 11 +++++++++++ .../workflow/cps/actions/PersistanceController.java | 7 ------- 2 files changed, 11 insertions(+), 7 deletions(-) delete mode 100644 src/main/java/org/jenkinsci/plugins/workflow/cps/actions/PersistanceController.java diff --git a/src/main/java/org/jenkinsci/plugins/workflow/cps/CpsFlowExecution.java b/src/main/java/org/jenkinsci/plugins/workflow/cps/CpsFlowExecution.java index 4c4a24fa0..94c827bdb 100644 --- a/src/main/java/org/jenkinsci/plugins/workflow/cps/CpsFlowExecution.java +++ b/src/main/java/org/jenkinsci/plugins/workflow/cps/CpsFlowExecution.java @@ -719,6 +719,17 @@ public void onLoad(FlowExecutionOwner owner) throws IOException { this.owner = owner; try { initializeStorage(); + if (heads == null) { + LOGGER.log(Level.WARNING, "Null Flow heads after initializing storage for FlowExecution "+this); + } else if (heads.isEmpty()) { + LOGGER.log(Level.INFO, "Empty flow heads after initializing storage - not necessarily an error, but odd - for FlowExecution "+this); + } + if (startNodes == null) { + LOGGER.log(Level.WARNING, "Null block start nodes after initializing storage for FlowExecution "+this); + } else if (startNodes.isEmpty()) { + LOGGER.log(Level.INFO, "Empty block start nodes after initializing storage - not necessarily an error, but odd - for FlowExecution "+this); + } + try { if (!isComplete()) { if (canResume()) { diff --git a/src/main/java/org/jenkinsci/plugins/workflow/cps/actions/PersistanceController.java b/src/main/java/org/jenkinsci/plugins/workflow/cps/actions/PersistanceController.java deleted file mode 100644 index 5cc39d0b3..000000000 --- a/src/main/java/org/jenkinsci/plugins/workflow/cps/actions/PersistanceController.java +++ /dev/null @@ -1,7 +0,0 @@ -package org.jenkinsci.plugins.workflow.cps.actions; - -/** - * @author Sam Van Oort - */ -public class PersistanceController { -} From e0f0025995a59127ed495d708da8873a2ae22cff Mon Sep 17 00:00:00 2001 From: Sam Van Oort Date: Fri, 20 Apr 2018 18:24:53 -0400 Subject: [PATCH 09/34] Fix one cause of builds showing up as incomplete when can't be loaded --- .../org/jenkinsci/plugins/workflow/cps/CpsFlowExecution.java | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/main/java/org/jenkinsci/plugins/workflow/cps/CpsFlowExecution.java b/src/main/java/org/jenkinsci/plugins/workflow/cps/CpsFlowExecution.java index 94c827bdb..10a95430d 100644 --- a/src/main/java/org/jenkinsci/plugins/workflow/cps/CpsFlowExecution.java +++ b/src/main/java/org/jenkinsci/plugins/workflow/cps/CpsFlowExecution.java @@ -616,6 +616,7 @@ private synchronized String getHeadsAsString() { /** Handle failures where we can't load heads. */ private void rebuildEmptyGraph() { synchronized (this) { + this.done = Boolean.TRUE; // Ensures that the flow does not show as incomplete if the graph data is corrupt // something went catastrophically wrong and there's no live head. fake one LOGGER.log(Level.WARNING, "Failed to load pipeline heads/start nodes, so faking some up for execution " + this.toString()); if (this.startNodes == null) { @@ -689,6 +690,7 @@ protected synchronized void initializeStorage() throws IOException { startNodesSerial = null; } catch (Exception ioe) { + this.done = Boolean.TRUE; LOGGER.log(Level.WARNING, "Error initializing storage and loading nodes for "+this, ioe); storageErrors = true; } @@ -732,6 +734,7 @@ public void onLoad(FlowExecutionOwner owner) throws IOException { try { if (!isComplete()) { + // FOR SOME REASON we're arriving here even for *completed* builds sometimes, hopefully logging above helps if (canResume()) { loadProgramAsync(getProgramDataFile()); } else { From 646b1d2dcbaf17519fbca0509f745a0537facee2 Mon Sep 17 00:00:00 2001 From: Sam Van Oort Date: Tue, 24 Apr 2018 18:29:42 -0400 Subject: [PATCH 10/34] Add a bunch of persistence test cases that will fail because our persistence is not correct --- .../plugins/workflow/cps/FlowHead.java | 4 +- .../workflow/cps/PersistenceProblemsTest.java | 274 ++++++++++++++++++ 2 files changed, 277 insertions(+), 1 deletion(-) create mode 100644 src/test/java/org/jenkinsci/plugins/workflow/cps/PersistenceProblemsTest.java diff --git a/src/main/java/org/jenkinsci/plugins/workflow/cps/FlowHead.java b/src/main/java/org/jenkinsci/plugins/workflow/cps/FlowHead.java index c38a10504..103f9f7f5 100644 --- a/src/main/java/org/jenkinsci/plugins/workflow/cps/FlowHead.java +++ b/src/main/java/org/jenkinsci/plugins/workflow/cps/FlowHead.java @@ -25,6 +25,7 @@ package org.jenkinsci.plugins.workflow.cps; import com.cloudbees.groovy.cps.Outcome; +import com.google.common.annotations.VisibleForTesting; import hudson.model.Action; import java.io.IOException; import java.io.ObjectInputStream; @@ -69,7 +70,8 @@ final class FlowHead implements Serializable { private /*almost final except for serialization*/ int id; private /*almost final except for serialization*/ transient CpsFlowExecution execution; - private FlowNode head; // TODO: rename to node + @VisibleForTesting + FlowNode head; // TODO: rename to node FlowHead(CpsFlowExecution execution, int id) { this.id = id; diff --git a/src/test/java/org/jenkinsci/plugins/workflow/cps/PersistenceProblemsTest.java b/src/test/java/org/jenkinsci/plugins/workflow/cps/PersistenceProblemsTest.java new file mode 100644 index 000000000..c54f0a0cb --- /dev/null +++ b/src/test/java/org/jenkinsci/plugins/workflow/cps/PersistenceProblemsTest.java @@ -0,0 +1,274 @@ +package org.jenkinsci.plugins.workflow.cps; + +import com.google.common.util.concurrent.ListenableFuture; +import hudson.model.Queue; +import hudson.model.Result; +import org.jenkinsci.plugins.workflow.flow.FlowDurabilityHint; +import org.jenkinsci.plugins.workflow.flow.FlowExecution; +import org.jenkinsci.plugins.workflow.flow.FlowExecutionList; +import org.jenkinsci.plugins.workflow.graph.FlowEndNode; +import org.jenkinsci.plugins.workflow.graph.FlowStartNode; +import org.jenkinsci.plugins.workflow.job.WorkflowJob; +import org.jenkinsci.plugins.workflow.job.WorkflowRun; +import org.jenkinsci.plugins.workflow.job.properties.DurabilityHintJobProperty; +import org.jenkinsci.plugins.workflow.support.steps.input.InputAction; +import org.jenkinsci.plugins.workflow.support.steps.input.InputStepExecution; +import org.junit.Assert; +import org.junit.ClassRule; +import org.junit.Rule; +import org.junit.Test; +import org.jvnet.hudson.test.BuildWatcher; +import org.jvnet.hudson.test.JenkinsRule; +import org.jvnet.hudson.test.RestartableJenkinsRule; + +import java.io.File; +import java.util.List; +import java.util.concurrent.atomic.AtomicBoolean; + +/** + * Verifies we can cope with all the bizarre quirks that occur when persistence fails or something unexpected happens. + */ +public class PersistenceProblemsTest { + @ClassRule + public static BuildWatcher buildWatcher = new BuildWatcher(); + + @Rule + public RestartableJenkinsRule story = new RestartableJenkinsRule(); + + /** Verifies all the assumptions about a cleanly finished build. */ + static void assertCompletedCleanly(WorkflowRun run) throws Exception { + if (run.isBuilding()) { + System.out.println("Run initially building, going to wait a second to see if it finishes, run="+run); + Thread.sleep(1); + } + Assert.assertFalse(run.isBuilding()); + Assert.assertNotNull(run.getResult()); + FlowExecution fe = run.getExecution(); + FlowExecutionList.get().forEach(f -> { + if (fe != null && f == fe) { + Assert.fail("FlowExecution still in FlowExecutionList!"); + } + }); + Assert.assertTrue("Queue not empty after completion!", Queue.getInstance().isEmpty()); + + if (fe instanceof CpsFlowExecution) { + CpsFlowExecution cpsExec = (CpsFlowExecution)fe; + Assert.assertTrue(cpsExec.isComplete()); + Assert.assertEquals(Boolean.TRUE, cpsExec.persistedClean); + Assert.assertEquals(Boolean.TRUE, cpsExec.done); + Assert.assertEquals(1, cpsExec.getCurrentHeads().size()); + Assert.assertTrue(cpsExec.getCurrentHeads().get(0) instanceof FlowEndNode); + Assert.assertTrue(cpsExec.startNodes == null || cpsExec.startNodes.isEmpty()); + Assert.assertFalse(cpsExec.blocksRestart()); + } else { + System.out.println("WARNING: no FlowExecutionForBuild"); + } + } + + static void assertCleanInProgress(WorkflowRun run) throws Exception { + Assert.assertTrue(run.isBuilding()); + Assert.assertNull(run.getResult()); + FlowExecution fe = run.getExecution(); + AtomicBoolean hasExecutionInList = new AtomicBoolean(false); + FlowExecutionList.get().forEach(f -> { + if (fe != null && f == fe) { + hasExecutionInList.set(true); + } + }); + if (!hasExecutionInList.get()) { + Assert.fail("Build completed but should still show in FlowExecutionList"); + } + CpsFlowExecution cpsExec = (CpsFlowExecution)fe; + Assert.assertFalse(cpsExec.isComplete()); + Assert.assertEquals(Boolean.FALSE, cpsExec.done); + Assert.assertFalse(cpsExec.getCurrentHeads().get(0) instanceof FlowEndNode); + Assert.assertTrue(cpsExec.startNodes != null && !cpsExec.startNodes.isEmpty()); + } + + /** Create and run a basic build before we mangle its persisted contents. Stores job number to jobIdNumber index 0. */ + private static WorkflowRun runBasicBuild(JenkinsRule j, String jobName, int[] jobIdNumber) throws Exception { + WorkflowJob job = j.jenkins.createProject(WorkflowJob.class, jobName); + job.setDefinition(new CpsFlowDefinition("echo 'doSomething'", true)); + job.addProperty(new DurabilityHintJobProperty(FlowDurabilityHint.MAX_SURVIVABILITY)); + WorkflowRun run = j.buildAndAssertSuccess(job); + jobIdNumber[0] = run.getNumber(); + assertCompletedCleanly(run); + return run; + } + + /** Sets up a running build that is waiting on input. */ + private static WorkflowRun runBasicPauseOnInput(JenkinsRule j, String jobName, int[] jobIdNumber) throws Exception { + WorkflowJob job = j.jenkins.createProject(WorkflowJob.class, jobName); + job.setDefinition(new CpsFlowDefinition("input 'pause'", true)); + job.addProperty(new DurabilityHintJobProperty(FlowDurabilityHint.MAX_SURVIVABILITY)); + + WorkflowRun run = job.scheduleBuild2(0).getStartCondition().get(); + ListenableFuture listener = run.getExecutionPromise(); + FlowExecution exec = listener.get(); + while(run.getAction(InputAction.class) != null) { // Wait until input step starts + Thread.sleep(50); + } + Thread.sleep(1000L); + jobIdNumber[0] = run.getNumber(); + return run; + } + + private static InputStepExecution getInputStepExecution(WorkflowRun run, String inputMessage) throws Exception { + InputAction ia = run.getAction(InputAction.class); + List execList = ia.getExecutions(); + return execList.stream().filter(e -> inputMessage.equals(e.getInput().getMessage())).findFirst().orElse(null); + } + + final static String DEFAULT_JOBNAME = "testJob"; + + /** Simulates something happening badly during final shutdown, which may cause build to not appear done. */ + @Test + public void completedFinalFlowNodeNotPersisted() throws Exception { + final int[] build = new int[1]; + story.thenWithHardShutdown( j -> { + WorkflowRun run = runBasicBuild(j, DEFAULT_JOBNAME, build); + String finalId = run.getExecution().getCurrentHeads().get(0).getId(); + + // Hack but deletes the file from disk + CpsFlowExecution cpsExec = (CpsFlowExecution)(run.getExecution()); + File f = new File(cpsExec.getStorageDir(), finalId+".xml"); + f.delete(); + }); + story.then(j-> { + WorkflowJob r = j.jenkins.getItemByFullName(DEFAULT_JOBNAME, WorkflowJob.class); + WorkflowRun run = r.getBuildByNumber(build[0]); + assertCompletedCleanly(run); + Assert.assertEquals(Result.SUCCESS, run.getResult()); + }); + } + /** Perhaps there was a serialization error breaking the FlowGraph persistence for non-durable mode. */ + @Test + public void completedNoNodesPersisted() throws Exception { + final int[] build = new int[1]; + story.thenWithHardShutdown( j -> { + WorkflowRun run = runBasicBuild(j, DEFAULT_JOBNAME, build); + + // Hack but deletes the FlowNodeStorage from disk + CpsFlowExecution cpsExec = (CpsFlowExecution)(run.getExecution()); + cpsExec.getStorageDir().delete(); + }); + story.then(j-> { + WorkflowJob r = j.jenkins.getItemByFullName(DEFAULT_JOBNAME, WorkflowJob.class); + WorkflowRun run = r.getBuildByNumber(build[0]); + assertCompletedCleanly(run); + Assert.assertEquals(Result.SUCCESS, run.getResult()); + }); + } + + /** Unserializable flownodes or other errors */ + public void errorSavingNodes() throws Exception { + // TODO new step that generates a readResolve error when loaded + } + + /** Simulates case where done flag was not persisted. */ + @Test + public void completedButWrongDoneStatus() throws Exception { + final int[] build = new int[1]; + story.thenWithHardShutdown( j -> { + WorkflowRun run = runBasicBuild(j, DEFAULT_JOBNAME, build); + String finalId = run.getExecution().getCurrentHeads().get(0).getId(); + + // Hack but deletes the FlowNodeStorage from disk + CpsFlowExecution cpsExec = (CpsFlowExecution)(run.getExecution()); + cpsExec.done = false; + cpsExec.saveOwner(); + }); + story.then(j-> { + WorkflowJob r = j.jenkins.getItemByFullName(DEFAULT_JOBNAME, WorkflowJob.class); + WorkflowRun run = r.getBuildByNumber(build[0]); + assertCompletedCleanly(run); + Assert.assertEquals(Result.SUCCESS, run.getResult()); + }); + } + + @Test + public void inProgressNormal() throws Exception { + final int[] build = new int[1]; + story.then( j -> { + WorkflowRun run = runBasicPauseOnInput(j, DEFAULT_JOBNAME, build); + }); + story.then( j->{ + WorkflowJob r = j.jenkins.getItemByFullName(DEFAULT_JOBNAME, WorkflowJob.class); + WorkflowRun run = r.getBuildByNumber(build[0]); + assertCleanInProgress(run); + InputStepExecution exec = getInputStepExecution(run, "pause"); + exec.doProceedEmpty(); + j.waitForCompletion(run); + assertCompletedCleanly(run); + Assert.assertEquals(Result.SUCCESS, run.getResult()); + }); + } + + @Test + public void inProgressButFlowNodesLost() throws Exception { + final int[] build = new int[1]; + story.thenWithHardShutdown( j -> { + WorkflowRun run = runBasicPauseOnInput(j, DEFAULT_JOBNAME, build); + CpsFlowExecution cpsExec = (CpsFlowExecution)(run.getExecution()); + cpsExec.getStorageDir().delete(); + }); + story.then( j->{ + WorkflowJob r = j.jenkins.getItemByFullName(DEFAULT_JOBNAME, WorkflowJob.class); + WorkflowRun run = r.getBuildByNumber(build[0]); + assertCompletedCleanly(run); + }); + } + + /** Failed to save the Execution after the Program bumped the FlowHeads */ + @Test + public void inProgressButProgramAheadOfExecutionLost() throws Exception { + final int[] build = new int[1]; + story.thenWithHardShutdown( j -> { + WorkflowRun run = runBasicPauseOnInput(j, DEFAULT_JOBNAME, build); + CpsFlowExecution cpsExec = (CpsFlowExecution)(run.getExecution()); + FlowHead currHead = cpsExec.heads.firstEntry().getValue(); + + // Set the head to the previous value and SAVE + currHead.head = cpsExec.getCurrentHeads().get(0).getParents().get(0); + run.save(); + }); + story.then( j->{ + WorkflowJob r = j.jenkins.getItemByFullName(DEFAULT_JOBNAME, WorkflowJob.class); + WorkflowRun run = r.getBuildByNumber(build[0]); + assertCompletedCleanly(run); + }); + } + + @Test + /** Build okay but program fails to load */ + public void inProgressButProgramLoadFailure() throws Exception { + final int[] build = new int[1]; + story.thenWithHardShutdown( j -> { + WorkflowRun run = runBasicPauseOnInput(j, DEFAULT_JOBNAME, build); + CpsFlowExecution cpsExec = (CpsFlowExecution)(run.getExecution()); + cpsExec.getProgramDataFile().delete(); + }); + story.then( j->{ + WorkflowJob r = j.jenkins.getItemByFullName(DEFAULT_JOBNAME, WorkflowJob.class); + WorkflowRun run = r.getBuildByNumber(build[0]); + assertCompletedCleanly(run); + }); + } + + @Test + /** Build okay but then the start nodes get screwed up */ + public void inProgressButStartBlocksLost() throws Exception { + final int[] build = new int[1]; + story.thenWithHardShutdown( j -> { + WorkflowRun run = runBasicPauseOnInput(j, DEFAULT_JOBNAME, build); + CpsFlowExecution cpsExec = (CpsFlowExecution)(run.getExecution()); + cpsExec.startNodes.push(new FlowStartNode(cpsExec, cpsExec.iotaStr())); + run.save(); + }); + story.then( j->{ + WorkflowJob r = j.jenkins.getItemByFullName(DEFAULT_JOBNAME, WorkflowJob.class); + WorkflowRun run = r.getBuildByNumber(build[0]); + assertCompletedCleanly(run); + }); + } +} From 06e56f2680c81f236b060874f1270c5518e4a690 Mon Sep 17 00:00:00 2001 From: Sam Van Oort Date: Tue, 24 Apr 2018 18:47:10 -0400 Subject: [PATCH 11/34] Additional persistence problems testcases --- .../workflow/cps/PersistenceProblemsTest.java | 68 ++++++++++++++++--- 1 file changed, 58 insertions(+), 10 deletions(-) diff --git a/src/test/java/org/jenkinsci/plugins/workflow/cps/PersistenceProblemsTest.java b/src/test/java/org/jenkinsci/plugins/workflow/cps/PersistenceProblemsTest.java index c54f0a0cb..0446be520 100644 --- a/src/test/java/org/jenkinsci/plugins/workflow/cps/PersistenceProblemsTest.java +++ b/src/test/java/org/jenkinsci/plugins/workflow/cps/PersistenceProblemsTest.java @@ -54,7 +54,7 @@ static void assertCompletedCleanly(WorkflowRun run) throws Exception { if (fe instanceof CpsFlowExecution) { CpsFlowExecution cpsExec = (CpsFlowExecution)fe; Assert.assertTrue(cpsExec.isComplete()); - Assert.assertEquals(Boolean.TRUE, cpsExec.persistedClean); +// Assert.assertEquals(Boolean.TRUE, cpsExec.persistedClean); Assert.assertEquals(Boolean.TRUE, cpsExec.done); Assert.assertEquals(1, cpsExec.getCurrentHeads().size()); Assert.assertTrue(cpsExec.getCurrentHeads().get(0) instanceof FlowEndNode); @@ -86,21 +86,26 @@ static void assertCleanInProgress(WorkflowRun run) throws Exception { } /** Create and run a basic build before we mangle its persisted contents. Stores job number to jobIdNumber index 0. */ - private static WorkflowRun runBasicBuild(JenkinsRule j, String jobName, int[] jobIdNumber) throws Exception { + private static WorkflowRun runBasicBuild(JenkinsRule j, String jobName, int[] jobIdNumber, FlowDurabilityHint hint) throws Exception { WorkflowJob job = j.jenkins.createProject(WorkflowJob.class, jobName); job.setDefinition(new CpsFlowDefinition("echo 'doSomething'", true)); - job.addProperty(new DurabilityHintJobProperty(FlowDurabilityHint.MAX_SURVIVABILITY)); + job.addProperty(new DurabilityHintJobProperty(hint)); WorkflowRun run = j.buildAndAssertSuccess(job); jobIdNumber[0] = run.getNumber(); assertCompletedCleanly(run); return run; } + /** Create and run a basic build before we mangle its persisted contents. Stores job number to jobIdNumber index 0. */ + private static WorkflowRun runBasicBuild(JenkinsRule j, String jobName, int[] jobIdNumber) throws Exception { + return runBasicBuild(j, jobName, jobIdNumber, FlowDurabilityHint.MAX_SURVIVABILITY); + } + /** Sets up a running build that is waiting on input. */ - private static WorkflowRun runBasicPauseOnInput(JenkinsRule j, String jobName, int[] jobIdNumber) throws Exception { + private static WorkflowRun runBasicPauseOnInput(JenkinsRule j, String jobName, int[] jobIdNumber, FlowDurabilityHint durabilityHint) throws Exception { WorkflowJob job = j.jenkins.createProject(WorkflowJob.class, jobName); job.setDefinition(new CpsFlowDefinition("input 'pause'", true)); - job.addProperty(new DurabilityHintJobProperty(FlowDurabilityHint.MAX_SURVIVABILITY)); + job.addProperty(new DurabilityHintJobProperty(durabilityHint)); WorkflowRun run = job.scheduleBuild2(0).getStartCondition().get(); ListenableFuture listener = run.getExecutionPromise(); @@ -113,6 +118,10 @@ private static WorkflowRun runBasicPauseOnInput(JenkinsRule j, String jobName, i return run; } + private static WorkflowRun runBasicPauseOnInput(JenkinsRule j, String jobName, int[] jobIdNumber) throws Exception { + return runBasicPauseOnInput(j, jobName, jobIdNumber, FlowDurabilityHint.MAX_SURVIVABILITY); + } + private static InputStepExecution getInputStepExecution(WorkflowRun run, String inputMessage) throws Exception { InputAction ia = run.getAction(InputAction.class); List execList = ia.getExecutions(); @@ -160,11 +169,6 @@ public void completedNoNodesPersisted() throws Exception { }); } - /** Unserializable flownodes or other errors */ - public void errorSavingNodes() throws Exception { - // TODO new step that generates a readResolve error when loaded - } - /** Simulates case where done flag was not persisted. */ @Test public void completedButWrongDoneStatus() throws Exception { @@ -204,6 +208,50 @@ public void inProgressNormal() throws Exception { }); } + @Test + public void inProgressMaxPerfCleanShutdown() throws Exception { + final int[] build = new int[1]; + story.then( j -> { + WorkflowRun run = runBasicPauseOnInput(j, DEFAULT_JOBNAME, build, FlowDurabilityHint.PERFORMANCE_OPTIMIZED); + // SHOULD still save at end via persist-at-shutdown hooks + }); + story.then( j->{ + WorkflowJob r = j.jenkins.getItemByFullName(DEFAULT_JOBNAME, WorkflowJob.class); + WorkflowRun run = r.getBuildByNumber(build[0]); + assertCleanInProgress(run); + InputStepExecution exec = getInputStepExecution(run, "pause"); + exec.doProceedEmpty(); + j.waitForCompletion(run); + assertCompletedCleanly(run); + Assert.assertEquals(Result.SUCCESS, run.getResult()); + }); + } + + @Test + public void inProgressMaxPerfDirtyShutdown() throws Exception { + final int[] build = new int[1]; + final String[] finalNodeId = new String[1]; + story.thenWithHardShutdown( j -> { + runBasicPauseOnInput(j, DEFAULT_JOBNAME, build, FlowDurabilityHint.PERFORMANCE_OPTIMIZED); + // SHOULD still save at end via persist-at-shutdown hooks + }); + story.then( j->{ + WorkflowJob r = j.jenkins.getItemByFullName(DEFAULT_JOBNAME, WorkflowJob.class); + WorkflowRun run = r.getBuildByNumber(build[0]); + j.waitForCompletion(run); + assertCompletedCleanly(run); + Assert.assertEquals(Result.FAILURE, run.getResult()); + finalNodeId[0] = run.getExecution().getCurrentHeads().get(0).getId(); + }); + story.then(j-> { + WorkflowJob r = j.jenkins.getItemByFullName(DEFAULT_JOBNAME, WorkflowJob.class); + WorkflowRun run = r.getBuildByNumber(build[0]); + assertCompletedCleanly(run); + Assert.assertEquals(finalNodeId[0], run.getExecution().getCurrentHeads().get(0).getId()); + // JENKINS-50199, verify it doesn't try to resume again + }); + } + @Test public void inProgressButFlowNodesLost() throws Exception { final int[] build = new int[1]; From 8d1b7f58260c7ae444cb1fd11ebb40061dd77f9f Mon Sep 17 00:00:00 2001 From: Sam Van Oort Date: Tue, 24 Apr 2018 18:58:25 -0400 Subject: [PATCH 12/34] Refine testcases a bit more --- .../plugins/workflow/cps/PersistenceProblemsTest.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/test/java/org/jenkinsci/plugins/workflow/cps/PersistenceProblemsTest.java b/src/test/java/org/jenkinsci/plugins/workflow/cps/PersistenceProblemsTest.java index 0446be520..2e42baad9 100644 --- a/src/test/java/org/jenkinsci/plugins/workflow/cps/PersistenceProblemsTest.java +++ b/src/test/java/org/jenkinsci/plugins/workflow/cps/PersistenceProblemsTest.java @@ -39,7 +39,7 @@ public class PersistenceProblemsTest { static void assertCompletedCleanly(WorkflowRun run) throws Exception { if (run.isBuilding()) { System.out.println("Run initially building, going to wait a second to see if it finishes, run="+run); - Thread.sleep(1); + Thread.sleep(1000); } Assert.assertFalse(run.isBuilding()); Assert.assertNotNull(run.getResult()); @@ -54,7 +54,7 @@ static void assertCompletedCleanly(WorkflowRun run) throws Exception { if (fe instanceof CpsFlowExecution) { CpsFlowExecution cpsExec = (CpsFlowExecution)fe; Assert.assertTrue(cpsExec.isComplete()); -// Assert.assertEquals(Boolean.TRUE, cpsExec.persistedClean); + Assert.assertEquals(Boolean.TRUE, cpsExec.persistedClean); Assert.assertEquals(Boolean.TRUE, cpsExec.done); Assert.assertEquals(1, cpsExec.getCurrentHeads().size()); Assert.assertTrue(cpsExec.getCurrentHeads().get(0) instanceof FlowEndNode); From 3cbc60c94e508145e548c7951d79de4c7efca10c Mon Sep 17 00:00:00 2001 From: Sam Van Oort Date: Tue, 24 Apr 2018 21:17:42 -0400 Subject: [PATCH 13/34] Obligatory save when ending the execution, error out when stored flownode data is corrupted rather than continuing --- .../workflow/cps/CpsFlowExecution.java | 65 ++++++++++++------- .../workflow/cps/FlowDurabilityTest.java | 2 +- .../workflow/cps/PersistenceProblemsTest.java | 6 +- 3 files changed, 47 insertions(+), 26 deletions(-) diff --git a/src/main/java/org/jenkinsci/plugins/workflow/cps/CpsFlowExecution.java b/src/main/java/org/jenkinsci/plugins/workflow/cps/CpsFlowExecution.java index 10a95430d..350744ee8 100644 --- a/src/main/java/org/jenkinsci/plugins/workflow/cps/CpsFlowExecution.java +++ b/src/main/java/org/jenkinsci/plugins/workflow/cps/CpsFlowExecution.java @@ -35,6 +35,7 @@ import com.cloudbees.jenkins.support.api.Component; import com.cloudbees.jenkins.support.api.Container; import com.cloudbees.jenkins.support.api.Content; +import com.google.common.annotations.VisibleForTesting; import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMap; import com.google.common.util.concurrent.FutureCallback; @@ -231,6 +232,7 @@ * @author Kohsuke Kawaguchi */ @PersistIn(RUN) +@SuppressFBWarnings(value = "IS_FIELD_NOT_GUARDED", justification = "temporary only") public class CpsFlowExecution extends FlowExecution implements BlockableResume { /** * Groovy script of the main source file (that the user enters in the GUI) @@ -640,7 +642,7 @@ private void rebuildEmptyGraph() { startNodes.push(start); head.newStartNode(start); } catch (IOException e) { - LOGGER.log(Level.WARNING, "Failed to persist", e); + LOGGER.log(Level.WARNING, "Failed to persist FlowNode", e); } persistedClean = false; startNodesSerial = null; @@ -698,8 +700,8 @@ protected synchronized void initializeStorage() throws IOException { if (storageErrors) { // this.storageDir = (this.storageDir != null) ? this.storageDir+"-fallback" : "workflow-fallback"; // Avoid overwriting data this.storage = createStorage(); // Empty storage - // Need to find a way to mimic up the heads and fail cleanly, far enough to let the canResume do its thing - rebuildEmptyGraph(); + rebuildEmptyGraph(); // Mimic up some basic flowGraph data as much as we can. + throw new IOException("Failed to load FlowNodes for build, see errors in Jenkins log"); } } @@ -719,41 +721,42 @@ public boolean canResume() { @SuppressFBWarnings(value = "RC_REF_COMPARISON_BAD_PRACTICE_BOOLEAN", justification = "We want to explicitly check for boolean not-null and true") public void onLoad(FlowExecutionOwner owner) throws IOException { this.owner = owner; + try { initializeStorage(); if (heads == null) { LOGGER.log(Level.WARNING, "Null Flow heads after initializing storage for FlowExecution "+this); } else if (heads.isEmpty()) { - LOGGER.log(Level.INFO, "Empty flow heads after initializing storage - not necessarily an error, but odd - for FlowExecution "+this); + LOGGER.log(Level.INFO, "Empty flow heads after initializing storage - probably an error, but odd - for FlowExecution "+this); } if (startNodes == null) { LOGGER.log(Level.WARNING, "Null block start nodes after initializing storage for FlowExecution "+this); - } else if (startNodes.isEmpty()) { + } else if (startNodes.isEmpty() && heads != null && !(heads.isEmpty())) { LOGGER.log(Level.INFO, "Empty block start nodes after initializing storage - not necessarily an error, but odd - for FlowExecution "+this); } - try { - if (!isComplete()) { - // FOR SOME REASON we're arriving here even for *completed* builds sometimes, hopefully logging above helps - if (canResume()) { - loadProgramAsync(getProgramDataFile()); - } else { - // TODO if possible, consider trying to close out unterminated blocks to keep existing graph history - // That way we can visualize the graph in some error cases. - LOGGER.log(Level.WARNING, "Pipeline state not properly persisted, cannot resume "+owner.getUrl()); - throw new IOException("Cannot resume build -- was not cleanly saved when Jenkins shut down."); - } - } else if (done && !super.isComplete()) { - LOGGER.log(Level.WARNING, "Completed flow without FlowEndNode: "+this+" heads:"+getHeadsAsString()); + if (!isComplete()) { + // FOR SOME REASON we're arriving here even for *completed* builds sometimes, hopefully logging above helps + if (canResume()) { + loadProgramAsync(getProgramDataFile()); + } else { + // TODO if possible, consider trying to close out unterminated blocks to keep existing graph history + // That way we can visualize the graph in some error cases. + LOGGER.log(Level.WARNING, "Pipeline state not properly persisted, cannot resume "+owner.getUrl()); + throw new IOException("Cannot resume build -- was not cleanly saved when Jenkins shut down."); } - } catch (Exception e) { // Multicatch ensures that failure to load does not nuke the master - SettableFuture p = SettableFuture.create(); - programPromise = p; - loadProgramFailed(e, p); + } else if (done && !super.isComplete()) { + LOGGER.log(Level.WARNING, "Completed flow without FlowEndNode: "+this+" heads:"+getHeadsAsString()); + } + } catch (Exception e) { // Multicatch ensures that failure to load does not nuke the master + SettableFuture p = SettableFuture.create(); + programPromise = p; + loadProgramFailed(e, p); } finally { if (programPromise == null) { programPromise = Futures.immediateFailedFuture(new IllegalStateException("completed or broken execution")); + } } } @@ -1240,7 +1243,7 @@ synchronized void onProgramEnd(Outcome outcome) { } } catch (Exception ex) { done = Boolean.TRUE; - throw ex; + LOGGER.log(Level.WARNING, "Error trying to end execution "+this, ex); } try { @@ -1248,6 +1251,9 @@ synchronized void onProgramEnd(Outcome outcome) { } catch (IOException ioe) { LOGGER.log(Level.WARNING, "Error flushing FlowNodeStorage to disk at end of run", ioe); } + + this.persistedClean = Boolean.TRUE; + saveOwner(); } void cleanUpHeap() { @@ -1875,12 +1881,15 @@ private void checkpoint() { // Try to ensure we've saved the appropriate things -- the program is the last stumbling block. try { final SettableFuture myOutcome = SettableFuture.create(); + LOGGER.log(Level.INFO, "About to try to checkpoint the program for build"+this); if (programPromise != null && programPromise.isDone()) { runInCpsVmThread(new FutureCallback() { @Override public void onSuccess(CpsThreadGroup result) { try { + LOGGER.log(Level.INFO, "Trying to save program before shutdown "+this); result.saveProgramIfPossible(true); + LOGGER.log(Level.INFO, "Finished saving program before shutdown "+this); myOutcome.set(null); } catch (Exception ex) { LOGGER.log(Level.WARNING, "Error persisting program: "+ex); @@ -1890,10 +1899,12 @@ public void onSuccess(CpsThreadGroup result) { @Override public void onFailure(Throwable t) { + LOGGER.log(Level.WARNING, "Failed trying to save program before shutdown "+this); myOutcome.setException(t); } }); myOutcome.get(30, TimeUnit.SECONDS); + LOGGER.log(Level.FINE, "Successfully saved program for "+this); } } catch (TimeoutException te) { @@ -1905,12 +1916,18 @@ public void onFailure(Throwable t) { } try { // Flush node storage just in case the Program mutated it, just to be sure storage.flush(); + LOGGER.log(Level.FINE, "Successfully did final flush of storage for "+this); } catch (IOException ioe) { persistOk=false; LOGGER.log(Level.WARNING, "Error persisting FlowNode storage before shutdown", ioe); } persistedClean = persistOk; - saveOwner(); + try { + saveOwner(); + } catch (Exception ex) { + LOGGER.log(Level.WARNING, "Error saving build for "+this, ex); + } + } } diff --git a/src/test/java/org/jenkinsci/plugins/workflow/cps/FlowDurabilityTest.java b/src/test/java/org/jenkinsci/plugins/workflow/cps/FlowDurabilityTest.java index fdfa563ab..bb6c60d82 100644 --- a/src/test/java/org/jenkinsci/plugins/workflow/cps/FlowDurabilityTest.java +++ b/src/test/java/org/jenkinsci/plugins/workflow/cps/FlowDurabilityTest.java @@ -605,7 +605,7 @@ public void evaluate() throws Throwable { /** Verify that if we bomb out because we cannot resume, we at least try to finish the flow graph if we have something to work with. */ @Test - @Ignore // Can be fleshed out later if we have a valid need for it. + // Can be fleshed out later if we have a valid need for it. public void testPipelineFinishesFlowGraph() throws Exception { final String[] logStart = new String[1]; final List nodesOut = new ArrayList(); diff --git a/src/test/java/org/jenkinsci/plugins/workflow/cps/PersistenceProblemsTest.java b/src/test/java/org/jenkinsci/plugins/workflow/cps/PersistenceProblemsTest.java index 2e42baad9..32602aea6 100644 --- a/src/test/java/org/jenkinsci/plugins/workflow/cps/PersistenceProblemsTest.java +++ b/src/test/java/org/jenkinsci/plugins/workflow/cps/PersistenceProblemsTest.java @@ -15,6 +15,7 @@ import org.jenkinsci.plugins.workflow.support.steps.input.InputStepExecution; import org.junit.Assert; import org.junit.ClassRule; +import org.junit.Ignore; import org.junit.Rule; import org.junit.Test; import org.jvnet.hudson.test.BuildWatcher; @@ -54,9 +55,9 @@ static void assertCompletedCleanly(WorkflowRun run) throws Exception { if (fe instanceof CpsFlowExecution) { CpsFlowExecution cpsExec = (CpsFlowExecution)fe; Assert.assertTrue(cpsExec.isComplete()); - Assert.assertEquals(Boolean.TRUE, cpsExec.persistedClean); Assert.assertEquals(Boolean.TRUE, cpsExec.done); Assert.assertEquals(1, cpsExec.getCurrentHeads().size()); + Assert.assertTrue(cpsExec.isComplete()); Assert.assertTrue(cpsExec.getCurrentHeads().get(0) instanceof FlowEndNode); Assert.assertTrue(cpsExec.startNodes == null || cpsExec.startNodes.isEmpty()); Assert.assertFalse(cpsExec.blocksRestart()); @@ -209,6 +210,7 @@ public void inProgressNormal() throws Exception { } @Test + @Ignore public void inProgressMaxPerfCleanShutdown() throws Exception { final int[] build = new int[1]; story.then( j -> { @@ -228,6 +230,7 @@ public void inProgressMaxPerfCleanShutdown() throws Exception { } @Test + @Ignore public void inProgressMaxPerfDirtyShutdown() throws Exception { final int[] build = new int[1]; final String[] finalNodeId = new String[1]; @@ -238,6 +241,7 @@ public void inProgressMaxPerfDirtyShutdown() throws Exception { story.then( j->{ WorkflowJob r = j.jenkins.getItemByFullName(DEFAULT_JOBNAME, WorkflowJob.class); WorkflowRun run = r.getBuildByNumber(build[0]); + Thread.sleep(1000); j.waitForCompletion(run); assertCompletedCleanly(run); Assert.assertEquals(Result.FAILURE, run.getResult()); From 3fa7b8e2bacb28d5453cb26ac8fa91b6e2c6426d Mon Sep 17 00:00:00 2001 From: Sam Van Oort Date: Wed, 25 Apr 2018 08:36:48 -0400 Subject: [PATCH 14/34] Simply error out loading an execution with unpersisted flownodes and let it be nulled + fix tests that delete all flownodes --- pom.xml | 3 +- .../workflow/cps/CpsFlowExecution.java | 62 +++---------------- .../workflow/cps/PersistenceProblemsTest.java | 26 +++++--- 3 files changed, 29 insertions(+), 62 deletions(-) diff --git a/pom.xml b/pom.xml index baa9f2e76..aa129d3af 100644 --- a/pom.xml +++ b/pom.xml @@ -141,7 +141,8 @@ org.jenkins-ci.plugins.workflow workflow-job - 2.20 + + 2.21-20180425.020403-1 test diff --git a/src/main/java/org/jenkinsci/plugins/workflow/cps/CpsFlowExecution.java b/src/main/java/org/jenkinsci/plugins/workflow/cps/CpsFlowExecution.java index 350744ee8..bc7c3299d 100644 --- a/src/main/java/org/jenkinsci/plugins/workflow/cps/CpsFlowExecution.java +++ b/src/main/java/org/jenkinsci/plugins/workflow/cps/CpsFlowExecution.java @@ -610,43 +610,7 @@ private synchronized String getHeadsAsString() { } else if (myHeads.size() == 0) { return "empty-heads"; } else { - return myHeads.entrySet().stream().map(h->h.getKey()+"::"+h.getValue()).collect(Collectors.joining(",")); - } - - } - - /** Handle failures where we can't load heads. */ - private void rebuildEmptyGraph() { - synchronized (this) { - this.done = Boolean.TRUE; // Ensures that the flow does not show as incomplete if the graph data is corrupt - // something went catastrophically wrong and there's no live head. fake one - LOGGER.log(Level.WARNING, "Failed to load pipeline heads/start nodes, so faking some up for execution " + this.toString()); - if (this.startNodes == null) { - this.startNodes = new Stack(); - } - - if (this.heads == null) { - this.heads = new TreeMap(); - } else if (!this.heads.isEmpty()) { - if (LOGGER.isLoggable(Level.INFO)) { - LOGGER.log(Level.INFO, "Resetting heads to rebuild the Pipeline structure, tossing existing heads: "+getHeadsAsString()); - } - this.heads.clear(); - } - - this.startNodes.clear(); - FlowHead head = new FlowHead(this); - heads.put(head.getId(), head); - try { - FlowStartNode start = new FlowStartNode(this, iotaStr()); - startNodes.push(start); - head.newStartNode(start); - } catch (IOException e) { - LOGGER.log(Level.WARNING, "Failed to persist FlowNode", e); - } - persistedClean = false; - startNodesSerial = null; - headsSerial = null; + return myHeads.entrySet().stream().map(h -> h.getKey() + "::" + h.getValue()).collect(Collectors.joining(",")); } } @@ -697,10 +661,7 @@ protected synchronized void initializeStorage() throws IOException { storageErrors = true; } - if (storageErrors) { // - this.storageDir = (this.storageDir != null) ? this.storageDir+"-fallback" : "workflow-fallback"; // Avoid overwriting data - this.storage = createStorage(); // Empty storage - rebuildEmptyGraph(); // Mimic up some basic flowGraph data as much as we can. + if (storageErrors) { throw new IOException("Failed to load FlowNodes for build, see errors in Jenkins log"); } } @@ -723,18 +684,13 @@ public void onLoad(FlowExecutionOwner owner) throws IOException { this.owner = owner; try { - initializeStorage(); - if (heads == null) { - LOGGER.log(Level.WARNING, "Null Flow heads after initializing storage for FlowExecution "+this); - } else if (heads.isEmpty()) { - LOGGER.log(Level.INFO, "Empty flow heads after initializing storage - probably an error, but odd - for FlowExecution "+this); - } - if (startNodes == null) { - LOGGER.log(Level.WARNING, "Null block start nodes after initializing storage for FlowExecution "+this); - } else if (startNodes.isEmpty() && heads != null && !(heads.isEmpty())) { - LOGGER.log(Level.INFO, "Empty block start nodes after initializing storage - not necessarily an error, but odd - for FlowExecution "+this); - } + initializeStorage(); // Throws exception and bombs out if we can't load FlowNodes + } catch (Exception ex) { + programPromise = Futures.immediateFailedFuture(ex); + throw ex; + } + try { if (!isComplete()) { // FOR SOME REASON we're arriving here even for *completed* builds sometimes, hopefully logging above helps if (canResume()) { @@ -747,7 +703,6 @@ public void onLoad(FlowExecutionOwner owner) throws IOException { } } else if (done && !super.isComplete()) { LOGGER.log(Level.WARNING, "Completed flow without FlowEndNode: "+this+" heads:"+getHeadsAsString()); - } } catch (Exception e) { // Multicatch ensures that failure to load does not nuke the master SettableFuture p = SettableFuture.create(); @@ -756,7 +711,6 @@ public void onLoad(FlowExecutionOwner owner) throws IOException { } finally { if (programPromise == null) { programPromise = Futures.immediateFailedFuture(new IllegalStateException("completed or broken execution")); - } } } diff --git a/src/test/java/org/jenkinsci/plugins/workflow/cps/PersistenceProblemsTest.java b/src/test/java/org/jenkinsci/plugins/workflow/cps/PersistenceProblemsTest.java index 32602aea6..c3d95a7e8 100644 --- a/src/test/java/org/jenkinsci/plugins/workflow/cps/PersistenceProblemsTest.java +++ b/src/test/java/org/jenkinsci/plugins/workflow/cps/PersistenceProblemsTest.java @@ -3,6 +3,7 @@ import com.google.common.util.concurrent.ListenableFuture; import hudson.model.Queue; import hudson.model.Result; +import org.apache.commons.io.FileUtils; import org.jenkinsci.plugins.workflow.flow.FlowDurabilityHint; import org.jenkinsci.plugins.workflow.flow.FlowExecution; import org.jenkinsci.plugins.workflow.flow.FlowExecutionList; @@ -21,8 +22,10 @@ import org.jvnet.hudson.test.BuildWatcher; import org.jvnet.hudson.test.JenkinsRule; import org.jvnet.hudson.test.RestartableJenkinsRule; +import sun.misc.IOUtils; import java.io.File; +import java.nio.file.Files; import java.util.List; import java.util.concurrent.atomic.AtomicBoolean; @@ -36,6 +39,18 @@ public class PersistenceProblemsTest { @Rule public RestartableJenkinsRule story = new RestartableJenkinsRule(); + /** Execution bombed out due to some sort of irrecoverable persistence issue. */ + static void assertNulledExecution(WorkflowRun run) throws Exception { + if (run.isBuilding()) { + System.out.println("Run initially building, going to wait a second to see if it finishes, run="+run); + Thread.sleep(1000); + } + Assert.assertFalse(run.isBuilding()); + Assert.assertNotNull(run.getResult()); + FlowExecution fe = run.getExecution(); + Assert.assertNull(fe); + } + /** Verifies all the assumptions about a cleanly finished build. */ static void assertCompletedCleanly(WorkflowRun run) throws Exception { if (run.isBuilding()) { @@ -147,7 +162,7 @@ public void completedFinalFlowNodeNotPersisted() throws Exception { story.then(j-> { WorkflowJob r = j.jenkins.getItemByFullName(DEFAULT_JOBNAME, WorkflowJob.class); WorkflowRun run = r.getBuildByNumber(build[0]); - assertCompletedCleanly(run); + assertNulledExecution(run); Assert.assertEquals(Result.SUCCESS, run.getResult()); }); } @@ -157,15 +172,12 @@ public void completedNoNodesPersisted() throws Exception { final int[] build = new int[1]; story.thenWithHardShutdown( j -> { WorkflowRun run = runBasicBuild(j, DEFAULT_JOBNAME, build); - - // Hack but deletes the FlowNodeStorage from disk - CpsFlowExecution cpsExec = (CpsFlowExecution)(run.getExecution()); - cpsExec.getStorageDir().delete(); + FileUtils.deleteDirectory(((CpsFlowExecution)(run.getExecution())).getStorageDir()); }); story.then(j-> { WorkflowJob r = j.jenkins.getItemByFullName(DEFAULT_JOBNAME, WorkflowJob.class); WorkflowRun run = r.getBuildByNumber(build[0]); - assertCompletedCleanly(run); + assertNulledExecution(run); Assert.assertEquals(Result.SUCCESS, run.getResult()); }); } @@ -262,7 +274,7 @@ public void inProgressButFlowNodesLost() throws Exception { story.thenWithHardShutdown( j -> { WorkflowRun run = runBasicPauseOnInput(j, DEFAULT_JOBNAME, build); CpsFlowExecution cpsExec = (CpsFlowExecution)(run.getExecution()); - cpsExec.getStorageDir().delete(); + FileUtils.deleteDirectory(((CpsFlowExecution)(run.getExecution())).getStorageDir()); }); story.then( j->{ WorkflowJob r = j.jenkins.getItemByFullName(DEFAULT_JOBNAME, WorkflowJob.class); From 6ffcf33c715e4de6349935daf459755782ea7a5b Mon Sep 17 00:00:00 2001 From: Sam Van Oort Date: Wed, 25 Apr 2018 11:33:50 -0400 Subject: [PATCH 15/34] Pull in wf-job fix for not setting build result --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index aa129d3af..a0006d790 100644 --- a/pom.xml +++ b/pom.xml @@ -142,7 +142,7 @@ org.jenkins-ci.plugins.workflow workflow-job - 2.21-20180425.020403-1 + 2.21-20180425.142151-3 test From a1083fcd9e605366bca2de13d545383a124c10b8 Mon Sep 17 00:00:00 2001 From: Sam Van Oort Date: Wed, 25 Apr 2018 11:34:26 -0400 Subject: [PATCH 16/34] Fix test wait-for-input step and remove test that doesn't do what it seems for persistence verification --- .../workflow/cps/PersistenceProblemsTest.java | 31 +++++-------------- 1 file changed, 7 insertions(+), 24 deletions(-) diff --git a/src/test/java/org/jenkinsci/plugins/workflow/cps/PersistenceProblemsTest.java b/src/test/java/org/jenkinsci/plugins/workflow/cps/PersistenceProblemsTest.java index c3d95a7e8..66239d062 100644 --- a/src/test/java/org/jenkinsci/plugins/workflow/cps/PersistenceProblemsTest.java +++ b/src/test/java/org/jenkinsci/plugins/workflow/cps/PersistenceProblemsTest.java @@ -22,10 +22,8 @@ import org.jvnet.hudson.test.BuildWatcher; import org.jvnet.hudson.test.JenkinsRule; import org.jvnet.hudson.test.RestartableJenkinsRule; -import sun.misc.IOUtils; import java.io.File; -import java.nio.file.Files; import java.util.List; import java.util.concurrent.atomic.AtomicBoolean; @@ -126,10 +124,15 @@ private static WorkflowRun runBasicPauseOnInput(JenkinsRule j, String jobName, i WorkflowRun run = job.scheduleBuild2(0).getStartCondition().get(); ListenableFuture listener = run.getExecutionPromise(); FlowExecution exec = listener.get(); - while(run.getAction(InputAction.class) != null) { // Wait until input step starts + while(exec.getCurrentHeads().isEmpty() || (exec.getCurrentHeads().get(0) instanceof FlowStartNode)) { // Wait until input step starts + System.out.println("Waiting for input step to begin"); Thread.sleep(50); } - Thread.sleep(1000L); + while(run.getAction(InputAction.class) == null) { // Wait until input step starts + System.out.println("Waiting for input action to get attached to run"); + Thread.sleep(50); + } + Thread.sleep(100L); // A little extra buffer for persistence etc jobIdNumber[0] = run.getNumber(); return run; } @@ -283,26 +286,6 @@ public void inProgressButFlowNodesLost() throws Exception { }); } - /** Failed to save the Execution after the Program bumped the FlowHeads */ - @Test - public void inProgressButProgramAheadOfExecutionLost() throws Exception { - final int[] build = new int[1]; - story.thenWithHardShutdown( j -> { - WorkflowRun run = runBasicPauseOnInput(j, DEFAULT_JOBNAME, build); - CpsFlowExecution cpsExec = (CpsFlowExecution)(run.getExecution()); - FlowHead currHead = cpsExec.heads.firstEntry().getValue(); - - // Set the head to the previous value and SAVE - currHead.head = cpsExec.getCurrentHeads().get(0).getParents().get(0); - run.save(); - }); - story.then( j->{ - WorkflowJob r = j.jenkins.getItemByFullName(DEFAULT_JOBNAME, WorkflowJob.class); - WorkflowRun run = r.getBuildByNumber(build[0]); - assertCompletedCleanly(run); - }); - } - @Test /** Build okay but program fails to load */ public void inProgressButProgramLoadFailure() throws Exception { From 2d3a219f983e427d46d24a33c327c5ad053189d0 Mon Sep 17 00:00:00 2001 From: Sam Van Oort Date: Wed, 25 Apr 2018 11:36:02 -0400 Subject: [PATCH 17/34] Fix test for bogus done status: set done flag on execution if FlowEndNode present and save + ensure we don't block a Jenkins restart if the programPromise errors rather than loading --- .../workflow/cps/CpsFlowExecution.java | 20 ++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/src/main/java/org/jenkinsci/plugins/workflow/cps/CpsFlowExecution.java b/src/main/java/org/jenkinsci/plugins/workflow/cps/CpsFlowExecution.java index bc7c3299d..40dbd2ed4 100644 --- a/src/main/java/org/jenkinsci/plugins/workflow/cps/CpsFlowExecution.java +++ b/src/main/java/org/jenkinsci/plugins/workflow/cps/CpsFlowExecution.java @@ -691,8 +691,16 @@ public void onLoad(FlowExecutionOwner owner) throws IOException { } try { - if (!isComplete()) { - // FOR SOME REASON we're arriving here even for *completed* builds sometimes, hopefully logging above helps + if (isComplete()) { + if (done == Boolean.TRUE && !super.isComplete()) { + LOGGER.log(Level.WARNING, "Completed flow without FlowEndNode: "+this+" heads:"+getHeadsAsString()); + } + if (super.isComplete() && done != Boolean.TRUE) { + LOGGER.log(Level.WARNING, "Flow has FlowEndNode, but is not marked as done, fixing this for"+this); + done = true; + saveOwner(); + } + } else { // See if we can/should resume build if (canResume()) { loadProgramAsync(getProgramDataFile()); } else { @@ -701,8 +709,6 @@ public void onLoad(FlowExecutionOwner owner) throws IOException { LOGGER.log(Level.WARNING, "Pipeline state not properly persisted, cannot resume "+owner.getUrl()); throw new IOException("Cannot resume build -- was not cleanly saved when Jenkins shut down."); } - } else if (done && !super.isComplete()) { - LOGGER.log(Level.WARNING, "Completed flow without FlowEndNode: "+this+" heads:"+getHeadsAsString()); } } catch (Exception e) { // Multicatch ensures that failure to load does not nuke the master SettableFuture p = SettableFuture.create(); @@ -878,15 +884,19 @@ public void onFailure(Throwable t) { }); } + /** See JENKINS-22941 for why this exists. */ @Override public boolean blocksRestart() { if (programPromise == null || !programPromise.isDone()) { + // Can't restart cleanly while trying to set up the build return true; } CpsThreadGroup g; try { g = programPromise.get(); } catch (Exception x) { - return true; + // FIXME check this won't cause issues due to depickling delays etc + LOGGER.log(Level.FINE, "Not blocking restart due to exception in ProgramPromise: "+this, x); + return false; } return g.busy; } From 12d6c8e209870041fb1149102a8a947120ac1293 Mon Sep 17 00:00:00 2001 From: Sam Van Oort Date: Wed, 25 Apr 2018 11:42:15 -0400 Subject: [PATCH 18/34] Simplify the flownodeStorage initialization and note that we need to rebuild flow graph if possible --- .../workflow/cps/CpsFlowExecution.java | 58 ++++++++----------- 1 file changed, 24 insertions(+), 34 deletions(-) diff --git a/src/main/java/org/jenkinsci/plugins/workflow/cps/CpsFlowExecution.java b/src/main/java/org/jenkinsci/plugins/workflow/cps/CpsFlowExecution.java index 40dbd2ed4..973f0b783 100644 --- a/src/main/java/org/jenkinsci/plugins/workflow/cps/CpsFlowExecution.java +++ b/src/main/java/org/jenkinsci/plugins/workflow/cps/CpsFlowExecution.java @@ -616,41 +616,30 @@ private synchronized String getHeadsAsString() { @SuppressFBWarnings(value = "IS2_INCONSISTENT_SYNC", justification="Storage does not actually NEED to be synchronized but the rest does.") protected synchronized void initializeStorage() throws IOException { - boolean storageErrors = false; // Maybe storage didn't get to persist properly or files were deleted. try { storage = createStorage(); - - heads = new TreeMap(); - for (Map.Entry entry : headsSerial.entrySet()) { - FlowHead h = new FlowHead(this, entry.getKey()); - - FlowNode n = storage.getNode(entry.getValue()); - if (n != null) { - h.setForDeserialize(storage.getNode(entry.getValue())); - heads.put(h.getId(), h); - } else { - LOGGER.log(Level.WARNING, "Tried to load head FlowNodes for execution "+this.owner+" but FlowNode was not found in storage for head id:FlowNodeId "+entry.getKey()+":"+entry.getValue()); - storageErrors = true; - break; - } + heads = new TreeMap(); + for (Map.Entry entry : headsSerial.entrySet()) { + FlowHead h = new FlowHead(this, entry.getKey()); + + FlowNode n = storage.getNode(entry.getValue()); + if (n != null) { + h.setForDeserialize(storage.getNode(entry.getValue())); + heads.put(h.getId(), h); + } else { + throw new IOException("Tried to load head FlowNodes for execution "+this.owner+" but FlowNode was not found in storage for head id:FlowNodeId "+entry.getKey()+":"+entry.getValue()); } - + } headsSerial = null; - if (!storageErrors) { - // Same for startNodes: - storageErrors = false; - startNodes = new Stack(); - for (String id : startNodesSerial) { - FlowNode node = storage.getNode(id); - if (node != null) { - startNodes.add((BlockStartNode) storage.getNode(id)); - } else { - // TODO if possible, consider trying to close out unterminated blocks using heads, to keep existing graph history - LOGGER.log(Level.WARNING, "Tried to load startNode FlowNodes for execution "+this.owner+" but FlowNode was not found in storage for FlowNode Id "+id); - storageErrors = true; - break; - } + startNodes = new Stack(); + for (String id : startNodesSerial) { + FlowNode node = storage.getNode(id); + if (node != null) { + startNodes.add((BlockStartNode) storage.getNode(id)); + } else { + // TODO if possible, consider trying to close out unterminated blocks using heads, to keep existing graph history + throw new IOException( "Tried to load startNode FlowNodes for execution "+this.owner+" but FlowNode was not found in storage for FlowNode Id "+id); } } startNodesSerial = null; @@ -658,10 +647,6 @@ protected synchronized void initializeStorage() throws IOException { } catch (Exception ioe) { this.done = Boolean.TRUE; LOGGER.log(Level.WARNING, "Error initializing storage and loading nodes for "+this, ioe); - storageErrors = true; - } - - if (storageErrors) { throw new IOException("Failed to load FlowNodes for build, see errors in Jenkins log"); } } @@ -685,6 +670,11 @@ public void onLoad(FlowExecutionOwner owner) throws IOException { try { initializeStorage(); // Throws exception and bombs out if we can't load FlowNodes + // TODO attempt to mock out the remaining bits of the FlowGraph and terminate cleanly as needed + // This includes setting done, persistedClean, programPromise values + // Creating new start/end nodes in a new storage directory and updating heads + starts appropriately. + // Plus invoking graphListeners potentially and setting build result explicitly by fetching the Executable runs + // WHY? This lets us retain scripts etc } catch (Exception ex) { programPromise = Futures.immediateFailedFuture(ex); throw ex; From 85e99f1519f545c8c96cad183a6be8c53affe727 Mon Sep 17 00:00:00 2001 From: Sam Van Oort Date: Fri, 27 Apr 2018 17:48:04 -0400 Subject: [PATCH 19/34] Mostly-working way to build placeholder flownodes for inconsistently stored state --- pom.xml | 2 +- .../workflow/cps/CpsFlowExecution.java | 125 ++++++++++++------ .../workflow/cps/PersistenceProblemsTest.java | 23 +++- 3 files changed, 109 insertions(+), 41 deletions(-) diff --git a/pom.xml b/pom.xml index a0006d790..b0ece6ade 100644 --- a/pom.xml +++ b/pom.xml @@ -142,7 +142,7 @@ org.jenkins-ci.plugins.workflow workflow-job - 2.21-20180425.142151-3 + 2.21-20180427.163721-4 test diff --git a/src/main/java/org/jenkinsci/plugins/workflow/cps/CpsFlowExecution.java b/src/main/java/org/jenkinsci/plugins/workflow/cps/CpsFlowExecution.java index 973f0b783..295ed8a15 100644 --- a/src/main/java/org/jenkinsci/plugins/workflow/cps/CpsFlowExecution.java +++ b/src/main/java/org/jenkinsci/plugins/workflow/cps/CpsFlowExecution.java @@ -614,41 +614,80 @@ private synchronized String getHeadsAsString() { } } - @SuppressFBWarnings(value = "IS2_INCONSISTENT_SYNC", justification="Storage does not actually NEED to be synchronized but the rest does.") - protected synchronized void initializeStorage() throws IOException { + /** + * In the event we're missing FlowNodes, fail-fast and create some mockup FlowNodes so we can continue. + * This avoids nulling out all of the execution's data + * Bypasses {@link #croak(Throwable)} and {@link #onProgramEnd(Outcome)} to guarantee a clean path. + */ + @GuardedBy("this") + void createPlaceholderNodes(Throwable failureReason) throws Exception { try { - storage = createStorage(); - heads = new TreeMap(); - for (Map.Entry entry : headsSerial.entrySet()) { - FlowHead h = new FlowHead(this, entry.getKey()); - - FlowNode n = storage.getNode(entry.getValue()); - if (n != null) { - h.setForDeserialize(storage.getNode(entry.getValue())); - heads.put(h.getId(), h); - } else { - throw new IOException("Tried to load head FlowNodes for execution "+this.owner+" but FlowNode was not found in storage for head id:FlowNodeId "+entry.getKey()+":"+entry.getValue()); + programPromise = Futures.immediateFailedFuture(new IllegalStateException("Failed loading heads", failureReason)); + LOGGER.log(Level.INFO, "Creating placeholder flownodes for execution: "+this); + if (this.owner != null) { + try { + owner.getListener().getLogger().println("Creating placeholder flownodes because failed loading originals."); + } catch (Exception ex) { + // It's okay to fail to log } } - headsSerial = null; - startNodes = new Stack(); - for (String id : startNodesSerial) { - FlowNode node = storage.getNode(id); - if (node != null) { - startNodes.add((BlockStartNode) storage.getNode(id)); - } else { - // TODO if possible, consider trying to close out unterminated blocks using heads, to keep existing graph history - throw new IOException( "Tried to load startNode FlowNodes for execution "+this.owner+" but FlowNode was not found in storage for FlowNode Id "+id); - } + // Switch to fallback storage so we don't delete original node data + this.storageDir = (this.storageDir != null) ? this.storageDir+"-fallback" : "workflow-fallback"; + this.storage = createStorage(); // Empty storage + + // Clear out old start nodes and heads + this.startNodes = new Stack(); + FlowHead head = new FlowHead(this); + this.heads = new TreeMap(); + heads.put(head.getId(), head); + FlowStartNode start = new FlowStartNode(this, iotaStr()); + head.newStartNode(start); + + // Create end + FlowNode end = new FlowEndNode(this, iotaStr(), (FlowStartNode)startNodes.pop(), result, getCurrentHeads().toArray(new FlowNode[0])); + end.addAction(new ErrorAction(failureReason)); + head.setNewHead(end); + + if (this.result == null) { + setResult(Result.FAILURE); + } + saveOwner(); + + } catch (Exception ex) { + this.done = true; + throw ex; + } + } + + @SuppressFBWarnings(value = "IS2_INCONSISTENT_SYNC", justification="Storage does not actually NEED to be synchronized but the rest does.") + protected synchronized void initializeStorage() throws IOException { + storage = createStorage(); + heads = new TreeMap(); + for (Map.Entry entry : headsSerial.entrySet()) { + FlowHead h = new FlowHead(this, entry.getKey()); + + FlowNode n = storage.getNode(entry.getValue()); + if (n != null) { + h.setForDeserialize(storage.getNode(entry.getValue())); + heads.put(h.getId(), h); + } else { + throw new IOException("Tried to load head FlowNodes for execution "+this.owner+" but FlowNode was not found in storage for head id:FlowNodeId "+entry.getKey()+":"+entry.getValue()); } - startNodesSerial = null; + } + headsSerial = null; - } catch (Exception ioe) { - this.done = Boolean.TRUE; - LOGGER.log(Level.WARNING, "Error initializing storage and loading nodes for "+this, ioe); - throw new IOException("Failed to load FlowNodes for build, see errors in Jenkins log"); + startNodes = new Stack(); + for (String id : startNodesSerial) { + FlowNode node = storage.getNode(id); + if (node != null) { + startNodes.add((BlockStartNode) storage.getNode(id)); + } else { + // TODO if possible, consider trying to close out unterminated blocks using heads, to keep existing graph history + throw new IOException( "Tried to load startNode FlowNodes for execution "+this.owner+" but FlowNode was not found in storage for FlowNode Id "+id); + } } + startNodesSerial = null; } /** If true, we are allowed to resume the build because resume is enabled AND we shut down cleanly. */ @@ -669,15 +708,17 @@ public void onLoad(FlowExecutionOwner owner) throws IOException { this.owner = owner; try { - initializeStorage(); // Throws exception and bombs out if we can't load FlowNodes - // TODO attempt to mock out the remaining bits of the FlowGraph and terminate cleanly as needed - // This includes setting done, persistedClean, programPromise values - // Creating new start/end nodes in a new storage directory and updating heads + starts appropriately. - // Plus invoking graphListeners potentially and setting build result explicitly by fetching the Executable runs - // WHY? This lets us retain scripts etc + try { + initializeStorage(); // Throws exception and bombs out if we can't load FlowNodes + } catch (Exception ex) { + LOGGER.log(Level.WARNING, "Error initializing storage and loading nodes, will try to create placeholders for: "+this, ex); + createPlaceholderNodes(ex); + return; + } } catch (Exception ex) { + done = true; programPromise = Futures.immediateFailedFuture(ex); - throw ex; + throw new IOException("Failed to even create placeholder nodes for execution", ex); } try { @@ -1182,9 +1223,10 @@ synchronized void onProgramEnd(Outcome outcome) { // shrink everything into a single new head try { if (heads != null) { + // Below does not look correct to me FlowHead first = getFirstHead(); first.setNewHead(head); - done = Boolean.TRUE; // After setting the final head + done = true; // After setting the final head heads.clear(); heads.put(first.getId(), first); @@ -1196,7 +1238,7 @@ synchronized void onProgramEnd(Outcome outcome) { } } } catch (Exception ex) { - done = Boolean.TRUE; + done = true; LOGGER.log(Level.WARNING, "Error trying to end execution "+this, ex); } @@ -1428,6 +1470,13 @@ public String getNextScriptName(String path) { return shell.generateScriptName().replaceFirst("[.]groovy$", ""); } + /** Has the execution been marked done - note that legacy builds may not have that flag persisted, in which case + * we look for a single FlowEndNode head (see: {@link #isComplete()} and {@link FlowExecution#isComplete()}) + */ + public boolean isDoneFlagSet() { + return done; + } + public boolean isPaused() { if (programPromise.isDone()) { try { @@ -1455,7 +1504,7 @@ public void pause(final boolean v) throws IOException { if (executable instanceof AccessControlled) { ((AccessControlled) executable).checkPermission(Item.CANCEL); } - done = Boolean.FALSE; + done = false; Futures.addCallback(programPromise, new FutureCallback() { @Override public void onSuccess(CpsThreadGroup g) { if (v) { diff --git a/src/test/java/org/jenkinsci/plugins/workflow/cps/PersistenceProblemsTest.java b/src/test/java/org/jenkinsci/plugins/workflow/cps/PersistenceProblemsTest.java index 66239d062..4e81de31d 100644 --- a/src/test/java/org/jenkinsci/plugins/workflow/cps/PersistenceProblemsTest.java +++ b/src/test/java/org/jenkinsci/plugins/workflow/cps/PersistenceProblemsTest.java @@ -99,6 +99,11 @@ static void assertCleanInProgress(WorkflowRun run) throws Exception { Assert.assertTrue(cpsExec.startNodes != null && !cpsExec.startNodes.isEmpty()); } + static void assertResultMatchExecutionAndRun(WorkflowRun run, Result[] executionAndBuildResult) throws Exception { + Assert.assertEquals(executionAndBuildResult[0], ((CpsFlowExecution)(run.getExecution())).getResult()); + Assert.assertEquals(executionAndBuildResult[1], run.getResult()); + } + /** Create and run a basic build before we mangle its persisted contents. Stores job number to jobIdNumber index 0. */ private static WorkflowRun runBasicBuild(JenkinsRule j, String jobName, int[] jobIdNumber, FlowDurabilityHint hint) throws Exception { WorkflowJob job = j.jenkins.createProject(WorkflowJob.class, jobName); @@ -153,6 +158,7 @@ private static InputStepExecution getInputStepExecution(WorkflowRun run, String @Test public void completedFinalFlowNodeNotPersisted() throws Exception { final int[] build = new int[1]; + final Result[] executionAndBuildResult = new Result[2]; story.thenWithHardShutdown( j -> { WorkflowRun run = runBasicBuild(j, DEFAULT_JOBNAME, build); String finalId = run.getExecution().getCurrentHeads().get(0).getId(); @@ -161,27 +167,36 @@ public void completedFinalFlowNodeNotPersisted() throws Exception { CpsFlowExecution cpsExec = (CpsFlowExecution)(run.getExecution()); File f = new File(cpsExec.getStorageDir(), finalId+".xml"); f.delete(); + executionAndBuildResult[0] = ((CpsFlowExecution)(run.getExecution())).getResult(); + executionAndBuildResult[1] = run.getResult(); }); story.then(j-> { WorkflowJob r = j.jenkins.getItemByFullName(DEFAULT_JOBNAME, WorkflowJob.class); WorkflowRun run = r.getBuildByNumber(build[0]); - assertNulledExecution(run); + assertCompletedCleanly(run); + // assertNulledExecution(run); Assert.assertEquals(Result.SUCCESS, run.getResult()); + assertResultMatchExecutionAndRun(run, executionAndBuildResult); }); } /** Perhaps there was a serialization error breaking the FlowGraph persistence for non-durable mode. */ @Test public void completedNoNodesPersisted() throws Exception { final int[] build = new int[1]; + final Result[] executionAndBuildResult = new Result[2]; story.thenWithHardShutdown( j -> { WorkflowRun run = runBasicBuild(j, DEFAULT_JOBNAME, build); FileUtils.deleteDirectory(((CpsFlowExecution)(run.getExecution())).getStorageDir()); + executionAndBuildResult[0] = ((CpsFlowExecution)(run.getExecution())).getResult(); + executionAndBuildResult[1] = run.getResult(); }); story.then(j-> { WorkflowJob r = j.jenkins.getItemByFullName(DEFAULT_JOBNAME, WorkflowJob.class); WorkflowRun run = r.getBuildByNumber(build[0]); - assertNulledExecution(run); + assertCompletedCleanly(run); + // assertNulledExecution(run); Assert.assertEquals(Result.SUCCESS, run.getResult()); + assertResultMatchExecutionAndRun(run, executionAndBuildResult); }); } @@ -189,6 +204,7 @@ public void completedNoNodesPersisted() throws Exception { @Test public void completedButWrongDoneStatus() throws Exception { final int[] build = new int[1]; + final Result[] executionAndBuildResult = new Result[2]; story.thenWithHardShutdown( j -> { WorkflowRun run = runBasicBuild(j, DEFAULT_JOBNAME, build); String finalId = run.getExecution().getCurrentHeads().get(0).getId(); @@ -197,12 +213,15 @@ public void completedButWrongDoneStatus() throws Exception { CpsFlowExecution cpsExec = (CpsFlowExecution)(run.getExecution()); cpsExec.done = false; cpsExec.saveOwner(); + executionAndBuildResult[0] = ((CpsFlowExecution)(run.getExecution())).getResult(); + executionAndBuildResult[1] = run.getResult(); }); story.then(j-> { WorkflowJob r = j.jenkins.getItemByFullName(DEFAULT_JOBNAME, WorkflowJob.class); WorkflowRun run = r.getBuildByNumber(build[0]); assertCompletedCleanly(run); Assert.assertEquals(Result.SUCCESS, run.getResult()); + assertResultMatchExecutionAndRun(run, executionAndBuildResult); }); } From 2ec83cc66c1ef43e10729a8d4db661131fb1e56b Mon Sep 17 00:00:00 2001 From: Sam Van Oort Date: Fri, 27 Apr 2018 18:39:56 -0400 Subject: [PATCH 20/34] Pull in wf-job fixes to notifications and ensure done is set if we add placeholder nodes --- pom.xml | 4 ++-- .../org/jenkinsci/plugins/workflow/cps/CpsFlowExecution.java | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pom.xml b/pom.xml index b0ece6ade..447124bce 100644 --- a/pom.xml +++ b/pom.xml @@ -141,8 +141,8 @@ org.jenkins-ci.plugins.workflow workflow-job - - 2.21-20180427.163721-4 + + 2.21-20180427.223321-5 test diff --git a/src/main/java/org/jenkinsci/plugins/workflow/cps/CpsFlowExecution.java b/src/main/java/org/jenkinsci/plugins/workflow/cps/CpsFlowExecution.java index 295ed8a15..44df1e1d3 100644 --- a/src/main/java/org/jenkinsci/plugins/workflow/cps/CpsFlowExecution.java +++ b/src/main/java/org/jenkinsci/plugins/workflow/cps/CpsFlowExecution.java @@ -621,6 +621,7 @@ private synchronized String getHeadsAsString() { */ @GuardedBy("this") void createPlaceholderNodes(Throwable failureReason) throws Exception { + this.done = true; try { programPromise = Futures.immediateFailedFuture(new IllegalStateException("Failed loading heads", failureReason)); LOGGER.log(Level.INFO, "Creating placeholder flownodes for execution: "+this); @@ -655,7 +656,6 @@ void createPlaceholderNodes(Throwable failureReason) throws Exception { saveOwner(); } catch (Exception ex) { - this.done = true; throw ex; } } From d4eb956c727e8891de7b285520cb8eb2ced5e2dc Mon Sep 17 00:00:00 2001 From: Sam Van Oort Date: Fri, 27 Apr 2018 18:59:25 -0400 Subject: [PATCH 21/34] Fix result handling for build --- .../plugins/workflow/cps/CpsFlowExecution.java | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/main/java/org/jenkinsci/plugins/workflow/cps/CpsFlowExecution.java b/src/main/java/org/jenkinsci/plugins/workflow/cps/CpsFlowExecution.java index 44df1e1d3..5b54fe9bc 100644 --- a/src/main/java/org/jenkinsci/plugins/workflow/cps/CpsFlowExecution.java +++ b/src/main/java/org/jenkinsci/plugins/workflow/cps/CpsFlowExecution.java @@ -622,6 +622,12 @@ private synchronized String getHeadsAsString() { @GuardedBy("this") void createPlaceholderNodes(Throwable failureReason) throws Exception { this.done = true; + + if (this.owner != null && this.owner.getExecutable() instanceof Run) { + Run r = (Run)(owner.getExecutable()); + setResult(r.getResult() != null ? r.getResult() : Result.FAILURE); + } + try { programPromise = Futures.immediateFailedFuture(new IllegalStateException("Failed loading heads", failureReason)); LOGGER.log(Level.INFO, "Creating placeholder flownodes for execution: "+this); @@ -649,10 +655,6 @@ void createPlaceholderNodes(Throwable failureReason) throws Exception { FlowNode end = new FlowEndNode(this, iotaStr(), (FlowStartNode)startNodes.pop(), result, getCurrentHeads().toArray(new FlowNode[0])); end.addAction(new ErrorAction(failureReason)); head.setNewHead(end); - - if (this.result == null) { - setResult(Result.FAILURE); - } saveOwner(); } catch (Exception ex) { From 9c9e9585b1eea3f2bc33ededf58f59b9172583a5 Mon Sep 17 00:00:00 2001 From: Sam Van Oort Date: Fri, 27 Apr 2018 18:59:33 -0400 Subject: [PATCH 22/34] Comment out unusable test --- .../org/jenkinsci/plugins/workflow/cps/FlowDurabilityTest.java | 1 + 1 file changed, 1 insertion(+) diff --git a/src/test/java/org/jenkinsci/plugins/workflow/cps/FlowDurabilityTest.java b/src/test/java/org/jenkinsci/plugins/workflow/cps/FlowDurabilityTest.java index bb6c60d82..d52eadaef 100644 --- a/src/test/java/org/jenkinsci/plugins/workflow/cps/FlowDurabilityTest.java +++ b/src/test/java/org/jenkinsci/plugins/workflow/cps/FlowDurabilityTest.java @@ -605,6 +605,7 @@ public void evaluate() throws Throwable { /** Verify that if we bomb out because we cannot resume, we at least try to finish the flow graph if we have something to work with. */ @Test + @Ignore // Can be fleshed out later if we have a valid need for it. public void testPipelineFinishesFlowGraph() throws Exception { final String[] logStart = new String[1]; From a87587a3932dfce45ac031396e0905cd8dd2f1cd Mon Sep 17 00:00:00 2001 From: Sam Van Oort Date: Fri, 27 Apr 2018 19:41:59 -0400 Subject: [PATCH 23/34] Handle fuzzers for durability killing run before it can be saved --- .../plugins/workflow/cps/FlowDurabilityTest.java | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/src/test/java/org/jenkinsci/plugins/workflow/cps/FlowDurabilityTest.java b/src/test/java/org/jenkinsci/plugins/workflow/cps/FlowDurabilityTest.java index d52eadaef..bcd18c473 100644 --- a/src/test/java/org/jenkinsci/plugins/workflow/cps/FlowDurabilityTest.java +++ b/src/test/java/org/jenkinsci/plugins/workflow/cps/FlowDurabilityTest.java @@ -877,6 +877,9 @@ public void evaluate() throws Throwable { @Override public void evaluate() throws Throwable { WorkflowRun run = story.j.jenkins.getItemByFullName(jobName, WorkflowJob.class).getLastBuild(); + if (run == null) { + return; + } if (run.getExecution() != null) { Assert.assertEquals(FlowDurabilityHint.MAX_SURVIVABILITY, run.getExecution().getDurabilityHint()); } @@ -922,6 +925,9 @@ public void evaluate() throws Throwable { @Override public void evaluate() throws Throwable { WorkflowRun run = story.j.jenkins.getItemByFullName(jobName, WorkflowJob.class).getLastBuild(); + if (run == null) { + return; + } if (run.getExecution() != null) { Assert.assertEquals(FlowDurabilityHint.PERFORMANCE_OPTIMIZED, run.getExecution().getDurabilityHint()); } @@ -936,6 +942,9 @@ public void evaluate() throws Throwable { // Verify build doesn't resume at next restart, see JENKINS-50199 Assert.assertFalse(FlowExecutionList.get().iterator().hasNext()); WorkflowRun run = story.j.jenkins.getItemByFullName(jobName, WorkflowJob.class).getLastBuild(); + if (run == null) { + return; + } Assert.assertFalse(run.isBuilding()); Assert.assertTrue(run.getExecution().isComplete()); if (run.getExecution() instanceof CpsFlowExecution) { @@ -974,6 +983,9 @@ public void evaluate() throws Throwable { @Override public void evaluate() throws Throwable { WorkflowRun run = story.j.jenkins.getItemByFullName(jobName, WorkflowJob.class).getLastBuild(); + if (run == null) { + return; + } if (run.getExecution() != null) { Assert.assertEquals(FlowDurabilityHint.PERFORMANCE_OPTIMIZED, run.getExecution().getDurabilityHint()); } From adc109f6988328abcb7ef3ade953c618117cc0ca Mon Sep 17 00:00:00 2001 From: Sam Van Oort Date: Mon, 30 Apr 2018 10:57:41 -0400 Subject: [PATCH 24/34] Try to save other FlowExecutions even if one fails --- .../workflow/cps/CpsFlowExecution.java | 34 +++++++++++-------- 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/src/main/java/org/jenkinsci/plugins/workflow/cps/CpsFlowExecution.java b/src/main/java/org/jenkinsci/plugins/workflow/cps/CpsFlowExecution.java index 5b54fe9bc..b67b1d418 100644 --- a/src/main/java/org/jenkinsci/plugins/workflow/cps/CpsFlowExecution.java +++ b/src/main/java/org/jenkinsci/plugins/workflow/cps/CpsFlowExecution.java @@ -1538,23 +1538,27 @@ public void pause(final boolean v) throws IOException { try (Timeout t = Timeout.limit(3, TimeUnit.MINUTES)) { // TODO some complicated sequence of calls to Futures could allow all of them to run in parallel LOGGER.fine("starting to suspend all executions"); for (FlowExecution execution : FlowExecutionList.get()) { - if (execution instanceof CpsFlowExecution) { - CpsFlowExecution cpsExec = (CpsFlowExecution)execution; - cpsExec.checkAndAbortNonresumableBuild(); - - LOGGER.log(Level.FINE, "waiting to suspend {0}", execution); - exec = (CpsFlowExecution) execution; - // Like waitForSuspension but with a timeout: - if (exec.programPromise != null) { - LOGGER.log(Level.FINER, "Waiting for Pipeline to go to sleep for shutdown: "+execution); - try { - exec.programPromise.get(1, TimeUnit.MINUTES).scheduleRun().get(1, TimeUnit.MINUTES); - LOGGER.log(Level.FINER, " Pipeline went to sleep OK: "+execution); - } catch (InterruptedException | TimeoutException ex) { - LOGGER.log(Level.WARNING, "Error waiting for Pipeline to suspend: "+exec, ex); + try { + if (execution instanceof CpsFlowExecution) { + CpsFlowExecution cpsExec = (CpsFlowExecution)execution; + cpsExec.checkAndAbortNonresumableBuild(); + + LOGGER.log(Level.FINE, "waiting to suspend {0}", execution); + exec = (CpsFlowExecution) execution; + // Like waitForSuspension but with a timeout: + if (exec.programPromise != null) { + LOGGER.log(Level.FINER, "Waiting for Pipeline to go to sleep for shutdown: "+execution); + try { + exec.programPromise.get(1, TimeUnit.MINUTES).scheduleRun().get(1, TimeUnit.MINUTES); + LOGGER.log(Level.FINER, " Pipeline went to sleep OK: "+execution); + } catch (InterruptedException | TimeoutException ex) { + LOGGER.log(Level.WARNING, "Error waiting for Pipeline to suspend: "+exec, ex); + } } + cpsExec.checkpoint(); } - cpsExec.checkpoint(); + } catch (Exception ex) { + LOGGER.log(Level.WARNING, "Error persisting Pipeline execution at shutdown: "+((CpsFlowExecution) execution).owner, ex); } } LOGGER.fine("finished suspending all executions"); From fc9384485d801f2416c3e28bbf842ccd79f3f8ac Mon Sep 17 00:00:00 2001 From: Sam Van Oort Date: Mon, 30 Apr 2018 12:15:20 -0400 Subject: [PATCH 25/34] Make findbugs hush --- .../jenkinsci/plugins/workflow/cps/CpsFlowExecution.java | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/main/java/org/jenkinsci/plugins/workflow/cps/CpsFlowExecution.java b/src/main/java/org/jenkinsci/plugins/workflow/cps/CpsFlowExecution.java index b67b1d418..cb9c07fca 100644 --- a/src/main/java/org/jenkinsci/plugins/workflow/cps/CpsFlowExecution.java +++ b/src/main/java/org/jenkinsci/plugins/workflow/cps/CpsFlowExecution.java @@ -623,9 +623,12 @@ private synchronized String getHeadsAsString() { void createPlaceholderNodes(Throwable failureReason) throws Exception { this.done = true; - if (this.owner != null && this.owner.getExecutable() instanceof Run) { - Run r = (Run)(owner.getExecutable()); - setResult(r.getResult() != null ? r.getResult() : Result.FAILURE); + if (this.owner != null) { + Queue.Executable ex = owner.getExecutable(); + if (ex instanceof Run) { + Result res = ((Run)ex).getResult(); + setResult(res != null ? res : Result.FAILURE); + } } try { From c7f44a776579d0647a59a1dd694c7bc0294e5418 Mon Sep 17 00:00:00 2001 From: Sam Van Oort Date: Mon, 30 Apr 2018 18:35:17 -0400 Subject: [PATCH 26/34] Update docs --- doc/persistence.md | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/doc/persistence.md b/doc/persistence.md index 1403dc537..6ff4b405d 100644 --- a/doc/persistence.md +++ b/doc/persistence.md @@ -3,11 +3,11 @@ # Data Model Running pipelines persist in 3 pieces: -1. The FlowNodes - stored by a FlowNodeStorage - this holds the FlowNodes created to map to Steps, and for block scoped Steps, start and end of blocks -2. The CpsFlowExecution - this is currently stored in the WorkflowRun, and the primary pieces of interest are: +1. The `FlowNode`s - stored by a `FlowNodeStorage` - this holds the FlowNodes created to map to `Step`s, and for block scoped Steps, start and end of blocks +2. The `CpsFlowExecution` - this is currently stored in the WorkflowRun, and the primary pieces of interest are: * heads - the current "tips" of the Flow Graph, i.e. the FlowNodes that represent running steps and are appended to - - A head maps to a CpsThread in the Pipeline program, within the CPSThreadGroup - * starts - the BlockStartNodes marking the start(s) of the currently executing blocks + - A head maps to a `CpsThread` in the Pipeline program, within the `CpsThreadGroup` + * starts - the `BlockStartNode`s marking the start(s) of the currently executing blocks * scripts - the loaded Pipeline script files (text) * persistedClean - If true, Pipeline saved its execution cleanly to disk and we *might* be able to resume it @@ -18,7 +18,8 @@ Running pipelines persist in 3 pieces: * * various other boolean flags & settings for the execution (durability setting, user that started the build, is it sandboxed, etc) 3. The Program -- this is the current execution state of the Pipeline - * This holds the Groovy state (transformed by CPS) plus the CPSThreadGroup for the running branches of the Pipeline + * This holds the Groovy state - the `CpsThreadGroup` - with runtime calls transformed by CPS so they can persist + * The `CpsThread`s map to the running branches of the Pipeline * The program depends on the FlowNodes from the FlowNodeStorage, since it reads them by ID rather than storing them in the program * This also depends on the heads in the CpsFlowExecution, because its FlowHeads are loaded from the heads of the CpsFlowExecution * Also holds the CpsStepContext, i.e. the variables such as EnvVars, Executor and Workspace uses (the latter stored as Pickles) From 7567a0a050171622a46b48762a3e8742d981f56b Mon Sep 17 00:00:00 2001 From: Sam Van Oort Date: Mon, 30 Apr 2018 20:12:38 -0400 Subject: [PATCH 27/34] Use locking to avoid ConcurrentModificationException on FlowGraph while trying to persist state at end of execution --- .../workflow/cps/CpsFlowExecution.java | 28 +++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/src/main/java/org/jenkinsci/plugins/workflow/cps/CpsFlowExecution.java b/src/main/java/org/jenkinsci/plugins/workflow/cps/CpsFlowExecution.java index cb9c07fca..321f8c65f 100644 --- a/src/main/java/org/jenkinsci/plugins/workflow/cps/CpsFlowExecution.java +++ b/src/main/java/org/jenkinsci/plugins/workflow/cps/CpsFlowExecution.java @@ -99,6 +99,8 @@ import java.util.concurrent.Future; import java.util.concurrent.TimeoutException; import java.util.concurrent.atomic.AtomicInteger; +import java.util.concurrent.locks.ReadWriteLock; +import java.util.concurrent.locks.ReentrantReadWriteLock; import java.util.logging.Level; import java.util.logging.Logger; @@ -1742,60 +1744,86 @@ private T readChild(HierarchicalStreamReader r, UnmarshallingContext context class TimingFlowNodeStorage extends FlowNodeStorage { private final FlowNodeStorage delegate; + ReadWriteLock readWriteLock = new ReentrantReadWriteLock(); + TimingFlowNodeStorage(FlowNodeStorage delegate) { this.delegate = delegate; } @Override public FlowNode getNode(String string) throws IOException { + readWriteLock.readLock().lock(); try (Timing t = time(TimingKind.flowNode)) { return delegate.getNode(string); + } finally { + readWriteLock.readLock().unlock(); } } @Override public void storeNode(@Nonnull FlowNode n) throws IOException { + readWriteLock.writeLock().lock(); try (Timing t = time(TimingKind.flowNode)) { delegate.storeNode(n); + } finally { + readWriteLock.writeLock().unlock(); } } @Override public void storeNode(@Nonnull FlowNode n, boolean delayWritingActions) throws IOException { + readWriteLock.writeLock().lock(); try (Timing t = time(TimingKind.flowNode)) { delegate.storeNode(n, delayWritingActions); + } finally { + readWriteLock.writeLock().unlock(); } } @Override public void flush() throws IOException { + readWriteLock.writeLock().lock(); try (Timing t = time(TimingKind.flowNode)) { delegate.flush(); + } finally { + readWriteLock.writeLock().unlock(); } } @Override public void flushNode(FlowNode node) throws IOException { + readWriteLock.writeLock().lock(); try (Timing t = time(TimingKind.flowNode)) { delegate.flushNode(node); + } finally { + readWriteLock.writeLock().unlock(); } } @Override public void autopersist(@Nonnull FlowNode n) throws IOException { + readWriteLock.writeLock().lock(); try (Timing t = time(TimingKind.flowNode)) { delegate.autopersist(n); + } finally { + readWriteLock.writeLock().unlock(); } } @Override public List loadActions(FlowNode node) throws IOException { + readWriteLock.readLock().lock(); try (Timing t = time(TimingKind.flowNode)) { return delegate.loadActions(node); + } finally { + readWriteLock.readLock().unlock(); } } @Override public void saveActions(FlowNode node, List actions) throws IOException { + readWriteLock.writeLock().lock(); try (Timing t = time(TimingKind.flowNode)) { delegate.saveActions(node, actions); + } finally { + readWriteLock.writeLock().unlock(); } } } From 297d820c1b20c853c6d45ee9a56d7d6558fa2ebe Mon Sep 17 00:00:00 2001 From: Sam Van Oort Date: Tue, 1 May 2018 09:26:18 -0400 Subject: [PATCH 28/34] Fix typo --- src/main/java/org/jenkinsci/plugins/workflow/cps/FlowHead.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/org/jenkinsci/plugins/workflow/cps/FlowHead.java b/src/main/java/org/jenkinsci/plugins/workflow/cps/FlowHead.java index 103f9f7f5..c525c5ca2 100644 --- a/src/main/java/org/jenkinsci/plugins/workflow/cps/FlowHead.java +++ b/src/main/java/org/jenkinsci/plugins/workflow/cps/FlowHead.java @@ -197,7 +197,7 @@ private Object readResolve() { } return myHead; } else { - LOGGER.log(Level.WARNING, "Tried to load a FlowHead from program with no Execution in PROGRAM_STATE_SERIALIZTION"); + LOGGER.log(Level.WARNING, "Tried to load a FlowHead from program with no Execution in PROGRAM_STATE_SERIALIZATION"); return this; } } From b51d1ba2768ee9fa277cc4cdd36fbd05ee88bd9c Mon Sep 17 00:00:00 2001 From: Sam Van Oort Date: Tue, 1 May 2018 10:47:46 -0400 Subject: [PATCH 29/34] Cleanup --- .../org/jenkinsci/plugins/workflow/cps/CpsFlowExecution.java | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/main/java/org/jenkinsci/plugins/workflow/cps/CpsFlowExecution.java b/src/main/java/org/jenkinsci/plugins/workflow/cps/CpsFlowExecution.java index 321f8c65f..532eda861 100644 --- a/src/main/java/org/jenkinsci/plugins/workflow/cps/CpsFlowExecution.java +++ b/src/main/java/org/jenkinsci/plugins/workflow/cps/CpsFlowExecution.java @@ -35,7 +35,6 @@ import com.cloudbees.jenkins.support.api.Component; import com.cloudbees.jenkins.support.api.Container; import com.cloudbees.jenkins.support.api.Content; -import com.google.common.annotations.VisibleForTesting; import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMap; import com.google.common.util.concurrent.FutureCallback; @@ -234,7 +233,6 @@ * @author Kohsuke Kawaguchi */ @PersistIn(RUN) -@SuppressFBWarnings(value = "IS_FIELD_NOT_GUARDED", justification = "temporary only") public class CpsFlowExecution extends FlowExecution implements BlockableResume { /** * Groovy script of the main source file (that the user enters in the GUI) From 25161118bb569096128de8d105e4b929e1450394 Mon Sep 17 00:00:00 2001 From: Sam Van Oort Date: Tue, 1 May 2018 18:20:54 -0400 Subject: [PATCH 30/34] Review changes --- .../plugins/workflow/cps/CpsFlowExecution.java | 6 +++--- .../java/org/jenkinsci/plugins/workflow/cps/DSL.java | 2 +- .../plugins/workflow/cps/FlowDurabilityTest.java | 12 ++++-------- 3 files changed, 8 insertions(+), 12 deletions(-) diff --git a/src/main/java/org/jenkinsci/plugins/workflow/cps/CpsFlowExecution.java b/src/main/java/org/jenkinsci/plugins/workflow/cps/CpsFlowExecution.java index 532eda861..419bb3040 100644 --- a/src/main/java/org/jenkinsci/plugins/workflow/cps/CpsFlowExecution.java +++ b/src/main/java/org/jenkinsci/plugins/workflow/cps/CpsFlowExecution.java @@ -624,6 +624,7 @@ void createPlaceholderNodes(Throwable failureReason) throws Exception { this.done = true; if (this.owner != null) { + // Ensure that the Run is marked as completed (failed) if it isn't already so it won't show as running Queue.Executable ex = owner.getExecutable(); if (ex instanceof Run) { Result res = ((Run)ex).getResult(); @@ -746,7 +747,7 @@ public void onLoad(FlowExecutionOwner owner) throws IOException { throw new IOException("Cannot resume build -- was not cleanly saved when Jenkins shut down."); } } - } catch (Exception e) { // Multicatch ensures that failure to load does not nuke the master + } catch (Exception e) { // Broad catch ensures that failure to load do NOT nuke the master SettableFuture p = SettableFuture.create(); programPromise = p; loadProgramFailed(e, p); @@ -930,7 +931,7 @@ public void onFailure(Throwable t) { try { g = programPromise.get(); } catch (Exception x) { - // FIXME check this won't cause issues due to depickling delays etc + // TODO Check this won't cause issues due to depickling delays etc LOGGER.log(Level.FINE, "Not blocking restart due to exception in ProgramPromise: "+this, x); return false; } @@ -1228,7 +1229,6 @@ synchronized void onProgramEnd(Outcome outcome) { // shrink everything into a single new head try { if (heads != null) { - // Below does not look correct to me FlowHead first = getFirstHead(); first.setNewHead(head); done = true; // After setting the final head diff --git a/src/main/java/org/jenkinsci/plugins/workflow/cps/DSL.java b/src/main/java/org/jenkinsci/plugins/workflow/cps/DSL.java index 9bfd5c506..8603934e6 100644 --- a/src/main/java/org/jenkinsci/plugins/workflow/cps/DSL.java +++ b/src/main/java/org/jenkinsci/plugins/workflow/cps/DSL.java @@ -578,7 +578,7 @@ private void invokeBody(CpsThread cur) { // we want to do this first before starting body so that the order of heads preserve // natural ordering. - // FIXME give this javadocs worth a darn, because this is how we create parallel branches and the docs are crypic. + // TODO give this javadocs worth a darn, because this is how we create parallel branches and the docs are cryptic as can be! // Also we need to double-check this logic because this might cause a failure of persistence FlowHead[] heads = new FlowHead[context.bodyInvokers.size()]; for (int i = 0; i < heads.length; i++) { diff --git a/src/test/java/org/jenkinsci/plugins/workflow/cps/FlowDurabilityTest.java b/src/test/java/org/jenkinsci/plugins/workflow/cps/FlowDurabilityTest.java index bcd18c473..854f71c7f 100644 --- a/src/test/java/org/jenkinsci/plugins/workflow/cps/FlowDurabilityTest.java +++ b/src/test/java/org/jenkinsci/plugins/workflow/cps/FlowDurabilityTest.java @@ -794,11 +794,7 @@ public void evaluate() throws Throwable { private static void assertBuildNotHung(@Nonnull RestartableJenkinsRule story, @Nonnull WorkflowRun run, int timeOutMillis) throws Exception { if (run.isBuilding()) { - try { - story.j.waitUntilNoActivityUpTo(timeOutMillis); - } catch (AssertionError ase) { // Allows attaching a debugger here - throw new AssertionError("Build hung: " + run, ase); - } + story.j.waitUntilNoActivityUpTo(timeOutMillis); } } @@ -877,7 +873,7 @@ public void evaluate() throws Throwable { @Override public void evaluate() throws Throwable { WorkflowRun run = story.j.jenkins.getItemByFullName(jobName, WorkflowJob.class).getLastBuild(); - if (run == null) { + if (run == null) { // Build killed so early the Run did not get to persist return; } if (run.getExecution() != null) { @@ -925,7 +921,7 @@ public void evaluate() throws Throwable { @Override public void evaluate() throws Throwable { WorkflowRun run = story.j.jenkins.getItemByFullName(jobName, WorkflowJob.class).getLastBuild(); - if (run == null) { + if (run == null) { // Build killed so early the Run did not get to persist return; } if (run.getExecution() != null) { @@ -983,7 +979,7 @@ public void evaluate() throws Throwable { @Override public void evaluate() throws Throwable { WorkflowRun run = story.j.jenkins.getItemByFullName(jobName, WorkflowJob.class).getLastBuild(); - if (run == null) { + if (run == null) { // Build killed so early the Run did not get to persist return; } if (run.getExecution() != null) { From 474f325e62a69cf1cf1162e5ce6418f54ff69711 Mon Sep 17 00:00:00 2001 From: Sam Van Oort Date: Tue, 1 May 2018 18:26:37 -0400 Subject: [PATCH 31/34] Update SNAPSHOT --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 447124bce..caf00f7d9 100644 --- a/pom.xml +++ b/pom.xml @@ -142,7 +142,7 @@ org.jenkins-ci.plugins.workflow workflow-job - 2.21-20180427.223321-5 + 2.21-20180501.222458-6 test From 46cd988c10d374361708b0aee152e3289d4369c5 Mon Sep 17 00:00:00 2001 From: Sam Van Oort Date: Tue, 1 May 2018 21:47:03 -0400 Subject: [PATCH 32/34] Fix findbugs kvetching about synchronization --- .../org/jenkinsci/plugins/workflow/cps/CpsFlowExecution.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/org/jenkinsci/plugins/workflow/cps/CpsFlowExecution.java b/src/main/java/org/jenkinsci/plugins/workflow/cps/CpsFlowExecution.java index 419bb3040..14ff3f628 100644 --- a/src/main/java/org/jenkinsci/plugins/workflow/cps/CpsFlowExecution.java +++ b/src/main/java/org/jenkinsci/plugins/workflow/cps/CpsFlowExecution.java @@ -620,7 +620,7 @@ private synchronized String getHeadsAsString() { * Bypasses {@link #croak(Throwable)} and {@link #onProgramEnd(Outcome)} to guarantee a clean path. */ @GuardedBy("this") - void createPlaceholderNodes(Throwable failureReason) throws Exception { + synchronized void createPlaceholderNodes(Throwable failureReason) throws Exception { this.done = true; if (this.owner != null) { From 5e16362ad6b4dbc39512e9905a58e96f2631ea0b Mon Sep 17 00:00:00 2001 From: Sam Van Oort Date: Wed, 2 May 2018 09:23:01 -0400 Subject: [PATCH 33/34] Reduce logging verbosity a bit --- .../org/jenkinsci/plugins/workflow/cps/CpsFlowExecution.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/java/org/jenkinsci/plugins/workflow/cps/CpsFlowExecution.java b/src/main/java/org/jenkinsci/plugins/workflow/cps/CpsFlowExecution.java index 14ff3f628..5b9e2a59d 100644 --- a/src/main/java/org/jenkinsci/plugins/workflow/cps/CpsFlowExecution.java +++ b/src/main/java/org/jenkinsci/plugins/workflow/cps/CpsFlowExecution.java @@ -730,10 +730,10 @@ public void onLoad(FlowExecutionOwner owner) throws IOException { try { if (isComplete()) { if (done == Boolean.TRUE && !super.isComplete()) { - LOGGER.log(Level.WARNING, "Completed flow without FlowEndNode: "+this+" heads:"+getHeadsAsString()); + LOGGER.log(Level.INFO, "Completed flow without FlowEndNode: "+this+" heads:"+getHeadsAsString()); } if (super.isComplete() && done != Boolean.TRUE) { - LOGGER.log(Level.WARNING, "Flow has FlowEndNode, but is not marked as done, fixing this for"+this); + LOGGER.log(Level.FINE, "Flow has FlowEndNode, but is not marked as done, fixing this for"+this); done = true; saveOwner(); } From d824047e87cd7dd2898e51536a962b9b9a39de4e Mon Sep 17 00:00:00 2001 From: Sam Van Oort Date: Wed, 2 May 2018 17:15:05 -0400 Subject: [PATCH 34/34] Pick up released workflow-job version --- pom.xml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pom.xml b/pom.xml index caf00f7d9..3fc2e6615 100644 --- a/pom.xml +++ b/pom.xml @@ -141,8 +141,7 @@ org.jenkins-ci.plugins.workflow workflow-job - - 2.21-20180501.222458-6 + 2.21 test