Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[JENKINS-55287] Improve error messages for nonresumable Pipelines #363

Closed
wants to merge 10 commits into from
Expand Up @@ -712,17 +712,26 @@ public void onLoad(FlowExecutionOwner owner) throws IOException {
this.owner = owner;

try {
initializeStorage(); // Throws exception and bombs out if we can't load FlowNodes
} catch (Exception ex) {
try {
initializeStorage(); // Throws exception and bombs out if we can't load FlowNodes
} catch (Exception ex) {
LOGGER.log(Level.WARNING, "Error initializing storage and loading nodes, will try to create placeholders for: "+this, ex);
if (!canResume()) {
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Alternatively, we could check this even before we attempt to initialize storage and just bail out right away. I don't know if that is too pessimistic, maybe there is a case where the Pipeline is not resumable but flow nodes are all up to date and so the optimistic behavior here is useful because you will still be able to see the existing flow graph in those cases?

// This case is expected for PERFORMANCE_OPTIMIZED Pipelines after a Jenkins crash (JENKINS-55287). The specific issue is
// usually that only the FlowStartNode has been persisted and so we cannot find the current head node in storage.
String message = "Unable to resume " + owner.getExecutable() + " because it was not saved before Jenkins shut down. Did Jenkins crash?";
LOGGER.log(Level.WARNING, message, ex);
ex = new AbortException(message); // Change the exception displayed in the build log to one that is more useful for users.
} else {
LOGGER.log(Level.WARNING, "Unable to resume " + owner.getExecutable() + " because its flow nodes could not be loaded", ex);
}
createPlaceholderNodes(ex);
return;
} catch (Exception ex2) {
Copy link
Member Author

@dwnusbaum dwnusbaum Jun 18, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is all mostly refactoring. The old code had this structure:

try {
  try {
    // code that can throw
  } catch (first) {
    // code that can throw
    return
  }
} catch (second) {
   throw second // first exception, if any, is totally lost
}

Now it has this structure:

try {
  // code that can throw
} catch (first) {
  try {
    // code that can throw
  } catch (second) {
    second.addSuppressed(first)
    throw second
  }
  return
}

ex2.addSuppressed(ex);
done = true;
programPromise = Futures.immediateFailedFuture(ex2);
throw new IOException("Failed to even create placeholder nodes for execution", ex2);
}
} catch (Exception ex) {
done = true;
programPromise = Futures.immediateFailedFuture(ex);
throw new IOException("Failed to even create placeholder nodes for execution", ex);
return;
}

try {
Expand All @@ -736,13 +745,19 @@ public void onLoad(FlowExecutionOwner owner) throws IOException {
saveOwner();
}
} else { // See if we can/should resume build
// Always set up the shell in case the build is not resumable and loadProgramFailed is called without loadProgramAsync being called first.
scriptClass = parseScript().getClass();
if (canResume()) {
// Before we resume, we need to unset persistedClean (and persist the change) in case Jenkins restarts again.
persistedClean = null;
saveOwner();
loadProgramAsync(getProgramDataFile());
} else {
// TODO if possible, consider trying to close out unterminated blocks to keep existing graph history
// That way we can visualize the graph in some error cases.
LOGGER.log(Level.WARNING, "Pipeline state not properly persisted, cannot resume "+owner.getUrl());
throw new IOException("Cannot resume build -- was not cleanly saved when Jenkins shut down.");
String message = "Unable to resume " + owner.getExecutable() + " because it was not saved before Jenkins shut down. Did Jenkins crash?";
LOGGER.log(Level.WARNING, message);
throw new IOException(message);
}
}
} catch (Exception e) { // Broad catch ensures that failure to load do NOT nuke the master
Expand All @@ -767,8 +782,6 @@ public void loadProgramAsync(File programDataFile) {
programPromise = result;

try {
scriptClass = parseScript().getClass();

final RiverReader r = new RiverReader(programDataFile, scriptClass.getClassLoader(), owner);
Futures.addCallback(
r.restorePickles(pickleFutures = new ArrayList<>()),
Expand Down Expand Up @@ -1470,6 +1483,10 @@ void notifyListeners(List<FlowNode> nodes, boolean synchronous) {
} finally {
if (synchronous) {
bc.abort(); // hack to skip save—we are holding a lock
} else if (!getDurabilityHint().isPersistWithEveryStep() && !isComplete()
&& nodes.stream().noneMatch(node -> node instanceof FlowStartNode || node instanceof FlowEndNode)) {
// Do not save in PERFORMANCE_OPTIMIZED mode unless this is the end of the build.
bc.abort();
} else {
try {
bc.commit();
Expand Down Expand Up @@ -1526,10 +1543,6 @@ public boolean isPaused() {
return false;
}

private void setPersistedClean(boolean persistedClean) { // Workaround for some issues with anonymous classes.
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Unused

this.persistedClean = persistedClean;
}

/**
* Pause or unpause the execution.
*
Expand All @@ -1550,6 +1563,10 @@ public void pause(final boolean v) throws IOException {
checkAndAbortNonresumableBuild(); // TODO Verify if we can rely on just killing paused builds at shutdown via checkAndAbortNonresumableBuild()
checkpoint(false);
} else {
// Pausing the build sets persistedClean to true so the build can resume, so if we unpause the build
// we need to unset persistedClean again.
persistedClean = null;
saveOwner();
Comment on lines +1566 to +1569
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should add a test case for this. The Pipeline would need to be paused, then unpaused, and then we do a hard shutdown and make sure we get the error message that the Pipeline can't be resumed rather than a raw exception.

g.unpause();
}
try {
Expand Down Expand Up @@ -1954,7 +1971,6 @@ void saveOwner() {
try {
if (this.owner != null && this.owner.getExecutable() instanceof Saveable) { // Null-check covers some anomalous cases we've seen
Saveable saveable = (Saveable)(this.owner.getExecutable());
persistedClean = true;
if (storage != null && storage.delegate != null) {
// Defensively flush FlowNodes to storage
try {
Expand Down
Expand Up @@ -14,6 +14,7 @@
import org.jenkinsci.plugins.workflow.job.properties.DurabilityHintJobProperty;
import org.jenkinsci.plugins.workflow.support.steps.input.InputAction;
import org.jenkinsci.plugins.workflow.support.steps.input.InputStepExecution;
import org.jenkinsci.plugins.workflow.test.steps.SemaphoreStep;
import org.junit.Assert;
import org.junit.ClassRule;
import org.junit.Rule;
Expand All @@ -27,6 +28,7 @@
import java.util.List;
import java.util.concurrent.atomic.AtomicBoolean;

import static org.junit.Assert.assertEquals;
/**
* Verifies we can cope with all the bizarre quirks that occur when persistence fails or something unexpected happens.
*/
Expand Down Expand Up @@ -261,13 +263,14 @@ public void inProgressMaxPerfCleanShutdown() throws Exception {
});
}

@Issue("JENKINS-55287")
@Test
public void inProgressMaxPerfDirtyShutdown() throws Exception {
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is the exact situation reported in JENKINS-55287 as far as I can tell.

final int[] build = new int[1];
final String[] finalNodeId = new String[1];
story.thenWithHardShutdown( j -> {
runBasicPauseOnInput(j, DEFAULT_JOBNAME, build, FlowDurabilityHint.PERFORMANCE_OPTIMIZED);
// SHOULD still save at end via persist-at-shutdown hooks
// Will not be saved since the persist-at-shutdown hooks will occur after the snapshot for thenWithHardShutdown is taken.
});
story.then( j->{
WorkflowJob r = j.jenkins.getItemByFullName(DEFAULT_JOBNAME, WorkflowJob.class);
Expand All @@ -276,6 +279,7 @@ public void inProgressMaxPerfDirtyShutdown() throws Exception {
j.waitForCompletion(run);
assertCompletedCleanly(run);
Assert.assertEquals(Result.FAILURE, run.getResult());
j.assertLogContains("was not saved before Jenkins shut down", run);
finalNodeId[0] = run.getExecution().getCurrentHeads().get(0).getId();
});
story.then(j-> {
Expand Down Expand Up @@ -359,4 +363,42 @@ public void inProgressButStartBlocksLost() throws Exception {
Assert.assertEquals("4", b.getExecution().getCurrentHeads().get(0).getId());
});
}

/**
* Makes sure that CpsFlowExecution.persistedClean is nulled out after a successful resumption in case of future resumptions.
*/
@Test
public void inProgressMaxPerfCleanShutdownThenDirtyShutdown() {
story.then(r -> {
WorkflowJob p = r.jenkins.createProject(WorkflowJob.class, DEFAULT_JOBNAME);
p.setDefinition(new CpsFlowDefinition(
"echo 'initial execution'\n" +
"semaphore 'clean-shutdown'\n" +
"echo 'after clean shutdown'\n" +
"semaphore 'dirty-shutdown'\n" +
"echo 'after dirty shutdown'", true));
p.addProperty(new DurabilityHintJobProperty(FlowDurabilityHint.PERFORMANCE_OPTIMIZED));
WorkflowRun b = p.scheduleBuild2(0).waitForStart();
SemaphoreStep.waitForStart("clean-shutdown/1", b);
r.assertLogContains("initial execution", b);
});
story.thenWithHardShutdown(r -> {
WorkflowJob p = r.jenkins.getItemByFullName(DEFAULT_JOBNAME, WorkflowJob.class);
WorkflowRun b = p.getLastBuild();
assertCleanInProgress(b);
SemaphoreStep.success("clean-shutdown/1", null);
SemaphoreStep.waitForStart("dirty-shutdown/1", b);
r.assertLogContains("after clean shutdown", b);
// Will not be able to resume after this since the persist-at-shutdown hooks will occur after the snapshot for thenWithHardShutdown is taken.
});
story.then(r -> {
WorkflowJob p = r.jenkins.getItemByFullName(DEFAULT_JOBNAME, WorkflowJob.class);
WorkflowRun b = p.getLastBuild();
r.waitForCompletion(b);
assertCompletedCleanly(b);
assertEquals(Result.FAILURE, b.getResult());
r.assertLogContains("was not saved before Jenkins shut down", b);
r.assertLogNotContains("after dirty shutdown", b);
});
}
}