Skip to content
Permalink
Browse files
Merge pull request #1815 from varmenise/OSS-192
[FIXED JENKINS-30084] FlyWeightTasks tied to a label will not cause node provisioning and will be blocked forever.

When a flyweighttask is limited to run on a specific label (e.g. matrix project set restrict where this project can run) if there are no nodes with that label available when it enters the queue then it will immediately move to blocked.
As it is blocked the Node provisioner will not attempt to create any slaves, so the project will sit in the queue forever (or until some other project allocates a slave with the correct label).
(cherry picked from commit 5a5f9c5)
  • Loading branch information
stephenc authored and olivergondza committed Oct 27, 2015
1 parent e021165 commit f07b3b7efc252be7e84dc2f0042bf8a5a45e99b6
Show file tree
Hide file tree
Showing 3 changed files with 260 additions and 51 deletions.
@@ -876,6 +876,13 @@ public List<BuildableItem> getPendingItems() {
return new ArrayList<BuildableItem>(snapshot.pendings);
}

/**
* Gets the snapshot of all {@link BlockedItem}s.
*/
protected List<BlockedItem> getBlockedItems() {
return new ArrayList<BlockedItem>(snapshot.blockedProjects);
}

/**
* Returns the snapshot of all {@link LeftItem}s.
*
@@ -1471,43 +1478,54 @@ public void maintain() {
continue;
}

List<JobOffer> candidates = new ArrayList<JobOffer>(parked.size());
for (JobOffer j : parked.values())
if (j.canTake(p))
candidates.add(j);

MappingWorksheet ws = new MappingWorksheet(p, candidates);
Mapping m = loadBalancer.map(p.task, ws);
if (m == null) {
// if we couldn't find the executor that fits,
// just leave it in the buildables list and
// check if we can execute other projects
LOGGER.log(Level.FINER, "Failed to map {0} to executors. candidates={1} parked={2}",
new Object[]{p, candidates, parked.values()});
continue;
}
if (p.task instanceof FlyweightTask) {
Runnable r = makeFlyWeightTaskBuildable(new BuildableItem(p));
if (r != null) {
p.leave(this);
r.run();
updateSnapshot();
}
} else {

List<JobOffer> candidates = new ArrayList<JobOffer>(parked.size());
for (JobOffer j : parked.values())
if (j.canTake(p))
candidates.add(j);

MappingWorksheet ws = new MappingWorksheet(p, candidates);
Mapping m = loadBalancer.map(p.task, ws);
if (m == null) {
// if we couldn't find the executor that fits,
// just leave it in the buildables list and
// check if we can execute other projects
LOGGER.log(Level.FINER, "Failed to map {0} to executors. candidates={1} parked={2}",
new Object[]{p, candidates, parked.values()});
continue;
}

// found a matching executor. use it.
WorkUnitContext wuc = new WorkUnitContext(p);
m.execute(wuc);

// found a matching executor. use it.
WorkUnitContext wuc = new WorkUnitContext(p);
m.execute(wuc);

p.leave(this);
if (!wuc.getWorkUnits().isEmpty())
makePending(p);
else
LOGGER.log(Level.FINE, "BuildableItem {0} with empty work units!?", p);

// Ensure that identification of blocked tasks is using the live state: JENKINS-27708 & JENKINS-27871
// The creation of a snapshot itself should be relatively cheap given the expected rate of
// job execution. You probably would need 100's of jobs starting execution every iteration
// of maintain() before this could even start to become an issue and likely the calculation
// of isBuildBlocked(p) will become a bottleneck before updateSnapshot() will. Additionally
// since the snapshot itself only ever has at most one reference originating outside of the stack
// it should remain in the eden space and thus be cheap to GC.
// See https://issues.jenkins-ci.org/browse/JENKINS-27708?focusedCommentId=225819&page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel#comment-225819
// or https://issues.jenkins-ci.org/browse/JENKINS-27708?focusedCommentId=225906&page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel#comment-225906
// for alternative fixes of this issue.
updateSnapshot();
p.leave(this);
if (!wuc.getWorkUnits().isEmpty()) {
makePending(p);
}
else
LOGGER.log(Level.FINE, "BuildableItem {0} with empty work units!?", p);

// Ensure that identification of blocked tasks is using the live state: JENKINS-27708 & JENKINS-27871
// The creation of a snapshot itself should be relatively cheap given the expected rate of
// job execution. You probably would need 100's of jobs starting execution every iteration
// of maintain() before this could even start to become an issue and likely the calculation
// of isBuildBlocked(p) will become a bottleneck before updateSnapshot() will. Additionally
// since the snapshot itself only ever has at most one reference originating outside of the stack
// it should remain in the eden space and thus be cheap to GC.
// See https://issues.jenkins-ci.org/browse/JENKINS-27708?focusedCommentId=225819&page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel#comment-225819
// or https://issues.jenkins-ci.org/browse/JENKINS-27708?focusedCommentId=225906&page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel#comment-225906
// for alternative fixes of this issue.
updateSnapshot();
}
}
} finally { updateSnapshot(); } } finally {
lock.unlock();
@@ -1522,8 +1540,38 @@ public void maintain() {
private @CheckForNull Runnable makeBuildable(final BuildableItem p) {
if (p.task instanceof FlyweightTask) {
if (!isBlockedByShutdown(p.task)) {

Runnable runnable = makeFlyWeightTaskBuildable(p);

if(runnable != null){
return runnable;
}

//this is to solve JENKINS-30084: the task has to be buildable to force the provisioning of nodes.
//if the execution gets here, it means the task could not be scheduled since the node
//the task is supposed to run on is offline or not available.
//Thus, the flyweighttask enters the buildables queue and will ask Jenkins to provision a node
return new BuildableRunnable(p);
}
// if the execution gets here, it means the task is blocked by shutdown and null is returned.
return null;
} else {
// regular heavyweight task
return new BuildableRunnable(p);
}
}

/**
* This method checks if the flyweight task can be run on any of the available executors
* @param p - the flyweight task to be scheduled
* @return a Runnable if there is an executor that can take the task, null otherwise
*/
@CheckForNull
private Runnable makeFlyWeightTaskBuildable(final BuildableItem p){
//we double check if this is a flyweight task
if (p.task instanceof FlyweightTask) {
Jenkins h = Jenkins.getInstance();
Map<Node,Integer> hashSource = new HashMap<Node, Integer>(h.getNodes().size());
Map<Node, Integer> hashSource = new HashMap<Node, Integer>(h.getNodes().size());

// Even if master is configured with zero executors, we may need to run a flyweight task like MatrixProject on it.
hashSource.put(h, Math.max(h.getNumExecutors() * 100, 1));
@@ -1538,29 +1586,26 @@ public void maintain() {
Label lbl = p.getAssignedLabel();
for (Node n : hash.list(p.task.getFullDisplayName())) {
final Computer c = n.toComputer();
if (c==null || c.isOffline()) continue;
if (lbl!=null && !lbl.contains(n)) continue;
if (n.canTake(p) != null) continue;
if (c == null || c.isOffline()) {
continue;
}
if (lbl!=null && !lbl.contains(n)) {
continue;
}
if (n.canTake(p) != null) {
continue;
}
return new Runnable() {
@Override public void run() {
c.startFlyWeightTask(new WorkUnitContext(p).createWorkUnit(p.task));
makePending(p);
}
};
}
}
// if the execution get here, it means we couldn't schedule it anywhere.
return null;
} else { // regular heavyweight task
return new Runnable() {
@Override public void run() {
p.enter(Queue.this);
}
};
}
return null;
}


private static Hash<Node> NODE_HASH = new Hash<Node>() {
public String hash(Node node) {
return node.getNodeName();
@@ -2672,6 +2717,20 @@ public void run() {
}
}

private class BuildableRunnable implements Runnable {
private final BuildableItem buildableItem;

private BuildableRunnable(BuildableItem p) {
this.buildableItem = p;
}

@Override
public void run() {
//the flyweighttask enters the buildables queue and will ask Jenkins to provision a node
buildableItem.enter(Queue.this);
}
}

private static class LockedJUCCallable<V> implements java.util.concurrent.Callable<V> {
private final java.util.concurrent.Callable<V> delegate;

@@ -30,6 +30,7 @@
import com.gargoylesoftware.htmlunit.xml.XmlPage;
import hudson.Launcher;
import hudson.XmlFile;
import hudson.matrix.Axis;
import hudson.matrix.AxisList;
import hudson.matrix.LabelAxis;
import hudson.matrix.MatrixBuild;
@@ -41,7 +42,9 @@
import hudson.model.Queue.BlockedItem;
import hudson.model.Queue.Executable;
import hudson.model.Queue.WaitingItem;
import hudson.model.labels.LabelExpression;
import hudson.model.queue.AbstractQueueTask;
import hudson.model.queue.CauseOfBlockage;
import hudson.model.queue.QueueTaskFuture;
import hudson.model.queue.ScheduleResult;
import hudson.model.queue.SubTask;
@@ -50,6 +53,8 @@
import hudson.security.SparseACL;
import hudson.slaves.DumbSlave;
import hudson.slaves.DummyCloudImpl;
import hudson.slaves.NodeProperty;
import hudson.slaves.NodePropertyDescriptor;
import hudson.slaves.NodeProvisionerRule;
import hudson.tasks.BuildTrigger;
import hudson.tasks.Shell;
@@ -99,6 +104,7 @@
import org.jvnet.hudson.test.SequenceLock;
import org.jvnet.hudson.test.SleepBuilder;
import org.jvnet.hudson.test.TestBuilder;
import org.jvnet.hudson.test.TestExtension;
import org.jvnet.hudson.test.recipes.LocalData;
import org.mortbay.jetty.Server;
import org.mortbay.jetty.bio.SocketConnector;
@@ -780,4 +786,119 @@ public void run() {
"B finished at " + buildBEndTime + ", C started at " + buildC.getStartTimeInMillis(),
buildC.getStartTimeInMillis() >= buildBEndTime);
}

@Issue("JENKINS-30084")
@Test
/*
* When a flyweight task is restricted to run on a specific node, the node will be provisioned
* and the flyweight task will be executed.
*/
public void shouldRunFlyweightTaskOnProvisionedNodeWhenNodeRestricted() throws Exception {
MatrixProject matrixProject = r.createMatrixProject();
matrixProject.setAxes(new AxisList(
new Axis("axis", "a", "b")
));
Label label = LabelExpression.get("aws-linux-dummy");
DummyCloudImpl dummyCloud = new DummyCloudImpl(r, 0);
dummyCloud.label = label;
r.jenkins.clouds.add(dummyCloud);
matrixProject.setAssignedLabel(label);
r.assertBuildStatusSuccess(matrixProject.scheduleBuild2(0));
assertEquals("aws-linux-dummy", matrixProject.getBuilds().getLastBuild().getBuiltOn().getLabelString());
}

@Test
public void shouldBeAbleToBlockFlyweightTaskAtTheLastMinute() throws Exception {
MatrixProject matrixProject = r.createMatrixProject("downstream");
matrixProject.setDisplayName("downstream");
matrixProject.setAxes(new AxisList(
new Axis("axis", "a", "b")
));

Label label = LabelExpression.get("aws-linux-dummy");
DummyCloudImpl dummyCloud = new DummyCloudImpl(r, 0);
dummyCloud.label = label;
BlockDownstreamProjectExecution property = new BlockDownstreamProjectExecution();
dummyCloud.getNodeProperties().add(property);
r.jenkins.clouds.add(dummyCloud);
matrixProject.setAssignedLabel(label);

FreeStyleProject upstreamProject = r.createFreeStyleProject("upstream");
upstreamProject.getBuildersList().add(new SleepBuilder(10000));
upstreamProject.setDisplayName("upstream");

//let's assume the flyweighttask has an upstream project and that must be blocked
// when the upstream project is running
matrixProject.addTrigger(new ReverseBuildTrigger("upstream", Result.SUCCESS));
matrixProject.setBlockBuildWhenUpstreamBuilding(true);

//we schedule the project but we pretend no executors are available thus
//the flyweight task is in the buildable queue without being executed
QueueTaskFuture downstream = matrixProject.scheduleBuild2(0);
if (downstream == null) {
throw new Exception("the flyweight task could not be scheduled, thus the test will be interrupted");
}
//let s wait for the Queue instance to be updated
while (Queue.getInstance().getBuildableItems().size() != 1) {
Thread.sleep(10);
}
//in this state the build is not blocked, it's just waiting for an available executor
assertFalse(Queue.getInstance().getItems()[0].isBlocked());

//we start the upstream project that should block the downstream one
QueueTaskFuture upstream = upstreamProject.scheduleBuild2(0);
if (upstream == null) {
throw new Exception("the upstream task could not be scheduled, thus the test will be interrupted");
}
//let s wait for the Upstream to enter the buildable Queue
boolean enteredTheQueue = false;
while (!enteredTheQueue) {
for (Queue.BuildableItem item : Queue.getInstance().getBuildableItems()) {
if (item.task.getDisplayName() != null && item.task.getDisplayName().equals(upstreamProject.getDisplayName())) {
enteredTheQueue = true;
}
}
}
//let's wait for the upstream project to actually start so that we're sure the Queue has been updated
//when the upstream starts the downstream has already left the buildable queue and the queue is empty
while (!Queue.getInstance().getBuildableItems().isEmpty()) {
Thread.sleep(10);
}
assertTrue(Queue.getInstance().getItems()[0].isBlocked());
assertTrue(Queue.getInstance().getBlockedItems().get(0).task.getDisplayName().equals(matrixProject.displayName));

//once the upstream is completed, the downstream can join the buildable queue again.
r.assertBuildStatusSuccess(upstream);
while (Queue.getInstance().getBuildableItems().isEmpty()) {
Thread.sleep(10);
}
assertFalse(Queue.getInstance().getItems()[0].isBlocked());
assertTrue(Queue.getInstance().getBlockedItems().isEmpty());
assertTrue(Queue.getInstance().getBuildableItems().get(0).task.getDisplayName().equals(matrixProject.displayName));
}

//let's make sure that the downstram project is not started before the upstream --> we want to simulate
// the case: buildable-->blocked-->buildable
public static class BlockDownstreamProjectExecution extends NodeProperty<Slave> {
@Override
public CauseOfBlockage canTake(Queue.BuildableItem item) {
if (item.task.getName().equals("downstream")) {
return new CauseOfBlockage() {
@Override
public String getShortDescription() {
return "slave not provisioned";
}
};
}
return null;
}

@TestExtension("shouldBeAbleToBlockFlyWeightTaskOnLastMinute")
public static class DescriptorImpl extends NodePropertyDescriptor {
@Override
public String getDisplayName() {
return "Some Property";
}
}
}
}

0 comments on commit f07b3b7

Please sign in to comment.