Skip to content
Permalink
Browse files

Merge pull request #2 from jenkinsci/jenkins-28690-related

Need to hold a lock when disconnecting or you can trigger JENKINS-28690 style deadlocks
  • Loading branch information...
imeredith committed Jul 13, 2015
2 parents e265cac + 3346565 commit cf798b87dc339c91da4b5fb26ceb4ab1bcae4259
@@ -45,12 +45,26 @@
private final MansionSlave slave;
private long creationTime = System.currentTimeMillis();
private long onlineTime = 0;
private boolean disconnectInProgress;

MansionComputer(MansionSlave slave) {
super(slave);
this.slave = slave;
}

public synchronized boolean isDisconnectInProgress() {
return disconnectInProgress;
}

/*package*/ synchronized void setDisconnectInProgress(boolean disconnectInProgress) {
this.disconnectInProgress = disconnectInProgress;
}

@Override
public boolean isAcceptingTasks() {
return !isDisconnectInProgress() && super.isAcceptingTasks();
}

/**
* {@link MansionComputer} is not configurable.
*
@@ -27,11 +27,15 @@
import hudson.model.Computer;
import hudson.model.Executor;
import hudson.model.Node;
import hudson.model.Queue;
import hudson.slaves.CloudSlaveRetentionStrategy;
import hudson.slaves.OfflineCause;
import hudson.util.TimeUnit2;
import jenkins.model.Jenkins;

import java.io.IOException;
import java.lang.reflect.InvocationTargetException;
import java.lang.reflect.Method;
import java.lang.reflect.Modifier;
import java.util.concurrent.TimeUnit;
import java.util.logging.Level;
import java.util.logging.Logger;
@@ -40,6 +44,8 @@

public class MansionRetentionStrategy <T extends MansionComputer> extends CloudSlaveRetentionStrategy<T> {

private transient volatile boolean disconnectInProgress;

@Override
public long check(T c) {
long nextCheck = super.check(c);
@@ -72,32 +78,68 @@ private boolean shouldHaveConnectedByNow(T c) {
}

@Override
protected void kill(Node n) throws IOException {
MansionComputer computer = (MansionComputer) n.toComputer();
// attempt to heuristically detect a race condition
// where an executor accepted a task after we checked for
// idleness,
// but before we marked it as unavailable for tasks
// See JENKINS-23676
LOGGER.log(Level.FINE, "Taking node offline since it seems to be idle" + n.getNodeName());
protected void kill(final Node n) throws IOException {
final MansionComputer computer = (MansionComputer) n.toComputer();

final String nodeName = n.getNodeName();
LOGGER.log(Level.FINE, "Taking node {0} offline since it seems to be idle", n.getNodeName());
// we need to have a private block as other threads could try to turn on accepting tasks
computer.setDisconnectInProgress(true);
// set accepting tasks so that others can see this flag if expecting it
computer.setAcceptingTasks(false);
try {
Thread.sleep(2000);
} catch (InterruptedException e) {
LOGGER.log(Level.FINE, "Interrupted while sleeping before removing node " + n.getNodeName());
return;
}
if (!computer.isIdle() && computer.isOnline()) {
LOGGER.log(FINE, computer.getName() + " is no longer idle, aborting termination.");
// we lost the race -- mark it as back online
computer.setAcceptingTasks(true);
return;
}
for (Executor e : computer.getExecutors()) {
e.interrupt();
synchronized (this) {
if (disconnectInProgress) {
// if we already have a background thread running, then we can return immediately
return;
}
disconnectInProgress = true;
}
LOGGER.log(Level.FINE, "Finally removing node " + n.getNodeName());
super.kill(n);
Computer.threadPoolForRemoting.submit(new Runnable() {
public void run() {
// TODO once Jenkins 1.607+ we can remove the sleep as the withLock will give atomic guarantee

// attempt to heuristically detect a race condition
// where an executor accepted a task after we checked for
// idleness,
// but before we marked it as unavailable for tasks
// See JENKINS-23676
try {
Thread.sleep(2000);
} catch (InterruptedException e) {
LOGGER.log(Level.FINE, "Interrupted while sleeping before removing node {0}", nodeName);
return;
}

queueLock.withLock(new Runnable() {
public void run() {
if (!computer.isIdle() && computer.isOnline()) {
LOGGER.log(FINE, computer.getName() + " is no longer idle, aborting termination.");
// we lost the race -- mark it as back online
computer.setAcceptingTasks(true);
computer.setDisconnectInProgress(false);
synchronized (MansionRetentionStrategy.this) {
disconnectInProgress = false;
}
return;
}
// TODO figure out why this cannot just be computer.getNode().terminate()
try {
for (Executor e : computer.getExecutors()) {
e.interrupt();
}
LOGGER.log(Level.FINE, "Finally removing node " + n.getNodeName());
MansionRetentionStrategy.super.kill(n);
} catch (IOException e) {
computer.setAcceptingTasks(true);
computer.setDisconnectInProgress(false);
synchronized (MansionRetentionStrategy.this) {
disconnectInProgress = false;
}
}
}
});
}
});
}

/**
@@ -114,4 +156,55 @@ protected long getIdleMaxTime() {


private static Logger LOGGER = Logger.getLogger(MansionRetentionStrategy.class.getName());

private static final QueueLock queueLock = newQueueLock();

private static QueueLock newQueueLock() {
try {
return new ReflectionPost592QueueLock();
} catch (NoSuchMethodException e) {
return new Pre592QueueLock();
}
}

// TODO replace all this with Queue.withLock once Jenkins 1.592+
interface QueueLock {
void withLock(Runnable runnable);
}

private static class Pre592QueueLock implements QueueLock {
@SuppressWarnings("SynchronizationOnLocalVariableOrMethodParameter")
public void withLock(Runnable runnable) {
final Jenkins jenkins = Jenkins.getInstance();
final Object monitor = jenkins == null ? getClass() : jenkins.getQueue();
synchronized (monitor) {
runnable.run();
}
}
}

private static class ReflectionPost592QueueLock implements QueueLock {
private final Method withLock;

private ReflectionPost592QueueLock() throws NoSuchMethodException {
this.withLock = Queue.class.getMethod("withLock", Runnable.class);
if (!Modifier.isStatic(withLock.getModifiers())) {
throw new NoSuchMethodException("Expecting withLock(Runnable) to be static");
}
if (!Modifier.isPublic(withLock.getModifiers())) {
throw new NoSuchMethodException("Expecting withLock(Runnable) to be static");
}
}

public void withLock(Runnable runnable) {
try {
withLock.invoke(null, runnable);
} catch (IllegalAccessException e) {
// fall back, but should never get here as the constructor will blow up first
new Pre592QueueLock().withLock(runnable);
} catch (InvocationTargetException e) {
// ignore
}
}
}
}

0 comments on commit cf798b8

Please sign in to comment.
You can’t perform that action at this time.