Skip to content
Permalink
Browse files

Merge pull request #49 from jglick/heartbeat

[JENKINS-47791] Remove ProcessLiveness and PID tracking in favor of a simple timestamp check on the log file
  • Loading branch information
jglick committed Nov 15, 2017
2 parents edef683 + d35e639 commit a818ef883ff5200a64ca33f4d700d1ecf17b3211
22 pom.xml
@@ -3,7 +3,7 @@
<parent>
<groupId>org.jenkins-ci.plugins</groupId>
<artifactId>plugin</artifactId>
<version>2.26</version>
<version>3.0</version>
<relativePath />
</parent>
<artifactId>durable-task</artifactId>
@@ -20,7 +20,7 @@
</licenses>

<properties>
<jenkins.version>1.642.3</jenkins.version>
<jenkins.version>2.7.3</jenkins.version>
<findbugs.failOnError>false</findbugs.failOnError>
</properties>

@@ -41,6 +41,20 @@
<connection>scm:git:git://github.com/jenkinsci/${project.artifactId}-plugin.git</connection>
<developerConnection>scm:git:git@github.com:jenkinsci/${project.artifactId}-plugin.git</developerConnection>
<url>http://github.com/jenkinsci/${project.artifactId}-plugin</url>
<tag>HEAD</tag>
</scm>
<tag>HEAD</tag>
</scm>
<dependencies>
<dependency>
<groupId>org.jenkins-ci.test</groupId>
<artifactId>docker-fixtures</artifactId>
<version>1.2</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.jenkins-ci.plugins</groupId>
<artifactId>ssh-slaves</artifactId>
<version>1.22</version>
<scope>test</scope>
</dependency>
</dependencies>
</project>
@@ -39,6 +39,9 @@
import java.util.List;
import java.util.Map;
import java.util.WeakHashMap;
import java.util.concurrent.TimeUnit;
import java.util.logging.Level;
import java.util.logging.Logger;
import javax.annotation.Nonnull;
import jenkins.model.Jenkins;
import jenkins.security.MasterToSlaveCallable;
@@ -49,12 +52,30 @@
*/
public final class BourneShellScript extends FileMonitoringTask {

private static final Logger LOGGER = Logger.getLogger(BourneShellScript.class.getName());

private static enum OsType {DARWIN, UNIX, WINDOWS}

/** Number of times we will show launch diagnostics in a newly encountered workspace before going mute to save resources. */
private static /* not final */ int NOVEL_WORKSPACE_DIAGNOSTICS_COUNT = Integer.getInteger(BourneShellScript.class.getName() + ".NOVEL_WORKSPACE_DIAGNOSTICS_COUNT", 10);
/** Number of seconds we will wait for a controller script to be launched before assuming the launch failed. */
private static /* not final */ int LAUNCH_FAILURE_TIMEOUT = Integer.getInteger(BourneShellScript.class.getName() + ".LAUNCH_FAILURE_TIMEOUT", 15);
@SuppressWarnings("FieldMayBeFinal")
// TODO use SystemProperties if and when unrestricted
private static int NOVEL_WORKSPACE_DIAGNOSTICS_COUNT = Integer.getInteger(BourneShellScript.class.getName() + ".NOVEL_WORKSPACE_DIAGNOSTICS_COUNT", 10);

/**
* Seconds between heartbeat checks, where we check to see if
* {@code jenkins-log.txt} is still being modified.
*/
@SuppressWarnings("FieldMayBeFinal")
private static int HEARTBEAT_CHECK_INTERVAL = Integer.getInteger(BourneShellScript.class.getName() + ".HEARTBEAT_CHECK_INTERVAL", 15);

/**
* Minimum timestamp difference on {@code jenkins-log.txt} that is
* considered an actual modification. Theoretically could be zero (if
* {@code <} became {@code <=}, else infinitesimal positive) but on some
* platforms file timestamps are not that precise.
*/
@SuppressWarnings("FieldMayBeFinal")
private static int HEARTBEAT_MINIMUM_DELTA = Integer.getInteger(BourneShellScript.class.getName() + ".HEARTBEAT_MINIMUM_DELTA", 2);

private final @Nonnull String script;
private boolean capturingOutput;
@@ -101,7 +122,7 @@ public String getScript() {
shf.chmod(0755);

scriptPath = shf.getRemote();
List<String> args = new ArrayList<String>();
List<String> args = new ArrayList<>();

OsType os = ws.act(new getOsType());

@@ -115,27 +136,35 @@ public String getScript() {
envVars.put(cookieVariable, "please-do-not-kill-me");
// The temporary variable is to ensure JENKINS_SERVER_COOKIE=durable-… does not appear even in argv[], lest it be confused with the environment.
String cmd;
FilePath logFile = c.getLogFile(ws);
FilePath resultFile = c.getResultFile(ws);
FilePath controlDir = c.controlDir(ws);
if (capturingOutput) {
cmd = String.format("echo $$ > '%s'; jsc=%s; %s=$jsc '%s' > '%s' 2> '%s'; echo $? > '%s'",
c.pidFile(ws),
cmd = String.format("{ while [ -d '%s' -a \\! -f '%s' ]; do touch '%s'; sleep 3; done } & jsc=%s; %s=$jsc '%s' > '%s' 2> '%s'; echo $? > '%s'; wait",
controlDir,
resultFile,
logFile,
cookieValue,
cookieVariable,
scriptPath,
c.getOutputFile(ws),
c.getLogFile(ws),
c.getResultFile(ws));
logFile,
resultFile);
} else {
cmd = String.format("echo $$ > '%s'; jsc=%s; %s=$jsc '%s' > '%s' 2>&1; echo $? > '%s'",
c.pidFile(ws),
cmd = String.format("{ while [ -d '%s' -a \\! -f '%s' ]; do touch '%s'; sleep 3; done } & jsc=%s; %s=$jsc '%s' > '%s' 2>&1; echo $? > '%s'; wait",
controlDir,
resultFile,
logFile,
cookieValue,
cookieVariable,
scriptPath,
c.getLogFile(ws),
c.getResultFile(ws));
logFile,
resultFile);
}
cmd = cmd.replace("$", "$$"); // escape against EnvVars jobEnv in LocalLauncher.launch

args.addAll(Arrays.asList("sh", "-c", cmd));
LOGGER.log(Level.FINE, "launching {0}", args);
Launcher.ProcStarter ps = launcher.launch().cmds(args).envs(escape(envVars)).pwd(ws).quiet(true);
listener.getLogger().println("[" + ws.getRemote().replaceFirst("^.+/", "") + "] Running shell script"); // -x will give details
boolean novel;
@@ -160,8 +189,10 @@ public String getScript() {

/*package*/ static final class ShellController extends FileMonitoringController {

private int pid;
private final long startTime = System.currentTimeMillis();
/** Last time we checked the timestamp, in nanoseconds on the master. */
private transient long lastCheck;
/** Last-observed modification time of {@link getLogFile} on remote computer, in milliseconds. */
private transient long checkedTimestamp;

private ShellController(FilePath ws) throws IOException, InterruptedException {
super(ws);
@@ -171,47 +202,50 @@ public FilePath getScriptFile(FilePath ws) throws IOException, InterruptedExcept
return controlDir(ws).child("script.sh");
}

FilePath pidFile(FilePath ws) throws IOException, InterruptedException {
/** Only here for compatibility. */
private FilePath pidFile(FilePath ws) throws IOException, InterruptedException {
return controlDir(ws).child("pid");
}

private synchronized int pid(FilePath ws) throws IOException, InterruptedException {
if (pid == 0) {
FilePath pidFile = pidFile(ws);
if (pidFile.exists()) {
try {
pid = Integer.parseInt(pidFile.readToString().trim());
} catch (NumberFormatException x) {
throw new IOException("corrupted content in " + pidFile + ": " + x, x);
}
}
}
return pid;
}

@Override public Integer exitStatus(FilePath workspace, Launcher launcher) throws IOException, InterruptedException {
Integer status = super.exitStatus(workspace, launcher);
if (status != null) {
LOGGER.log(Level.FINE, "found exit code {0} in {1}", new Object[] {status, controlDir});
return status;
}
int _pid = pid(workspace);
if (_pid > 0 && !ProcessLiveness.isAlive(workspace.getChannel(), _pid, launcher)) {
// it looks like the process has disappeared. one last check to make sure it's not a result of a race condition,
// then if we still don't have the exit code, use fake exit code to distinguish from 0 (success) and 1+ (observed failure)
// TODO would be better to have exitStatus accept a TaskListener so we could print an informative message
status = super.exitStatus(workspace, launcher);
if (status == null) {
status = -1;
long now = System.nanoTime();
if (lastCheck == 0) {
LOGGER.log(Level.FINE, "starting check in {0}", controlDir);
lastCheck = now;
} else if (now > lastCheck + TimeUnit.SECONDS.toNanos(HEARTBEAT_CHECK_INTERVAL)) {
lastCheck = now;
long currentTimestamp = getLogFile(workspace).lastModified();
if (currentTimestamp == 0) {
LOGGER.log(Level.FINE, "apparently never started in {0}", controlDir);
return recordExitStatus(workspace, -2);
} else if (checkedTimestamp > 0) {
if (currentTimestamp < checkedTimestamp) {
LOGGER.log(Level.WARNING, "apparent clock skew in {0}", controlDir);
} else if (currentTimestamp < checkedTimestamp + TimeUnit.SECONDS.toMillis(HEARTBEAT_MINIMUM_DELTA)) {
FilePath pidFile = pidFile(workspace);
if (pidFile.exists()) {
LOGGER.log(Level.FINE, "still have {0} so heartbeat checks unreliable; process may or may not be alive", pidFile);
} else {
LOGGER.log(Level.FINE, "heartbeat touches apparently not running in {0}", controlDir);
return recordExitStatus(workspace, -1);
}
}
} else {
LOGGER.log(Level.FINE, "seeing recent log file modifications in {0}", controlDir);
}
return status;
} else if (_pid == 0 && /* compatibility */ startTime > 0 && System.currentTimeMillis() - startTime > 1000 * LAUNCH_FAILURE_TIMEOUT) {
return -2; // apparently never started
checkedTimestamp = currentTimestamp;
}
return null;
}

@Override public String getDiagnostics(FilePath workspace, Launcher launcher) throws IOException, InterruptedException {
return super.getDiagnostics(workspace, launcher) + " (pid: " + pid + ")";
private int recordExitStatus(FilePath workspace, int code) throws IOException, InterruptedException {
getResultFile(workspace).write(Integer.toString(code), null);
return code;
}

private static final long serialVersionUID = 1L;
@@ -235,5 +269,6 @@ private synchronized int pid(FilePath ws) throws IOException, InterruptedExcepti
return OsType.UNIX; // Default Value
}
}
private static final long serialVersionUID = 1L;
}
}
@@ -96,7 +96,7 @@ protected FileMonitoringController doLaunch(FilePath workspace, Launcher launche
protected static class FileMonitoringController extends Controller {

/** Absolute path of {@link #controlDir(FilePath)}. */
private String controlDir;
String controlDir;

/**
* @deprecated used only in pre-1.8

This file was deleted.

0 comments on commit a818ef8

Please sign in to comment.
You can’t perform that action at this time.