From 32ece82d9f4f0db1a00132c15ca6dd394806a4f1 Mon Sep 17 00:00:00 2001 From: Peter Darton Date: Tue, 6 Mar 2018 18:14:15 +0000 Subject: [PATCH] DockerCloud removes containers it created if provisioning failed. If the docker container gets successfully created, but Jenkins then fails to take on the new slave, we used to forget about it and just leak containers. We now remove the container if we failed to complete the provisioning process after we'd created it, and log the termination progress. --- .../jenkins/plugins/docker/DockerCloud.java | 6 +- .../plugins/docker/DockerTemplate.java | 52 +++++--- .../jenkins/docker/DockerTransientNode.java | 117 ++++++++++++++---- 3 files changed, 132 insertions(+), 43 deletions(-) diff --git a/src/main/java/com/nirima/jenkins/plugins/docker/DockerCloud.java b/src/main/java/com/nirima/jenkins/plugins/docker/DockerCloud.java index 4bfb95152..25d473b09 100644 --- a/src/main/java/com/nirima/jenkins/plugins/docker/DockerCloud.java +++ b/src/main/java/com/nirima/jenkins/plugins/docker/DockerCloud.java @@ -328,10 +328,11 @@ public synchronized Collection provision(final Labe final Runnable taskToCreateNewSlave = new Runnable() { @Override public void run() { + DockerTransientNode slave = null; try { // TODO where can we log provisioning progress ? final DockerAPI api = DockerCloud.this.getDockerApi(); - final DockerTransientNode slave = t.provisionNode(api, TaskListener.NULL); + slave = t.provisionNode(api, TaskListener.NULL); slave.setCloudId(DockerCloud.this.name); plannedNode.complete(slave); @@ -342,6 +343,9 @@ public void run() { LOGGER.error("Error in provisioning; template='{}' for cloud='{}'", t, getDisplayName(), ex); plannedNode.completeExceptionally(ex); + if (slave != null) { + slave.terminate(LOGGER); + } throw Throwables.propagate(ex); } finally { decrementContainersInProgress(t); diff --git a/src/main/java/com/nirima/jenkins/plugins/docker/DockerTemplate.java b/src/main/java/com/nirima/jenkins/plugins/docker/DockerTemplate.java index 3d4f2c70f..275e4e591 100644 --- a/src/main/java/com/nirima/jenkins/plugins/docker/DockerTemplate.java +++ b/src/main/java/com/nirima/jenkins/plugins/docker/DockerTemplate.java @@ -4,7 +4,6 @@ import com.github.dockerjava.api.command.CreateContainerCmd; import com.github.dockerjava.api.command.PullImageCmd; import com.github.dockerjava.api.exception.DockerClientException; -import com.github.dockerjava.api.exception.DockerException; import com.github.dockerjava.api.exception.NotFoundException; import com.github.dockerjava.api.model.PortBinding; import com.github.dockerjava.api.model.PullResponseItem; @@ -255,9 +254,8 @@ public String getRemoteFs() { public String getInstanceCapStr() { if (instanceCap == Integer.MAX_VALUE) { return ""; - } else { - return String.valueOf(instanceCap); } + return String.valueOf(instanceCap); } public int getInstanceCap() { @@ -420,7 +418,7 @@ public DockerTransientNode provisionNode(DockerAPI api, TaskListener listener) t final DockerClient client = api.getClient(); - CreateContainerCmd cmd = client.createContainerCmd(getImage()); + final CreateContainerCmd cmd = client.createContainerCmd(getImage()); fillContainerConfig(cmd); // Unique ID we use both as Node identifier and container Name @@ -430,29 +428,43 @@ public DockerTransientNode provisionNode(DockerAPI api, TaskListener listener) t connector.beforeContainerCreated(api, remoteFs, cmd); - LOGGER.info("Trying to run container {} from image: {}", uid, getImage()); - String containerId = cmd.exec().getId(); + LOGGER.info("Trying to run container for node {} from image: {}", uid, getImage()); + boolean finallyRemoveTheContainer = true; + final String containerId = cmd.exec().getId(); + // if we get this far, we have created the container so, + // if we fail to return the node, we need to ensure it's cleaned up. + LOGGER.info("Started container ID {} for node {} from image: {}", containerId, uid, getImage()); try { connector.beforeContainerStarted(api, remoteFs, containerId); client.startContainerCmd(containerId).exec(); connector.afterContainerStarted(api, remoteFs, containerId); - } catch (DockerException e) { + + final ComputerLauncher launcher = connector.createLauncher(api, containerId, remoteFs, listener); + + final DockerTransientNode node = new DockerTransientNode(uid, containerId, remoteFs, launcher); + node.setNodeDescription("Docker Agent [" + getImage() + " on "+ api.getDockerHost().getUri() + " ID " + containerId + "]"); + node.setMode(mode); + node.setLabelString(labelString); + node.setRetentionStrategy(retentionStrategy); + node.setNodeProperties(nodeProperties); + node.setRemoveVolumes(removeVolumes); + node.setDockerAPI(api); + finallyRemoveTheContainer = false; + return node; + } finally { // if something went wrong, cleanup aborted container - client.removeContainerCmd(containerId).withForce(true).exec(); + // while ensuring that the original exception escapes. + if ( finallyRemoveTheContainer ) { + try { + client.removeContainerCmd(containerId).withForce(true).exec(); + } catch (NotFoundException ex) { + LOGGER.info("Unable to remove container '" + containerId + "' as it had already gone."); + } catch (Throwable ex) { + LOGGER.error("Unable to remove container '" + containerId + "' due to exception:", ex); + } + } } - - final ComputerLauncher launcher = connector.createLauncher(api, containerId, remoteFs, listener); - - DockerTransientNode node = new DockerTransientNode(uid, containerId, remoteFs, launcher); - node.setNodeDescription("Docker Agent [" + getImage() + " on "+ api.getDockerHost().getUri() + "]"); - node.setMode(mode); - node.setLabelString(labelString); - node.setRetentionStrategy(retentionStrategy); - node.setNodeProperties(nodeProperties); - node.setRemoveVolumes(removeVolumes); - node.setDockerAPI(api); - return node; } diff --git a/src/main/java/io/jenkins/docker/DockerTransientNode.java b/src/main/java/io/jenkins/docker/DockerTransientNode.java index 81ddefb88..5ac693e77 100644 --- a/src/main/java/io/jenkins/docker/DockerTransientNode.java +++ b/src/main/java/io/jenkins/docker/DockerTransientNode.java @@ -13,6 +13,10 @@ import hudson.slaves.ComputerLauncher; import io.jenkins.docker.client.DockerAPI; import jenkins.model.Jenkins; +import org.kohsuke.accmod.Restricted; +import org.kohsuke.accmod.restrictions.NoExternalUse; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import javax.annotation.Nonnull; import java.io.IOException; @@ -24,13 +28,14 @@ * @author Nicolas De Loof */ public class DockerTransientNode extends Slave { + private static final Logger LOGGER = LoggerFactory.getLogger(DockerTransientNode.class.getName()); //Keeping real containerId information, but using containerName as containerId private final String containerId; private final String containerName; - private DockerAPI dockerAPI; + private transient DockerAPI dockerAPI; private boolean removeVolumes; @@ -38,7 +43,6 @@ public class DockerTransientNode extends Slave { private AtomicBoolean acceptingTasks = new AtomicBoolean(true); - public DockerTransientNode(@Nonnull String uid, String containerId, String workdir, ComputerLauncher launcher) throws Descriptor.FormException, IOException { super(nodeName(uid), workdir, launcher); this.containerName = uid; @@ -73,10 +77,18 @@ public void setDockerAPI(DockerAPI dockerAPI) { this.dockerAPI = dockerAPI; } + /** @return The {@link DockerAPI} for our cloud. */ public DockerAPI getDockerAPI() { + if (dockerAPI == null) { + final DockerCloud cloud = getCloud(); + if (cloud != null) { + dockerAPI = cloud.getDockerApi(); + } + } return dockerAPI; } + @Override public String getDisplayName() { if (cloudId != null) { return getNodeName() + " on " + cloudId; @@ -105,48 +117,109 @@ public DockerComputer createComputer() { return new DockerComputer(this); } - public void terminate(TaskListener listener) { + private interface ILogger { + void println(String msg); + + void error(String msg, Throwable ex); + } + + public void terminate(final TaskListener listener) { + final ILogger tl = new ILogger() { + @Override + public void println(String msg) { + listener.getLogger().println(msg); + LOGGER.info(msg); + } + + @Override + public void error(String msg, Throwable ex) { + listener.error(msg, ex); + LOGGER.error(msg, ex); + } + }; try { - toComputer().disconnect(new DockerOfflineCause()); - listener.getLogger().println("Disconnected computer"); - } catch (Exception e) { - listener.error("Can't disconnect", e); + terminate(tl); + } catch (Throwable ex) { + tl.error("Failure while terminating '" + name + "':", ex); } + } + + /** + * Tries to remove all trace of this node, logging anything that goes wrong. + *

+ * Note: This is not intended for use outside the plugin. + */ + @Restricted(NoExternalUse.class) + public void terminate(final Logger logger) { + final ILogger tl = new ILogger() { + @Override + public void println(String msg) { + logger.info(msg); + } + @Override + public void error(String msg, Throwable ex) { + logger.error(msg, ex); + } + }; + try { + terminate(tl); + } catch (Throwable ex) { + tl.error("Failure while terminating '" + name + "':", ex); + } + } + + private void terminate(ILogger logger) { + try { + final Computer computer = toComputer(); + if (computer != null) { + computer.disconnect(new DockerOfflineCause()); + logger.println("Disconnected computer for slave '" + name + "'."); + } + } catch (Exception ex) { + logger.error("Can't disconnect computer for slave '" + name + "' due to exception:", ex); + } + + final String containerId = getContainerId(); Computer.threadPoolForRemoting.submit(() -> { - final String containerId = getContainerId(); - DockerClient client = dockerAPI.getClient(); + final DockerClient client; + try { + final DockerAPI api = getDockerAPI(); + client = api.getClient(); + } catch (RuntimeException ex) { + logger.error("Unable to stop and remove container '" + containerId + "' for slave '" + name + "' due to exception:", ex); + return; + } try { client.stopContainerCmd(containerId) .withTimeout(10) .exec(); - listener.getLogger().println("Stopped container "+ containerId); + logger.println("Stopped container '"+ containerId + "' for slave '" + name + "'."); } catch(NotFoundException e) { - listener.getLogger().println("Container already removed " + containerId); + logger.println("Can't stop container '" + containerId + "' for slave '" + name + "' as it does not exist."); + return; // no point trying to remove the container if it's already gone. } catch (Exception ex) { - listener.error("Failed to stop instance " + getContainerId() + " for slave " + name + " due to exception", ex.getMessage()); - listener.error("Causing exception for failure on stopping the instance was", ex); + logger.error("Failed to stop container '" + containerId + "' for slave '" + name + "' due to exception:", ex); } try { client.removeContainerCmd(containerId) .withRemoveVolumes(removeVolumes) .exec(); - - listener.getLogger().println("Removed container " + containerId); + logger.println("Removed container '" + containerId + "' for slave '" + name + "'."); } catch (NotFoundException e) { - listener.getLogger().println("Container already gone."); + logger.println("Container '" + containerId + "' already gone for slave '" + name + "'."); } catch (Exception ex) { - listener.error("Failed to remove instance " + getContainerId() + " for slave " + name + " due to exception: " + ex.getMessage()); - listener.error("Causing exception for failre on removing instance was", ex); + logger.error("Failed to remove container '" + containerId + "' for slave '" + name + "' due to exception:", ex); } }); try { Jenkins.getInstance().removeNode(this); - } catch (IOException e) { - listener.error("Failed to remove Docker Node", e); + logger.println("Removed Node for slave '" + name + "'."); + } catch (IOException ex) { + logger.error("Failed to remove Node for slave '" + name + "' due to exception:", ex); } } @@ -158,8 +231,8 @@ public DockerCloud getCloud() { throw new RuntimeException("Failed to retrieve Cloud " + cloudId); } - if (cloud.getClass() != DockerCloud.class) { - throw new RuntimeException(cloudId + " is not DockerCloud"); + if (!(cloud instanceof DockerCloud)) { + throw new RuntimeException(cloudId + " is not a DockerCloud, it's a " + cloud.getClass().toString()); } return (DockerCloud) cloud;