Skip to content

Commit

Permalink
DockerCloud removes containers it created if provisioning failed.
Browse files Browse the repository at this point in the history
If the docker container gets successfully created, but Jenkins then
fails to take on the new slave, we used to forget about it and just leak
containers.
We now remove the container if we failed to complete the provisioning
process after we'd created it, and log the termination progress.
  • Loading branch information
pjdarton committed Mar 6, 2018
1 parent 1411dce commit 32ece82
Show file tree
Hide file tree
Showing 3 changed files with 132 additions and 43 deletions.
Expand Up @@ -328,10 +328,11 @@ public synchronized Collection<NodeProvisioner.PlannedNode> provision(final Labe
final Runnable taskToCreateNewSlave = new Runnable() {
@Override
public void run() {
DockerTransientNode slave = null;
try {
// TODO where can we log provisioning progress ?
final DockerAPI api = DockerCloud.this.getDockerApi();
final DockerTransientNode slave = t.provisionNode(api, TaskListener.NULL);
slave = t.provisionNode(api, TaskListener.NULL);
slave.setCloudId(DockerCloud.this.name);
plannedNode.complete(slave);

Expand All @@ -342,6 +343,9 @@ public void run() {
LOGGER.error("Error in provisioning; template='{}' for cloud='{}'",
t, getDisplayName(), ex);
plannedNode.completeExceptionally(ex);
if (slave != null) {
slave.terminate(LOGGER);
}
throw Throwables.propagate(ex);
} finally {
decrementContainersInProgress(t);
Expand Down
52 changes: 32 additions & 20 deletions src/main/java/com/nirima/jenkins/plugins/docker/DockerTemplate.java
Expand Up @@ -4,7 +4,6 @@
import com.github.dockerjava.api.command.CreateContainerCmd;
import com.github.dockerjava.api.command.PullImageCmd;
import com.github.dockerjava.api.exception.DockerClientException;
import com.github.dockerjava.api.exception.DockerException;
import com.github.dockerjava.api.exception.NotFoundException;
import com.github.dockerjava.api.model.PortBinding;
import com.github.dockerjava.api.model.PullResponseItem;
Expand Down Expand Up @@ -255,9 +254,8 @@ public String getRemoteFs() {
public String getInstanceCapStr() {
if (instanceCap == Integer.MAX_VALUE) {
return "";
} else {
return String.valueOf(instanceCap);
}
return String.valueOf(instanceCap);
}

public int getInstanceCap() {
Expand Down Expand Up @@ -420,7 +418,7 @@ public DockerTransientNode provisionNode(DockerAPI api, TaskListener listener) t

final DockerClient client = api.getClient();

CreateContainerCmd cmd = client.createContainerCmd(getImage());
final CreateContainerCmd cmd = client.createContainerCmd(getImage());
fillContainerConfig(cmd);

// Unique ID we use both as Node identifier and container Name
Expand All @@ -430,29 +428,43 @@ public DockerTransientNode provisionNode(DockerAPI api, TaskListener listener) t

connector.beforeContainerCreated(api, remoteFs, cmd);

LOGGER.info("Trying to run container {} from image: {}", uid, getImage());
String containerId = cmd.exec().getId();
LOGGER.info("Trying to run container for node {} from image: {}", uid, getImage());
boolean finallyRemoveTheContainer = true;
final String containerId = cmd.exec().getId();
// if we get this far, we have created the container so,
// if we fail to return the node, we need to ensure it's cleaned up.
LOGGER.info("Started container ID {} for node {} from image: {}", containerId, uid, getImage());

try {
connector.beforeContainerStarted(api, remoteFs, containerId);
client.startContainerCmd(containerId).exec();
connector.afterContainerStarted(api, remoteFs, containerId);
} catch (DockerException e) {

final ComputerLauncher launcher = connector.createLauncher(api, containerId, remoteFs, listener);

final DockerTransientNode node = new DockerTransientNode(uid, containerId, remoteFs, launcher);
node.setNodeDescription("Docker Agent [" + getImage() + " on "+ api.getDockerHost().getUri() + " ID " + containerId + "]");
node.setMode(mode);
node.setLabelString(labelString);
node.setRetentionStrategy(retentionStrategy);
node.setNodeProperties(nodeProperties);
node.setRemoveVolumes(removeVolumes);
node.setDockerAPI(api);
finallyRemoveTheContainer = false;
return node;
} finally {
// if something went wrong, cleanup aborted container
client.removeContainerCmd(containerId).withForce(true).exec();
// while ensuring that the original exception escapes.
if ( finallyRemoveTheContainer ) {
try {
client.removeContainerCmd(containerId).withForce(true).exec();
} catch (NotFoundException ex) {
LOGGER.info("Unable to remove container '" + containerId + "' as it had already gone.");
} catch (Throwable ex) {
LOGGER.error("Unable to remove container '" + containerId + "' due to exception:", ex);
}
}
}

final ComputerLauncher launcher = connector.createLauncher(api, containerId, remoteFs, listener);

DockerTransientNode node = new DockerTransientNode(uid, containerId, remoteFs, launcher);
node.setNodeDescription("Docker Agent [" + getImage() + " on "+ api.getDockerHost().getUri() + "]");
node.setMode(mode);
node.setLabelString(labelString);
node.setRetentionStrategy(retentionStrategy);
node.setNodeProperties(nodeProperties);
node.setRemoveVolumes(removeVolumes);
node.setDockerAPI(api);
return node;
}


Expand Down
117 changes: 95 additions & 22 deletions src/main/java/io/jenkins/docker/DockerTransientNode.java
Expand Up @@ -13,6 +13,10 @@
import hudson.slaves.ComputerLauncher;
import io.jenkins.docker.client.DockerAPI;
import jenkins.model.Jenkins;
import org.kohsuke.accmod.Restricted;
import org.kohsuke.accmod.restrictions.NoExternalUse;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import javax.annotation.Nonnull;
import java.io.IOException;
Expand All @@ -24,21 +28,21 @@
* @author <a href="mailto:nicolas.deloof@gmail.com">Nicolas De Loof</a>
*/
public class DockerTransientNode extends Slave {
private static final Logger LOGGER = LoggerFactory.getLogger(DockerTransientNode.class.getName());

//Keeping real containerId information, but using containerName as containerId
private final String containerId;

private final String containerName;

private DockerAPI dockerAPI;
private transient DockerAPI dockerAPI;

private boolean removeVolumes;

private String cloudId;

private AtomicBoolean acceptingTasks = new AtomicBoolean(true);


public DockerTransientNode(@Nonnull String uid, String containerId, String workdir, ComputerLauncher launcher) throws Descriptor.FormException, IOException {
super(nodeName(uid), workdir, launcher);
this.containerName = uid;
Expand Down Expand Up @@ -73,10 +77,18 @@ public void setDockerAPI(DockerAPI dockerAPI) {
this.dockerAPI = dockerAPI;
}

/** @return The {@link DockerAPI} for our cloud. */
public DockerAPI getDockerAPI() {
if (dockerAPI == null) {
final DockerCloud cloud = getCloud();
if (cloud != null) {
dockerAPI = cloud.getDockerApi();
}
}
return dockerAPI;
}

@Override
public String getDisplayName() {
if (cloudId != null) {
return getNodeName() + " on " + cloudId;
Expand Down Expand Up @@ -105,48 +117,109 @@ public DockerComputer createComputer() {
return new DockerComputer(this);
}

public void terminate(TaskListener listener) {
private interface ILogger {
void println(String msg);

void error(String msg, Throwable ex);
}

public void terminate(final TaskListener listener) {
final ILogger tl = new ILogger() {
@Override
public void println(String msg) {
listener.getLogger().println(msg);
LOGGER.info(msg);
}

@Override
public void error(String msg, Throwable ex) {
listener.error(msg, ex);
LOGGER.error(msg, ex);
}
};
try {
toComputer().disconnect(new DockerOfflineCause());
listener.getLogger().println("Disconnected computer");
} catch (Exception e) {
listener.error("Can't disconnect", e);
terminate(tl);
} catch (Throwable ex) {
tl.error("Failure while terminating '" + name + "':", ex);
}
}

/**
* Tries to remove all trace of this node, logging anything that goes wrong.
* <p>
* Note: This is not intended for use outside the plugin.
*/
@Restricted(NoExternalUse.class)
public void terminate(final Logger logger) {
final ILogger tl = new ILogger() {
@Override
public void println(String msg) {
logger.info(msg);
}

@Override
public void error(String msg, Throwable ex) {
logger.error(msg, ex);
}
};
try {
terminate(tl);
} catch (Throwable ex) {
tl.error("Failure while terminating '" + name + "':", ex);
}
}

private void terminate(ILogger logger) {
try {
final Computer computer = toComputer();
if (computer != null) {
computer.disconnect(new DockerOfflineCause());
logger.println("Disconnected computer for slave '" + name + "'.");
}
} catch (Exception ex) {
logger.error("Can't disconnect computer for slave '" + name + "' due to exception:", ex);
}

final String containerId = getContainerId();
Computer.threadPoolForRemoting.submit(() -> {
final String containerId = getContainerId();
DockerClient client = dockerAPI.getClient();
final DockerClient client;
try {
final DockerAPI api = getDockerAPI();
client = api.getClient();
} catch (RuntimeException ex) {
logger.error("Unable to stop and remove container '" + containerId + "' for slave '" + name + "' due to exception:", ex);
return;
}

try {
client.stopContainerCmd(containerId)
.withTimeout(10)
.exec();
listener.getLogger().println("Stopped container "+ containerId);
logger.println("Stopped container '"+ containerId + "' for slave '" + name + "'.");
} catch(NotFoundException e) {
listener.getLogger().println("Container already removed " + containerId);
logger.println("Can't stop container '" + containerId + "' for slave '" + name + "' as it does not exist.");
return; // no point trying to remove the container if it's already gone.
} catch (Exception ex) {
listener.error("Failed to stop instance " + getContainerId() + " for slave " + name + " due to exception", ex.getMessage());
listener.error("Causing exception for failure on stopping the instance was", ex);
logger.error("Failed to stop container '" + containerId + "' for slave '" + name + "' due to exception:", ex);
}

try {
client.removeContainerCmd(containerId)
.withRemoveVolumes(removeVolumes)
.exec();

listener.getLogger().println("Removed container " + containerId);
logger.println("Removed container '" + containerId + "' for slave '" + name + "'.");
} catch (NotFoundException e) {
listener.getLogger().println("Container already gone.");
logger.println("Container '" + containerId + "' already gone for slave '" + name + "'.");
} catch (Exception ex) {
listener.error("Failed to remove instance " + getContainerId() + " for slave " + name + " due to exception: " + ex.getMessage());
listener.error("Causing exception for failre on removing instance was", ex);
logger.error("Failed to remove container '" + containerId + "' for slave '" + name + "' due to exception:", ex);
}
});

try {
Jenkins.getInstance().removeNode(this);
} catch (IOException e) {
listener.error("Failed to remove Docker Node", e);
logger.println("Removed Node for slave '" + name + "'.");
} catch (IOException ex) {
logger.error("Failed to remove Node for slave '" + name + "' due to exception:", ex);
}
}

Expand All @@ -158,8 +231,8 @@ public DockerCloud getCloud() {
throw new RuntimeException("Failed to retrieve Cloud " + cloudId);
}

if (cloud.getClass() != DockerCloud.class) {
throw new RuntimeException(cloudId + " is not DockerCloud");
if (!(cloud instanceof DockerCloud)) {
throw new RuntimeException(cloudId + " is not a DockerCloud, it's a " + cloud.getClass().toString());
}

return (DockerCloud) cloud;
Expand Down

0 comments on commit 32ece82

Please sign in to comment.