Skip to content

Commit

Permalink
[JENKINS-56347] Wait for agent to connect on provisioning
Browse files Browse the repository at this point in the history
Otherwise the provision method is called multiple times
and triggers multiple pod launches
  • Loading branch information
carlossg committed May 1, 2019
1 parent b9ea4ff commit 3225773
Show file tree
Hide file tree
Showing 2 changed files with 121 additions and 6 deletions.
Expand Up @@ -111,6 +111,15 @@ public void onClose(KubernetesClientException cause) {

}

/**
* Wait until all pod containers are running
*
* @return the pod
* @throws IllegalStateException
* if pod or containers are no longer running
* @throws KubernetesClientTimeoutException
* if time ran out
*/
public Pod await(long amount, TimeUnit timeUnit) {
long started = System.currentTimeMillis();
long alreadySpent = System.currentTimeMillis() - started;
Expand All @@ -137,6 +146,15 @@ private Pod awaitWatcher(long amount, TimeUnit timeUnit) {
}
}

/**
* Wait until all pod containers are running
*
* @return the pod
* @throws IllegalStateException
* if pod or containers are no longer running
* @throws KubernetesClientTimeoutException
* if time ran out
*/
private Pod periodicAwait(int i, long started, long interval, long amount) {
Pod pod = client.pods().inNamespace(this.pod.getMetadata().getNamespace())
.withName(this.pod.getMetadata().getName()).get();
Expand Down
Expand Up @@ -27,25 +27,32 @@
import static java.util.logging.Level.*;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import java.util.concurrent.TimeUnit;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.stream.Collectors;

import javax.annotation.CheckForNull;

import io.fabric8.kubernetes.client.Watch;
import org.apache.commons.lang.StringUtils;
import org.kohsuke.stapler.DataBoundConstructor;

import com.google.common.base.Throwables;
import com.google.common.collect.ImmutableList;

import hudson.model.TaskListener;
import hudson.slaves.JNLPLauncher;
import hudson.slaves.SlaveComputer;

import io.fabric8.kubernetes.api.model.ContainerStatus;
import io.fabric8.kubernetes.api.model.Pod;
import io.fabric8.kubernetes.client.KubernetesClient;
import io.fabric8.kubernetes.client.Watch;
import io.fabric8.kubernetes.client.dsl.LogWatch;
import io.fabric8.kubernetes.client.dsl.PrettyLoggable;

/**
* Launches on Kubernetes the specified {@link KubernetesComputer} instance.
Expand Down Expand Up @@ -110,16 +117,76 @@ public void launch(SlaveComputer computer, TaskListener listener) {
.stream().filter(s -> StringUtils.isNotBlank(s)).findFirst().orElse(null);
slave.setNamespace(namespace);

LOGGER.log(Level.FINE, "Creating Pod: {0} in namespace {1}", new Object[]{podId, namespace});
LOGGER.log(Level.FINE, "Creating Pod: {0}/{1}", new Object[] { namespace, podId });
pod = client.pods().inNamespace(namespace).create(pod);
LOGGER.log(INFO, "Created Pod: {0} in namespace {1}", new Object[]{podId, namespace});
listener.getLogger().printf("Created Pod: %s in namespace %s%n", podId, namespace);
LOGGER.log(INFO, "Created Pod: {0}/{1}", new Object[] { namespace, podId });
listener.getLogger().printf("Created Pod: %s/%s%n", namespace, podId);
String podName = pod.getMetadata().getName();
String namespace1 = pod.getMetadata().getNamespace();
watcher = new AllContainersRunningPodWatcher(client, pod);
try (Watch _w = client.pods().inNamespace(namespace1).withName(podName).watch(watcher)){
try (Watch _w = client.pods().inNamespace(namespace1).withName(podName).watch(watcher)) {
watcher.await(template.getSlaveConnectTimeout(), TimeUnit.SECONDS);
}
LOGGER.log(INFO, "Pod is running: {0}/{1}", new Object[] { namespace, podId });

// We need the pod to be running and connected before returning
// otherwise this method keeps being called multiple times
List<String> validStates = ImmutableList.of("Running");

int waitForSlaveToConnect = template.getSlaveConnectTimeout();
int waitedForSlave;

// now wait for agent to be online
SlaveComputer slaveComputer = null;
String status = null;
List<ContainerStatus> containerStatuses = null;
for (waitedForSlave = 0; waitedForSlave < waitForSlaveToConnect; waitedForSlave++) {
slaveComputer = slave.getComputer();
if (slaveComputer == null) {
throw new IllegalStateException("Node was deleted, computer is null");
}
if (slaveComputer.isOnline()) {
break;
}

// Check that the pod hasn't failed already
pod = client.pods().inNamespace(namespace).withName(podId).get();
if (pod == null) {
throw new IllegalStateException("Pod no longer exists: " + podId);
}
status = pod.getStatus().getPhase();
if (!validStates.contains(status)) {
break;
}

containerStatuses = pod.getStatus().getContainerStatuses();
List<ContainerStatus> terminatedContainers = new ArrayList<>();
for (ContainerStatus info : containerStatuses) {
if (info != null) {
if (info.getState().getTerminated() != null) {
// Container has errored
LOGGER.log(INFO, "Container is terminated {0} [{2}]: {1}",
new Object[] { podId, info.getState().getTerminated(), info.getName() });
listener.getLogger().printf("Container is terminated %1$s [%3$s]: %2$s%n", podId,
info.getState().getTerminated(), info.getName());
terminatedContainers.add(info);
}
}
}

checkTerminatedContainers(terminatedContainers, podId, namespace, slave, client);

LOGGER.log(INFO, "Waiting for agent to connect ({1}/{2}): {0}",
new Object[] { podId, waitedForSlave, waitForSlaveToConnect });
listener.getLogger().printf("Waiting for agent to connect (%2$s/%3$s): %1$s%n", podId, waitedForSlave,
waitForSlaveToConnect);
Thread.sleep(1000);
}
if (slaveComputer == null || slaveComputer.isOffline()) {
logLastLines(containerStatuses, podId, namespace, slave, null, client);
throw new IllegalStateException(
"Agent is not connected after " + waitedForSlave + " seconds, status: " + status);
}

computer.setAcceptingTasks(true);
launched = true;
Expand All @@ -142,6 +209,36 @@ public void launch(SlaveComputer computer, TaskListener listener) {
}
}

private void checkTerminatedContainers(List<ContainerStatus> terminatedContainers, String podId, String namespace,
KubernetesSlave slave, KubernetesClient client) {
if (!terminatedContainers.isEmpty()) {
Map<String, Integer> errors = terminatedContainers.stream().collect(Collectors
.toMap(ContainerStatus::getName, (info) -> info.getState().getTerminated().getExitCode()));

// Print the last lines of failed containers
logLastLines(terminatedContainers, podId, namespace, slave, errors, client);
throw new IllegalStateException("Containers are terminated with exit codes: " + errors);
}
}

/**
* Log the last lines of containers logs
*/
private void logLastLines(List<ContainerStatus> containers, String podId, String namespace, KubernetesSlave slave,
Map<String, Integer> errors, KubernetesClient client) {
for (ContainerStatus containerStatus : containers) {
String containerName = containerStatus.getName();
PrettyLoggable<String, LogWatch> tailingLines = client.pods().inNamespace(namespace).withName(podId)
.inContainer(containerStatus.getName()).tailingLines(30);
String log = tailingLines.getLog();
if (!StringUtils.isBlank(log)) {
String msg = errors != null ? String.format(" exited with error %s", errors.get(containerName)) : "";
LOGGER.log(Level.SEVERE, "Error in provisioning; agent={0}, template={1}. Container {2}{3}. Logs: {4}",
new Object[] { slave, slave.getTemplate(), containerName, msg, tailingLines.getLog() });
}
}
}

/**
* The last problem that occurred, if any.
* @return
Expand Down

0 comments on commit 3225773

Please sign in to comment.