Skip to content

Commit

Permalink
Garbage collection of orphan pods (#1543)
Browse files Browse the repository at this point in the history
  • Loading branch information
Vlatombe committed May 7, 2024
1 parent 22eec6b commit f10083a
Show file tree
Hide file tree
Showing 16 changed files with 382 additions and 8 deletions.
14 changes: 14 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,20 @@ adequate communication from Jenkins to the Kubernetes cluster, as seen below

![image](images/cloud-configuration.png)


### Garbage collection (beta)

In some exceptional cases, agent pods can be left behind, with no declared Jenkins agent in the controller. They will try to reconnect over and over, until something deletes them.

The plugin provides a garbage collection mechanism to clean up these pods. As it has been introduced recently,
and generates extra load on the Kubernetes API server, it is disabled by default.

Feel free to enable it and provide feedback about this functionality.

![image](images/garbage-collection.png)

## Static pod templates

In addition to that, in the **Kubernetes Pod Template** section, we need to configure the image that will be used to
spin up the agent pod. We do not recommend overriding the `jnlp` container except under unusual circumstances.
For your agent, you can use the default Jenkins agent image available in [Docker Hub](https://hub.docker.com). In the
Expand Down
Binary file added images/garbage-collection.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
1 change: 1 addition & 0 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -295,6 +295,7 @@
<systemPropertyVariables>
<hudson.slaves.NodeProvisioner.initialDelay>0</hudson.slaves.NodeProvisioner.initialDelay>
<hudson.slaves.NodeProvisioner.recurrencePeriod>3000</hudson.slaves.NodeProvisioner.recurrencePeriod>
<org.csanchez.jenkins.plugins.kubernetes.GarbageCollection.recurrencePeriod>5</org.csanchez.jenkins.plugins.kubernetes.GarbageCollection.recurrencePeriod>
<org.jenkinsci.plugins.workflow.support.pickles.ExecutorPickle.timeoutForNodeMillis>60000</org.jenkinsci.plugins.workflow.support.pickles.ExecutorPickle.timeoutForNodeMillis>
<!-- listen in this interface for connections from kubernetes pods -->
<connectorHost>${connectorHost}</connectorHost>
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,216 @@
package org.csanchez.jenkins.plugins.kubernetes;

import static org.csanchez.jenkins.plugins.kubernetes.PodTemplateBuilder.LABEL_KUBERNETES_CONTROLLER;
import static org.csanchez.jenkins.plugins.kubernetes.PodTemplateUtils.sanitizeLabel;

import edu.umd.cs.findbugs.annotations.NonNull;
import hudson.Extension;
import hudson.Main;
import hudson.Util;
import hudson.model.AbstractDescribableImpl;
import hudson.model.AsyncPeriodicWork;
import hudson.model.Descriptor;
import hudson.model.TaskListener;
import hudson.util.FormValidation;
import io.fabric8.kubernetes.api.model.Pod;
import java.io.IOException;
import java.time.Duration;
import java.time.Instant;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Objects;
import java.util.Optional;
import java.util.Set;
import java.util.concurrent.TimeUnit;
import java.util.logging.Level;
import java.util.logging.Logger;
import jenkins.model.Jenkins;
import jenkins.util.SystemProperties;
import org.jenkinsci.plugins.kubernetes.auth.KubernetesAuthException;
import org.kohsuke.stapler.DataBoundConstructor;
import org.kohsuke.stapler.DataBoundSetter;
import org.kohsuke.stapler.QueryParameter;

/**
* Manages garbage collection of orphaned pods.
*/
public class GarbageCollection extends AbstractDescribableImpl<GarbageCollection> {
public static final String ANNOTATION_LAST_REFRESH = "kubernetes.jenkins.io/last-refresh";
private static final Logger LOGGER = Logger.getLogger(GarbageCollection.class.getName());

public static final int MINIMUM_GC_TIMEOUT = 120;

private String namespaces;
private transient Set<String> namespaceSet;
private int timeout;

private static Long RECURRENCE_PERIOD = SystemProperties.getLong(
GarbageCollection.class.getName() + ".recurrencePeriod", TimeUnit.MINUTES.toSeconds(1));

@DataBoundConstructor
public GarbageCollection() {}

public String getNamespaces() {
return namespaces;
}

@DataBoundSetter
public void setNamespaces(String namespaces) {
this.namespaces = Util.fixEmptyAndTrim(namespaces);
if (this.namespaces == null) {
this.namespaceSet = Set.of();
} else {
this.namespaceSet = Set.of(this.namespaces.split("\n"));
}
}

public int getTimeout() {
return timeout;
}

protected Object readResolve() {
if (namespaces != null) {
setNamespaces(namespaces);
}
return this;
}

@DataBoundSetter
public void setTimeout(int timeout) {
if (Main.isUnitTest) {
this.timeout = timeout;
} else {
this.timeout = Math.max(timeout, MINIMUM_GC_TIMEOUT);
}
}

public Duration getDurationTimeout() {
return Duration.ofSeconds(timeout);
}

@NonNull
public Set<String> getNamespaceSet() {
return namespaceSet == null ? Set.of() : namespaceSet;
}

@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
GarbageCollection that = (GarbageCollection) o;
return timeout == that.timeout && Objects.equals(namespaces, that.namespaces);
}

@Override
public int hashCode() {
return Objects.hash(namespaces, timeout);
}

@Override
public String toString() {
return "GarbageCollection{" + "namespaces='" + namespaces + '\'' + ", timeout=" + timeout + '}';
}

@Extension
public static class DescriptorImpl extends Descriptor<GarbageCollection> {
@SuppressWarnings("unused") // stapler
public FormValidation doCheckTimeout(@QueryParameter String value) {
return FormValidation.validateIntegerInRange(value, MINIMUM_GC_TIMEOUT, Integer.MAX_VALUE);
}
}

/**
* Annotate pods owned by live Kubernetes agents to help with garbage collection.
*/
@Extension
public static final class PeriodicGarbageCollection extends AsyncPeriodicWork {
public PeriodicGarbageCollection() {
super("Garbage collection of orphaned Kubernetes pods");
}

@Override
protected void execute(TaskListener listener) throws IOException, InterruptedException {
annotateLiveAgents(listener);
garbageCollect();
}

private static void annotateLiveAgents(TaskListener listener) {
Arrays.stream(Jenkins.get().getComputers())
.filter(KubernetesComputer.class::isInstance)
.map(KubernetesComputer.class::cast)
.forEach(kc -> kc.annotateTtl(listener));
}

private static void garbageCollect() {
for (var cloud : Jenkins.get().clouds.getAll(KubernetesCloud.class)) {
Optional.ofNullable(cloud.getGarbageCollection()).ifPresent(gc -> {
try {
var client = cloud.connect();
var namespaces = new HashSet<String>();
namespaces.add(client.getNamespace());
namespaces.addAll(gc.getNamespaceSet());
for (var ns : namespaces) {
client
.pods()
.inNamespace(ns)
// Only look at pods created by this controller
.withLabel(LABEL_KUBERNETES_CONTROLLER, sanitizeLabel(cloud.getJenkinsUrlOrNull()))
.list()
.getItems()
.stream()
.filter(pod -> {
var lastRefresh = pod.getMetadata()
.getAnnotations()
.get(ANNOTATION_LAST_REFRESH);
if (lastRefresh != null) {
try {
var refreshTime = Long.parseLong(lastRefresh);
var now = Instant.now();
LOGGER.log(
Level.FINE,
() -> getQualifiedName(pod) + " refresh diff = "
+ (now.toEpochMilli() - refreshTime) + ", timeout is "
+ gc.getDurationTimeout()
.toMillis());
return Duration.between(Instant.ofEpochMilli(refreshTime), now)
.compareTo(gc.getDurationTimeout())
> 0;
} catch (NumberFormatException e) {
LOGGER.log(
Level.WARNING,
e,
() -> "Unable to parse last refresh for pod "
+ getQualifiedName(pod) + ", ignoring");
return false;
}
} else {
LOGGER.log(
Level.FINE, () -> "Ignoring legacy pod " + getQualifiedName(pod));
return false;
}
})
.forEach(pod -> {
LOGGER.log(Level.INFO, () -> "Deleting orphan pod " + getQualifiedName(pod));
client.resource(pod).delete();
});
}
} catch (KubernetesAuthException e) {
LOGGER.log(Level.WARNING, "Error authenticating to Kubernetes", e);
} catch (IOException e) {
LOGGER.log(Level.WARNING, "Error while getting Kubernetes client", e);
}
});
}
}

private static String getQualifiedName(@NonNull Pod pod) {
var metadata = pod.getMetadata();
return metadata.getNamespace() + "/" + metadata.getName();
}

@Override
public long getRecurrencePeriod() {
return TimeUnit.SECONDS.toMillis(RECURRENCE_PERIOD);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,9 @@ public class KubernetesCloud extends Cloud implements PodTemplateGroup {
@CheckForNull
private PodRetention podRetention = PodRetention.getKubernetesCloudDefault();

@CheckForNull
private GarbageCollection garbageCollection;

@DataBoundConstructor
public KubernetesCloud(String name) {
super(name);
Expand Down Expand Up @@ -334,6 +337,15 @@ public boolean isCapOnlyOnAlivePods() {
return capOnlyOnAlivePods;
}

public GarbageCollection getGarbageCollection() {
return garbageCollection;
}

@DataBoundSetter
public void setGarbageCollection(GarbageCollection garbageCollection) {
this.garbageCollection = garbageCollection;
}

/**
* @return same as {@link #getJenkinsUrlOrNull}, if set
* @throws IllegalStateException if no Jenkins URL could be computed.
Expand Down Expand Up @@ -767,6 +779,7 @@ public boolean equals(Object o) {
&& Objects.equals(getPodLabels(), that.getPodLabels())
&& Objects.equals(podRetention, that.podRetention)
&& Objects.equals(waitForPodSec, that.waitForPodSec)
&& Objects.equals(garbageCollection, that.garbageCollection)
&& useJenkinsProxy == that.useJenkinsProxy;
}

Expand Down Expand Up @@ -794,7 +807,8 @@ public int hashCode() {
usageRestricted,
maxRequestsPerHost,
podRetention,
useJenkinsProxy);
useJenkinsProxy,
garbageCollection);
}

public Integer getWaitForPodSec() {
Expand Down Expand Up @@ -1068,7 +1082,8 @@ public String toString() {
+ waitForPodSec + ", podRetention="
+ podRetention + ", useJenkinsProxy="
+ useJenkinsProxy + ", templates="
+ templates + '}';
+ templates + ", garbageCollection="
+ garbageCollection + '}';
}

private Object readResolve() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import hudson.model.Computer;
import hudson.model.Executor;
import hudson.model.Queue;
import hudson.model.TaskListener;
import hudson.security.ACL;
import hudson.security.Permission;
import hudson.slaves.AbstractCloudComputer;
Expand All @@ -18,6 +19,7 @@
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.logging.Level;
import java.util.logging.Logger;
import jenkins.model.Jenkins;
Expand Down Expand Up @@ -165,6 +167,10 @@ public ACL getACL() {
return new KubernetesComputerACL(base);
}

public void annotateTtl(TaskListener listener) {
Optional.ofNullable(getNode()).ifPresent(ks -> ks.annotateTtl(listener));
}

/**
* Simple static inner class to be used by {@link #getACL()}.
* It replaces an anonymous inner class in order to fix
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
import io.fabric8.kubernetes.client.KubernetesClientException;
import io.fabric8.kubernetes.client.utils.Serialization;
import java.io.IOException;
import java.time.Instant;
import java.util.HashSet;
import java.util.Locale;
import java.util.Objects;
Expand Down Expand Up @@ -273,7 +274,7 @@ public Cloud getCloud() {
}

public Optional<Pod> getPod() {
return pod == null ? Optional.empty() : Optional.of(pod);
return Optional.ofNullable(pod);
}

/**
Expand Down Expand Up @@ -538,6 +539,39 @@ public static Builder builder() {
return new Builder();
}

public void annotateTtl(TaskListener listener) {
try {
var kubernetesCloud = getKubernetesCloud();
Optional.ofNullable(kubernetesCloud.getGarbageCollection()).ifPresent(gc -> {
var ns = getNamespace();
var name = getPodName();
var l = Instant.now();
try {
kubernetesCloud
.connect()
.pods()
.inNamespace(ns)
.withName(name)
.patch("{\"metadata\":{\"annotations\":{\"" + GarbageCollection.ANNOTATION_LAST_REFRESH
+ "\":\"" + l.toEpochMilli() + "\"}}}");
} catch (KubernetesAuthException e) {
e.printStackTrace(listener.error("Failed to authenticate to Kubernetes cluster"));
} catch (IOException e) {
e.printStackTrace(listener.error("Failed to connect to Kubernetes cluster"));
}
listener.getLogger().println("Annotated agent pod " + ns + "/" + name + " with TTL");
LOGGER.log(Level.FINE, () -> "Annotated agent pod " + ns + "/" + name + " with TTL");
try {
save();
} catch (IOException e) {
LOGGER.log(Level.WARNING, e, () -> "Failed to save");
}
});
} catch (RuntimeException e) {
e.printStackTrace(listener.error("Failed to annotate agent pod with TTL"));
}
}

/**
* Builds a {@link KubernetesSlave} instance.
*/
Expand Down
Loading

0 comments on commit f10083a

Please sign in to comment.