Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Garbage collection of orphan pods #1543

Merged
merged 7 commits into from
May 7, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,199 @@
package org.csanchez.jenkins.plugins.kubernetes;

import static org.csanchez.jenkins.plugins.kubernetes.PodTemplateBuilder.LABEL_KUBERNETES_INSTANCE;
import static org.csanchez.jenkins.plugins.kubernetes.PodTemplateUtils.sanitizeLabel;

import edu.umd.cs.findbugs.annotations.NonNull;
import hudson.Extension;
import hudson.Util;
import hudson.model.AbstractDescribableImpl;
import hudson.model.AsyncPeriodicWork;
import hudson.model.Descriptor;
import hudson.model.TaskListener;
import hudson.util.FormValidation;
import io.fabric8.kubernetes.api.model.Pod;
import java.io.IOException;
import java.time.Duration;
import java.time.Instant;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Optional;
import java.util.Set;
import java.util.concurrent.TimeUnit;
import java.util.logging.Level;
import java.util.logging.Logger;
import jenkins.model.Jenkins;
import org.jenkinsci.plugins.kubernetes.auth.KubernetesAuthException;
import org.kohsuke.stapler.DataBoundConstructor;
import org.kohsuke.stapler.DataBoundSetter;
import org.kohsuke.stapler.QueryParameter;

/**
* Manages garbage collection of orphaned pods.
*/
public class GarbageCollection extends AbstractDescribableImpl<GarbageCollection> {
public static final String ANNOTATION_LAST_REFRESH = "kubernetes.jenkins.io/last-refresh";
private static final Logger LOGGER = Logger.getLogger(GarbageCollection.class.getName());

public static final int MINIMUM_GC_TIMEOUT = 120;

private String namespaces;
private transient Set<String> namespaceSet;
private int timeout;

@DataBoundConstructor
public GarbageCollection() {}

public String getNamespaces() {
return namespaces;
}

@DataBoundSetter
public void setNamespaces(String namespaces) {
this.namespaces = Util.fixEmptyAndTrim(namespaces);
if (this.namespaces == null) {
this.namespaceSet = Set.of();
} else {
this.namespaceSet = Set.of(this.namespaces.split("\n"));
}
}

public int getTimeout() {
return timeout;
}

protected Object readResolve() {
if (namespaces != null) {
setNamespaces(namespaces);
}
return this;
}

@DataBoundSetter
public void setTimeout(int timeout) {
this.timeout = Math.max(timeout, MINIMUM_GC_TIMEOUT);
}

public Duration getDurationTimeout() {
return Duration.ofSeconds(timeout);
}

@NonNull
public Set<String> getNamespaceSet() {
return namespaceSet == null ? Set.of() : namespaceSet;
}

@Extension
public static class DescriptorImpl extends Descriptor<GarbageCollection> {
@SuppressWarnings("unused") // stapler
public FormValidation doCheckTimeout(@QueryParameter String value) {
return FormValidation.validateIntegerInRange(value, MINIMUM_GC_TIMEOUT, Integer.MAX_VALUE);
}
}

/**
* Annotate pods owned by live Kubernetes agents to help with garbage collection.
*/
@Extension
public static final class AnnotateLiveAgents extends AsyncPeriodicWork {

public AnnotateLiveAgents() {
super("Annotate Live Kubernetes Agents");
}

@Override
protected void execute(TaskListener listener) throws IOException, InterruptedException {
Arrays.stream(Jenkins.get().getComputers())
.filter(KubernetesComputer.class::isInstance)
.map(KubernetesComputer.class::cast)
.forEach(kc -> kc.annotateTtl(listener));
}

@Override
public long getRecurrencePeriod() {
return TimeUnit.MINUTES.toMillis(1);
}
}

/**
* Reap orphan pods that have not been refreshed for a while
*/
@Extension
public static final class ReapOrphanPods extends AsyncPeriodicWork {
public ReapOrphanPods() {
super("Garbage collect orphan pods");
Vlatombe marked this conversation as resolved.
Show resolved Hide resolved
}

@Override
protected void execute(TaskListener listener) throws IOException, InterruptedException {
for (var cloud : Jenkins.get().clouds.getAll(KubernetesCloud.class)) {
Optional.ofNullable(cloud.getGarbageCollection()).ifPresent(gc -> {
try {
var client = cloud.connect();
var namespaces = new HashSet<String>();
namespaces.add(client.getNamespace());
namespaces.addAll(gc.getNamespaceSet());
for (var ns : namespaces) {
client
.pods()
.inNamespace(ns)
// Only look at pods created by this controller
.withLabel(LABEL_KUBERNETES_INSTANCE, sanitizeLabel(cloud.getJenkinsUrlOrNull()))
.list()
.getItems()
.stream()
.filter(pod -> {
var lastRefresh = pod.getMetadata()
.getAnnotations()
.get(ANNOTATION_LAST_REFRESH);
if (lastRefresh != null) {
try {
var refreshTime = Long.parseLong(lastRefresh);
LOGGER.log(
Level.FINEST,
() -> getQualifiedName(pod) + " last refreshed at "
+ refreshTime);
return Duration.between(
Instant.ofEpochMilli(refreshTime),
Instant.now())
.compareTo(gc.getDurationTimeout())
> 0;
} catch (NumberFormatException e) {
LOGGER.log(
Level.WARNING,
e,
() -> "Unable to parse last refresh for pod "
+ getQualifiedName(pod) + ", ignoring");
return false;
}
} else {
LOGGER.log(
Level.FINE, () -> "Ignoring legacy pod " + getQualifiedName(pod));
return false;
}
})
.forEach(pod -> {
LOGGER.log(Level.INFO, () -> "Deleting orphan pod " + getQualifiedName(pod));
client.resource(pod).delete();
});
}
} catch (KubernetesAuthException e) {
LOGGER.log(Level.WARNING, "Error authenticating to Kubernetes", e);
} catch (IOException e) {
LOGGER.log(Level.WARNING, "Error while getting Kubernetes client", e);
}
});
}
}

private static String getQualifiedName(@NonNull Pod pod) {
var metadata = pod.getMetadata();
return metadata.getNamespace() + "/" + metadata.getName();
}

@Override
public long getRecurrencePeriod() {
return TimeUnit.MINUTES.toMillis(1);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,8 @@ public class KubernetesCloud extends Cloud implements PodTemplateGroup {
@CheckForNull
private PodRetention podRetention = PodRetention.getKubernetesCloudDefault();

private GarbageCollection garbageCollection;
Vlatombe marked this conversation as resolved.
Show resolved Hide resolved

@DataBoundConstructor
public KubernetesCloud(String name) {
super(name);
Expand Down Expand Up @@ -334,6 +336,15 @@ public boolean isCapOnlyOnAlivePods() {
return capOnlyOnAlivePods;
}

public GarbageCollection getGarbageCollection() {
return garbageCollection;
}

@DataBoundSetter
public void setGarbageCollection(GarbageCollection garbageCollection) {
this.garbageCollection = garbageCollection;
}

/**
* @return same as {@link #getJenkinsUrlOrNull}, if set
* @throws IllegalStateException if no Jenkins URL could be computed.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import hudson.model.Computer;
import hudson.model.Executor;
import hudson.model.Queue;
import hudson.model.TaskListener;
import hudson.security.ACL;
import hudson.security.Permission;
import hudson.slaves.AbstractCloudComputer;
Expand All @@ -18,6 +19,7 @@
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.logging.Level;
import java.util.logging.Logger;
import jenkins.model.Jenkins;
Expand Down Expand Up @@ -165,6 +167,13 @@ public ACL getACL() {
return new KubernetesComputerACL(base);
}

public void annotateTtl(TaskListener listener) {
if (!isOnline()) {
Vlatombe marked this conversation as resolved.
Show resolved Hide resolved
return;
}
Optional.ofNullable(getNode()).ifPresent(ks -> ks.annotateTtl(listener));
}

/**
* Simple static inner class to be used by {@link #getACL()}.
* It replaces an anonymous inner class in order to fix
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,13 @@
import hudson.slaves.SlaveComputer;
import io.fabric8.kubernetes.api.model.Container;
import io.fabric8.kubernetes.api.model.Pod;
import io.fabric8.kubernetes.api.model.PodBuilder;
import io.fabric8.kubernetes.client.KubernetesClient;
import io.fabric8.kubernetes.client.KubernetesClientException;
import io.fabric8.kubernetes.client.utils.Serialization;
import java.io.IOException;
import java.time.Duration;
import java.time.Instant;
import java.util.HashSet;
import java.util.Locale;
import java.util.Objects;
Expand Down Expand Up @@ -85,6 +88,8 @@ public class KubernetesSlave extends AbstractCloudSlave {
@CheckForNull
private transient Pod pod;

private long lastRefresh;

@NonNull
public PodTemplate getTemplate() throws IllegalStateException {
// Look up updated pod template after a restart
Expand Down Expand Up @@ -273,7 +278,7 @@ public Cloud getCloud() {
}

public Optional<Pod> getPod() {
return pod == null ? Optional.empty() : Optional.of(pod);
return Optional.ofNullable(pod);
}

/**
Expand Down Expand Up @@ -538,6 +543,51 @@ public static Builder builder() {
return new Builder();
}

public void annotateTtl(TaskListener listener) {
var kubernetesCloud = getKubernetesCloud();
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can this be null?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Raises an IllegalStateException if so, which is caught in order to avoid breaking the whole loop.

Optional.ofNullable(kubernetesCloud.getGarbageCollection()).ifPresent(gc -> {
var ns = getNamespace();
var name = getPodName();
var l = Instant.now();
if (lastRefresh == 0
Vlatombe marked this conversation as resolved.
Show resolved Hide resolved
|| Duration.between(Instant.ofEpochMilli(lastRefresh), Instant.now())
.compareTo(gc.getDurationTimeout().dividedBy(2))
> 0) {
try {
kubernetesCloud
.connect()
.pods()
.inNamespace(ns)
.withName(name)
.edit(p -> {
Vlatombe marked this conversation as resolved.
Show resolved Hide resolved
if (p == null) return null;
return new PodBuilder(p)
.editMetadata()
.addToAnnotations(
GarbageCollection.ANNOTATION_LAST_REFRESH,
String.valueOf(l.toEpochMilli()))
.endMetadata()
.build();
});
} catch (KubernetesAuthException e) {
e.printStackTrace(listener.error("Failed to authenticate to Kubernetes cluster"));
} catch (IOException e) {
e.printStackTrace(listener.error("Failed to connect to Kubernetes cluster"));
}
lastRefresh = l.toEpochMilli();
listener.getLogger().println("Annotated agent pod " + ns + "/" + name + " with TTL");
LOGGER.log(Level.FINE, () -> "Annotated agent pod " + ns + "/" + name + " with TTL");
try {
save();
} catch (IOException e) {
LOGGER.log(Level.WARNING, e, () -> "Failed to save");
}
} else {
LOGGER.log(Level.FINEST, () -> "Not refreshing agent pod " + ns + "/" + name + " yet");
}
});
}

/**
* Builds a {@link KubernetesSlave} instance.
*/
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
import static org.csanchez.jenkins.plugins.kubernetes.KubernetesCloud.JNLP_NAME;
import static org.csanchez.jenkins.plugins.kubernetes.PodTemplateUtils.combine;
import static org.csanchez.jenkins.plugins.kubernetes.PodTemplateUtils.isNullOrEmpty;
import static org.csanchez.jenkins.plugins.kubernetes.PodTemplateUtils.sanitizeLabel;
import static org.csanchez.jenkins.plugins.kubernetes.PodTemplateUtils.substituteEnv;

import edu.umd.cs.findbugs.annotations.CheckForNull;
Expand Down Expand Up @@ -98,6 +99,8 @@ public class PodTemplateBuilder {
private static final String WORKSPACE_VOLUME_NAME = "workspace-volume";
public static final Pattern FROM_DIRECTIVE = Pattern.compile("^FROM (.*)$");

public static final String LABEL_KUBERNETES_INSTANCE = "kubernetes.jenkins.io/instance";
Vlatombe marked this conversation as resolved.
Show resolved Hide resolved

@SuppressFBWarnings(value = "MS_SHOULD_BE_FINAL", justification = "tests")
@Restricted(NoExternalUse.class)
static String DEFAULT_JNLP_DOCKER_REGISTRY_PREFIX =
Expand Down Expand Up @@ -230,6 +233,9 @@ public Pod build() {
if (!labels.isEmpty()) {
metadataBuilder.withLabels(labels);
}
if (cloud != null) {
metadataBuilder.addToLabels(LABEL_KUBERNETES_INSTANCE, sanitizeLabel(cloud.getJenkinsUrlOrNull()));
}

Map<String, String> annotations = getAnnotationsMap(template.getAnnotations());
if (!annotations.isEmpty()) {
Expand Down Expand Up @@ -618,6 +624,7 @@ private Map<String, String> getAnnotationsMap(List<PodAnnotation> annotations) {
builder.put(podAnnotation.getKey(), substituteEnv(podAnnotation.getValue()));
}
}
builder.put(GarbageCollection.ANNOTATION_LAST_REFRESH, String.valueOf(System.currentTimeMillis()));
return Collections.unmodifiableMap(builder);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -767,7 +767,10 @@ public static boolean validateImage(String image) {
* @return the sanitized and validated label
* @throws AssertionError if the generated label is not valid
*/
public static String sanitizeLabel(String input) {
public static String sanitizeLabel(@CheckForNull String input) {
if (input == null) {
return null;
}
int max = /* Kubernetes limit */ 63 - /* hyphen */ 1 - /* suffix */ 5;
String label;
if (input.length() > max) {
Expand Down
Loading