/
image-cleaner.py
executable file
·198 lines (161 loc) · 6.04 KB
/
image-cleaner.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
#!/usr/bin/env python3
"""
Clean docker images
This serves as a substitute for Kubernetes ImageGC
which has thresholds that are not sufficiently configurable on GKE
at this time.
"""
from collections import defaultdict
import logging
import os
import time
import docker
import requests
logging.basicConfig(format='%(asctime)s %(message)s',
level=logging.INFO)
def get_used_percent(path):
"""
Return disk usage as a percentage
(100 is full, 0 is empty)
Calculated by blocks or inodes,
which ever reports as the most full.
"""
stat = os.statvfs(path)
inodes_avail = stat.f_favail / stat.f_files
blocks_avail = stat.f_bavail / stat.f_blocks
return 100 * (1 - min(blocks_avail, inodes_avail))
def image_key(image):
"""Sort key for images
Prefers untagged images, sorted by size
"""
return (not image.tags, image.attrs['Size'])
def get_docker_images(client):
"""Return list of docker images, sorted by size
Untagged images will come first
"""
images = client.images.list(all=True)
# create dict by image id for O(1) lookup
by_id = {image.id: image for image in images}
# graph contains a set of all descendant (not just immediate)
# images for each image
graph = defaultdict(set)
for image in images:
while image.attrs['Parent']:
graph[image.attrs['Parent']].add(image)
image = by_id[image.attrs['Parent']]
def image_key(image):
"""Sort images topologically and by size
- Prefer images with fewer descendants, so that we never try to delete
an image before its children (fails with 409)
- Prefer untagged images to tagged ones (delete build intermediates first)
- Sort topological peers by size
"""
return (-len(graph[image.id]), not image.tags, image.attrs['Size'])
images.sort(key=image_key, reverse=True)
return images
def cordon(kube, node):
"""cordon a kubernetes node"""
kube.patch_node(
node,
{
"spec": {
"unschedulable": True,
},
},
)
def uncordon(kube, node):
"""uncordon a kubernetes node"""
kube.patch_node(
node,
{
"spec": {
"unschedulable": False,
},
},
)
def main():
node = os.getenv('NODE_NAME')
if node:
import kubernetes.config
import kubernetes.client
try:
kubernetes.config.load_incluster_config()
except Exception:
kubernetes.config.load_kube_config()
kube = kubernetes.client.CoreV1Api()
# verify that we can talk to the node
kube.read_node(node)
path_to_check = os.getenv('PATH_TO_CHECK', '/var/lib/docker')
interval = float(os.getenv('IMAGE_GC_INTERVAL', '300'))
delay = float(os.getenv('IMAGE_GC_DELAY', '1'))
gc_low = float(os.getenv('IMAGE_GC_THRESHOLD_LOW', '60'))
gc_high = float(os.getenv('IMAGE_GC_THRESHOLD_HIGH', '80'))
logging.info(f'Pruning docker images when {path_to_check} has {gc_high}% inodes or blocks used')
client = docker.from_env(version='auto')
images = get_docker_images(client)
while True:
used = get_used_percent(path_to_check)
logging.info(f'{used:.1f}% used')
if used < gc_high:
# Do nothing! We have enough space
pass
else:
images = get_docker_images(client)
if not images:
logging.info(f'No images to delete')
time.sleep(interval)
continue
else:
logging.info(f'{len(images)} images available to prune')
start = time.perf_counter()
images_before = len(images)
if node:
logging.info(f"Cordoning node {node}")
cordon(kube, node)
deleted = 0
while images and get_used_percent(path_to_check) > gc_low:
# Ensure the node is still cordoned
if node:
logging.info(f"Cordoning node {node}")
cordon(kube, node)
# Remove biggest image
image = images.pop(0)
if image.tags:
# does it have a name, e.g. jupyter/base-notebook:12345
name = image.tags[0]
else:
# no name, use id
name = image.id
gb = image.attrs['Size'] / (2**30)
logging.info(f'Removing {name} (size={gb:.2f}GB)')
try:
client.images.remove(image=image.id, force=True)
logging.info(f'Removed {name}')
# Delay between deletions.
# A sleep here avoids monopolizing the Docker API with deletions.
time.sleep(delay)
except docker.errors.APIError as e:
if e.status_code == 409:
# This means the image can not be removed right now
logging.info(f'Failed to remove {name}, skipping this image')
logging.info(str(e))
elif e.status_code == 404:
logging.info(f'{name} not found, probably already deleted')
else:
raise
except requests.exceptions.ReadTimeout:
logging.warning(f'Timeout removing {name}')
# Delay longer after a timeout, which indicates that Docker is overworked
time.sleep(max(delay, 30))
else:
deleted += 1
if node:
logging.info(f"Uncordoning node {node}")
uncordon(kube, node)
# log what we did and how long it took
duration = time.perf_counter() - start
images_deleted = images_before - len(images)
logging.info(f"Deleted {images_deleted} images in {int(duration)} seconds")
time.sleep(interval)
if __name__ == '__main__':
main()