/
prepare.go
329 lines (303 loc) · 10 KB
/
prepare.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
package main
import (
"encoding/json"
"fmt"
"os"
"path/filepath"
"strings"
"github.com/opencontainers/runtime-spec/specs-go"
log "github.com/sirupsen/logrus"
"github.com/vishvananda/netlink"
"golang.org/x/sys/unix"
)
// Note these definitions are from src/moby/config.go and should be kept in sync
// Runtime is the type of config processed at runtime, not used to build the OCI spec
type Runtime struct {
Cgroups []string `yaml:"cgroups" json:"cgroups,omitempty"`
Mounts []specs.Mount `yaml:"mounts" json:"mounts,omitempty"`
Mkdir []string `yaml:"mkdir" json:"mkdir,omitempty"`
Interfaces []Interface `yaml:"interfaces" json:"interfaces,omitempty"`
BindNS Namespaces `yaml:"bindNS" json:"bindNS,omitempty"`
Namespace string `yaml:"namespace,omitempty" json:"namespace,omitempty"`
}
// Namespaces is the type for configuring paths to bind namespaces
type Namespaces struct {
Cgroup string `yaml:"cgroup" json:"cgroup,omitempty"`
Ipc string `yaml:"ipc" json:"ipc,omitempty"`
Mnt string `yaml:"mnt" json:"mnt,omitempty"`
Net string `yaml:"net" json:"net,omitempty"`
Pid string `yaml:"pid" json:"pid,omitempty"`
User string `yaml:"user" json:"user,omitempty"`
Uts string `yaml:"uts" json:"uts,omitempty"`
}
// Interface is the runtime config for network interfaces
type Interface struct {
Name string `yaml:"name" json:"name,omitempty"`
Add string `yaml:"add" json:"add,omitempty"`
Peer string `yaml:"peer" json:"peer,omitempty"`
CreateInRoot bool `yaml:"createInRoot" json:"createInRoot"`
}
func getRuntimeConfig(path string) Runtime {
var runtime Runtime
conf, err := os.ReadFile(filepath.Join(path, "runtime.json"))
if err != nil {
// if it does not exist it is fine to return an empty runtime, to not do anything
if os.IsNotExist(err) {
return runtime
}
log.Fatalf("Cannot read runtime config: %v", err)
}
if err := json.Unmarshal(conf, &runtime); err != nil {
log.Fatalf("Cannot parse runtime config: %v", err)
}
return runtime
}
// parseMountOptions takes fstab style mount options and parses them for
// use with a standard mount() syscall
func parseMountOptions(options []string) (int, string) {
var (
flag int
data []string
)
flags := map[string]struct {
clear bool
flag int
}{
"async": {true, unix.MS_SYNCHRONOUS},
"atime": {true, unix.MS_NOATIME},
"bind": {false, unix.MS_BIND},
"defaults": {false, 0},
"dev": {true, unix.MS_NODEV},
"diratime": {true, unix.MS_NODIRATIME},
"dirsync": {false, unix.MS_DIRSYNC},
"exec": {true, unix.MS_NOEXEC},
"mand": {false, unix.MS_MANDLOCK},
"noatime": {false, unix.MS_NOATIME},
"nodev": {false, unix.MS_NODEV},
"nodiratime": {false, unix.MS_NODIRATIME},
"noexec": {false, unix.MS_NOEXEC},
"nomand": {true, unix.MS_MANDLOCK},
"norelatime": {true, unix.MS_RELATIME},
"nostrictatime": {true, unix.MS_STRICTATIME},
"nosuid": {false, unix.MS_NOSUID},
"private": {false, unix.MS_PRIVATE},
"rbind": {false, unix.MS_BIND | unix.MS_REC},
"relatime": {false, unix.MS_RELATIME},
"remount": {false, unix.MS_REMOUNT},
"ro": {false, unix.MS_RDONLY},
"rw": {true, unix.MS_RDONLY},
"shared": {false, unix.MS_SHARED},
"slave": {false, unix.MS_SLAVE},
"strictatime": {false, unix.MS_STRICTATIME},
"suid": {true, unix.MS_NOSUID},
"sync": {false, unix.MS_SYNCHRONOUS},
"unbindable": {false, unix.MS_UNBINDABLE},
}
for _, o := range options {
// If the option does not exist in the flags table or the flag
// is not supported on the platform,
// then it is a data value for a specific fs type
if f, exists := flags[o]; exists && f.flag != 0 {
if f.clear {
flag &^= f.flag
} else {
flag |= f.flag
}
} else {
data = append(data, o)
}
}
return flag, strings.Join(data, ",")
}
// newCgroup creates a cgroup (ie directory)
// we could use github.com/containerd/cgroups but it has a lot of deps and this is just a sugary mkdir
func newCgroup(cgroup string) error {
v2, err := isCgroupV2()
if err != nil {
return err
}
if v2 {
// a cgroupv2 cgroup is a single directory
if err := os.MkdirAll(filepath.Join("/sys/fs/cgroup", cgroup), 0755); err != nil {
log.Printf("cgroup error: %v", err)
}
return nil
}
// a cgroupv1 cgroup is a directory under all directories in /sys/fs/cgroup
dirs, err := os.ReadDir("/sys/fs/cgroup")
if err != nil {
return err
}
for _, dir := range dirs {
if !dir.IsDir() {
continue
}
if err := os.MkdirAll(filepath.Join("/sys/fs/cgroup", dir.Name(), cgroup), 0755); err != nil {
log.Printf("cgroup error: %v", err)
}
}
return nil
}
func isCgroupV2() (bool, error) {
_, err := os.Stat("/sys/fs/cgroup/cgroup.controllers")
if err == nil {
return true, nil
}
if os.IsNotExist(err) {
return false, nil
}
return false, err
}
// prepareFilesystem sets up the mounts and cgroups, before the container is created
func prepareFilesystem(path string, runtime Runtime) error {
// execute the runtime config that should be done up front
// we execute Mounts before Mkdir so you can make a directory under a mount
// but we do mkdir of the destination path in case missing
rootfs := filepath.Join(path, "rootfs")
makeAbsolute := func(dir string) string {
if filepath.IsAbs(dir) {
return dir
}
// relative paths are relative to rootfs of container
return filepath.Join(rootfs, dir)
}
for _, mount := range runtime.Mounts {
const mode os.FileMode = 0755
dir := makeAbsolute(mount.Destination)
err := os.MkdirAll(dir, mode)
if err != nil {
return fmt.Errorf("Cannot create directory for mount destination %s: %v", dir, err)
}
// also mkdir upper and work directories on overlay
for _, o := range mount.Options {
eq := strings.SplitN(o, "=", 2)
if len(eq) == 2 && (eq[0] == "upperdir" || eq[0] == "workdir") {
err := os.MkdirAll(eq[1], mode)
if err != nil {
return fmt.Errorf("Cannot create directory for overlay %s=%s: %v", eq[0], eq[1], err)
}
}
}
opts, data := parseMountOptions(mount.Options)
if err := unix.Mount(mount.Source, dir, mount.Type, uintptr(opts), data); err != nil {
return fmt.Errorf("Failed to mount %s: %v", mount.Source, err)
}
}
for _, dir := range runtime.Mkdir {
// in future we may need to change the structure to set mode, ownership
const mode os.FileMode = 0755
dir = makeAbsolute(dir)
err := os.MkdirAll(dir, mode)
if err != nil {
return fmt.Errorf("Cannot create directory %s: %v", dir, err)
}
}
for _, cgroup := range runtime.Cgroups {
// currently no way to specify resource limits on new cgroups at creation time
if err := newCgroup(cgroup); err != nil {
return fmt.Errorf("Cannot create cgroup %s: %v", cgroup, err)
}
}
return nil
}
// bind mount a namespace file
func bindNS(ns string, path string, pid int) error {
if path == "" {
return nil
}
// the path and file need to exist for the bind to succeed, so try to create
dir := filepath.Dir(path)
if err := os.MkdirAll(dir, 0755); err != nil {
return fmt.Errorf("Cannot create leading directories %s for bind mount destination: %v", dir, err)
}
fi, err := os.Create(path)
if err != nil {
return fmt.Errorf("Cannot create a mount point for namespace bind at %s: %v", path, err)
}
if err := fi.Close(); err != nil {
return err
}
if err := unix.Mount(fmt.Sprintf("/proc/%d/ns/%s", pid, ns), path, "", unix.MS_BIND, ""); err != nil {
return fmt.Errorf("Failed to bind %s namespace at %s: %v", ns, path, err)
}
return nil
}
// prepareProcess sets up anything that needs to be done after the container process is created, but before it runs
// for example networking
func prepareProcess(pid int, runtime Runtime) error {
for _, iface := range runtime.Interfaces {
if iface.Name == "" {
return fmt.Errorf("Interface requires a name")
}
var link netlink.Link
var ns interface{} = netlink.NsPid(pid)
var move bool
var err error
if iface.Peer != "" && iface.Add == "" {
// must be a veth if specify peer
iface.Add = "veth"
}
// if create in root is set, create in root namespace first, then move
// also do the same for a veth pair
if iface.CreateInRoot || iface.Add == "veth" {
ns = nil
move = true
}
if iface.Add != "" {
switch iface.Add {
case "veth":
if iface.Peer == "" {
return fmt.Errorf("Creating a veth pair %s requires a peer to be set", iface.Name)
}
la := netlink.LinkAttrs{Name: iface.Name, Namespace: ns}
link = &netlink.Veth{LinkAttrs: la, PeerName: iface.Peer}
default:
// no special creation options needed
la := netlink.LinkAttrs{Name: iface.Name, Namespace: ns}
link = &netlink.GenericLink{LinkAttrs: la, LinkType: iface.Add}
}
if err := netlink.LinkAdd(link); err != nil {
return fmt.Errorf("Link add %s of type %s failed: %v", iface.Name, iface.Add, err)
}
fmt.Fprintf(os.Stderr, "Created interface %s type %s\n", iface.Name, iface.Add)
} else {
// find existing interface
link, err = netlink.LinkByName(iface.Name)
if err != nil {
return fmt.Errorf("Cannot find interface %s: %v", iface.Name, err)
}
// then move into namespace
move = true
}
if move {
if err := netlink.LinkSetNsPid(link, pid); err != nil {
return fmt.Errorf("Cannot move interface %s into namespace: %v", iface.Name, err)
}
fmt.Fprintf(os.Stderr, "Moved interface %s to pid %d\n", iface.Name, pid)
}
}
binds := []struct {
ns string
path string
}{
{"cgroup", runtime.BindNS.Cgroup},
{"ipc", runtime.BindNS.Ipc},
{"mnt", runtime.BindNS.Mnt},
{"net", runtime.BindNS.Net},
{"pid", runtime.BindNS.Pid},
{"user", runtime.BindNS.User},
{"uts", runtime.BindNS.Uts},
}
for _, b := range binds {
if err := bindNS(b.ns, b.path, pid); err != nil {
return err
}
}
return nil
}
// cleanup functions are best efforts only, mainly for rw onboot containers
func cleanup(path string) {
// remove the root mount
rootfs := filepath.Join(path, "rootfs")
_ = unix.Unmount(rootfs, 0)
}