From 8d1d51654f364490fc93d23d02b6fce27aa0a5e9 Mon Sep 17 00:00:00 2001 From: Yifan Gu Date: Wed, 29 Apr 2015 18:11:30 -0700 Subject: [PATCH] kubelet/rkt: Add routines for converting kubelet pod to rkt pod. --- pkg/kubelet/rkt/cap.go | 122 ++++++++++++++++++++ pkg/kubelet/rkt/rkt.go | 247 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 369 insertions(+) create mode 100644 pkg/kubelet/rkt/cap.go diff --git a/pkg/kubelet/rkt/cap.go b/pkg/kubelet/rkt/cap.go new file mode 100644 index 000000000000..7ce27708e0fc --- /dev/null +++ b/pkg/kubelet/rkt/cap.go @@ -0,0 +1,122 @@ +/* +Copyright 2015 Google Inc. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package rkt + +import ( + "fmt" + "strings" + + "github.com/GoogleCloudPlatform/kubernetes/pkg/api" +) + +const ( + CAP_CHOWN = iota + CAP_DAC_OVERRIDE + CAP_DAC_READ_SEARCH + CAP_FOWNER + CAP_FSETID + CAP_KILL + CAP_SETGID + CAP_SETUID + CAP_SETPCAP + CAP_LINUX_IMMUTABLE + CAP_NET_BIND_SERVICE + CAP_NET_BROADCAST + CAP_NET_ADMIN + CAP_NET_RAW + CAP_IPC_LOCK + CAP_IPC_OWNER + CAP_SYS_MODULE + CAP_SYS_RAWIO + CAP_SYS_CHROOT + CAP_SYS_PTRACE + CAP_SYS_PACCT + CAP_SYS_ADMIN + CAP_SYS_BOOT + CAP_SYS_NICE + CAP_SYS_RESOURCE + CAP_SYS_TIME + CAP_SYS_TTY_CONFIG + CAP_MKNOD + CAP_LEASE + CAP_AUDIT_WRITE + CAP_AUDIT_CONTROL + CAP_SETFCAP + CAP_MAC_OVERRIDE + CAP_MAC_ADMIN + CAP_SYSLOG + CAP_WAKE_ALARM + CAP_BLOCK_SUSPEND + CAP_AUDIT_READ +) + +var capalibityList = map[int]string{ + CAP_CHOWN: "CAP_CHOWN", + CAP_DAC_OVERRIDE: "CAP_DAC_OVERRIDE", + CAP_DAC_READ_SEARCH: "CAP_DAC_READ_SEARCH", + CAP_FOWNER: "CAP_FOWNER", + CAP_FSETID: "CAP_FSETID", + CAP_KILL: "CAP_KILL", + CAP_SETGID: "CAP_SETGID", + CAP_SETUID: "CAP_SETUID", + CAP_SETPCAP: "CAP_SETPCAP", + CAP_LINUX_IMMUTABLE: "CAP_LINUX_IMMUTABLE", + CAP_NET_BIND_SERVICE: "CAP_NET_BIND_SERVICE", + CAP_NET_BROADCAST: "CAP_NET_BROADCAST", + CAP_NET_ADMIN: "CAP_NET_ADMIN", + CAP_NET_RAW: "CAP_NET_RAW", + CAP_IPC_LOCK: "CAP_IPC_LOCK", + CAP_IPC_OWNER: "CAP_IPC_OWNER", + CAP_SYS_MODULE: "CAP_SYS_MODULE", + CAP_SYS_RAWIO: "CAP_SYS_RAWIO", + CAP_SYS_CHROOT: "CAP_SYS_CHROOT", + CAP_SYS_PTRACE: "CAP_SYS_PTRACE", + CAP_SYS_PACCT: "CAP_SYS_PACCT", + CAP_SYS_ADMIN: "CAP_SYS_ADMIN", + CAP_SYS_BOOT: "CAP_SYS_BOOT", + CAP_SYS_NICE: "CAP_SYS_NICE", + CAP_SYS_RESOURCE: "CAP_SYS_RESOURCE", + CAP_SYS_TIME: "CAP_SYS_TIME", + CAP_SYS_TTY_CONFIG: "CAP_SYS_TTY_CONFIG", + CAP_MKNOD: "CAP_MKNOD", + CAP_LEASE: "CAP_LEASE", + CAP_AUDIT_WRITE: "CAP_AUDIT_WRITE", + CAP_AUDIT_CONTROL: "CAP_AUDIT_CONTROL", + CAP_SETFCAP: "CAP_SETFCAP", + CAP_MAC_OVERRIDE: "CAP_MAC_OVERRIDE", + CAP_MAC_ADMIN: "CAP_MAC_ADMIN", + CAP_SYSLOG: "CAP_SYSLOG", + CAP_WAKE_ALARM: "CAP_WAKE_ALARM", + CAP_BLOCK_SUSPEND: "CAP_BLOCK_SUSPEND", + CAP_AUDIT_READ: "CAP_AUDIT_READ", +} + +func getAllCapabilities() string { + var capabilities []string + for _, cap := range capalibityList { + capabilities = append(capabilities, fmt.Sprintf("%q", cap)) + } + return strings.Join(capabilities, ",") +} + +func getCapabilities(caps []api.CapabilityType) string { + var capList []string + for _, cap := range caps { + capList = append(capList, fmt.Sprintf("%q", cap)) + } + return strings.Join(capList, ",") +} diff --git a/pkg/kubelet/rkt/rkt.go b/pkg/kubelet/rkt/rkt.go index 2a4ac3e2589c..9b1f5c95f044 100644 --- a/pkg/kubelet/rkt/rkt.go +++ b/pkg/kubelet/rkt/rkt.go @@ -21,8 +21,14 @@ import ( "os/exec" "strings" + "github.com/GoogleCloudPlatform/kubernetes/pkg/api" + "github.com/GoogleCloudPlatform/kubernetes/pkg/capabilities" "github.com/GoogleCloudPlatform/kubernetes/pkg/credentialprovider" + "github.com/GoogleCloudPlatform/kubernetes/pkg/volume" + appcschema "github.com/appc/spec/schema" + appctypes "github.com/appc/spec/schema/types" "github.com/coreos/go-systemd/dbus" + "github.com/coreos/rkt/store" "github.com/golang/glog" ) @@ -142,3 +148,244 @@ func (r *Runtime) runCommand(args ...string) ([]string, error) { } return strings.Split(strings.TrimSpace(string(output)), "\n"), nil } + +type resource struct { + limit string + request string +} + +// setIsolators overrides the isolators of the pod manifest if necessary. +func setIsolators(app *appctypes.App, c *api.Container) error { + var isolator appctypes.Isolator + if len(c.Capabilities.Add) > 0 || len(c.Capabilities.Drop) > 0 || len(c.Resources.Limits) > 0 || len(c.Resources.Requests) > 0 { + app.Isolators = []appctypes.Isolator{} + } + + // Retained capabilities/privileged. + privileged := false + if capabilities.Get().AllowPrivileged { + privileged = c.Privileged + } else if c.Privileged { + glog.Errorf("Privileged is disallowed globally") + // TODO(yifan): Return error? + } + var caps string + var value []byte + if privileged { + caps = getAllCapabilities() + } else { + caps = getCapabilities(c.Capabilities.Add) + } + if len(caps) > 0 { + value = []byte(fmt.Sprintf(`{"name":"os/linux/capabilities-retain-set","value":{"set":[%s]}`, caps)) + if err := isolator.UnmarshalJSON(value); err != nil { + glog.Errorf("Cannot unmarshal the retained capabilites %q: %v", value, err) + return err + } + app.Isolators = append(app.Isolators, isolator) + } + + // Removed capabilities. + caps = getCapabilities(c.Capabilities.Drop) + if len(caps) > 0 { + value = []byte(fmt.Sprintf(`{"name":"os/linux/capabilities-remove-set","value":{"set":[%s]}`, caps)) + if err := isolator.UnmarshalJSON(value); err != nil { + glog.Errorf("Cannot unmarshal the retained capabilites %q: %v", value, err) + return err + } + app.Isolators = append(app.Isolators, isolator) + } + + // Resources. + resources := make(map[api.ResourceName]resource) + for name, quantity := range c.Resources.Limits { + resources[name] = resource{limit: quantity.String()} + } + for name, quantity := range c.Resources.Requests { + r, ok := resources[name] + if !ok { + r = resource{} + } + r.request = quantity.String() + } + for name, res := range resources { + switch name { + case api.ResourceCPU: + name = "resource/cpu" + case api.ResourceMemory: + name = "resource/memory" + default: + glog.Warningf("Resource type not supported: %v", name) + } + value = []byte(fmt.Sprintf(`"name":%q,"value":{"request":%q,"limit":%q}`, name, res.request, res.limit)) + if err := isolator.UnmarshalJSON(value); err != nil { + glog.Errorf("Cannot unmarshal the resource %q: %v", value, err) + return err + } + app.Isolators = append(app.Isolators, isolator) + } + return nil +} + +// setApp overrides the app's fields if any of them are specified in the +// container's spec. +func setApp(app *appctypes.App, c *api.Container) error { + // Override the exec. + // TOOD(yifan): Revisit this for the overriding rule. + if len(c.Command) > 0 || len(c.Args) > 0 { + app.Exec = append(c.Command, c.Args...) + } + + // TODO(yifan): Use non-root user in the future? + // Currently it's a bug as reported https://github.com/coreos/rkt/issues/539. + // However since we cannot get the user/group information from the container + // spec, maybe we use the file path to set the user/group? + app.User, app.Group = "0", "0" + + // Override the working directory. + if len(c.WorkingDir) > 0 { + app.WorkingDirectory = c.WorkingDir + } + + // Override the environment. + // TODO(yifan): Use RunContainerOptions. + if len(c.Env) > 0 { + app.Environment = []appctypes.EnvironmentVariable{} + } + for _, env := range c.Env { + app.Environment = append(app.Environment, appctypes.EnvironmentVariable{ + Name: env.Name, + Value: env.Value, + }) + } + + // Override the mount points. + if len(c.VolumeMounts) > 0 { + app.MountPoints = []appctypes.MountPoint{} + } + for _, m := range c.VolumeMounts { + mountPointName, err := appctypes.NewACName(m.Name) + if err != nil { + glog.Errorf("Cannot use the volume mount's name %q as ACName: %v", m.Name, err) + return err + } + app.MountPoints = append(app.MountPoints, appctypes.MountPoint{ + Name: *mountPointName, + Path: m.MountPath, + ReadOnly: m.ReadOnly, + }) + } + + // Override the ports. + if len(c.Ports) > 0 { + app.Ports = []appctypes.Port{} + } + for _, p := range c.Ports { + portName, err := appctypes.NewACName(p.Name) + if err != nil { + glog.Errorf("Cannot use the port's name %q as ACName: %v", p.Name, err) + return err + } + app.Ports = append(app.Ports, appctypes.Port{ + Name: *portName, + Protocol: string(p.Protocol), + Port: uint(p.ContainerPort), + }) + } + + // Override isolators. + return setIsolators(app, c) +} + +// makePodManifest transforms a kubelet pod spec to the rkt pod manifest. +// TODO(yifan): Use the RunContainerOptions generated by GenerateRunContainerOptions(). +func (r *Runtime) makePodManifest(pod *api.Pod, volumeMap map[string]volume.Volume) (*appcschema.PodManifest, error) { + manifest := appcschema.BlankPodManifest() + + // Get the image manifests, assume they are already in the cas, + // and extract the app field from the image and to be the 'base app'. + // + // We do this is because we will fully replace the image manifest's app + // with the pod manifest's app in rkt runtime. See below: + // + // https://github.com/coreos/rkt/issues/723. + // + ds, err := store.NewStore(rktDataDir) + if err != nil { + glog.Errorf("Cannot open store: %v", err) + return nil, err + } + for _, c := range pod.Spec.Containers { + // Assume we are running docker images for now, see #7203. + imageID, err := r.getImageID(c.Image) + if err != nil { + return nil, fmt.Errorf("cannot get image ID for %q: %v", c.Image, err) + } + hash, err := appctypes.NewHash(imageID) + if err != nil { + glog.Errorf("Cannot create new hash from %q", imageID) + return nil, err + } + + im, err := ds.GetImageManifest(hash.String()) + if err != nil { + glog.Errorf("Cannot get image manifest: %v", err) + return nil, err + } + + // Override the image manifest's app and store it in the pod manifest. + app := im.App + if err := setApp(app, &c); err != nil { + return nil, err + } + manifest.Apps = append(manifest.Apps, appcschema.RuntimeApp{ + Name: im.Name, + Image: appcschema.RuntimeImage{ID: *hash}, + App: app, + }) + } + + // Set global volumes. + for name, volume := range volumeMap { + volName, err := appctypes.NewACName(name) + if err != nil { + glog.Errorf("Cannot use the volume's name %q as ACName: %v", name, err) + return nil, err + } + manifest.Volumes = append(manifest.Volumes, appctypes.Volume{ + Name: *volName, + Kind: "host", + Source: volume.GetPath(), + }) + } + + // Set global ports. + for _, c := range pod.Spec.Containers { + for _, port := range c.Ports { + portName, err := appctypes.NewACName(port.Name) + if err != nil { + glog.Errorf("Cannot use the volume's name %q as ACName: %v", port.Name, err) + return nil, err + } + manifest.Ports = append(manifest.Ports, appctypes.ExposedPort{ + Name: *portName, + HostPort: uint(port.HostPort), + }) + } + } + // TODO(yifan): Set pod-level isolators once it's supported in kubernetes. + return manifest, nil +} + +// TODO(yifan): Need to implement image management in rkt. +func (r *Runtime) getImageID(imageName string) (string, error) { + output, err := r.runCommand("fetch", imageName) + if err != nil { + return "", err + } + last := output[len(output)-1] + if !strings.HasPrefix(last, "sha512-") { + return "", fmt.Errorf("unexpected result: %q", last) + } + return last, err +}