Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Mount SSH Secret directly on main container #416

Merged
merged 1 commit into from
Aug 26, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 7 additions & 3 deletions examples/horovod/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -69,9 +69,13 @@ RUN apt-get install -y --no-install-recommends openssh-client openssh-server &&
mkdir -p /var/run/sshd

# Allow OpenSSH to talk to containers without asking for confirmation
RUN cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \
echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \
mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config
# by disabling StrictHostKeyChecking.
# mpi-operator mounts the .ssh folder from a Secret. For that to work, we need
# to disable UserKnownHostsFile to avoid write permissions.
# Disabling StrictModes avoids directory and files read permission checks.
RUN sed -i 's/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g' /etc/ssh/ssh_config && \
echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config && \
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

update the comment to explain what each of the three options does?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done

sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config

WORKDIR "/examples"

Expand Down
9 changes: 8 additions & 1 deletion examples/pi/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -24,5 +24,12 @@ RUN setcap CAP_NET_BIND_SERVICE=+eip /usr/sbin/sshd
RUN useradd -m mpiuser
WORKDIR /home/mpiuser
COPY --chown=mpiuser sshd_config .sshd_config
RUN sed -i 's/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g' /etc/ssh/ssh_config
# Allow OpenSSH to talk to containers without asking for confirmation
# by disabling StrictHostKeyChecking.
# mpi-operator mounts the .ssh folder from a Secret. For that to work, we need
# to disable UserKnownHostsFile to avoid write permissions.
# Disabling StrictModes avoids directory and files read permission checks.
RUN sed -i 's/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g' /etc/ssh/ssh_config && \
echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config && \
sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

apart from lines {8, 9, 32}, should we put the rest in a common image?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll do it in a followup.

COPY --from=builder /pi /home/mpiuser/pi
9 changes: 8 additions & 1 deletion examples/pi/intel.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,13 @@ WORKDIR /home/mpiuser
COPY intel-entrypoint.sh /entrypoint.sh
ENTRYPOINT ["/entrypoint.sh"]
COPY --chown=mpiuser sshd_config .sshd_config
RUN sed -i 's/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g' /etc/ssh/ssh_config
# Allow OpenSSH to talk to containers without asking for confirmation
# by disabling StrictHostKeyChecking.
# mpi-operator mounts the .ssh folder from a Secret. For that to work, we need
# to disable UserKnownHostsFile to avoid write permissions.
# Disabling StrictModes avoids directory and files read permission checks.
RUN sed -i 's/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g' /etc/ssh/ssh_config && \
echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config && \
sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config

COPY --from=builder /pi /home/mpiuser/pi
1 change: 1 addition & 0 deletions examples/pi/sshd_config
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
PidFile /home/mpiuser/sshd.pid
HostKey /home/mpiuser/.ssh/id_rsa
StrictModes no
6 changes: 6 additions & 0 deletions examples/tensorflow-benchmarks/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
FROM horovod/horovod:0.20.0-tf2.3.0-torch1.6.0-mxnet1.6.0.post0-py3.7-cuda10.1

# mpi-operator mounts the .ssh folder from a Secret. For that to work, we need
# to disable UserKnownHostsFile to avoid write permissions.
# Disabling StrictModes avoids directory and files read permission checks.
RUN echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config && \
sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config

RUN mkdir /tensorflow
WORKDIR "/tensorflow"
RUN git clone https://github.com/tensorflow/benchmarks
Expand Down
3 changes: 0 additions & 3 deletions v2/cmd/mpi-operator/app/options/options.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@ type ServerOption struct {
LockNamespace string
QPS int
Burst int
ScriptingImage string
}

// NewServerOption creates a new CMServer with a default config.
Expand Down Expand Up @@ -69,6 +68,4 @@ func (s *ServerOption) AddFlags(fs *flag.FlagSet) {

fs.IntVar(&s.QPS, "kube-api-qps", 5, "QPS indicates the maximum QPS to the master from this client.")
fs.IntVar(&s.Burst, "kube-api-burst", 10, "Maximum burst for throttle.")

fs.StringVar(&s.ScriptingImage, "scripting-image", "alpine:3.14", "Container image used for scripting, such as in init containers.")
}
3 changes: 1 addition & 2 deletions v2/cmd/mpi-operator/app/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -161,8 +161,7 @@ func Run(opt *options.ServerOption) error {
kubeInformerFactory.Core().V1().Pods(),
podgroupsInformer,
kubeflowInformerFactory.Kubeflow().V2beta1().MPIJobs(),
opt.GangSchedulingName,
opt.ScriptingImage)
opt.GangSchedulingName)

go kubeInformerFactory.Start(ctx.Done())
go kubeflowInformerFactory.Start(ctx.Done())
Expand Down
56 changes: 11 additions & 45 deletions v2/pkg/controller/mpi_job_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -73,9 +73,7 @@ const (
discoverHostsScriptName = "discover_hosts.sh"
sshAuthSecretSuffix = "-ssh"
sshAuthVolume = "ssh-auth"
sshAuthMountPath = "/mnt/ssh"
sshHomeInitMountPath = "/mnt/home-ssh"
sshHomeVolume = "ssh-home"
rootSSHPath = "/root/.ssh"
launcher = "launcher"
worker = "worker"
launcherSuffix = "-launcher"
Expand Down Expand Up @@ -242,8 +240,6 @@ type MPIJobController struct {
recorder record.EventRecorder
// Gang scheduler name to use
gangSchedulerName string
// Container image used for scripting.
scriptingImage string

// To allow injection of updateStatus for testing.
updateStatusHandler func(mpijob *kubeflow.MPIJob) error
Expand All @@ -261,7 +257,7 @@ func NewMPIJobController(
podInformer coreinformers.PodInformer,
podgroupsInformer podgroupsinformer.PodGroupInformer,
mpiJobInformer informers.MPIJobInformer,
gangSchedulerName, scriptingImage string) *MPIJobController {
gangSchedulerName string) *MPIJobController {

// Create event broadcaster.
klog.V(4).Info("Creating event broadcaster")
Expand Down Expand Up @@ -298,7 +294,6 @@ func NewMPIJobController(
queue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "MPIJobs"),
recorder: recorder,
gangSchedulerName: gangSchedulerName,
scriptingImage: scriptingImage,
}

controller.updateStatusHandler = controller.doUpdateJobStatus
Expand Down Expand Up @@ -1516,57 +1511,28 @@ func workerReplicas(job *kubeflow.MPIJob) int32 {
}

func (c *MPIJobController) setupSSHOnPod(podSpec *corev1.PodSpec, job *kubeflow.MPIJob) {
var mode *int32
if job.Spec.SSHAuthMountPath == rootSSHPath {
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OOC, why is this part of the job spec? is the default /root/.ssh?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, that's the default. We need the user to tell us where to put the credentials for the image to use it.

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

interesting, why would one want to change the ssh configuration in the container image 😅

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The folder will host the keys, not the configuration. And this location varies depending on the home directory. There is no way we can deduce this from the image.

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can't we just use the env variable $HOME somehow?

Copy link
Collaborator Author

@alculquicondor alculquicondor Aug 26, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We wouldn't have access to it from the controller.

Copy link

@ahg-g ahg-g Aug 26, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do we need it inside the controller for anything other than setting the MountPath below? if only that, then it seems there is a way: https://kubernetes.io/docs/concepts/storage/volumes/#using-subpath-expanded-environment

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's pretty cool, but I would assume it's only possible for variables defined by the downward API (thus, known at Pod sandbox creation time, in kubelet).

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah, it seems so.

mode = newInt32(0600)
}
mainContainer := &podSpec.Containers[0]
podSpec.Volumes = append(podSpec.Volumes,
corev1.Volume{
Name: sshAuthVolume,
VolumeSource: corev1.VolumeSource{
Secret: &corev1.SecretVolumeSource{
SecretName: job.Name + sshAuthSecretSuffix,
Items: sshVolumeItems,
DefaultMode: mode,
SecretName: job.Name + sshAuthSecretSuffix,
Items: sshVolumeItems,
},
},
},
corev1.Volume{
Name: sshHomeVolume,
VolumeSource: corev1.VolumeSource{
EmptyDir: &corev1.EmptyDirVolumeSource{},
},
})

mainContainer := &podSpec.Containers[0]
mainContainer.VolumeMounts = append(mainContainer.VolumeMounts,
corev1.VolumeMount{
Name: sshHomeVolume,
Name: sshAuthVolume,
MountPath: job.Spec.SSHAuthMountPath,
})

// The init script sets the permissions of the ssh folder in the user's home
// directory. The ownership is set based on the security context of the
// launcher's first container.
launcherSecurityCtx := job.Spec.MPIReplicaSpecs[kubeflow.MPIReplicaTypeLauncher].Template.Spec.Containers[0].SecurityContext
initScript := "" +
"cp -RL /mnt/ssh/* /mnt/home-ssh && " +
"chmod 700 /mnt/home-ssh && " +
"chmod 600 /mnt/home-ssh/*"
if launcherSecurityCtx != nil && launcherSecurityCtx.RunAsUser != nil {
initScript += fmt.Sprintf(" && chown %d -R /mnt/home-ssh", *launcherSecurityCtx.RunAsUser)
}
podSpec.InitContainers = append(podSpec.InitContainers, corev1.Container{
Name: "init-ssh",
Image: c.scriptingImage,
VolumeMounts: []corev1.VolumeMount{
{
Name: sshAuthVolume,
MountPath: sshAuthMountPath,
},
{
Name: sshHomeVolume,
MountPath: sshHomeInitMountPath,
},
},
Command: []string{"/bin/sh"},
Args: []string{"-c", initScript},
})
}

func ownerReferenceAndGVK(object metav1.Object) (*metav1.OwnerReference, schema.GroupVersionKind, error) {
Expand Down
Loading