diff --git a/pytorch/Dockerfile b/pytorch/Dockerfile index d83877d4f..c02ca9b26 100644 --- a/pytorch/Dockerfile +++ b/pytorch/Dockerfile @@ -80,9 +80,7 @@ RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missin gcc \ libgl1-mesa-glx \ libglib2.0-0 \ - virtualenv && \ - apt-get clean && \ - rm -rf /var/lib/apt/lists/* + virtualenv ENV SIGOPT_PROJECT=. @@ -91,17 +89,53 @@ COPY multinode-requirements.txt . RUN python -m pip install --no-cache-dir -r multinode-requirements.txt +ENV LD_LIBRARY_PATH="/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH}:/usr/local/lib/python${PYTHON_VERSION}/dist-packages/oneccl_bindings_for_pytorch/opt/mpi/libfabric/lib:/usr/local/lib/python${PYTHON_VERSION}/dist-packages/oneccl_bindings_for_pytorch/lib" + +RUN apt-get install -y --no-install-recommends --fix-missing \ + openssh-client \ + openssh-server && \ + rm /etc/ssh/ssh_host_*_key \ + /etc/ssh/ssh_host_*_key.pub && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +# Allow OpenSSH to talk to containers without asking for confirmation +# hadolint global ignore=SC2002 +RUN mkdir -p /var/run/sshd && \ + cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \ + echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \ + mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config + ARG PYTHON_VERSION -RUN echo "source /usr/local/lib/python${PYTHON_VERSION}/dist-packages/oneccl_bindings_for_pytorch/env/setvars.sh" >> ~/.bashrc +COPY generate_ssh_keys.sh . + +# modify generate_ssh_keys to be a helper script +# print how to use helper script on bash startup +# Avoids loop for further execution of the startup file +RUN echo "source /usr/local/lib/python${PYTHON_VERSION}/dist-packages/oneccl_bindings_for_pytorch/env/setvars.sh" >> ~/.startup && \ + cat '/generate_ssh_keys.sh' >> ~/.startup && \ + rm -rf /generate_ssh_keys.sh -ENV I_MPI_ROOT="${I_MPI_ROOT}:/usr/local/lib/python${PYTHON_VERSION}/dist-packages/oneccl_bindings_for_pytorch" -ENV CCL_ROOT="${CCL_ROOT}:/usr/local/lib/python${PYTHON_VERSION}/dist-packages/oneccl_bindings_for_pytorch" -ENV FI_PROVIDER_PATH="${FI_PROVIDER_PATH}:/usr/local/lib/python${PYTHON_VERSION}/dist-packages/oneccl_bindings_for_pytorch/opt/mpi/libfabric/lib/prov" -ENV LIBRARY_PATH="${LIBRARY_PATH}:/usr/local/lib/python${PYTHON_VERSION}/dist-packages/oneccl_bindings_for_pytorch/lib" -ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/local/lib/python${PYTHON_VERSION}/dist-packages/oneccl_bindings_for_pytorch/opt/mpi/libfabric/lib:/usr/local/lib/python${PYTHON_VERSION}/dist-packages/oneccl_bindings_for_pytorch/lib" -ENV PATH="${PATH}:/usr/local/lib/python${PYTHON_VERSION}/dist-packages/oneccl_bindings_for_pytorch/bin" -ENV CPATH="${CPATH}:/usr/local/lib/python${PYTHON_VERSION}/dist-packages/oneccl_bindings_for_pytorch/include" +# hadolint global ignore=SC3037 +RUN echo -e "#!/bin/bash \n\ +set -e \n\ +set -a \n\ +source ~/.startup \n\ +set +a \n\ +eval \"\$@\" \n\ +tail -f /dev/null" >> /usr/local/bin/dockerd-entrypoint.sh && \ + chmod +x /usr/local/bin/dockerd-entrypoint.sh + +RUN echo 'HostKey /etc/ssh/ssh_host_dsa_key' > /var/run/sshd_config && \ + echo 'HostKey /etc/ssh/ssh_host_rsa_key' > /var/run/sshd_config && \ + echo 'HostKey /etc/ssh/ssh_host_ecdsa_key' > /var/run/sshd_config && \ + echo 'HostKey /etc/ssh/ssh_host_ed25519_key' > /var/run/sshd_config && \ + echo 'AuthorizedKeysFile /etc/ssh/authorized_keys' > /var/run/sshd_config && \ + echo '## Enable DEBUG log. You can ignore this but this may help you debug any issue while enabling SSHD for the first time' > /var/run/sshd_config && \ + echo 'LogLevel DEBUG3' > /var/run/sshd_config && \ + echo 'UsePAM yes' > /var/run/sshd_config && \ + echo 'Subsystem sftp /usr/lib/openssh/sftp-server' > /var/run/sshd_config RUN mkdir -p /licensing @@ -109,6 +143,9 @@ RUN wget -q --no-check-certificate https://raw.githubusercontent.com/oneapi-src wget -q --no-check-certificate https://raw.githubusercontent.com/intel/neural-compressor/master/docker/third-party-programs-pytorch.txt -O /licensing/third-party-programs-pytorch.txt && \ wget -q --no-check-certificate https://raw.githubusercontent.com/intel/neural-compressor/master/LICENSE -O /licensing/LICENSE +ENTRYPOINT ["/usr/local/bin/dockerd-entrypoint.sh"] +CMD ["bash"] + FROM ${PYTHON_BASE} AS ipex-xpu-base RUN apt-get update && \ diff --git a/pytorch/README.md b/pytorch/README.md index aaf70d67b..3e2d31774 100644 --- a/pytorch/README.md +++ b/pytorch/README.md @@ -97,7 +97,7 @@ docker run -it --rm \ --net=host \ -v $PWD/workspace:/workspace \ -w /workspace \ - intel/intel-extension-for-tensorflow:xpu-jupyter + intel/intel-extension-for-pytorch:xpu-jupyter ``` After running the command above, copy the URL (something like `http://127.0.0.1:$PORT/?token=***`) into your browser to access the notebook server. @@ -113,6 +113,99 @@ The images below additionally include [IntelĀ® oneAPI Collective Communications | `2.1.0-pip-mulitnode` | [v2.1.0] | [v2.1.0+cpu] | [v2.1.0][ccl-v2.1.0] | [v2.3.1] | [v0.2.3] | | `2.0.0-pip-multinode` | [v2.0.0] | [v2.0.0+cpu] | [v2.0.0][ccl-v2.0.0] | [v2.1.1] | [v0.1.0] | +> **Note:** Passwordless SSH connection is also enabled in the image. +> The container does not contain the SSH ID keys. The user needs to mount those keys at `/root/.ssh/id_rsa` and `/root/.ssh/id_rsa.pub`. +> User also need to append content of id_rsa.pub in `/etc/ssh/authorized_keys` in the SSH server container. +> Since the SSH key is not owned by default user account in docker, please also do "chmod 644 id_rsa.pub; chmod 644 id_rsa" to grant read access for default user account. +> Users could also use "/usr/bin/ssh-keygen -t rsa -b 4096 -N '' -f ~/mnt/ssh_key/id_rsa" to generate a new SSH Key inside the container. +> Users need to mount a config file to list all hostnames at location `/root/.ssh/config` on the SSH client container. +> Once all files are added + +#### Setup and Run IPEX Multi-Node Container + +Some additional assembly is required to utilize this container with OpenSSH. To perform any kind of DDP (Distributed Data Parallel) execution, containers are assigned the roles of launcher and worker respectively: + +SSH Server (Worker) + +1. *Authorized Keys* : `/etc/ssh/authorized_keys` + +SSH Client (Launcher) + +1. *Config File with Host IPs* : `/root/.ssh/config` +2. *Private User Key* : `/root/.ssh/id_rsa` + +To add these files correctly please follow the steps described below. + +1. Setup ID Keys + + You can use the commands provided below to [generate the Identity keys](https://www.ssh.com/academy/ssh/keygen#creating-an-ssh-key-pair-for-user-authentication) for OpenSSH. + + ```bash + ssh-keygen -q -N "" -t rsa -b 4096 -f ./id_rsa + touch authorized_keys + cat id_rsa.pub >> authorized_keys + ``` + +2. Add hosts to config + + The launcher container needs to have the a config file with all hostnames and ports specified. An example of a hostfile is provided below. + + ```bash + touch config + ``` + + ```txt + Host host1 + HostName + IdentitiesOnly yes + Port + Host host2 + HostName + IdentitiesOnly yes + Port + ... + ``` + +3. Configure the permissions and ownership for all of the files you have created so far. + + ```bash + chmod 600 id_rsa.pub id_rsa config authorized_keys + chown root:root id_rsa.pub id_rsa config authorized_keys + ``` + +4. Now start the workers and execute DDP on the launcher. + + 1. Worker run command: + + ```bash + export SSH_PORT= + docker run -it --rm \ + --net=host \ + -v $PWD/authorized_keys:/root/.ssh/authorized_keys \ + -v $PWD/tests:/workspace/tests \ + -w /workspace \ + -e SSH_PORT=${SSH_PORT} \ + intel/intel-extension-for-pytorch:2.3.0-pip-multinode \ + bash -c '/usr/sbin/sshd -D -p ${SSH_PORT} -f /var/run/sshd_config' + ``` + + 2. Launcher run command: + + ```bash + docker run -it --rm \ + --net=host \ + -v $PWD/id_rsa:/root/.ssh/id_rsa \ + -v $PWD/config:/root/.ssh/config \ + -v $PWD/tests:/workspace/tests \ + -w /workspace \ + -e SSH_PORT=${SSH_PORT} \ + intel/intel-extension-for-pytorch:2.3.0-pip-multinode \ + bash -c 'ipexrun cpu /workspace/tests/ipex-resnet50.py --ipex --device cpu --backend ccl' + ``` + +> [!NOTE] +> [Intel MPI](https://www.intel.com/content/www/us/en/developer/tools/oneapi/mpi-library.html) can be configured based on your machine settings. If the above commands do not work for you, see the documentation for how to configure based on your network. + --- The images below are [TorchServe*] with CPU Optimizations: diff --git a/pytorch/generate_ssh_keys.sh b/pytorch/generate_ssh_keys.sh new file mode 100755 index 000000000..0ee61398e --- /dev/null +++ b/pytorch/generate_ssh_keys.sh @@ -0,0 +1,28 @@ +#!/usr/bin/env bash +# Copyright (c) 2023 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: Apache-2.0 + +function gen_single_key() { + ALG_NAME=$1 + if [[ ! -f /etc/ssh/ssh_host_${ALG_NAME}_key ]]; then + ssh-keygen -q -N "" -t "${ALG_NAME}" -f "/etc/ssh/ssh_host_${ALG_NAME}_key" + fi +} + +gen_single_key dsa +gen_single_key rsa +gen_single_key ecdsa +gen_single_key ed25519