diff --git a/tensorflow/Dockerfile b/tensorflow/Dockerfile index a54b5466b..48fb73321 100644 --- a/tensorflow/Dockerfile +++ b/tensorflow/Dockerfile @@ -33,12 +33,11 @@ ENV KMP_AFFINITY='granularity=fine,verbose,compact,1,0' \ KMP_BLOCKTIME=1 \ KMP_SETTINGS=1 -ARG TF_VERSION - WORKDIR / COPY requirements.txt . -RUN python -m pip install --no-cache-dir -r requirements.txt +RUN python -m pip install --no-cache-dir -r requirements.txt && \ + rm -rf requirements.txt ADD https://raw.githubusercontent.com/intel/intel-extension-for-tensorflow/master/third-party-programs/dockerlayer/THIRD-PARTY-PROGRAMS.txt /licenses/ ADD https://raw.githubusercontent.com/intel/intel-extension-for-tensorflow/master/third-party-programs/dockerlayer/third-party-program-of-intel-extension-for-tensorflow.txt /licenses/ @@ -53,12 +52,13 @@ ENV KMP_AFFINITY='granularity=fine,verbose,compact,1,0' \ ENV PATH /usr/bin:/root/conda/envs/idp/bin:/root/conda/condabin:~/conda/bin/:${PATH} ENV TF_ENABLE_ONEDNN_OPTS=1 -ARG TF_VERSION WORKDIR / COPY requirements.txt . -RUN python -m pip install --no-cache-dir -r requirements.txt +RUN conda run -n idp python -m pip install --no-cache-dir -r requirements.txt && \ + rm -rf requirements.txt && \ + conda clean -y --all ADD https://raw.githubusercontent.com/intel/intel-extension-for-tensorflow/master/third-party-programs/dockerlayer/THIRD-PARTY-PROGRAMS.txt /licenses/ ADD https://raw.githubusercontent.com/intel/intel-extension-for-tensorflow/master/third-party-programs/dockerlayer/third-party-program-of-intel-extension-for-tensorflow.txt /licenses/ @@ -77,37 +77,43 @@ EXPOSE 8888 CMD ["bash", "-c", "source /etc/bash.bashrc && jupyter notebook --notebook-dir=/jupyter --port 8888 --ip 0.0.0.0 --no-browser --allow-root --ServerApp.token= --ServerApp.password= --ServerApp.allow_origin=* --ServerApp.base_url=$NB_PREFIX"] -FROM tf-base-${PACKAGE_OPTION} AS openmpi +FROM tf-base-${PACKAGE_OPTION} AS multinode RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \ + build-essential \ + cmake \ + g++ \ + gcc \ + git \ + libgl1-mesa-glx \ + libglib2.0-0 \ libopenmpi-dev \ + numactl \ openmpi-bin \ - openmpi-common + openmpi-common \ + python3-dev \ + unzip \ + virtualenv -WORKDIR / -COPY ompi-requirements.txt . +ENV SIGOPT_PROJECT=. -RUN python -m pip install --no-cache-dir -r ompi-requirements.txt +WORKDIR / +COPY multinode/requirements.txt requirements.txt -FROM openmpi AS horovod +RUN python -m pip install --no-cache-dir -r requirements.txt && \ + rm -rf requirements.txt -ENV LD_LIBRARY_PATH /lib64/:/usr/lib64/:/usr/local/lib64 +ENV LD_LIBRARY_PATH="/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH}:/usr/local/lib/python${PYTHON_VERSION}/dist-packages/oneccl_bindings_for_pytorch/opt/mpi/libfabric/lib:/usr/local/lib/python${PYTHON_VERSION}/dist-packages/oneccl_bindings_for_pytorch/lib" RUN apt-get install -y --no-install-recommends --fix-missing \ - unzip \ openssh-client \ openssh-server && \ - rm /etc/ssh/ssh_host_*_key \ - /etc/ssh/ssh_host_*_key.pub - -ENV OMPI_ALLOW_RUN_AS_ROOT=1 -ENV OMPI_ALLOW_RUN_AS_ROOT_CONFIRM=1 - -ENV OMPI_MCA_tl_tcp_if_exclude="lo,docker0" + rm /etc/ssh/ssh_host_*_key \ + /etc/ssh/ssh_host_*_key.pub && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* -# Install OpenSSH for MPI to communicate between containers -RUN mkdir -p /var/run/sshd && \ - echo 'LoginGraceTime 0' >> /etc/ssh/sshd_config +RUN mkdir -p /var/run/sshd # Install Horovod ARG HOROVOD_WITH_TENSORFLOW=1 @@ -116,43 +122,32 @@ ARG HOROVOD_WITHOUT_PYTORCH=1 ARG HOROVOD_WITHOUT_GLOO=1 ARG HOROVOD_WITH_MPI=1 -RUN apt-get install -y --no-install-recommends --fix-missing \ - build-essential \ - cmake \ - g++ \ - gcc \ - git \ - libgl1-mesa-glx \ - libglib2.0-0 \ - python3-dev && \ - apt-get clean && \ - rm -rf /var/lib/apt/lists/* - -WORKDIR / -COPY hvd-requirements.txt . - -RUN python -m pip install --no-cache-dir -r hvd-requirements.txt - -ENV SIGOPT_PROJECT=. - -RUN wget --progress=dot:giga --no-check-certificate https://github.com/intel/neural-compressor/raw/master/docker/third-party-programs-tensorflow.txt -O /licenses/inc-third-party-programs-tensorflow.txt && \ - wget --progress=dot:giga --no-check-certificate https://raw.githubusercontent.com/intel/neural-compressor/master/LICENSE -O /licenses/INC_LICENSE +ENV LD_LIBRARY_PATH /lib64/:/usr/lib64/:/usr/local/lib64 -FROM horovod AS multinode-pip +RUN python -m pip install --no-cache-dir horovod==0.28.1 -WORKDIR / -COPY multinode-requirements.txt . +ARG PYTHON_VERSION -RUN python -m pip install --no-cache-dir -r multinode-requirements.txt +COPY multinode/generate_ssh_keys.sh /generate_ssh_keys.sh -FROM horovod AS multinode-idp +# modify generate_ssh_keys to be a helper script +# print how to use helper script on bash startup +# Avoids loop for further execution of the startup file +ARG PACKAGE_OPTION=pip +ARG PYPATH="/usr/local/lib/python${PYTHON_VERSION}/dist-packages" +RUN if [ "${PACKAGE_OPTION}" = "idp" ]; then PYPATH="/opt/conda/envs/idp/lib/python${PYTHON_VERSION}/site-packages"; fi && \ + echo "source ${PYPATH}/oneccl_bindings_for_pytorch/env/setvars.sh" >> ~/.startup && \ + cat '/generate_ssh_keys.sh' >> ~/.startup && \ + rm -rf /generate_ssh_keys.sh -WORKDIR / -COPY multinode-requirements.txt . +COPY multinode/dockerd-entrypoint.sh /usr/local/bin/dockerd-entrypoint.sh +COPY multinode/sshd_config /etc/ssh/sshd_config +COPY multinode/ssh_config /etc/ssh/ssh_config -RUN python -m pip install --no-cache-dir -r multinode-requirements.txt +RUN wget --progress=dot:giga --no-check-certificate https://github.com/intel/neural-compressor/raw/master/docker/third-party-programs-tensorflow.txt -O /licenses/inc-third-party-programs-tensorflow.txt && \ + wget --progress=dot:giga --no-check-certificate https://raw.githubusercontent.com/intel/neural-compressor/master/LICENSE -O /licenses/INC_LICENSE -FROM ${PYTHON_BASE} AS itex-xpu-base-pip +FROM ${PYTHON_BASE} AS itex-xpu-base RUN apt-get update && \ apt-get install -y --no-install-recommends --fix-missing \ @@ -219,54 +214,7 @@ ADD https://raw.githubusercontent.com/intel/intel-extension-for-tensorflow/maste ENV LD_LIBRARY_PATH=/opt/intel/oneapi/redist/lib:$LD_LIBRARY_PATH -FROM ${PYTHON_BASE} AS itex-xpu-base-idp - -RUN apt-get update && \ - apt-get install -y --no-install-recommends --fix-missing \ - apt-utils \ - build-essential \ - clinfo \ - git \ - gnupg2 \ - gpg-agent \ - rsync \ - unzip \ - wget && \ - apt-get clean && \ - rm -rf /var/lib/apt/lists/* - -ARG ICD_VER -ARG LEVEL_ZERO_GPU_VER -ARG LEVEL_ZERO_VER -ARG LEVEL_ZERO_DEV_VER - -RUN no_proxy="" NO_PROXY="" wget -qO - https://repositories.intel.com/gpu/intel-graphics.key | \ - gpg --dearmor --output /usr/share/keyrings/intel-graphics.gpg -RUN echo "deb [arch=amd64 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/gpu/ubuntu jammy/lts/2350 unified" | \ - tee /etc/apt/sources.list.d/intel-gpu-jammy.list - -RUN no_proxy="" NO_PROXY="" apt-get update && \ - apt-get install -y --no-install-recommends --fix-missing \ - intel-opencl-icd=${ICD_VER} \ - intel-level-zero-gpu=${LEVEL_ZERO_GPU_VER} \ - level-zero=${LEVEL_ZERO_VER} \ - level-zero-dev=${LEVEL_ZERO_DEV_VER} && \ - apt-get clean && \ - rm -rf /var/lib/apt/lists/* - -ARG ITEX_VER="2.15.0.1" - -RUN conda install -n idp -y intel-extension-for-tensorflow=${ITEX_VER}=*xpu* \ - -c https://software.repos.intel.com/python/conda - -ENV LD_LIBRARY_PATH=/opt/conda/envs/idp/lib:$LD_LIBRARY_PATH - -ADD https://raw.githubusercontent.com/intel/intel-extension-for-tensorflow/master/third-party-programs/dockerlayer/THIRD-PARTY-PROGRAMS.txt /licenses/ -ADD https://raw.githubusercontent.com/intel/intel-extension-for-tensorflow/master/third-party-programs/dockerlayer/third-party-program-of-intel-extension-for-tensorflow.txt /licenses/ -ADD https://raw.githubusercontent.com/intel/intel-extension-for-tensorflow/master/third-party-programs/dockerlayer/third-party-programs-of-intel-tensorflow.txt /licenses/ -ADD https://raw.githubusercontent.com/intel/intel-extension-for-tensorflow/master/third-party-programs/dockerlayer/third-party-programs-of-intel-optimization-for-horovod.txt /licenses/ - -FROM itex-xpu-base-${PACKAGE_OPTION} AS itex-xpu-jupyter +FROM itex-xpu-base AS itex-xpu-jupyter WORKDIR /jupyter COPY jupyter-requirements.txt . diff --git a/tensorflow/README.md b/tensorflow/README.md index 195cebdf3..ac2c8b7c9 100644 --- a/tensorflow/README.md +++ b/tensorflow/README.md @@ -85,7 +85,8 @@ The images below are built only with CPU optimizations (GPU acceleration support | Tag(s) | TensorFlow | ITEX | Dockerfile | | --------------------------- | ----------- | ------------ | --------------- | -| `2.15.0-pip-base`, `latest` | [v2.15.0] | [v2.15.0.0] | [v0.4.0-Beta] | +| `2.15.1-pip-base`, `latest` | [v2.15.1] | [v2.15.0.1] | [v0.4.0-Beta] | +| `2.15.0-pip-base` | [v2.15.0] | [v2.15.0.0] | [v0.4.0-Beta] | | `2.14.0-pip-base` | [v2.14.1] | [v2.14.0.1] | [v0.3.4] | | `2.13-pip-base` | [v2.13.0] | [v2.13.0.0] | [v0.2.3] | @@ -93,6 +94,7 @@ The images below additionally include [Jupyter Notebook](https://jupyter.org/) s | Tag(s) | TensorFlow | ITEX | Dockerfile | | -------------------- | ----------- | ------------- | --------------- | +| `2.15.1-pip-jupyter` | [v2.15.1] | [v2.15.0.1] | [v0.4.0-Beta] | | `2.15.0-pip-jupyter` | [v2.15.0] | [v2.15.0.0] | [v0.4.0-Beta] | | `2.14.0-pip-jupyter` | [v2.14.1] | [v2.14.0.1] | [v0.3.4] | | `2.13-pip-jupyter` | [v2.13.0] | [v2.13.0.0] | [v0.2.3] | @@ -105,7 +107,7 @@ docker run -it --rm \ --net=host \ -v $PWD/workspace:/workspace \ -w /workspace \ - intel/intel-extension-for-tensorflow:2.15.0-pip-jupyter + intel/intel-extension-for-tensorflow:2.15.1-pip-jupyter ``` After running the command above, copy the URL (something like `http://127.0.0.1:$PORT/?token=***`) into your browser to access the notebook server. @@ -116,10 +118,102 @@ The images below additionally include [Horovod]: | Tag(s) | Tensorflow | ITEX | Horovod | Dockerfile | | ------------------------------ | --------- | ------------ | --------- | --------------- | +| `2.15.1-pip-multinode` | [v2.15.1] | [v2.15.0.1] | [v0.28.1] | [v0.4.0-Beta] | | `2.15.0-pip-multinode` | [v2.15.0] | [v2.15.0.0] | [v0.28.1] | [v0.4.0-Beta] | | `2.14.0-pip-openmpi-multinode` | [v2.14.1] | [v2.14.0.1] | [v0.28.1] | [v0.3.4] | | `2.13-pip-openmpi-mulitnode` | [v2.13.0] | [v2.13.0.0] | [v0.28.0] | [v0.2.3] | +> [!NOTE] +> Passwordless SSH connection is also enabled in the image, but the container does not contain any SSH ID keys. The user needs to mount those keys at `/root/.ssh/id_rsa` and `/etc/ssh/authorized_keys`. + +> [!TIP] +> Before mounting any keys, modify the permissions of those files with `chmod 600 authorized_keys; chmod 600 id_rsa` to grant read access for the default user account. + +#### Setup and Run ITEX Multi-Node Container + +Some additional assembly is required to utilize this container with OpenSSH. To perform any kind of DDP (Distributed Data Parallel) execution, containers are assigned the roles of launcher and worker respectively: + +SSH Server (Worker) + +1. *Authorized Keys* : `/etc/ssh/authorized_keys` + +SSH Client (Launcher) + +1. *Private User Key* : `/root/.ssh/id_rsa` + +To add these files correctly please follow the steps described below. + +1. Setup ID Keys + + You can use the commands provided below to [generate the identity keys](https://www.ssh.com/academy/ssh/keygen#creating-an-ssh-key-pair-for-user-authentication) for OpenSSH. + + ```bash + ssh-keygen -q -N "" -t rsa -b 4096 -f ./id_rsa + touch authorized_keys + cat id_rsa.pub >> authorized_keys + ``` + +2. Configure the permissions and ownership for all of the files you have created so far + + ```bash + chmod 600 id_rsa config authorized_keys + chown root:root id_rsa.pub id_rsa config authorized_keys + ``` + +3. Create a hostfile for horovod. (Optional) + + ```txt + Host host1 + HostName + IdentitiesOnly yes + IdentityFile ~/.root/id_rsa + Port + Host host2 + HostName + IdentitiesOnly yes + IdentityFile ~/.root/id_rsa + Port + ... + ``` + +4. Configure [Horovod] in your python script + + ```python + import horovod.torch as hvd + + hvd.init() + ``` + +5. Now start the workers and execute DDP on the launcher + + 1. Worker run command: + + ```bash + docker run -it --rm \ + --net=host \ + -v $PWD/authorized_keys:/etc/ssh/authorized_keys \ + -v $PWD/tests:/workspace/tests \ + -w /workspace \ + intel/intel-optimized-tensorflow:2.15.1-pip-multinode \ + bash -c '/usr/sbin/sshd -D' + ``` + + 2. Launcher run command: + + ```bash + docker run -it --rm \ + --net=host \ + -v $PWD/id_rsa:/root/.ssh/id_rsa \ + -v $PWD/tests:/workspace/tests \ + -v $PWD/hostfile:/root/ssh/config \ + -w /workspace \ + intel/intel-optimized-tensorflow:2.15.1-pip-multinode \ + bash -c 'horovodrun --verbose -np 2 -H host1:1,host2:1 /workspace/tests/tf_base_test.py' + ``` + +> [!NOTE] +> [Intel® MPI] can be configured based on your machine settings. If the above commands do not work for you, see the documentation for how to configure based on your network. + --- The images below are [TensorFlow* Serving] with CPU Optimizations: @@ -151,7 +245,8 @@ The images below are built only with CPU optimizations (GPU acceleration support | Tag(s) | TensorFlow | ITEX | Dockerfile | | --------------------------- | ----------- | ------------ | --------------- | -| `2.15.0-idp-base`, `latest` | [v2.15.0] | [v2.15.0.0] | [v0.4.0-Beta] | +| `2.15.1-idp-base` | [v2.15.1] | [v2.15.0.1] | [v0.4.0-Beta] | +| `2.15.0-idp-base` | [v2.15.0] | [v2.15.0.0] | [v0.4.0-Beta] | | `2.14.0-idp-base` | [v2.14.1] | [v2.14.0.1] | [v0.3.4] | | `2.13-idp-base` | [v2.13.0] | [v2.13.0.0] | [v0.2.3] | @@ -159,6 +254,7 @@ The images below additionally include [Jupyter Notebook](https://jupyter.org/) s | Tag(s) | TensorFlow | ITEX | Dockerfile | | -------------------- | ----------- | ------------- | --------------- | +| `2.15.1-idp-jupyter` | [v2.15.1] | [v2.15.0.1] | [v0.4.0-Beta] | | `2.15.0-idp-jupyter` | [v2.15.0] | [v2.15.0.0] | [v0.4.0-Beta] | | `2.14.0-idp-jupyter` | [v2.14.1] | [v2.14.0.1] | [v0.3.4] | | `2.13-idp-jupyter` | [v2.13.0] | [v2.13.0.0] | [v0.2.3] | @@ -167,6 +263,7 @@ The images below additionally include [Horovod]: | Tag(s) | Tensorflow | ITEX | Horovod | Dockerfile | | ------------------------------ | --------- | ------------ | --------- | --------------- | +| `2.15.1-idp-multinode` | [v2.15.1] | [v2.15.0.1] | [v0.28.1] | [v0.4.0-Beta] | | `2.15.0-idp-multinode` | [v2.15.0] | [v2.15.0.0] | [v0.28.1] | [v0.4.0-Beta] | | `2.14.0-idp-openmpi-multinode` | [v2.14.1] | [v2.14.0.1] | [v0.28.1] | [v0.3.4] | | `2.13-idp-openmpi-mulitnode` | [v2.13.0] | [v2.13.0.0] | [v0.28.0] | [v0.2.3] | diff --git a/tensorflow/docker-compose.yaml b/tensorflow/docker-compose.yaml index 9583b2966..18aec65ad 100644 --- a/tensorflow/docker-compose.yaml +++ b/tensorflow/docker-compose.yaml @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -version: '3' include: - path: - ../python/docker-compose.yaml @@ -31,7 +30,7 @@ services: PYTHON_VERSION: ${PYTHON_VERSION:-3.10} REGISTRY: ${REGISTRY} REPO: ${REPO} - TF_VERSION: ${TF_VERSION:-2.15.0} + TF_VERSION: ${TF_VERSION:-2.15.1} target: tf-base-${PACKAGE_OPTION:-pip} context: . labels: @@ -41,20 +40,20 @@ services: org.opencontainers.base.name: "intel/python:3.10-core" org.opencontainers.image.name: "intel/intel-optimized-tensorflow" org.opencontainers.image.title: "Intel® Extension for TensorFlow Base Image" - org.opencontainers.image.version: ${TF_VERSION:-2.15.0}-${PACKAGE_OPTION:-pip}-base + org.opencontainers.image.version: ${TF_VERSION:-2.15.1}-${PACKAGE_OPTION:-pip}-base depends_on: - ${PACKAGE_OPTION:-pip} command: > python -c 'import tensorflow as tf; print("Tensorflow Version:", tf.__version__)' - image: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-${TF_VERSION:-2.15.0}-base + image: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-${TF_VERSION:-2.15.1}-base pull_policy: always jupyter: build: labels: dependency.python.pip: jupyter-requirements.txt - org.opencontainers.base.name: "intel/intel-optimized-tensorflow:${TF_VERSION:-2.15.0}-${PACKAGE_OPTION:-pip}-base" + org.opencontainers.base.name: "intel/intel-optimized-tensorflow:${TF_VERSION:-2.15.1}-${PACKAGE_OPTION:-pip}-base" org.opencontainers.image.title: "Intel® Extension for TensorFlow Jupyter Image" - org.opencontainers.image.version: ${TF_VERSION:-2.15.0}-${PACKAGE_OPTION:-pip}-jupyter + org.opencontainers.image.version: ${TF_VERSION:-2.15.1}-${PACKAGE_OPTION:-pip}-jupyter target: jupyter command: > bash -c "python -m jupyter --version" @@ -62,32 +61,38 @@ services: http_proxy: ${http_proxy} https_proxy: ${https_proxy} extends: tf-base - image: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-${TF_VERSION:-2.15.0}-jupyter + image: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-${TF_VERSION:-2.15.1}-jupyter network_mode: host volumes: - /$PWD:/jupyter multinode: build: labels: + dependency.apt.build-essential: true + dependency.apt.cmake: true dependency.apt.gcc: true + dependency.apt.g++: true + dependency.apt.git: true dependency.apt.libgl1-mesa-glx: true dependency.apt.libglib2: true - dependency.apt.python3-dev: true - dependency.pip.apt.virtualenv: true dependency.apt.libopenmpi-dev: true + dependency.apt.numactl: true dependency.apt.openmpi-bin: true - dependency.apt.unzip: true dependency.apt.openssh-client: true dependency.apt.openssh-server: true - dependency.python.pip: multinode-requirements.txt - org.opencontainers.base.name: "intel/intel-optimized-tensorflow:${TF_VERSION:-2.15.0}-${PACKAGE_OPTION:-pip}-base" + dependency.apt.python3-dev: true + dependency.apt.unzip: true + dependency.pip.apt.virtualenv: true + dependency.pip.horovod: 0.28.1 + dependency.python.pip: multinode/requirements.txt + org.opencontainers.base.name: "intel/intel-optimized-tensorflow:${TF_VERSION:-2.15.1}-${PACKAGE_OPTION:-pip}-base" org.opencontainers.image.title: "Intel® Extension for TensorFlow MultiNode Image" - org.opencontainers.image.version: ${TF_VERSION:-2.15.0}-${PACKAGE_OPTION:-pip}-multinode - target: multinode-${PACKAGE_OPTION:-pip} + org.opencontainers.image.version: ${TF_VERSION:-2.15.1}-${PACKAGE_OPTION:-pip}-multinode + target: multinode command: > bash -c "horovodrun --check-build && mpirun --version && python -c 'import horovod.tensorflow as hvd;hvd.init();import horovod.tensorflow;import neural_compressor, tf2onnx; print(\"\\nNeural Compressor Version:\", neural_compressor.__version__, \"\\\nTensorFlow2ONNX Version:\", tf2onnx.__version__)'" extends: tf-base - image: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-${TF_VERSION:-2.15.0}-horovod-${HOROVOD_VERSION:-0.28.1}-inc-${INC_VERSION:-2.6} + image: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-${TF_VERSION:-2.15.1}-horovod-${HOROVOD_VERSION:-0.28.1}-inc-${INC_VERSION:-3.0} xpu: build: args: @@ -120,7 +125,7 @@ services: org.opencontainers.base.name: "intel/python:3.10-core" org.opencontainers.image.title: "Intel® Extension for TensorFlow XPU Base Image" org.opencontainers.image.version: ${TF_VER:-2.15.0}-xpu-${PACKAGE_OPTION:-pip}-base - target: itex-xpu-base-${PACKAGE_OPTION:-pip} + target: itex-xpu-base command: > sh -c "python -c 'import tensorflow as tf;print(tf.__version__);from tensorflow.python.client import device_lib;print(device_lib.list_local_devices())'" extends: tf-base @@ -140,7 +145,7 @@ services: NO_PROXY: '' labels: dependency.python.pip: jupyter-requirements.txt - org.opencontainers.base.name: "intel/intel-optimized-tensorflow:${TF_VERSION:-2.15.0}-xpu-${PACKAGE_OPTION:-pip}-base" + org.opencontainers.base.name: "intel/intel-optimized-tensorflow:${TF_VERSION:-2.15.1}-xpu-${PACKAGE_OPTION:-pip}-base" org.opencontainers.image.title: "Intel® Extension for TensorFlow XPU Jupyter Image" org.opencontainers.image.version: ${TF_VER:-2.15.0}-xpu-${PACKAGE_OPTION:-pip}-jupyter target: itex-xpu-jupyter diff --git a/tensorflow/hvd-requirements.txt b/tensorflow/hvd-requirements.txt deleted file mode 100644 index f2eadccea..000000000 --- a/tensorflow/hvd-requirements.txt +++ /dev/null @@ -1 +0,0 @@ -horovod==0.28.1 diff --git a/tensorflow/jupyter-requirements.txt b/tensorflow/jupyter-requirements.txt index 23a738859..9bdbed92a 100644 --- a/tensorflow/jupyter-requirements.txt +++ b/tensorflow/jupyter-requirements.txt @@ -1,4 +1,4 @@ -jupyterlab==4.3.0a0 +jupyterlab>=4.2.4 jupyterhub==5.1.0 -notebook==7.3.0a0 +notebook>=7.1.3 jupyter-server-proxy>=4.1.2 diff --git a/tensorflow/multinode-requirements.txt b/tensorflow/multinode-requirements.txt deleted file mode 100644 index d9cff3697..000000000 --- a/tensorflow/multinode-requirements.txt +++ /dev/null @@ -1,3 +0,0 @@ -cython -tf2onnx -neural-compressor==2.6 diff --git a/tensorflow/multinode/dockerd-entrypoint.sh b/tensorflow/multinode/dockerd-entrypoint.sh new file mode 100755 index 000000000..ba13c0f94 --- /dev/null +++ b/tensorflow/multinode/dockerd-entrypoint.sh @@ -0,0 +1,21 @@ +#!/bin/bash +# Copyright (c) 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -e +set -a +# shellcheck disable=SC1091 +source "$HOME/.startup" +set +a +"$@" diff --git a/tensorflow/multinode/generate_ssh_keys.sh b/tensorflow/multinode/generate_ssh_keys.sh new file mode 100755 index 000000000..0ee61398e --- /dev/null +++ b/tensorflow/multinode/generate_ssh_keys.sh @@ -0,0 +1,28 @@ +#!/usr/bin/env bash +# Copyright (c) 2023 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: Apache-2.0 + +function gen_single_key() { + ALG_NAME=$1 + if [[ ! -f /etc/ssh/ssh_host_${ALG_NAME}_key ]]; then + ssh-keygen -q -N "" -t "${ALG_NAME}" -f "/etc/ssh/ssh_host_${ALG_NAME}_key" + fi +} + +gen_single_key dsa +gen_single_key rsa +gen_single_key ecdsa +gen_single_key ed25519 diff --git a/tensorflow/multinode/requirements.txt b/tensorflow/multinode/requirements.txt new file mode 100644 index 000000000..807477402 --- /dev/null +++ b/tensorflow/multinode/requirements.txt @@ -0,0 +1,5 @@ +cython>=3.0.11 +impi-rt>=2021.12.0 +mpi4py>=3.1.0 +neural-compressor==3.0 +tf2onnx>=1.16.1 diff --git a/tensorflow/multinode/ssh_config b/tensorflow/multinode/ssh_config new file mode 100644 index 000000000..9ac730173 --- /dev/null +++ b/tensorflow/multinode/ssh_config @@ -0,0 +1,4 @@ +Host * + Port 3022 + IdentityFile ~/.ssh/id_rsa + StrictHostKeyChecking no diff --git a/tensorflow/multinode/sshd_config b/tensorflow/multinode/sshd_config new file mode 100644 index 000000000..4796a48af --- /dev/null +++ b/tensorflow/multinode/sshd_config @@ -0,0 +1,12 @@ +HostKey /etc/ssh/ssh_host_dsa_key +HostKey /etc/ssh/ssh_host_rsa_key +HostKey /etc/ssh/ssh_host_ecdsa_key +HostKey /etc/ssh/ssh_host_ed25519_key +AuthorizedKeysFile /etc/ssh/authorized_keys +## Enable DEBUG log. You can ignore this but this may help you debug any issue while enabling SSHD for the first time +LogLevel DEBUG3 +Port 3022 +UsePAM yes +Subsystem sftp /usr/lib/openssh/sftp-server +# https://ubuntu.com/security/CVE-2024-6387 +LoginGraceTime 0 diff --git a/tensorflow/ompi-requirements.txt b/tensorflow/ompi-requirements.txt deleted file mode 100644 index 7b64c1667..000000000 --- a/tensorflow/ompi-requirements.txt +++ /dev/null @@ -1 +0,0 @@ -impi-rt>=2021.12.0 diff --git a/tensorflow/requirements.txt b/tensorflow/requirements.txt index 9b50ec785..92fd1059d 100644 --- a/tensorflow/requirements.txt +++ b/tensorflow/requirements.txt @@ -1,4 +1,4 @@ -tensorflow==2.15.0 -intel-extension-for-tensorflow[cpu]==2.15.0.0 +tensorflow==2.15.1 +intel-extension-for-tensorflow[cpu]>=2.15,<2.16 tensorflow-hub==0.16.1 -pillow==10.3.0 +pillow==10.4.0 diff --git a/tensorflow/serving/requirements.txt b/tensorflow/serving/requirements.txt index cf28053cb..cd80fbcd9 100644 --- a/tensorflow/serving/requirements.txt +++ b/tensorflow/serving/requirements.txt @@ -1,5 +1,5 @@ -numpy==2.0.0 -pillow==10.3.0 +numpy==2.0.1 +pillow==10.4.0 requests==2.32.3 -tensorflow==2.16.1 -tensorflow-serving-api==2.16.1 +tensorflow==2.17.0 +tensorflow-serving-api==2.17.0 diff --git a/tensorflow/tests/tests.yaml b/tensorflow/tests/tests.yaml index 0fa5b2b3f..43af22394 100644 --- a/tensorflow/tests/tests.yaml +++ b/tensorflow/tests/tests.yaml @@ -14,7 +14,7 @@ --- import-itex-cpu-${PACKAGE_OPTION:-pip}: - img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-${TF_VERSION:-2.15.0}-base + img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-${TF_VERSION:-2.15.1}-base cmd: python -c "from tensorflow.python.client import device_lib; print(device_lib.list_local_devices())" import-itex-xpu-${PACKAGE_OPTION:-pip}: img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-itex-${TF_VERSION:-2.15.1}-itex-xpu-base @@ -24,20 +24,20 @@ import-itex-xpu-${PACKAGE_OPTION:-pip}: - src: ${PWD}/tensorflow/tests dst: /tests import-cpu-jupyter-${PACKAGE_OPTION:-pip}: - img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-${TF_VERSION:-2.15.0}-jupyter + img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-${TF_VERSION:-2.15.1}-jupyter cmd: python -m jupyter --version import-xpu-jupyter-${PACKAGE_OPTION:-pip}: img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-itex-${TF_VERSION:-2.15.1}-itex-xpu-jupyter cmd: python -m jupyter --version device: ["/dev/dri"] import-multinode-${PACKAGE_OPTION:-pip}: - img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-${TF_VERSION:-2.15.0}-horovod-${HOROVOD_VERSION:-0.28.1}-inc-${INC_VERSION:-2.6} + img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-${TF_VERSION:-2.15.1}-horovod-${HOROVOD_VERSION:-0.28.1}-inc-${INC_VERSION:-3.0} cmd: horovodrun --check-build && mpirun --version && python -c 'import horovod.tensorflow as hvd;hvd.init();import horovod.tensorflow' import-inc-${PACKAGE_OPTION:-pip}: - img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-${TF_VERSION:-2.15.0}-horovod-${HOROVOD_VERSION:-0.28.1}-inc-${INC_VERSION:-2.6} + img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-${TF_VERSION:-2.15.1}-horovod-${HOROVOD_VERSION:-0.28.1}-inc-${INC_VERSION:-3.0} cmd: python -c "import neural_compressor as inc;print(inc.__version__)" itex-cpu-${PACKAGE_OPTION:-pip}: - img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-${TF_VERSION:-2.15.0}-base + img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-${TF_VERSION:-2.15.1}-base cmd: python /tests/tf_base_test.py volumes: - src: ${PWD}/tensorflow/tests @@ -55,13 +55,13 @@ itex-xpu-jupyter-${PACKAGE_OPTION:-pip}: notebook: True device: ["/dev/dri"] multinode-${PACKAGE_OPTION:-pip}: - img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-${TF_VERSION:-2.15.0}-horovod-${HOROVOD_VERSION:-0.28.1}-inc-${INC_VERSION:-2.6} + img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-${TF_VERSION:-2.15.1}-horovod-${HOROVOD_VERSION:-0.28.1}-inc-${INC_VERSION:-3.0} cmd: horovodrun -np 2 -H localhost:2 --binding-args="-bind-to socket -map-by socket" python /tests/tf_base_test.py volumes: - dst: /tests src: $PWD/tensorflow/tests inc-${PACKAGE_OPTION:-pip}: - img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-${TF_VERSION:-2.15.0}-horovod-${HOROVOD_VERSION:-0.28.1}-inc-${INC_VERSION:-2.6} + img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-${TF_VERSION:-2.15.1}-horovod-${HOROVOD_VERSION:-0.28.1}-inc-${INC_VERSION:-3.0} cmd: bash /tests/inc_test.sh volumes: - dst: /tests diff --git a/tensorflow/xpu-requirements.txt b/tensorflow/xpu-requirements.txt index 0280ef9d5..9e4bb523d 100644 --- a/tensorflow/xpu-requirements.txt +++ b/tensorflow/xpu-requirements.txt @@ -1,2 +1,2 @@ -tensorflow==2.15.1 +tensorflow==2.15.0 intel-extension-for-tensorflow[xpu]==2.15.0.1