Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
150 changes: 49 additions & 101 deletions tensorflow/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -33,12 +33,11 @@ ENV KMP_AFFINITY='granularity=fine,verbose,compact,1,0' \
KMP_BLOCKTIME=1 \
KMP_SETTINGS=1

ARG TF_VERSION

WORKDIR /
COPY requirements.txt .

RUN python -m pip install --no-cache-dir -r requirements.txt
RUN python -m pip install --no-cache-dir -r requirements.txt && \
rm -rf requirements.txt

ADD https://raw.githubusercontent.com/intel/intel-extension-for-tensorflow/master/third-party-programs/dockerlayer/THIRD-PARTY-PROGRAMS.txt /licenses/
ADD https://raw.githubusercontent.com/intel/intel-extension-for-tensorflow/master/third-party-programs/dockerlayer/third-party-program-of-intel-extension-for-tensorflow.txt /licenses/
Expand All @@ -53,12 +52,13 @@ ENV KMP_AFFINITY='granularity=fine,verbose,compact,1,0' \
ENV PATH /usr/bin:/root/conda/envs/idp/bin:/root/conda/condabin:~/conda/bin/:${PATH}

ENV TF_ENABLE_ONEDNN_OPTS=1
ARG TF_VERSION

WORKDIR /
COPY requirements.txt .

RUN python -m pip install --no-cache-dir -r requirements.txt
RUN conda run -n idp python -m pip install --no-cache-dir -r requirements.txt && \
rm -rf requirements.txt && \
conda clean -y --all

ADD https://raw.githubusercontent.com/intel/intel-extension-for-tensorflow/master/third-party-programs/dockerlayer/THIRD-PARTY-PROGRAMS.txt /licenses/
ADD https://raw.githubusercontent.com/intel/intel-extension-for-tensorflow/master/third-party-programs/dockerlayer/third-party-program-of-intel-extension-for-tensorflow.txt /licenses/
Expand All @@ -77,37 +77,43 @@ EXPOSE 8888

CMD ["bash", "-c", "source /etc/bash.bashrc && jupyter notebook --notebook-dir=/jupyter --port 8888 --ip 0.0.0.0 --no-browser --allow-root --ServerApp.token= --ServerApp.password= --ServerApp.allow_origin=* --ServerApp.base_url=$NB_PREFIX"]

FROM tf-base-${PACKAGE_OPTION} AS openmpi
FROM tf-base-${PACKAGE_OPTION} AS multinode

RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \
build-essential \
cmake \
g++ \
gcc \
git \
libgl1-mesa-glx \
libglib2.0-0 \
libopenmpi-dev \
numactl \
openmpi-bin \
openmpi-common
openmpi-common \
python3-dev \
unzip \
virtualenv

WORKDIR /
COPY ompi-requirements.txt .
ENV SIGOPT_PROJECT=.

RUN python -m pip install --no-cache-dir -r ompi-requirements.txt
WORKDIR /
COPY multinode/requirements.txt requirements.txt

FROM openmpi AS horovod
RUN python -m pip install --no-cache-dir -r requirements.txt && \
rm -rf requirements.txt

ENV LD_LIBRARY_PATH /lib64/:/usr/lib64/:/usr/local/lib64
ENV LD_LIBRARY_PATH="/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH}:/usr/local/lib/python${PYTHON_VERSION}/dist-packages/oneccl_bindings_for_pytorch/opt/mpi/libfabric/lib:/usr/local/lib/python${PYTHON_VERSION}/dist-packages/oneccl_bindings_for_pytorch/lib"

RUN apt-get install -y --no-install-recommends --fix-missing \
unzip \
openssh-client \
openssh-server && \
rm /etc/ssh/ssh_host_*_key \
/etc/ssh/ssh_host_*_key.pub

ENV OMPI_ALLOW_RUN_AS_ROOT=1
ENV OMPI_ALLOW_RUN_AS_ROOT_CONFIRM=1

ENV OMPI_MCA_tl_tcp_if_exclude="lo,docker0"
rm /etc/ssh/ssh_host_*_key \
/etc/ssh/ssh_host_*_key.pub && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*

# Install OpenSSH for MPI to communicate between containers
RUN mkdir -p /var/run/sshd && \
echo 'LoginGraceTime 0' >> /etc/ssh/sshd_config
RUN mkdir -p /var/run/sshd

# Install Horovod
ARG HOROVOD_WITH_TENSORFLOW=1
Expand All @@ -116,43 +122,32 @@ ARG HOROVOD_WITHOUT_PYTORCH=1
ARG HOROVOD_WITHOUT_GLOO=1
ARG HOROVOD_WITH_MPI=1

RUN apt-get install -y --no-install-recommends --fix-missing \
build-essential \
cmake \
g++ \
gcc \
git \
libgl1-mesa-glx \
libglib2.0-0 \
python3-dev && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*

WORKDIR /
COPY hvd-requirements.txt .

RUN python -m pip install --no-cache-dir -r hvd-requirements.txt

ENV SIGOPT_PROJECT=.

RUN wget --progress=dot:giga --no-check-certificate https://github.com/intel/neural-compressor/raw/master/docker/third-party-programs-tensorflow.txt -O /licenses/inc-third-party-programs-tensorflow.txt && \
wget --progress=dot:giga --no-check-certificate https://raw.githubusercontent.com/intel/neural-compressor/master/LICENSE -O /licenses/INC_LICENSE
ENV LD_LIBRARY_PATH /lib64/:/usr/lib64/:/usr/local/lib64

FROM horovod AS multinode-pip
RUN python -m pip install --no-cache-dir horovod==0.28.1

WORKDIR /
COPY multinode-requirements.txt .
ARG PYTHON_VERSION

RUN python -m pip install --no-cache-dir -r multinode-requirements.txt
COPY multinode/generate_ssh_keys.sh /generate_ssh_keys.sh

FROM horovod AS multinode-idp
# modify generate_ssh_keys to be a helper script
# print how to use helper script on bash startup
# Avoids loop for further execution of the startup file
ARG PACKAGE_OPTION=pip
ARG PYPATH="/usr/local/lib/python${PYTHON_VERSION}/dist-packages"
RUN if [ "${PACKAGE_OPTION}" = "idp" ]; then PYPATH="/opt/conda/envs/idp/lib/python${PYTHON_VERSION}/site-packages"; fi && \
echo "source ${PYPATH}/oneccl_bindings_for_pytorch/env/setvars.sh" >> ~/.startup && \
cat '/generate_ssh_keys.sh' >> ~/.startup && \
rm -rf /generate_ssh_keys.sh

WORKDIR /
COPY multinode-requirements.txt .
COPY multinode/dockerd-entrypoint.sh /usr/local/bin/dockerd-entrypoint.sh
COPY multinode/sshd_config /etc/ssh/sshd_config
COPY multinode/ssh_config /etc/ssh/ssh_config

RUN python -m pip install --no-cache-dir -r multinode-requirements.txt
RUN wget --progress=dot:giga --no-check-certificate https://github.com/intel/neural-compressor/raw/master/docker/third-party-programs-tensorflow.txt -O /licenses/inc-third-party-programs-tensorflow.txt && \
wget --progress=dot:giga --no-check-certificate https://raw.githubusercontent.com/intel/neural-compressor/master/LICENSE -O /licenses/INC_LICENSE

FROM ${PYTHON_BASE} AS itex-xpu-base-pip
FROM ${PYTHON_BASE} AS itex-xpu-base

RUN apt-get update && \
apt-get install -y --no-install-recommends --fix-missing \
Expand Down Expand Up @@ -219,54 +214,7 @@ ADD https://raw.githubusercontent.com/intel/intel-extension-for-tensorflow/maste

ENV LD_LIBRARY_PATH=/opt/intel/oneapi/redist/lib:$LD_LIBRARY_PATH

FROM ${PYTHON_BASE} AS itex-xpu-base-idp

RUN apt-get update && \
apt-get install -y --no-install-recommends --fix-missing \
apt-utils \
build-essential \
clinfo \
git \
gnupg2 \
gpg-agent \
rsync \
unzip \
wget && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*

ARG ICD_VER
ARG LEVEL_ZERO_GPU_VER
ARG LEVEL_ZERO_VER
ARG LEVEL_ZERO_DEV_VER

RUN no_proxy="" NO_PROXY="" wget -qO - https://repositories.intel.com/gpu/intel-graphics.key | \
gpg --dearmor --output /usr/share/keyrings/intel-graphics.gpg
RUN echo "deb [arch=amd64 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/gpu/ubuntu jammy/lts/2350 unified" | \
tee /etc/apt/sources.list.d/intel-gpu-jammy.list

RUN no_proxy="" NO_PROXY="" apt-get update && \
apt-get install -y --no-install-recommends --fix-missing \
intel-opencl-icd=${ICD_VER} \
intel-level-zero-gpu=${LEVEL_ZERO_GPU_VER} \
level-zero=${LEVEL_ZERO_VER} \
level-zero-dev=${LEVEL_ZERO_DEV_VER} && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*

ARG ITEX_VER="2.15.0.1"

RUN conda install -n idp -y intel-extension-for-tensorflow=${ITEX_VER}=*xpu* \
-c https://software.repos.intel.com/python/conda

ENV LD_LIBRARY_PATH=/opt/conda/envs/idp/lib:$LD_LIBRARY_PATH

ADD https://raw.githubusercontent.com/intel/intel-extension-for-tensorflow/master/third-party-programs/dockerlayer/THIRD-PARTY-PROGRAMS.txt /licenses/
ADD https://raw.githubusercontent.com/intel/intel-extension-for-tensorflow/master/third-party-programs/dockerlayer/third-party-program-of-intel-extension-for-tensorflow.txt /licenses/
ADD https://raw.githubusercontent.com/intel/intel-extension-for-tensorflow/master/third-party-programs/dockerlayer/third-party-programs-of-intel-tensorflow.txt /licenses/
ADD https://raw.githubusercontent.com/intel/intel-extension-for-tensorflow/master/third-party-programs/dockerlayer/third-party-programs-of-intel-optimization-for-horovod.txt /licenses/

FROM itex-xpu-base-${PACKAGE_OPTION} AS itex-xpu-jupyter
FROM itex-xpu-base AS itex-xpu-jupyter

WORKDIR /jupyter
COPY jupyter-requirements.txt .
Expand Down
103 changes: 100 additions & 3 deletions tensorflow/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -85,14 +85,16 @@ The images below are built only with CPU optimizations (GPU acceleration support

| Tag(s) | TensorFlow | ITEX | Dockerfile |
| --------------------------- | ----------- | ------------ | --------------- |
| `2.15.0-pip-base`, `latest` | [v2.15.0] | [v2.15.0.0] | [v0.4.0-Beta] |
| `2.15.1-pip-base`, `latest` | [v2.15.1] | [v2.15.0.1] | [v0.4.0-Beta] |
| `2.15.0-pip-base` | [v2.15.0] | [v2.15.0.0] | [v0.4.0-Beta] |
| `2.14.0-pip-base` | [v2.14.1] | [v2.14.0.1] | [v0.3.4] |
| `2.13-pip-base` | [v2.13.0] | [v2.13.0.0] | [v0.2.3] |

The images below additionally include [Jupyter Notebook](https://jupyter.org/) server:

| Tag(s) | TensorFlow | ITEX | Dockerfile |
| -------------------- | ----------- | ------------- | --------------- |
| `2.15.1-pip-jupyter` | [v2.15.1] | [v2.15.0.1] | [v0.4.0-Beta] |
| `2.15.0-pip-jupyter` | [v2.15.0] | [v2.15.0.0] | [v0.4.0-Beta] |
| `2.14.0-pip-jupyter` | [v2.14.1] | [v2.14.0.1] | [v0.3.4] |
| `2.13-pip-jupyter` | [v2.13.0] | [v2.13.0.0] | [v0.2.3] |
Expand All @@ -105,7 +107,7 @@ docker run -it --rm \
--net=host \
-v $PWD/workspace:/workspace \
-w /workspace \
intel/intel-extension-for-tensorflow:2.15.0-pip-jupyter
intel/intel-extension-for-tensorflow:2.15.1-pip-jupyter
```

After running the command above, copy the URL (something like `http://127.0.0.1:$PORT/?token=***`) into your browser to access the notebook server.
Expand All @@ -116,10 +118,102 @@ The images below additionally include [Horovod]:

| Tag(s) | Tensorflow | ITEX | Horovod | Dockerfile |
| ------------------------------ | --------- | ------------ | --------- | --------------- |
| `2.15.1-pip-multinode` | [v2.15.1] | [v2.15.0.1] | [v0.28.1] | [v0.4.0-Beta] |
| `2.15.0-pip-multinode` | [v2.15.0] | [v2.15.0.0] | [v0.28.1] | [v0.4.0-Beta] |
| `2.14.0-pip-openmpi-multinode` | [v2.14.1] | [v2.14.0.1] | [v0.28.1] | [v0.3.4] |
| `2.13-pip-openmpi-mulitnode` | [v2.13.0] | [v2.13.0.0] | [v0.28.0] | [v0.2.3] |

> [!NOTE]
> Passwordless SSH connection is also enabled in the image, but the container does not contain any SSH ID keys. The user needs to mount those keys at `/root/.ssh/id_rsa` and `/etc/ssh/authorized_keys`.

> [!TIP]
> Before mounting any keys, modify the permissions of those files with `chmod 600 authorized_keys; chmod 600 id_rsa` to grant read access for the default user account.

#### Setup and Run ITEX Multi-Node Container

Some additional assembly is required to utilize this container with OpenSSH. To perform any kind of DDP (Distributed Data Parallel) execution, containers are assigned the roles of launcher and worker respectively:

SSH Server (Worker)

1. *Authorized Keys* : `/etc/ssh/authorized_keys`

SSH Client (Launcher)

1. *Private User Key* : `/root/.ssh/id_rsa`

To add these files correctly please follow the steps described below.

1. Setup ID Keys

You can use the commands provided below to [generate the identity keys](https://www.ssh.com/academy/ssh/keygen#creating-an-ssh-key-pair-for-user-authentication) for OpenSSH.

```bash
ssh-keygen -q -N "" -t rsa -b 4096 -f ./id_rsa
touch authorized_keys
cat id_rsa.pub >> authorized_keys
```

2. Configure the permissions and ownership for all of the files you have created so far

```bash
chmod 600 id_rsa config authorized_keys
chown root:root id_rsa.pub id_rsa config authorized_keys
```

3. Create a hostfile for horovod. (Optional)

```txt
Host host1
HostName <Hostname of host1>
IdentitiesOnly yes
IdentityFile ~/.root/id_rsa
Port <SSH Port>
Host host2
HostName <Hostname of host2>
IdentitiesOnly yes
IdentityFile ~/.root/id_rsa
Port <SSH Port>
...
```

4. Configure [Horovod] in your python script

```python
import horovod.torch as hvd

hvd.init()
```

5. Now start the workers and execute DDP on the launcher

1. Worker run command:

```bash
docker run -it --rm \
--net=host \
-v $PWD/authorized_keys:/etc/ssh/authorized_keys \
-v $PWD/tests:/workspace/tests \
-w /workspace \
intel/intel-optimized-tensorflow:2.15.1-pip-multinode \
bash -c '/usr/sbin/sshd -D'
```

2. Launcher run command:

```bash
docker run -it --rm \
--net=host \
-v $PWD/id_rsa:/root/.ssh/id_rsa \
-v $PWD/tests:/workspace/tests \
-v $PWD/hostfile:/root/ssh/config \
-w /workspace \
intel/intel-optimized-tensorflow:2.15.1-pip-multinode \
bash -c 'horovodrun --verbose -np 2 -H host1:1,host2:1 /workspace/tests/tf_base_test.py'
```

> [!NOTE]
> [Intel® MPI] can be configured based on your machine settings. If the above commands do not work for you, see the documentation for how to configure based on your network.

---

The images below are [TensorFlow* Serving] with CPU Optimizations:
Expand Down Expand Up @@ -151,14 +245,16 @@ The images below are built only with CPU optimizations (GPU acceleration support

| Tag(s) | TensorFlow | ITEX | Dockerfile |
| --------------------------- | ----------- | ------------ | --------------- |
| `2.15.0-idp-base`, `latest` | [v2.15.0] | [v2.15.0.0] | [v0.4.0-Beta] |
| `2.15.1-idp-base` | [v2.15.1] | [v2.15.0.1] | [v0.4.0-Beta] |
| `2.15.0-idp-base` | [v2.15.0] | [v2.15.0.0] | [v0.4.0-Beta] |
| `2.14.0-idp-base` | [v2.14.1] | [v2.14.0.1] | [v0.3.4] |
| `2.13-idp-base` | [v2.13.0] | [v2.13.0.0] | [v0.2.3] |

The images below additionally include [Jupyter Notebook](https://jupyter.org/) server:

| Tag(s) | TensorFlow | ITEX | Dockerfile |
| -------------------- | ----------- | ------------- | --------------- |
| `2.15.1-idp-jupyter` | [v2.15.1] | [v2.15.0.1] | [v0.4.0-Beta] |
| `2.15.0-idp-jupyter` | [v2.15.0] | [v2.15.0.0] | [v0.4.0-Beta] |
| `2.14.0-idp-jupyter` | [v2.14.1] | [v2.14.0.1] | [v0.3.4] |
| `2.13-idp-jupyter` | [v2.13.0] | [v2.13.0.0] | [v0.2.3] |
Expand All @@ -167,6 +263,7 @@ The images below additionally include [Horovod]:

| Tag(s) | Tensorflow | ITEX | Horovod | Dockerfile |
| ------------------------------ | --------- | ------------ | --------- | --------------- |
| `2.15.1-idp-multinode` | [v2.15.1] | [v2.15.0.1] | [v0.28.1] | [v0.4.0-Beta] |
| `2.15.0-idp-multinode` | [v2.15.0] | [v2.15.0.0] | [v0.28.1] | [v0.4.0-Beta] |
| `2.14.0-idp-openmpi-multinode` | [v2.14.1] | [v2.14.0.1] | [v0.28.1] | [v0.3.4] |
| `2.13-idp-openmpi-mulitnode` | [v2.13.0] | [v2.13.0.0] | [v0.28.0] | [v0.2.3] |
Expand Down
Loading