From 8ea038a355412595dd487b150a989ef71a25bf0e Mon Sep 17 00:00:00 2001 From: tylertitsworth Date: Fri, 28 Jun 2024 12:48:12 -0700 Subject: [PATCH 01/12] init deepspeewd Signed-off-by: tylertitsworth --- pytorch/docker-compose.yaml | 7 ++----- pytorch/multinode-requirements.txt | 1 + 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/pytorch/docker-compose.yaml b/pytorch/docker-compose.yaml index f18afdf8d..33137d448 100644 --- a/pytorch/docker-compose.yaml +++ b/pytorch/docker-compose.yaml @@ -82,13 +82,10 @@ services: org.opencontainers.image.title: "Intel® Extension for PyTorch MultiNode Image" org.opencontainers.image.version: ${IPEX_VERSION:-2.2.0}-${PACKAGE_OPTION:-pip}-multinode target: multinode - command: > - sh -c "python -c 'import neural_compressor;import - oneccl_bindings_for_pytorch as oneccl; print(\"Neural Compressor - Version:\", neural_compressor.__version__, \"\\nOneCCL:\", - oneccl.__version__)'" + command: ds_report extends: ipex-base image: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-ipex-${IPEX_VERSION:-2.3.0}-oneccl-inc-${INC_VERSION:-2.6} + shm_size: 2gb xpu: build: args: diff --git a/pytorch/multinode-requirements.txt b/pytorch/multinode-requirements.txt index 9b93026e9..33cb8fb72 100644 --- a/pytorch/multinode-requirements.txt +++ b/pytorch/multinode-requirements.txt @@ -1,3 +1,4 @@ +deepspeed==0.14.4 oneccl_bind_pt==2.3.0+cpu -f https://developer.intel.com/ipex-whl-stable-cpu neural-compressor==2.6 From 5fcb25cda84510590b99148b1b376c046115cdb0 Mon Sep 17 00:00:00 2001 From: Sharvil Shah Date: Fri, 28 Jun 2024 13:33:54 -0700 Subject: [PATCH 02/12] Fixed multinode entrypoint to exit properly (#186) Signed-off-by: tylertitsworth --- pytorch/Dockerfile | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pytorch/Dockerfile b/pytorch/Dockerfile index c02ca9b26..395916ca2 100644 --- a/pytorch/Dockerfile +++ b/pytorch/Dockerfile @@ -123,8 +123,7 @@ set -e \n\ set -a \n\ source ~/.startup \n\ set +a \n\ -eval \"\$@\" \n\ -tail -f /dev/null" >> /usr/local/bin/dockerd-entrypoint.sh && \ +eval \"\$@\"" >> /usr/local/bin/dockerd-entrypoint.sh && \ chmod +x /usr/local/bin/dockerd-entrypoint.sh RUN echo 'HostKey /etc/ssh/ssh_host_dsa_key' > /var/run/sshd_config && \ From 815eaee93dcedd23b1dd4abe6d4ecc2bd9395a5f Mon Sep 17 00:00:00 2001 From: tylertitsworth Date: Fri, 28 Jun 2024 16:00:40 -0700 Subject: [PATCH 03/12] fix smoke test Signed-off-by: tylertitsworth --- pytorch/Dockerfile | 10 ++++++---- pytorch/docker-compose.yaml | 7 ++++++- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/pytorch/Dockerfile b/pytorch/Dockerfile index 395916ca2..f3defdb34 100644 --- a/pytorch/Dockerfile +++ b/pytorch/Dockerfile @@ -44,7 +44,8 @@ ARG TORCHVISION_VERSION WORKDIR / COPY requirements.txt . -RUN python -m pip install --no-cache-dir -r requirements.txt +RUN python -m pip install --no-cache-dir -r requirements.txt && \ + rm -rf requirements.txt FROM ${PYTHON_BASE} AS ipex-base-idp @@ -64,7 +65,8 @@ FROM ipex-base-${PACKAGE_OPTION} AS jupyter WORKDIR /jupyter COPY jupyter-requirements.txt . -RUN python -m pip install --no-cache-dir -r jupyter-requirements.txt +RUN python -m pip install --no-cache-dir -r jupyter-requirements.txt && \ + rm -rf jupyter-requirements.txt RUN mkdir -p /jupyter/ && chmod -R a+rwx /jupyter/ RUN mkdir /.local && chmod a+rwx /.local @@ -87,7 +89,8 @@ ENV SIGOPT_PROJECT=. WORKDIR / COPY multinode-requirements.txt . -RUN python -m pip install --no-cache-dir -r multinode-requirements.txt +RUN python -m pip install --no-cache-dir -r multinode-requirements.txt && \ + rm -rf multinode-requirements.txt ENV LD_LIBRARY_PATH="/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH}:/usr/local/lib/python${PYTHON_VERSION}/dist-packages/oneccl_bindings_for_pytorch/opt/mpi/libfabric/lib:/usr/local/lib/python${PYTHON_VERSION}/dist-packages/oneccl_bindings_for_pytorch/lib" @@ -143,7 +146,6 @@ RUN wget -q --no-check-certificate https://raw.githubusercontent.com/oneapi-src wget -q --no-check-certificate https://raw.githubusercontent.com/intel/neural-compressor/master/LICENSE -O /licensing/LICENSE ENTRYPOINT ["/usr/local/bin/dockerd-entrypoint.sh"] -CMD ["bash"] FROM ${PYTHON_BASE} AS ipex-xpu-base diff --git a/pytorch/docker-compose.yaml b/pytorch/docker-compose.yaml index 33137d448..bfc3ba680 100644 --- a/pytorch/docker-compose.yaml +++ b/pytorch/docker-compose.yaml @@ -82,7 +82,12 @@ services: org.opencontainers.image.title: "Intel® Extension for PyTorch MultiNode Image" org.opencontainers.image.version: ${IPEX_VERSION:-2.2.0}-${PACKAGE_OPTION:-pip}-multinode target: multinode - command: ds_report + command: > + "ds_report && + python -c 'import neural_compressor;import oneccl_bindings_for_pytorch as oneccl;import deepspeed; + print(\"Neural Compressor:\", neural_compressor.__version__, + \"\\nOneCCL:\", oneccl.__version__, + \"\\nDeepspeed:\", deepspeed.__version__)'" extends: ipex-base image: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-ipex-${IPEX_VERSION:-2.3.0}-oneccl-inc-${INC_VERSION:-2.6} shm_size: 2gb From fbad7ba405449cbcff81f0ffa46204608e70504d Mon Sep 17 00:00:00 2001 From: tylertitsworth Date: Mon, 1 Jul 2024 10:16:18 -0700 Subject: [PATCH 04/12] add deepspeed test Signed-off-by: tylertitsworth --- pytorch/Dockerfile | 6 +++++- pytorch/docker-compose.yaml | 3 +-- pytorch/multinode-requirements.txt | 3 ++- pytorch/tests/ipex-resnet50.py | 18 ++++++++++++------ pytorch/tests/tests.yaml | 16 ++++++++++++++-- 5 files changed, 34 insertions(+), 12 deletions(-) diff --git a/pytorch/Dockerfile b/pytorch/Dockerfile index f3defdb34..392d86d6b 100644 --- a/pytorch/Dockerfile +++ b/pytorch/Dockerfile @@ -80,8 +80,11 @@ FROM ipex-base-${PACKAGE_OPTION} AS multinode RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \ python3-dev \ gcc \ + g++ \ libgl1-mesa-glx \ libglib2.0-0 \ + libopenmpi-dev \ + numactl \ virtualenv ENV SIGOPT_PROJECT=. @@ -89,7 +92,8 @@ ENV SIGOPT_PROJECT=. WORKDIR / COPY multinode-requirements.txt . -RUN python -m pip install --no-cache-dir -r multinode-requirements.txt && \ +RUN DS_BUILD_OPS=1 python -m pip install --no-cache-dir -r multinode-requirements.txt && \ + echo "Y" | pip uninstall nvidia-ml-py && \ rm -rf multinode-requirements.txt ENV LD_LIBRARY_PATH="/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH}:/usr/local/lib/python${PYTHON_VERSION}/dist-packages/oneccl_bindings_for_pytorch/opt/mpi/libfabric/lib:/usr/local/lib/python${PYTHON_VERSION}/dist-packages/oneccl_bindings_for_pytorch/lib" diff --git a/pytorch/docker-compose.yaml b/pytorch/docker-compose.yaml index bfc3ba680..27dc559ca 100644 --- a/pytorch/docker-compose.yaml +++ b/pytorch/docker-compose.yaml @@ -83,8 +83,7 @@ services: org.opencontainers.image.version: ${IPEX_VERSION:-2.2.0}-${PACKAGE_OPTION:-pip}-multinode target: multinode command: > - "ds_report && - python -c 'import neural_compressor;import oneccl_bindings_for_pytorch as oneccl;import deepspeed; + python -c "'import neural_compressor;import oneccl_bindings_for_pytorch as oneccl;import deepspeed; print(\"Neural Compressor:\", neural_compressor.__version__, \"\\nOneCCL:\", oneccl.__version__, \"\\nDeepspeed:\", deepspeed.__version__)'" diff --git a/pytorch/multinode-requirements.txt b/pytorch/multinode-requirements.txt index 33cb8fb72..df74e9026 100644 --- a/pytorch/multinode-requirements.txt +++ b/pytorch/multinode-requirements.txt @@ -1,4 +1,5 @@ deepspeed==0.14.4 +mpi4py>=3.1.0 +neural-compressor==2.6 oneccl_bind_pt==2.3.0+cpu -f https://developer.intel.com/ipex-whl-stable-cpu -neural-compressor==2.6 diff --git a/pytorch/tests/ipex-resnet50.py b/pytorch/tests/ipex-resnet50.py index 5b8157230..6861c388f 100644 --- a/pytorch/tests/ipex-resnet50.py +++ b/pytorch/tests/ipex-resnet50.py @@ -30,6 +30,7 @@ parser.add_argument("--device", default="cpu", choices=["cpu", "xpu"]) parser.add_argument("--ipex", action="store_true") parser.add_argument("--backend", default="gloo", choices=["gloo", "ccl"]) +parser.add_argument("--deepspeed", action="store_true") args = parser.parse_args() try: @@ -37,12 +38,17 @@ except: pass -dist.init_process_group( - backend=args.backend, - init_method=init_method, - world_size=int(os.environ.get("WORLD_SIZE")), - rank=int(os.environ.get("RANK")), -) +if args.deepspeed: + import deepspeed + + deepspeed.init_distributed(dist_backend="mpi") +else: + dist.init_process_group( + backend=args.backend, + init_method=init_method, + world_size=int(os.environ.get("WORLD_SIZE")), + rank=int(os.environ.get("RANK")), + ) model = models.resnet50(pretrained=False) diff --git a/pytorch/tests/tests.yaml b/pytorch/tests/tests.yaml index 7ebbed262..b950763ac 100644 --- a/pytorch/tests/tests.yaml +++ b/pytorch/tests/tests.yaml @@ -26,10 +26,14 @@ import-xpu-jupyter-${PACKAGE_OPTION:-pip}: cmd: python -m jupyter --version import-cpu-oneccl-${PACKAGE_OPTION:-pip}: img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-ipex-${IPEX_VERSION:-2.3.0}-oneccl-inc-${INC_VERSION:-2.6} - cmd: python -c "import oneccl_bindings_for_pytorch as oneccl; print(f'oneccl {oneccl.__version__}')" + cmd: python -c "'import oneccl_bindings_for_pytorch as oneccl;print(oneccl.__version__)'" import-cpu-inc-${PACKAGE_OPTION:-pip}: img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-ipex-${IPEX_VERSION:-2.3.0}-oneccl-inc-${INC_VERSION:-2.6} - cmd: python -c "import neural_compressor as inc;print(inc.__version__)" + cmd: python -c "'import neural_compressor as inc;print(inc.__version__)'" +import-cpu-deepspeed-${PACKAGE_OPTION:-pip}: + img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-ipex-${IPEX_VERSION:-2.3.0}-oneccl-inc-${INC_VERSION:-2.6} + cmd: ds_report + shm_size: 2gb ipex-cpu-${PACKAGE_OPTION:-pip}: img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-ipex-${IPEX_VERSION:-2.3.0}-base cmd: python /tests/ipex-resnet50.py --ipex --device cpu --backend gloo @@ -49,6 +53,14 @@ ipex-xpu-jupyter-${PACKAGE_OPTION:-pip}: oneccl-${PACKAGE_OPTION:-pip}: img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-ipex-${IPEX_VERSION:-2.3.0}-oneccl-inc-${INC_VERSION:-2.6} cmd: ipexrun cpu /tests/ipex-resnet50.py --ipex --device cpu --backend ccl + privileged: true + volumes: + - dst: /tests + src: $PWD/pytorch/tests +oneccl-ds-${PACKAGE_OPTION:-pip}: + img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-ipex-${IPEX_VERSION:-2.3.0}-oneccl-inc-${INC_VERSION:-2.6} + cmd: ipexrun cpu /tests/ipex-resnet50.py --ipex --device cpu --backend ccl --deepspeed + privileged: true volumes: - dst: /tests src: $PWD/pytorch/tests From caac4c42ab4c1dc2231890912a697d15cfa42c37 Mon Sep 17 00:00:00 2001 From: tylertitsworth Date: Wed, 3 Jul 2024 14:15:28 -0700 Subject: [PATCH 05/12] build deepspeed ops Signed-off-by: tylertitsworth --- pytorch/Dockerfile | 3 ++- pytorch/multinode-requirements.txt | 5 +++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/pytorch/Dockerfile b/pytorch/Dockerfile index 392d86d6b..7f09cc174 100644 --- a/pytorch/Dockerfile +++ b/pytorch/Dockerfile @@ -92,7 +92,8 @@ ENV SIGOPT_PROJECT=. WORKDIR / COPY multinode-requirements.txt . -RUN DS_BUILD_OPS=1 python -m pip install --no-cache-dir -r multinode-requirements.txt && \ +RUN python -m pip install --no-cache-dir -r multinode-requirements.txt && \ + DS_BUILD_OPS=1 python -m pip install --no-cache-dir deepspeed==0.14.4 && \ echo "Y" | pip uninstall nvidia-ml-py && \ rm -rf multinode-requirements.txt diff --git a/pytorch/multinode-requirements.txt b/pytorch/multinode-requirements.txt index df74e9026..931830c31 100644 --- a/pytorch/multinode-requirements.txt +++ b/pytorch/multinode-requirements.txt @@ -1,5 +1,6 @@ -deepspeed==0.14.4 -mpi4py>=3.1.0 neural-compressor==2.6 oneccl_bind_pt==2.3.0+cpu -f https://developer.intel.com/ipex-whl-stable-cpu +# required to build deepspeed ops +oneccl-devel>=2021.13.0 +mpi4py>=3.1.0 From 700ac344744b707301f5081cb215d0af680dd20f Mon Sep 17 00:00:00 2001 From: tylertitsworth Date: Wed, 3 Jul 2024 14:27:05 -0700 Subject: [PATCH 06/12] use ccl backend Signed-off-by: tylertitsworth --- pytorch/tests/ipex-resnet50.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pytorch/tests/ipex-resnet50.py b/pytorch/tests/ipex-resnet50.py index 6861c388f..cfa03cf3a 100644 --- a/pytorch/tests/ipex-resnet50.py +++ b/pytorch/tests/ipex-resnet50.py @@ -41,7 +41,9 @@ if args.deepspeed: import deepspeed - deepspeed.init_distributed(dist_backend="mpi") + deepspeed.init_distributed( + deepspeed.accelerator.get_accelerator().communication_backend_name() + ) else: dist.init_process_group( backend=args.backend, From 28837ce19f30aaa6b01efc5c51b254527e6cc5fe Mon Sep 17 00:00:00 2001 From: Tyler Titsworth Date: Wed, 3 Jul 2024 14:31:39 -0700 Subject: [PATCH 07/12] Update docker-compose.yaml Signed-off-by: Tyler Titsworth --- pytorch/docker-compose.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/pytorch/docker-compose.yaml b/pytorch/docker-compose.yaml index 27dc559ca..e2dd4eec4 100644 --- a/pytorch/docker-compose.yaml +++ b/pytorch/docker-compose.yaml @@ -78,6 +78,7 @@ services: dependency.apt.python3-dev: true dependency.pip.apt.virtualenv: true dependency.python.pip: multinode-requirements.txt + dependency.python.deepspeed: 0.14.4 org.opencontainers.base.name: "intel/intel-optimized-pytorch:${IPEX_VERSION:-2.2.0}-${PACKAGE_OPTION:-pip}-base" org.opencontainers.image.title: "Intel® Extension for PyTorch MultiNode Image" org.opencontainers.image.version: ${IPEX_VERSION:-2.2.0}-${PACKAGE_OPTION:-pip}-multinode From 715600bebe4089a383252646316fee8204fc3cd1 Mon Sep 17 00:00:00 2001 From: tylertitsworth Date: Wed, 3 Jul 2024 14:45:23 -0700 Subject: [PATCH 08/12] fix docs Signed-off-by: tylertitsworth --- docs/scripts/matrix.py | 2 +- pytorch/docker-compose.yaml | 2 +- pytorch/multinode-requirements.txt | 5 ++--- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/docs/scripts/matrix.py b/docs/scripts/matrix.py index f03b49b93..c27c88560 100644 --- a/docs/scripts/matrix.py +++ b/docs/scripts/matrix.py @@ -76,7 +76,7 @@ def get_dependency_string(dep_type): py_reqs = re.sub(r"\n-(.*)", "", f.read()) py_reqs = re.sub(r"(.*]?)(\W=)(.*)", r"\1 \3", py_reqs) py_reqs = re.sub(r"#(.*)", "", py_reqs) - py_deps = py_deps + "\n".join(py_reqs.split("\n")) + py_deps = py_deps + "\n" + "\n".join(py_reqs.split("\n")) return os_deps, py_deps, conda_deps diff --git a/pytorch/docker-compose.yaml b/pytorch/docker-compose.yaml index e2dd4eec4..a519f2a73 100644 --- a/pytorch/docker-compose.yaml +++ b/pytorch/docker-compose.yaml @@ -78,7 +78,7 @@ services: dependency.apt.python3-dev: true dependency.pip.apt.virtualenv: true dependency.python.pip: multinode-requirements.txt - dependency.python.deepspeed: 0.14.4 + dependency.pip.deepspeed: 0.14.4 org.opencontainers.base.name: "intel/intel-optimized-pytorch:${IPEX_VERSION:-2.2.0}-${PACKAGE_OPTION:-pip}-base" org.opencontainers.image.title: "Intel® Extension for PyTorch MultiNode Image" org.opencontainers.image.version: ${IPEX_VERSION:-2.2.0}-${PACKAGE_OPTION:-pip}-multinode diff --git a/pytorch/multinode-requirements.txt b/pytorch/multinode-requirements.txt index 931830c31..8e8455a6e 100644 --- a/pytorch/multinode-requirements.txt +++ b/pytorch/multinode-requirements.txt @@ -1,6 +1,5 @@ neural-compressor==2.6 oneccl_bind_pt==2.3.0+cpu -f https://developer.intel.com/ipex-whl-stable-cpu -# required to build deepspeed ops -oneccl-devel>=2021.13.0 -mpi4py>=3.1.0 +oneccl-devel>=2021.13.0 # required to build deepspeed ops +mpi4py>=3.1.0 # required to build deepspeed ops From 80e4f385c2c4ef330a194d68fda0cba929a9d319 Mon Sep 17 00:00:00 2001 From: tylertitsworth Date: Wed, 3 Jul 2024 21:36:29 -0700 Subject: [PATCH 09/12] update reqs Signed-off-by: tylertitsworth --- pytorch/multinode-requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch/multinode-requirements.txt b/pytorch/multinode-requirements.txt index 8e8455a6e..53f579ca9 100644 --- a/pytorch/multinode-requirements.txt +++ b/pytorch/multinode-requirements.txt @@ -1,5 +1,5 @@ neural-compressor==2.6 oneccl_bind_pt==2.3.0+cpu --f https://developer.intel.com/ipex-whl-stable-cpu +--extra-index-url https://developer.intel.com/ipex-whl-stable-cpu oneccl-devel>=2021.13.0 # required to build deepspeed ops mpi4py>=3.1.0 # required to build deepspeed ops From 62c03c004518bc2b9fb27f5c392042d84a8a23d1 Mon Sep 17 00:00:00 2001 From: tylertitsworth Date: Mon, 8 Jul 2024 12:10:05 -0700 Subject: [PATCH 10/12] update smoke test output Signed-off-by: tylertitsworth --- pytorch/docker-compose.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch/docker-compose.yaml b/pytorch/docker-compose.yaml index 97bed662e..0402e2740 100644 --- a/pytorch/docker-compose.yaml +++ b/pytorch/docker-compose.yaml @@ -84,7 +84,7 @@ services: org.opencontainers.image.version: ${IPEX_VERSION:-2.2.0}-${PACKAGE_OPTION:-pip}-multinode target: multinode command: > - python -c "'import neural_compressor;import oneccl_bindings_for_pytorch as oneccl;import deepspeed; + bash -c "python -c 'import neural_compressor;import oneccl_bindings_for_pytorch as oneccl;import deepspeed; print(\"Neural Compressor:\", neural_compressor.__version__, \"\\nOneCCL:\", oneccl.__version__, \"\\nDeepspeed:\", deepspeed.__version__)'" From acfeaaee2c247169cb5ebe927703399015142159 Mon Sep 17 00:00:00 2001 From: tylertitsworth Date: Mon, 8 Jul 2024 12:13:34 -0700 Subject: [PATCH 11/12] update documentation Signed-off-by: tylertitsworth --- pytorch/README.md | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/pytorch/README.md b/pytorch/README.md index a94760fab..c8d5e91cd 100644 --- a/pytorch/README.md +++ b/pytorch/README.md @@ -237,7 +237,10 @@ To add these files correctly please follow the steps described below. ``` > [!NOTE] -> [Intel MPI](https://www.intel.com/content/www/us/en/developer/tools/oneapi/mpi-library.html) can be configured based on your machine settings. If the above commands do not work for you, see the documentation for how to configure based on your network. +> [Intel® MPI] can be configured based on your machine settings. If the above commands do not work for you, see the documentation for how to configure based on your network. + +> [!TIP] +> Additionally, [DeepSpeed*] optimizations can be utilized in place of ipexrun with the `ccl` backend for multi-node training. --- @@ -331,12 +334,14 @@ It is the image user's responsibility to ensure that any use of The images below [Intel® Data Center GPU Flex Series]: https://ark.intel.com/content/www/us/en/ark/products/series/230021/intel-data-center-gpu-flex-series.html [Intel® Data Center GPU Max Series]: https://ark.intel.com/content/www/us/en/ark/products/series/232874/intel-data-center-gpu-max-series.html +[Intel® MPI]: (https://www.intel.com/content/www/us/en/developer/tools/oneapi/mpi-library.html) [Intel® Extension for PyTorch*]: https://intel.github.io/intel-extension-for-pytorch/ [Intel® Distribution for Python*]: https://www.intel.com/content/www/us/en/developer/tools/oneapi/distribution-for-python.html [Intel® oneAPI Collective Communications Library]: https://www.intel.com/content/www/us/en/developer/tools/oneapi/oneccl.html [INC]: https://github.com/intel/neural-compressor [PyTorch*]: https://pytorch.org/ [TorchServe*]: https://github.com/pytorch/serve +[DeepSpeed*]: https://github.com/microsoft/DeepSpeed [v0.4.0-Beta]: https://github.com/intel/ai-containers/blob/main/pytorch/Dockerfile [v0.3.4]: https://github.com/intel/ai-containers/blob/v0.3.4/pytorch/Dockerfile From 8aff7894a48a22d4341ec0757fb3a4de742322a9 Mon Sep 17 00:00:00 2001 From: tylertitsworth Date: Mon, 8 Jul 2024 15:35:51 -0700 Subject: [PATCH 12/12] add handling for idp setvars Signed-off-by: tylertitsworth --- pytorch/Dockerfile | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pytorch/Dockerfile b/pytorch/Dockerfile index dc219d322..0fcc432b3 100644 --- a/pytorch/Dockerfile +++ b/pytorch/Dockerfile @@ -116,7 +116,10 @@ COPY multinode/generate_ssh_keys.sh /generate_ssh_keys.sh # modify generate_ssh_keys to be a helper script # print how to use helper script on bash startup # Avoids loop for further execution of the startup file -RUN echo "source /usr/local/lib/python${PYTHON_VERSION}/dist-packages/oneccl_bindings_for_pytorch/env/setvars.sh" >> ~/.startup && \ +ARG PACKAGE_OPTION=pip +ARG PYPATH="/usr/local/lib/python${PYTHON_VERSION}/dist-packages" +RUN if [ "${PACKAGE_OPTION}" = "idp" ]; then PYPATH="/opt/conda/envs/idp/lib/python${PYTHON_VERSION}/site-packages"; fi && \ + echo "source ${PYPATH}/oneccl_bindings_for_pytorch/env/setvars.sh" >> ~/.startup && \ cat '/generate_ssh_keys.sh' >> ~/.startup && \ rm -rf /generate_ssh_keys.sh