From 27b67a2025b9804ebbd736dc991a7f930abea748 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 16 Aug 2024 21:16:01 +0000 Subject: [PATCH 1/4] Bump the tensorflow group across 1 directory with 8 updates Bumps the tensorflow group with 8 updates in the /tensorflow directory: | Package | From | To | | --- | --- | --- | | [tensorflow](https://github.com/tensorflow/tensorflow) | `2.15.0` | `2.17.0` | | [jupyterlab](https://github.com/jupyterlab/jupyterlab) | `4.3.0a0` | `4.3.0b0` | | [notebook](https://github.com/jupyter/notebook) | `7.3.0a0` | `7.3.0a1` | | [neural-compressor](https://github.com/intel/neural-compressor) | `2.6` | `3.0` | | [intel-extension-for-tensorflow[cpu]](https://github.com/intel/intel-extension-for-tensorflow) | `2.15.0.0` | `2.15.0.1` | | [pillow](https://github.com/python-pillow/Pillow) | `10.3.0` | `10.4.0` | | [numpy](https://github.com/numpy/numpy) | `2.0.0` | `2.0.1` | | [tensorflow-serving-api](http://tensorflow.org/serving) | `2.16.1` | `2.17.0` | Updates `tensorflow` from 2.15.0 to 2.17.0 - [Release notes](https://github.com/tensorflow/tensorflow/releases) - [Changelog](https://github.com/tensorflow/tensorflow/blob/master/RELEASE.md) - [Commits](https://github.com/tensorflow/tensorflow/compare/v2.15.0...v2.17.0) Updates `jupyterlab` from 4.3.0a0 to 4.3.0b0 - [Release notes](https://github.com/jupyterlab/jupyterlab/releases) - [Changelog](https://github.com/jupyterlab/jupyterlab/blob/main/CHANGELOG.md) - [Commits](https://github.com/jupyterlab/jupyterlab/compare/@jupyterlab/lsp@4.3.0-alpha.0...@jupyterlab/lsp@4.3.0-beta.0) Updates `notebook` from 7.3.0a0 to 7.3.0a1 - [Release notes](https://github.com/jupyter/notebook/releases) - [Changelog](https://github.com/jupyter/notebook/blob/main/CHANGELOG.md) - [Commits](https://github.com/jupyter/notebook/compare/@jupyter-notebook/tree@7.3.0-alpha.0...@jupyter-notebook/tree@7.3.0-alpha.1) Updates `neural-compressor` from 2.6 to 3.0 - [Release notes](https://github.com/intel/neural-compressor/releases) - [Commits](https://github.com/intel/neural-compressor/compare/v2.6...v3.0) Updates `intel-extension-for-tensorflow[cpu]` from 2.15.0.0 to 2.15.0.1 - [Release notes](https://github.com/intel/intel-extension-for-tensorflow/releases) - [Commits](https://github.com/intel/intel-extension-for-tensorflow/compare/v2.15.0.0...v2.15.0.1) Updates `pillow` from 10.3.0 to 10.4.0 - [Release notes](https://github.com/python-pillow/Pillow/releases) - [Changelog](https://github.com/python-pillow/Pillow/blob/main/CHANGES.rst) - [Commits](https://github.com/python-pillow/Pillow/compare/10.3.0...10.4.0) Updates `numpy` from 2.0.0 to 2.0.1 - [Release notes](https://github.com/numpy/numpy/releases) - [Changelog](https://github.com/numpy/numpy/blob/main/doc/RELEASE_WALKTHROUGH.rst) - [Commits](https://github.com/numpy/numpy/compare/v2.0.0...v2.0.1) Updates `tensorflow-serving-api` from 2.16.1 to 2.17.0 --- updated-dependencies: - dependency-name: tensorflow dependency-type: direct:production update-type: version-update:semver-minor dependency-group: tensorflow - dependency-name: jupyterlab dependency-type: direct:production update-type: version-update:semver-patch dependency-group: tensorflow - dependency-name: notebook dependency-type: direct:production update-type: version-update:semver-patch dependency-group: tensorflow - dependency-name: neural-compressor dependency-type: direct:production update-type: version-update:semver-major dependency-group: tensorflow - dependency-name: intel-extension-for-tensorflow[cpu] dependency-type: direct:production update-type: version-update:semver-patch dependency-group: tensorflow - dependency-name: pillow dependency-type: direct:production update-type: version-update:semver-minor dependency-group: tensorflow - dependency-name: numpy dependency-type: direct:production update-type: version-update:semver-patch dependency-group: tensorflow - dependency-name: tensorflow-serving-api dependency-type: direct:production update-type: version-update:semver-minor dependency-group: tensorflow ... Signed-off-by: dependabot[bot] --- tensorflow/jupyter-requirements.txt | 4 ++-- tensorflow/multinode-requirements.txt | 2 +- tensorflow/requirements.txt | 6 +++--- tensorflow/serving/requirements.txt | 8 ++++---- tensorflow/xpu-requirements.txt | 2 +- 5 files changed, 11 insertions(+), 11 deletions(-) diff --git a/tensorflow/jupyter-requirements.txt b/tensorflow/jupyter-requirements.txt index 23a738859..e95ad6e83 100644 --- a/tensorflow/jupyter-requirements.txt +++ b/tensorflow/jupyter-requirements.txt @@ -1,4 +1,4 @@ -jupyterlab==4.3.0a0 +jupyterlab==4.3.0b0 jupyterhub==5.1.0 -notebook==7.3.0a0 +notebook==7.3.0a1 jupyter-server-proxy>=4.1.2 diff --git a/tensorflow/multinode-requirements.txt b/tensorflow/multinode-requirements.txt index d9cff3697..f5381669b 100644 --- a/tensorflow/multinode-requirements.txt +++ b/tensorflow/multinode-requirements.txt @@ -1,3 +1,3 @@ cython tf2onnx -neural-compressor==2.6 +neural-compressor==3.0 diff --git a/tensorflow/requirements.txt b/tensorflow/requirements.txt index 9b50ec785..4d1838312 100644 --- a/tensorflow/requirements.txt +++ b/tensorflow/requirements.txt @@ -1,4 +1,4 @@ -tensorflow==2.15.0 -intel-extension-for-tensorflow[cpu]==2.15.0.0 +tensorflow==2.17.0 +intel-extension-for-tensorflow[cpu]==2.15.0.1 tensorflow-hub==0.16.1 -pillow==10.3.0 +pillow==10.4.0 diff --git a/tensorflow/serving/requirements.txt b/tensorflow/serving/requirements.txt index cf28053cb..cd80fbcd9 100644 --- a/tensorflow/serving/requirements.txt +++ b/tensorflow/serving/requirements.txt @@ -1,5 +1,5 @@ -numpy==2.0.0 -pillow==10.3.0 +numpy==2.0.1 +pillow==10.4.0 requests==2.32.3 -tensorflow==2.16.1 -tensorflow-serving-api==2.16.1 +tensorflow==2.17.0 +tensorflow-serving-api==2.17.0 diff --git a/tensorflow/xpu-requirements.txt b/tensorflow/xpu-requirements.txt index 0280ef9d5..2c7e3f7a9 100644 --- a/tensorflow/xpu-requirements.txt +++ b/tensorflow/xpu-requirements.txt @@ -1,2 +1,2 @@ -tensorflow==2.15.1 +tensorflow==2.17.0 intel-extension-for-tensorflow[xpu]==2.15.0.1 From 3fbabb9c28af2a4ce97aff0a08b79e8d2a5fec24 Mon Sep 17 00:00:00 2001 From: tylertitsworth Date: Fri, 16 Aug 2024 16:07:29 -0700 Subject: [PATCH 2/4] update tf Signed-off-by: tylertitsworth --- docs/classical-ml/README.md | 117 ++++++ docs/index.md | 173 ++++++++ docs/preset/README.md | 167 ++++++++ docs/python/README.md | 44 +++ docs/pytorch/README.md | 434 +++++++++++++++++++++ docs/tensorflow/README.md | 353 +++++++++++++++++ docs/workflows/README.md | 36 ++ tensorflow/Dockerfile | 140 +++---- tensorflow/README.md | 103 ++++- tensorflow/docker-compose.yaml | 18 +- tensorflow/hvd-requirements.txt | 1 - tensorflow/jupyter-requirements.txt | 4 +- tensorflow/multinode-requirements.txt | 3 - tensorflow/multinode/dockerd-entrypoint.sh | 21 + tensorflow/multinode/generate_ssh_keys.sh | 28 ++ tensorflow/multinode/requirements.txt | 5 + tensorflow/multinode/ssh_config | 4 + tensorflow/multinode/sshd_config | 12 + tensorflow/ompi-requirements.txt | 1 - tensorflow/requirements.txt | 4 +- tensorflow/xpu-requirements.txt | 2 +- 21 files changed, 1555 insertions(+), 115 deletions(-) create mode 100644 docs/classical-ml/README.md create mode 100644 docs/index.md create mode 100644 docs/preset/README.md create mode 100644 docs/python/README.md create mode 100644 docs/pytorch/README.md create mode 100644 docs/tensorflow/README.md create mode 100644 docs/workflows/README.md delete mode 100644 tensorflow/hvd-requirements.txt delete mode 100644 tensorflow/multinode-requirements.txt create mode 100755 tensorflow/multinode/dockerd-entrypoint.sh create mode 100755 tensorflow/multinode/generate_ssh_keys.sh create mode 100644 tensorflow/multinode/requirements.txt create mode 100644 tensorflow/multinode/ssh_config create mode 100644 tensorflow/multinode/sshd_config delete mode 100644 tensorflow/ompi-requirements.txt diff --git a/docs/classical-ml/README.md b/docs/classical-ml/README.md new file mode 100644 index 000000000..9d63355c4 --- /dev/null +++ b/docs/classical-ml/README.md @@ -0,0 +1,117 @@ +# Intel® Optimized ML + +[Intel® Extension for Scikit-learn*] enhances the performance of [Scikit-learn*] by accelerating the training and inference of machine learning models on Intel® hardware. + +[XGBoost*] is an optimized distributed gradient boosting library designed to be highly efficient, flexible and portable. + +## Images + +The images below include [Intel® Extension for Scikit-learn*] and [XGBoost*]. + +| Tag(s) | Intel SKLearn | Scikit-learn | XGBoost | Dockerfile | +| ------------------------------------------------- | -------------- | ------------ | -------- | --------------- | +| `2024.6.0-pip-base`, `latest` | [v2024.6.0] | [v1.5.0] | [v2.1.0] | [v0.4.0] | +| `2024.5.0-pip-base` | [v2024.5.0] | [v1.5.0] | [v2.1.0] | [v0.4.0] | +| `2024.3.0-pip-base` | [v2024.3.0] | [v1.4.2] | [v2.0.3] | [v0.4.0-Beta] | +| `2024.2.0-xgboost-2.0.3-pip-base` | [v2024.2.0] | [v1.4.1] | [v2.0.3] | [v0.4.0-Beta] | +| `scikit-learning-2024.0.0-xgboost-2.0.2-pip-base` | [v2024.0.0] | [v1.3.2] | [v2.0.2] | [v0.3.4] | + +The images below additionally include [Jupyter Notebook](https://jupyter.org/) server: + +| Tag(s) | Intel SKLearn | Scikit-learn | XGBoost | Dockerfile | +| ---------------------------------------------------- | -------------- | ------------ | -------- | --------------- | +| `2024.6.0-pip-jupyter` | [v2024.6.0] | [v1.5.1] | [v2.1.1] | [v0.4.0] | +| `2024.5.0-pip-jupyter` | [v2024.5.0] | [v1.5.0] | [v2.1.0] | [v0.4.0] | +| `2024.3.0-pip-jupyter` | [v2024.3.0] | [v1.4.2] | [v2.0.3] | [v0.4.0-Beta] | +| `2024.2.0-xgboost-2.0.3-pip-jupyter` | [v2024.2.0] | [v1.4.1] | [v2.0.3] | [v0.4.0-Beta] | +| `scikit-learning-2024.0.0-xgboost-2.0.2-pip-jupyter` | [v2024.0.0] | [v1.3.2] | [v2.0.2] | [v0.3.4] | + +### Run the Jupyter Container + +```bash +docker run -it --rm \ + -p 8888:8888 \ + --net=host \ + -v $PWD/workspace:/workspace \ + -w /workspace \ + intel/intel-optimized-ml:2024.2.0-xgboost-2.0.3-pip-jupyter +``` + +After running the command above, copy the URL (something like `http://127.0.0.1:$PORT/?token=***`) into your browser to access the notebook server. + +## Images with Intel® Distribution for Python* + +The images below include [Intel® Distribution for Python*]: + +| Tag(s) | Intel SKLearn | Scikit-learn | XGBoost | Dockerfile | +| ------------------------------------------------- | -------------- | ------------ | -------- | --------------- | +| `2024.6.0-idp-base` | [v2024.6.0] | [v1.5.1] | [v2.1.1] | [v0.4.0] | +| `2024.5.0-idp-base` | [v2024.5.0] | [v1.5.0] | [v2.1.0] | [v0.4.0] | +| `2024.3.0-idp-base` | [v2024.3.0] | [v1.4.1] | [v2.1.0] | [v0.4.0] | +| `2024.2.0-xgboost-2.0.3-idp-base` | [v2024.2.0] | [v1.4.1] | [v2.0.3] | [v0.4.0-Beta] | +| `scikit-learning-2024.0.0-xgboost-2.0.2-idp-base` | [v2024.0.0] | [v1.3.2] | [v2.0.2] | [v0.3.4] | + +The images below additionally include [Jupyter Notebook](https://jupyter.org/) server: + +| Tag(s) | Intel SKLearn | Scikit-learn | XGBoost | Dockerfile | +| ---------------------------------------------------- | -------------- | ------------ | -------- | --------------- | +| `2024.6.0-idp-jupyter` | [v2024.6.0] | [v1.5.1] | [v2.1.1] | [v0.4.0] | +| `2024.5.0-idp-jupyter` | [v2024.5.0] | [v1.5.0] | [v2.1.0] | [v0.4.0] | +| `2024.3.0-idp-jupyter` | [v2024.3.0] | [v1.4.0] | [v2.1.0] | [v0.4.0] | +| `2024.2.0-xgboost-2.0.3-idp-jupyter` | [v2024.2.0] | [v1.4.1] | [v2.0.3] | [v0.4.0-Beta] | +| `scikit-learning-2024.0.0-xgboost-2.0.2-idp-jupyter` | [v2024.0.0] | [v1.3.2] | [v2.0.2] | [v0.3.4] | + +## Build from Source + +To build the images from source, clone the [Intel® AI Containers](https://github.com/intel/ai-containers) repository, follow the main `README.md` file to setup your environment, and run the following command: + +```bash +cd classical-ml +docker compose build ml-base +docker compose run ml-base +``` + +You can find the list of services below for each container in the group: + +| Service Name | Description | +| ------------ | ------------------------------------------------------------------- | +| `ml-base` | Base image with [Intel® Extension for Scikit-learn*] and [XGBoost*] | +| `jupyter` | Adds Jupyter Notebook server | + +## License + +View the [License](https://github.com/intel/ai-containers/blob/main/LICENSE) for the [Intel® Distribution for Python]. + +The images below also contain other software which may be under other licenses (such as Pytorch*, Jupyter*, Bash, etc. from the base). + +It is the image user's responsibility to ensure that any use of The images below comply with any relevant licenses for all software contained within. + +\* Other names and brands may be claimed as the property of others. + + + +[Intel® Extension for Scikit-learn*]: https://www.intel.com/content/www/us/en/developer/tools/oneapi/scikit-learn.html +[Intel® Distribution for Python]: https://www.intel.com/content/www/us/en/developer/tools/oneapi/distribution-for-python.html#gs.9bos9m +[Scikit-learn*]: https://scikit-learn.org/stable/ +[XGBoost*]: https://github.com/dmlc/xgboost + +[v2024.6.0]: https://github.com/intel/scikit-learn-intelex/releases/tag/2024.6.0 +[v2024.5.0]: https://github.com/intel/scikit-learn-intelex/releases/tag/2024.5.0 +[v2024.3.0]: https://github.com/intel/scikit-learn-intelex/releases/tag/2024.3.0 +[v2024.2.0]: https://github.com/intel/scikit-learn-intelex/releases/tag/2024.2.0 +[v2024.0.0]: https://github.com/intel/scikit-learn-intelex/releases/tag/2024.0.0 + +[v1.5.1]: https://github.com/scikit-learn/scikit-learn/releases/tag/1.5.1 +[v1.5.0]: https://github.com/scikit-learn/scikit-learn/releases/tag/1.5.0 +[v1.4.2]: https://github.com/scikit-learn/scikit-learn/releases/tag/1.4.2 +[v1.4.1]: https://github.com/scikit-learn/scikit-learn/releases/tag/1.4.1 +[v1.3.2]: https://github.com/scikit-learn/scikit-learn/releases/tag/1.3.2 + +[v2.1.1]: https://github.com/dmlc/xgboost/releases/tag/v2.1.1 +[v2.1.0]: https://github.com/dmlc/xgboost/releases/tag/v2.1.0 +[v2.0.3]: https://github.com/dmlc/xgboost/releases/tag/v2.0.3 +[v2.0.2]: https://github.com/dmlc/xgboost/releases/tag/v2.0.2 + +[v0.4.0]: https://github.com/intel/ai-containers/blob/v0.4.0/classical-ml/Dockerfile +[v0.4.0-Beta]: https://github.com/intel/ai-containers/blob/v0.4.0-Beta/classical-ml/Dockerfile +[v0.3.4]: https://github.com/intel/ai-containers/blob/v0.3.4/classical-ml/Dockerfile diff --git a/docs/index.md b/docs/index.md new file mode 100644 index 000000000..237051230 --- /dev/null +++ b/docs/index.md @@ -0,0 +1,173 @@ +# Intel® AI Containers + +[![OpenSSF Best Practices](https://www.bestpractices.dev/projects/8270/badge)](https://www.bestpractices.dev/projects/8270) +[![OpenSSF Scorecard](https://api.securityscorecards.dev/projects/github.com/intel/ai-containers/badge)](https://securityscorecards.dev/viewer/?uri=github.com/intel/ai-containers) +[![pre-commit.ci status](https://results.pre-commit.ci/badge/github/intel/ai-containers/main.svg)](https://results.pre-commit.ci/latest/github/intel/ai-containers/main) +[![Coverage Status](https://coveralls.io/repos/github/intel/ai-containers/badge.svg?branch=main)](https://coveralls.io/github/intel/ai-containers?branch=main) +[![CodeQL](https://github.com/intel/ai-containers/actions/workflows/github-code-scanning/codeql/badge.svg?branch=main)](https://github.com/intel/ai-containers/actions/workflows/github-code-scanning/codeql) +[![Docs](https://github.com/intel/ai-containers/actions/workflows/docs.yaml/badge.svg?branch=main)](https://github.com/intel/ai-containers/actions/workflows/docs.yaml) +[![Lint](https://github.com/intel/ai-containers/actions/workflows/lint.yaml/badge.svg?branch=main)](https://github.com/intel/ai-containers/actions/workflows/lint.yaml) +[![Test Runner CI](https://github.com/intel/ai-containers/actions/workflows/test-runner-ci.yaml/badge.svg?branch=main)](https://github.com/intel/ai-containers/actions/workflows/test-runner-ci.yaml) +[![Helm Chart CI](https://github.com/intel/ai-containers/actions/workflows/chart-ci.yaml/badge.svg?event=merge_group)](https://github.com/intel/ai-containers/actions/workflows/chart-ci.yaml) +[![Weekly Tests](https://github.com/intel/ai-containers/actions/workflows/weekly-test.yaml/badge.svg?branch=main)](https://github.com/intel/ai-containers/actions/workflows/weekly-test.yaml) + +This repository contains Dockerfiles, scripts, yaml files, Helm charts, etc. used to scale out AI containers with versions of TensorFlow and PyTorch that have been optimized for Intel platforms. Scaling is done with python, Docker, kubernetes, kubeflow, cnvrg.io, Helm, and other container orchestration frameworks for use in the cloud and on-premise. + +## Project Setup + +Define your project's registry and repository each time you use the project: + +```bash +# REGISTRY/REPO:TAG +export REGISTRY= +export REPO= + +docker login $REGISTRY + +# Verify your access permissions +docker pull $REGISTRY/$REPO:latest +``` + +The maintainers of Intel® AI Containers use Azure to store containers, but an open source container registry like [harbor](https://github.com/goharbor/harbor) is preferred. + +> [!WARNING] +> You can optionally skip this step and use some placeholder values, however some container groups depend on other images and will pull from a registry that you have not defined and result in an error. + +### Set Up Docker Engine + +You'll need to install Docker Engine on your development system. Note that while **Docker Engine** is free to use, **Docker Desktop** may require you to purchase a license. See the [Docker Engine Server installation instructions](https://docs.docker.com/engine/install/#server) for details. + +### Set Up Docker Compose + +Ensure you have Docker Compose installed on your machine. If you don't have this tool installed, consult the official [Docker Compose installation documentation](https://docs.docker.com/compose/install/linux/#install-the-plugin-manually). + +```bash +DOCKER_CONFIG=${DOCKER_CONFIG:-$HOME/.docker} +mkdir -p $DOCKER_CONFIG/cli-plugins +curl -SL https://github.com/docker/compose/releases/download/v2.26.1/docker-compose-linux-x86_64 -o $DOCKER_CONFIG/cli-plugins/docker-compose +chmod +x $DOCKER_CONFIG/cli-plugins/docker-compose +docker compose version +``` + +> [!CAUTION] +> Docker compose `v2.25.0` is the minimum required version for some container groups. + +## Build Containers + +Select your framework of choice (TensorFlow*, PyTorch*, Classical ML) and run the docker compose commands: + +```bash +cd +docker compose up --build +``` + +To configure these containers, simply append the relevant environment variable to the docker compose command based on the build arguments in the compose file. For example: + +```bash +# I want to build ipex-base with Intel® Distribution for Python +cd pytorch +PACKAGE_OPTION=idp docker compose up --build ipex-base +``` + +> [!NOTE] +> If you didn't specify `REGISTRY` or `REPO`, you also need to add the `idp` service to the list to build the dependent python image. + +## Test Containers + +To test the containers, use the [Test Runner Framework](https://github.com/intel/ai-containers/tree/main/test-runner): + +```bash +# I want to test ipex-base with Intel® Distribution for Python +# 1. build the container in the above section +# 2. push it to a relevant registry +PACKAGE_OPTION=idp docker compose push ipex-base +cd .. +# 3. install the test runner python requirements +pip install -r test-runner/requirements.txt +# 4. Run the test file +PACKAGE_OPTION=idp python test-runner/test_runner.py -f pytorch/tests/tests.yaml +``` + +> [!TIP] +> To test a container built by GitHub Actions CI/CD, find the `run number` associated with the workflow run and set the `GITHUB_RUN_NUMBER` environment variable during execution to pull the desired image. + +## Deploy Containers + +### Install [Helm](https://helm.sh/docs/intro/install/) + +This assumes you've setup [kubectl](https://kubernetes.io/docs/tasks/tools/#kubectl) and have a `KUBECONFIG`. + +```bash +curl -fsSL -o get_helm.sh https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 && \ +chmod 700 get_helm.sh && \ +./get_helm.sh +``` + +### Deploy a Helm Chart + +```bash +cd workflows/charts +# Select a Chart and check its README for a list of customization options and other steps required. +helm install \ + --namespace= \ + --set = \ + +``` + +### Test a Helm Chart + +Install [Chart Testing](https://github.com/helm/chart-testing). + +```bash +pip install -r workflows/charts/dev-requirements.txt +brew install chart-testing +``` + +Utilize the `ct` CLI to run `helm lint`, `helm install`, and `helm test`. + +```bash +ct lint-and-install --namespace= --config .github/ct.yaml --charts workflow/charts/ +``` + +## Troubleshooting + +- See the [Docker Troubleshooting Article](https://docs.docker.com/engine/install/troubleshoot/). +- Verify that [Docker Engine Post-Install Steps](https://docs.docker.com/engine/install/linux-postinstall/) are completed. +- When facing socket error check the group membership of the user and ensure they are part of the `docker` group. +- After changing any docker files or configs, restart the docker service `sudo systemctl restart docker`. +- Enable [Docker Desktop for WSL 2](https://docs.docker.com/desktop/windows/wsl/). +- If you are trying to access a container UI from the browser, make sure you have [port forwarded](https://code.visualstudio.com/docs/remote/ssh#_forwarding-a-port-creating-ssh-tunnel) and reconnect. +- If your environment requires a proxy to access the internet, export your development system's proxy settings to the docker environment: + +```bash +export DOCKER_BUILD_ARGS="--build-arg ftp_proxy=${ftp_proxy} \ + --build-arg FTP_PROXY=${FTP_PROXY} --build-arg http_proxy=${http_proxy} \ + --build-arg HTTP_PROXY=${HTTP_PROXY} --build-arg https_proxy=${https_proxy} \ + --build-arg HTTPS_PROXY=${HTTPS_PROXY} --build-arg no_proxy=${no_proxy} \ + --build-arg NO_PROXY=${NO_PROXY} --build-arg socks_proxy=${socks_proxy} \ + --build-arg SOCKS_PROXY=${SOCKS_PROXY}" +``` + +```bash +export DOCKER_RUN_ENVS="-e ftp_proxy=${ftp_proxy} \ + -e FTP_PROXY=${FTP_PROXY} -e http_proxy=${http_proxy} \ + -e HTTP_PROXY=${HTTP_PROXY} -e https_proxy=${https_proxy} \ + -e HTTPS_PROXY=${HTTPS_PROXY} -e no_proxy=${no_proxy} \ + -e NO_PROXY=${NO_PROXY} -e socks_proxy=${socks_proxy} \ + -e SOCKS_PROXY=${SOCKS_PROXY}" +``` + +```bash +docker build $DOCKER_BUILD_ARGS -t my:tag . +docker run $DOCKER_RUN_ENVS --rm -it my:tag +``` + +## Support + +The Intel AI MLOps team tracks bugs and enhancement requests using +[GitHub issues](https://github.com/intel/ai-containers/issues). Before submitting a +suggestion or bug report, search the existing GitHub issues to see if your issue has already been reported. + +--- + +- [Trademarks](http://www.intel.com/content/www/us/en/legal/trademarks.html) diff --git a/docs/preset/README.md b/docs/preset/README.md new file mode 100644 index 000000000..b2a31dd82 --- /dev/null +++ b/docs/preset/README.md @@ -0,0 +1,167 @@ +# AI Tools Selector Preset Containers + +AI Tools Selector Preset Containers provides data scientists and developers with environment to perform various data-science tasks such as data analysis, data processing, machine learning and deep learning models training and inference. Each container is equipped with the Python packages and tools suited for each task. More detail about each container is described in the table below. + +## Preset Containers + +| Preset Container Name | Purpose | Tools | Image Name | +| -----------------------------| ------------- | ------------- | ----------------- | +| Data Analytics | Perform large scale data analysis |[Modin*](https://github.com/modin-project/modin), [Intel® Dataset Librarian](https://github.com/IntelAI/models/tree/master/datasets/dataset_api), [Intel® Data Connector](https://github.com/IntelAI/models/tree/master/datasets/cloud_data_connector) | [`intel/data-analytics:latest-py3.9`](https://hub.docker.com/r/intel/data-analytics/tags)
[`intel/data-analytics:latest-py3.10`](https://hub.docker.com/r/intel/data-analytics/tags) | +| Classical ML | Train classical-ml models using scikit, modin and xgboost |[Intel® extension for SciKit Learn](https://github.com/intel/scikit-learn-intelex), [Intel® Optimization for XGBoost*](https://github.com/dmlc/xgboost), [Modin*](https://github.com/modin-project/modin),
[Intel® Dataset Librarian](https://github.com/IntelAI/models/tree/master/datasets/dataset_api), [Intel® Data Connector](https://github.com/IntelAI/models/tree/master/datasets/cloud_data_connector) | [`intel/classical-ml:latest-py3.9`](https://hub.docker.com/r/intel/classical-ml/tags)
[`intel/classical-ml:latest-py3.10`](https://hub.docker.com/r/intel/classical-ml/tags) | +| Deep Learning | Train large scale Deep Learning models with Tensorflow or PyTorch | [Intel® Extension for PyTorch](https://github.com/intel/intel-extension-for-pytorch), [Intel® Extension for Tensorflow](https://github.com/intel/intel-extension-for-tensorflow),
[Intel® Optimization for Horovod](https://github.com/intel/intel-optimization-for-horovod), [Intel® Dataset Librarian](https://github.com/IntelAI/models/tree/master/datasets/dataset_api), [Intel® Data Connector](https://github.com/IntelAI/models/tree/master/datasets/cloud_data_connector), [Intel® Extension for DeepSpeed](https://github.com/intel/intel-extension-for-deepspeed) | [`intel/deep-learning:latest-py3.9`](https://hub.docker.com/r/intel/deep-learning/tags)
[`intel/deep-learning:latest-py3.10`](https://hub.docker.com/r/intel/deep-learning/tags) | +| Inference Optimization | Optimize Deep Learning models for inference
using Intel® Neural Compressor | [Intel® Extension for PyTorch](https://github.com/intel/intel-extension-for-pytorch), [Intel® Extension for Tensorflow](https://github.com/intel/intel-extension-for-tensorflow),
[Intel® Neural Compressor](https://github.com/intel/neural-compressor), [Intel® Dataset Librarian](https://github.com/IntelAI/models/tree/master/datasets/dataset_api), [Intel® Data Connector](https://github.com/IntelAI/models/tree/master/datasets/cloud_data_connector) | [`intel/inference-optimization:latest-py3.9`](https://hub.docker.com/r/intel/inference-optimization/tags)
[`intel/inference-optimization:latest-py3.10`](https://hub.docker.com/r/intel/inference-optimization/tags) | + +## Prerequisites + +1. Make sure [docker](https://docs.docker.com/engine/) is installed on the machine. Follow the [instruction here](https://docs.docker.com/engine/install/) to install docker engine on a host machine. + +2. Pull a Preset Container of your choice from the [AI Tools Selector](https://www.intel.com/content/www/us/en/developer/tools/oneapi/ai-tools-selector.html) or from the [table](#preset-containers). The commands below use the `deep-learning` preset as an example. + +```bash +docker pull intel/deep-learning:latest-py3.9 +``` + +## Run Preset Container + +There are 3 modes to run these containers: + +* [Interactive](#run-in-interactive-mode) +* [Jupyter](#run-using-jupyter-notebook) +* [Multi-Node Distributed Training](#run-in-multi-node-distributed-mode-advanced) (Deep Learning and Inference Optimization) + +> [!NOTE] +> Modify the commands below to fit your use case, especially the image, environment variables, and GPU device path. + +### Run in Interactive Mode + +This mode allows running the container in an interactive shell. This enables the ability to interact with the container's bash shell. Below is the command to start the container in interactive mode: + +#### Run on CPU + +```bash +docker run -it --rm \ + --shm-size=12G \ + -v ${PWD}:/home/dev/workdir \ + intel/deep-learning:latest-py3.9 bash +``` + +> [!NOTE] +> Certain applications use shared memory to share data between processes. But the default shared memory segment size is 64M for docker containers, and is not enough for multithreaded applications (Ex. Modin*). Docker recommends increasing shared memory size using `--shm-size`. + +#### Run on GPU + +Find your machine's `RENDER` and `VIDEO` group values to enable [Intel® Flex/Max GPU](https://www.intel.com/content/www/us/en/products/details/discrete-gpus/data-center-gpu.html). + +```bash +RENDER=$(getent group render | sed -E 's,^render:[^:]*:([^:]*):.*$,\1,') +VIDEO=$(getent group video | sed -E 's,^video:[^:]*:([^:]*):.*$,\1,') +test -z "$RENDER" || RENDER_GROUP="--group-add ${RENDER}" +test -z "$VIDEO" || VIDEO_GROUP="--group-add ${VIDEO}" +``` + +```bash +docker run -it --rm \ + ${RENDER_GROUP} \ + ${VIDEO_GROUP} \ + --device=/dev/dri \ + --shm-size=12G \ + -v ${PWD}:/home/dev/workdir \ + -v /dev/dri/by-path:/dev/dri/by-path \ + intel/deep-learning:latest-py3.9 bash +``` + +> [!NOTE] +> Certain applications use shared memory to share data between processes. But the default shared memory segment size is 64M for docker containers, and is not enough for multithreaded applications(Ex. Modin). Docker recommends increasing shared memory size using `--shm-size`. + +#### Next Steps + +1. For Deep Learning and Inference Optimization containers there will be separate conda environments for each AI framework: `pytorch-cpu`, `pytorch-gpu` and `tensorflow`. Use the command below to activate one environment: + + ```bash + conda activate + ``` + +2. Select a test from the `sample-tests` folder and run it using the following command as an example: + + ```bash + bash sample-tests/onnx/run.sh + # or if no bash script is found + python sample-tests/intel_extension_for_tensorflow/test_itex.py + ``` + +### Run using Jupyter Notebook + +This mode launches a jupyterlab notebook server. The command below will start the jupyterlab server which can be accessed from a web browser. Each container includes jupyter kernel to enable conda environment in jupyter notebook. The port for this server is `8888` and is exposed by default when you run the container. + +> [!NOTE] +> When launching a jupyter notebook server this way, docker assigns a [network](https://docs.docker.com/engine/tutorials/networkingcontainers/) such that the container can communicate with other applications like a web browser. By default docker launches containers with the `bridge` network, but if you are trying to access this server from a machine you are `ssh`'ing into, change the network mode with the `--net=host` flag and ensure you are local port forwarding with `ssh -L 8888:8888`. + +#### Run on Jupyter CPU + +```bash +docker run -it --rm \ + --shm-size=12G \ + -v ${PWD}:/home/dev/workdir \ + intel/deep-learning:latest-py3.9 +``` + +> [!NOTE] +> Certain applications use shared memory to share data between processes. But the default shared memory segment size is 64M for docker containers, and is not enough for multithreaded applications (Ex. Modin*). Docker recommends increasing shared memory size using `--shm-size`. + +#### Run on Jupyter GPU + +Find your machine's `RENDER` and `VIDEO` group values to enable [Intel® Flex/Max GPU](https://www.intel.com/content/www/us/en/products/details/discrete-gpus/data-center-gpu.html). + +```bash +RENDER=$(getent group render | sed -E 's,^render:[^:]*:([^:]*):.*$,\1,') +VIDEO=$(getent group video | sed -E 's,^video:[^:]*:([^:]*):.*$,\1,') +test -z "$RENDER" || RENDER_GROUP="--group-add ${RENDER}" +test -z "$VIDEO" || VIDEO_GROUP="--group-add ${VIDEO}" +``` + +```bash +docker run -it --rm \ + ${RENDER_GROUP} \ + ${VIDEO_GROUP} \ + --device=/dev/dri \ + --shm-size=12G \ + -v ${PWD}:/home/dev/workdir \ + -v /dev/dri/by-path:/dev/dri/by-path \ + intel/deep-learning:latest-py3.9 +``` + +> [!NOTE] +> Certain applications use shared memory to share data between processes. But the default shared memory segment size is 64M for docker containers, and is not enough for multithreaded applications (Ex. Modin*). Docker recommends increasing shared memory size using `--shm-size`. + +#### Next Steps + +1. After running this command the terminal should display an output similar to displayed below in the image ![image](https://github.com/intel/ai-containers/assets/18349036/0a8a2d05-f7b0-4a9f-994e-bcc4e4b703a0) The server address together with the port set can be used to connect to the jupyter server in a web browser. For example `http://127.0.0.1:8888`. The token displayed after the `token=` can be used as a password to login into the server. For example in the image displayed above the token is `b66e74a85bc2570bf15782e5124c933c3a4ddabd2cf2d7d3`. + +2. Select a notebook sample from the Overview notebook found in directory you launched the server with. In this example, the `intel/deep-learning` container has a notebook titled [`Deep_Learning_Samples_Overview.ipynb`](https://github.com/intel/ai-containers/blob/main/preset/deep-learning/notebooks/Deep_Learning_Samples_Overview.ipynb) when launched in jupyter mode. + +3. After selecting a notebook sample, select the preset kernel found in the dropdown menu presented when loading the notebook. For Deep Learning and Inference Optimization containers there will be multiple kernels, one for each framework: `pytorch`, `pytorch-gpu`, and `tensorflow`. + +##### Advanced Jupyter Server Configuration + +Modify your notebook server command by using the default example below to change the network (port/ip) and security (privilege) settings by appending it to the docker run commands above: + +```bash +docker run ... intel/deep-learning:latest-py3.9 \ + bash -c "jupyter notebook --notebook-dir=~/jupyter \ + --port 8888 \ + --ip 0.0.0.0 \ + --no-browser \ + --allow-root" +``` + +## Run in Multi-Node Distributed Mode [Advanced] + +You can follow the instructions provided for [Intel® Extension for TensorFlow*](https://github.com/intel/ai-containers/tree/main/preset/deep-learning/demo/tensorflow-distributed/README.md) and [Intel® Extension for PyTorch*](https://github.com/intel/ai-containers/tree/main/preset/deep-learning/demo/pytorch-distributed/README.md) along with the Deep Learning or Inference Optimization presets using your preferred framework. + +## Troubleshooting and Support + +If you face some issue in using the container you can find more information on how to troubleshoot [here](https://github.com/intel/ai-containers#troubleshooting). If you need more help feel free to submit an [issue](https://github.com/intel/ai-containers/issues). + +--- + +*Other names and brands may be claimed as the property of others. [Trademarks](http://www.intel.com/content/www/us/en/legal/trademarks.html) diff --git a/docs/python/README.md b/docs/python/README.md new file mode 100644 index 000000000..0b9f95dcd --- /dev/null +++ b/docs/python/README.md @@ -0,0 +1,44 @@ +# Intel® Distribution for Python + +[Intel® Distribution for Python] enhances performance and can improve your program speed from 10 to 100 times faster. It is a Python distribution that includes the [Intel® Math Kernel Library] (oneMKL) and other Intel performance libraries to enable near-native performance through acceleration of core numerical and machine learning packages. + +[Intel® Distribution for Python] is available as part of the [Intel® oneAPI Base Toolkit](https://software.intel.com/content/www/us/en/develop/tools/oneapi/base-toolkit.html). + +## Images + +The images below include variations for only the core packages in the [Intel® Distribution for Python] installation, or all of the packages. + +| Tag(s) | IDP | +| ---------------------- | ---------- | +| `3.10-full`, `latest` | `2024.2.0` | +| `3.10-core` | `2024.2.0` | + +## Build from Source + +To build the images from source, clone the [Intel® AI Containers](https://github.com/intel/ai-containers) repository, follow the main `README.md` file to setup your environment, and run the following command: + +```bash +cd python +docker compose build idp +docker compose run idp +``` + +You can find the list of services below for each container in the group: + +| Service Name | Description | +| ------------ | ------------------------------------------------------------------- | +| `idp` | Base image with [Intel® Distribution for Python] | +| `pip` | Equivalent python image without [Intel® Distribution for Python] | + +## License + +View the [License](https://github.com/intel/ai-containers/blob/main/LICENSE) for the [Intel® Distribution for Python]. + +It is the image user's responsibility to ensure that any use of The images below comply with any relevant licenses for all software contained within. + +\* Other names and brands may be claimed as the property of others. + + + +[Intel® Distribution for Python]: https://www.intel.com/content/www/us/en/developer/tools/oneapi/distribution-for-python.html#gs.9bos9m +[Intel® Math Kernel Library]: https://www.intel.com/content/www/us/en/developer/tools/oneapi/onemkl.html diff --git a/docs/pytorch/README.md b/docs/pytorch/README.md new file mode 100644 index 000000000..53adcb1d8 --- /dev/null +++ b/docs/pytorch/README.md @@ -0,0 +1,434 @@ +# Intel® Extension for Pytorch\* + +[Intel® Extension for PyTorch*] extends [PyTorch*] with up-to-date feature optimizations for an extra performance boost on Intel hardware. + +On Intel CPUs optimizations take advantage of the following instuction sets: + +* Intel® Advanced Matrix Extensions (Intel® AMX) +* Intel® Advanced Vector Extensions 512 (Intel® AVX-512) +* Vector Neural Network Instructions (VNNI) + +On Intel GPUs Intel® Extension for PyTorch\* provides easy GPU acceleration through the PyTorch* `xpu` device. The following Intel GPUs are supported: + +* [Intel® Arc™ A-Series Graphics] +* [Intel® Data Center GPU Flex Series] +* [Intel® Data Center GPU Max Series] + +Images available here start with the [Ubuntu* 22.04](https://hub.docker.com/_/ubuntu) base image with [Intel® Extension for PyTorch*] built for different use cases as well as some additional software. The [Python Dockerfile](https://github.com/intel/ai-containers/blob/main/python/Dockerfile) is used to generate The images below at https://github.com/intel/ai-containers. + +> **Note:** There are two dockerhub repositories (`intel/intel-extension-for-pytorch` and `intel/intel-optimized-pytorch`) that are routinely updated with the latest images, however, some legacy images have not be published to both repositories. + +## XPU images + +The images below include support for both CPU and GPU optimizations: + +| Tag(s) | Pytorch | IPEX | Driver | Dockerfile | +| ---------------------- | -------- | -------------- | ------ | --------------- | +| `2.1.40-xpu` | [v2.1.0] | [v2.1.40+xpu] | [914] | [v0.4.0-Beta] | +| `2.1.30-xpu` | [v2.1.0] | [v2.1.30+xpu] | [803] | [v0.4.0-Beta] | +| `2.1.20-xpu` | [v2.1.0] | [v2.1.20+xpu] | [803] | [v0.3.4] | +| `2.1.10-xpu` | [v2.1.0] | [v2.1.10+xpu] | [736] | [v0.2.3] | +| `xpu-flex-2.0.110-xpu` | [v2.0.1] | [v2.0.110+xpu] | [647] | [v0.1.0] | + +--- + +```bash +docker run -it --rm \ + --device /dev/dri \ + -v /dev/dri/by-path:/dev/dri/by-path \ + --ipc=host \ + intel/intel-extension-for-pytorch:2.1.40-xpu +``` + +--- + +The images below additionally include [Jupyter Notebook](https://jupyter.org/) server: + +| Tag(s) | Pytorch | IPEX | Driver | Jupyter Port | Dockerfile | +| --------------------- | -------- | ------------- | ------ | ------------ | --------------- | +| `2.1.40-xpu-pip-jupyter` | [v2.1.0] | [v2.1.40+xpu] | [914] | `8888` | [v0.4.0-Beta] | +| `2.1.20-xpu-pip-jupyter` | [v2.1.0] | [v2.1.20+xpu] | [803] | `8888` | [v0.3.4] | +| `2.1.10-xpu-pip-jupyter` | [v2.1.0] | [v2.1.10+xpu] | [736] | `8888` | [v0.2.3] | + +### Run the XPU Jupyter Container + +```bash +docker run -it --rm \ + -p 8888:8888 \ + --device /dev/dri \ + -v /dev/dri/by-path:/dev/dri/by-path \ + intel/intel-extension-for-pytorch:2.1.40-xpu-pip-jupyter +``` + +After running the command above, copy the URL (something like `http://127.0.0.1:$PORT/?token=***`) into your browser to access the notebook server. + +## CPU only images + +The images below are built only with CPU optimizations (GPU acceleration support was deliberately excluded): + +| Tag(s) | Pytorch | IPEX | Dockerfile | +| -------------------------- | -------- | ------------ | --------------- | +| `2.4.0-pip-base`, `latest` | [v2.4.0] | [v2.4.0+cpu] | [v0.4.0-Beta] | +| `2.3.0-pip-base` | [v2.3.0] | [v2.3.0+cpu] | [v0.4.0-Beta] | +| `2.2.0-pip-base` | [v2.2.0] | [v2.2.0+cpu] | [v0.3.4] | +| `2.1.0-pip-base` | [v2.1.0] | [v2.1.0+cpu] | [v0.2.3] | +| `2.0.0-pip-base` | [v2.0.0] | [v2.0.0+cpu] | [v0.1.0] | + +### Run the CPU Container + +```bash +docker run -it --rm intel/intel-extension-for-pytorch:latest +``` + +--- + +The images below additionally include [Jupyter Notebook](https://jupyter.org/) server: + +| Tag(s) | Pytorch | IPEX | Dockerfile | +| ------------------- | -------- | ------------ | --------------- | +| `2.4.0-pip-jupyter` | [v2.4.0] | [v2.4.0+cpu] | [v0.4.0-Beta] | +| `2.3.0-pip-jupyter` | [v2.3.0] | [v2.3.0+cpu] | [v0.4.0-Beta] | +| `2.2.0-pip-jupyter` | [v2.2.0] | [v2.2.0+cpu] | [v0.3.4] | +| `2.1.0-pip-jupyter` | [v2.1.0] | [v2.1.0+cpu] | [v0.2.3] | +| `2.0.0-pip-jupyter` | [v2.0.0] | [v2.0.0+cpu] | [v0.1.0] | + +```bash +docker run -it --rm \ + -p 8888:8888 \ + -v $PWD/workspace:/workspace \ + -w /workspace \ + intel/intel-extension-for-pytorch:2.4.0-pip-jupyter +``` + +After running the command above, copy the URL (something like `http://127.0.0.1:$PORT/?token=***`) into your browser to access the notebook server. + +--- + +The images below additionally include [Intel® oneAPI Collective Communications Library] (oneCCL) and Neural Compressor ([INC]): + +| Tag(s) | Pytorch | IPEX | oneCCL | INC | Dockerfile | +| --------------------- | -------- | ------------ | -------------------- | --------- | -------------- | +| `2.4.0-pip-multinode` | [v2.4.0] | [v2.4.0+cpu] | [v2.4.0][ccl-v2.4.0] | [v3.0] | [v0.4.0-Beta] | +| `2.3.0-pip-multinode` | [v2.3.0] | [v2.3.0+cpu] | [v2.3.0][ccl-v2.3.0] | [v2.6] | [v0.4.0-Beta] | +| `2.2.0-pip-multinode` | [v2.2.2] | [v2.2.0+cpu] | [v2.2.0][ccl-v2.2.0] | [v2.6] | [v0.4.0-Beta] | +| `2.1.100-pip-mulitnode` | [v2.1.2] | [v2.1.100+cpu] | [v2.1.0][ccl-v2.1.0] | [v2.6] | [v0.4.0-Beta] | +| `2.0.100-pip-multinode` | [v2.0.1] | [v2.0.100+cpu] | [v2.0.0][ccl-v2.0.0] | [v2.6] | [v0.4.0-Beta] | + +> [!NOTE] +> Passwordless SSH connection is also enabled in the image, but the container does not contain any SSH ID keys. The user needs to mount those keys at `/root/.ssh/id_rsa` and `/etc/ssh/authorized_keys`. + +> [!TIP] +> Before mounting any keys, modify the permissions of those files with `chmod 600 authorized_keys; chmod 600 id_rsa` to grant read access for the default user account. + +#### Setup and Run IPEX Multi-Node Container + +Some additional assembly is required to utilize this container with OpenSSH. To perform any kind of DDP (Distributed Data Parallel) execution, containers are assigned the roles of launcher and worker respectively: + +SSH Server (Worker) + +1. *Authorized Keys* : `/etc/ssh/authorized_keys` + +SSH Client (Launcher) + +1. *Private User Key* : `/root/.ssh/id_rsa` + +To add these files correctly please follow the steps described below. + +1. Setup ID Keys + + You can use the commands provided below to [generate the identity keys](https://www.ssh.com/academy/ssh/keygen#creating-an-ssh-key-pair-for-user-authentication) for OpenSSH. + + ```bash + ssh-keygen -q -N "" -t rsa -b 4096 -f ./id_rsa + touch authorized_keys + cat id_rsa.pub >> authorized_keys + ``` + +2. Configure the permissions and ownership for all of the files you have created so far + + ```bash + chmod 600 id_rsa config authorized_keys + chown root:root id_rsa.pub id_rsa config authorized_keys + ``` + +3. Create a hostfile for `torchrun` or `ipexrun`. (Optional) + + ```txt + Host host1 + HostName + IdentitiesOnly yes + IdentityFile ~/.root/id_rsa + Port + Host host2 + HostName + IdentitiesOnly yes + IdentityFile ~/.root/id_rsa + Port + ... + ``` + +4. Configure [Intel® oneAPI Collective Communications Library] in your python script + + ```python + import oneccl_bindings_for_pytorch + import os + + dist.init_process_group( + backend="ccl", + init_method="tcp://127.0.0.1:3022", + world_size=int(os.environ.get("WORLD_SIZE")), + rank=int(os.environ.get("RANK")), + ) + ``` + +5. Now start the workers and execute DDP on the launcher + + 1. Worker run command: + + ```bash + docker run -it --rm \ + --net=host \ + -v $PWD/authorized_keys:/etc/ssh/authorized_keys \ + -v $PWD/tests:/workspace/tests \ + -w /workspace \ + intel/intel-extension-for-pytorch:2.4.0-pip-multinode \ + bash -c '/usr/sbin/sshd -D' + ``` + + 2. Launcher run command: + + ```bash + docker run -it --rm \ + --net=host \ + -v $PWD/id_rsa:/root/.ssh/id_rsa \ + -v $PWD/tests:/workspace/tests \ + -v $PWD/hostfile:/workspace/hostfile \ + -w /workspace \ + intel/intel-extension-for-pytorch:2.4.0-pip-multinode \ + bash -c 'ipexrun cpu --nnodes 2 --nprocs-per-node 1 --master-addr 127.0.0.1 --master-port 3022 /workspace/tests/ipex-resnet50.py --ipex --device cpu --backend ccl' + ``` + +> [!NOTE] +> [Intel® MPI] can be configured based on your machine settings. If the above commands do not work for you, see the documentation for how to configure based on your network. + +#### Enable [DeepSpeed*] optimizations + +To enable [DeepSpeed*] optimizations with [Intel® oneAPI Collective Communications Library], add the following to your python script: + +```python +import deepspeed + +# Rather than dist.init_process_group(), use deepspeed.init_distributed() +deepspeed.init_distributed(backend="ccl") +``` + +Additionally, if you have a [DeepSpeed* configuration](https://www.deepspeed.ai/getting-started/#deepspeed-configuration) you can use the below command as your launcher to run your script with that configuration: + +```bash + docker run -it --rm \ + --net=host \ + -v $PWD/id_rsa:/root/.ssh/id_rsa \ + -v $PWD/tests:/workspace/tests \ + -v $PWD/hostfile:/workspace/hostfile \ + -v $PWD/ds_config.json:/workspace/ds_config.json \ + -w /workspace \ + intel/intel-extension-for-pytorch:2.4.0-pip-multinode \ + bash -c 'deepspeed --launcher IMPI \ + --master_addr 127.0.0.1 --master_port 3022 \ + --deepspeed_config ds_config.json --hostfile /workspace/hostfile \ + /workspace/tests/ipex-resnet50.py --ipex --device cpu --backend ccl --deepspeed' +``` + +--- + +#### Hugging Face Generative AI Container + +The image below is an extension of the IPEX Multi-Node Container designed to run Hugging Face Generative AI scripts. The container has the typical installations needed to run and fine tune PyTorch generative text models from Hugging Face. It can be used to run multinode jobs using the same instructions from the [IPEX Multi-Node container](#setup-and-run-ipex-multi-node-container). + +| Tag(s) | Pytorch | IPEX | oneCCL | HF Transformers | Dockerfile | +| ------------------------------------- | -------- | ------------ | -------------------- | --------------- | --------------- | +| `2.4.0-pip-multinode-hf-4.44.0-genai` | [v2.4.0] | [v2.4.0+cpu] | [v2.4.0][ccl-v2.4.0] | [v4.44.0] | [v0.4.0-Beta] | + +Below is an example that shows single node job with the existing [`finetune.py`](../workflows/charts/huggingface-llm/scripts/finetune.py) script. + +```bash +# Change into home directory first and run the command +docker run -it \ + -v $PWD/workflows/charts/huggingface-llm/scripts:/workspace/scripts \ + -w /workspace/scripts \ + intel/intel-extension-for-pytorch:2.4.0-pip-multinode-hf-4.44.0-genai \ + bash -c 'python finetune.py ' +``` + +--- + +The images below are [TorchServe*] with CPU Optimizations: + +| Tag(s) | Pytorch | IPEX | Dockerfile | +| ------------------- | -------- | ------------ | --------------- | +| `2.4.0-serving-cpu` | [v2.4.0] | [v2.4.0+cpu] | [v0.4.0-Beta] | +| `2.3.0-serving-cpu` | [v2.3.0] | [v2.3.0+cpu] | [v0.4.0-Beta] | +| `2.2.0-serving-cpu` | [v2.2.0] | [v2.2.0+cpu] | [v0.3.4] | + +For more details, follow the procedure in the [TorchServe](https://github.com/pytorch/serve/blob/master/examples/intel_extension_for_pytorch/README.md) instructions. + +## CPU only images with Intel® Distribution for Python* + +The images below are built only with CPU optimizations (GPU acceleration support was deliberately excluded) and include [Intel® Distribution for Python*]: + +| Tag(s) | Pytorch | IPEX | Dockerfile | +| ---------------- | -------- | ------------ | --------------- | +| `2.4.0-idp-base` | [v2.4.0] | [v2.4.0+cpu] | [v0.4.0-Beta] | +| `2.3.0-idp-base` | [v2.3.0] | [v2.3.0+cpu] | [v0.4.0-Beta] | +| `2.2.0-idp-base` | [v2.2.0] | [v2.2.0+cpu] | [v0.3.4] | +| `2.1.0-idp-base` | [v2.1.0] | [v2.1.0+cpu] | [v0.2.3] | +| `2.0.0-idp-base` | [v2.0.0] | [v2.0.0+cpu] | [v0.1.0] | + +The images below additionally include [Jupyter Notebook](https://jupyter.org/) server: + +| Tag(s) | Pytorch | IPEX | Dockerfile | +| ------------------- | -------- | ------------ | --------------- | +| `2.4.0-idp-jupyter` | [v2.4.0] | [v2.4.0+cpu] | [v0.4.0-Beta] | +| `2.3.0-idp-jupyter` | [v2.3.0] | [v2.3.0+cpu] | [v0.4.0-Beta] | +| `2.2.0-idp-jupyter` | [v2.2.0] | [v2.2.0+cpu] | [v0.3.4] | +| `2.1.0-idp-jupyter` | [v2.1.0] | [v2.1.0+cpu] | [v0.2.3] | +| `2.0.0-idp-jupyter` | [v2.0.0] | [v2.0.0+cpu] | [v0.1.0] | + +The images below additionally include [Intel® oneAPI Collective Communications Library] (oneCCL) and Neural Compressor ([INC]): + +| Tag(s) | Pytorch | IPEX | oneCCL | INC | Dockerfile | +| --------------------- | -------- | ------------ | -------------------- | --------- | --------------- | +| `2.4.0-idp-multinode` | [v2.4.0] | [v2.4.0+cpu] | [v2.4.0][ccl-v2.3.0] | [v3.0] | [v0.4.0-Beta] | +| `2.3.0-idp-multinode` | [v2.3.0] | [v2.3.0+cpu] | [v2.3.0][ccl-v2.3.0] | [v2.6] | [v0.4.0-Beta] | +| `2.2.0-idp-multinode` | [v2.2.0] | [v2.2.0+cpu] | [v2.2.0][ccl-v2.2.0] | [v2.4.1] | [v0.3.4] | +| `2.1.0-idp-mulitnode` | [v2.1.0] | [v2.1.0+cpu] | [v2.1.0][ccl-v2.1.0] | [v2.3.1] | [v0.2.3] | +| `2.0.0-idp-multinode` | [v2.0.0] | [v2.0.0+cpu] | [v2.0.0][ccl-v2.0.0] | [v2.1.1] | [v0.1.0] | + +## XPU images with Intel® Distribution for Python* + +The images below are built only with CPU and GPU optimizations and include [Intel® Distribution for Python*]: + +| Tag(s) | Pytorch | IPEX | Driver | Dockerfile | +| ---------------- | -------- | ------------ | -------- | ------ | +| `2.1.40-xpu-idp-base` | [v2.1.0] | [v2.1.40+xpu] | [914] | [v0.4.0-Beta] | +| `2.1.30-xpu-idp-base` | [v2.1.0] | [v2.1.30+xpu] | [803] | [v0.4.0-Beta] | +| `2.1.10-xpu-idp-base` | [v2.1.0] | [v2.1.10+xpu] | [736] | [v0.2.3] | + +The images below additionally include [Jupyter Notebook](https://jupyter.org/) server: + +| Tag(s) | Pytorch | IPEX | Driver | Jupyter Port | Dockerfile | +| --------------------- | -------- | ------------- | ------ | ------------ | --------------- | +| `2.1.40-xpu-idp-jupyter` | [v2.1.0] | [v2.1.40+xpu] | [914] | `8888` | [v0.4.0-Beta] | +| `2.1.20-xpu-idp-jupyter` | [v2.1.0] | [v2.1.20+xpu] | [803] | `8888` | [v0.3.4] | +| `2.1.10-xpu-idp-jupyter` | [v2.1.0] | [v2.1.10+xpu] | [736] | `8888` | [v0.2.3] | + +## Build from Source + +To build the images from source, clone the [Intel® AI Containers](https://github.com/intel/ai-containers) repository, follow the main `README.md` file to setup your environment, and run the following command: + +```bash +cd pytorch +docker compose build ipex-base +docker compose run ipex-base +``` + +You can find the list of services below for each container in the group: + +| Service Name | Description | +| ------------- | ------------------------------------------------------------------- | +| `ipex-base` | Base image with [Intel® Extension for PyTorch*] | +| `jupyter` | Adds Jupyter Notebook server | +| `multinode` | Adds [Intel® oneAPI Collective Communications Library] and [INC] | +| `xpu` | Adds Intel GPU Support | +| `xpu-jupyter` | Adds Jupyter notebook server to GPU image | +| `serving` | [TorchServe*] | + +## MLPerf Optimized Workloads + +The following images are available for MLPerf-optimized workloads. Instructions are available at '[Get Started with Intel MLPerf]'. + +| Tag(s) | Base OS | MLPerf Round | Target Platform | +| --------------------------------- | -------------- | ---------------- | ------------------------------- | +| `mlperf-inference-4.1-resnet50` | rockylinux:8.7 | [Inference v4.1] | Intel(R) Xeon(R) Platinum 8592+ | +| `mlperf-inference-4.1-retinanet` | ubuntu:22.04 | [Inference v4.1] | Intel(R) Xeon(R) Platinum 8592+ | +| `mlperf-inference-4.1-gptj` | ubuntu:22.04 | [Inference v4.1] | Intel(R) Xeon(R) Platinum 8592+ | +| `mlperf-inference-4.1-bert` | ubuntu:22.04 | [Inference v4.1] | Intel(R) Xeon(R) Platinum 8592+ | +| `mlperf-inference-4.1-dlrmv2` | rockylinux:8.7 | [Inference v4.1] | Intel(R) Xeon(R) Platinum 8592+ | +| `mlperf-inference-4.1-3dunet` | ubuntu:22.04 | [Inference v4.1] | Intel(R) Xeon(R) Platinum 8592+ | + +## License + +View the [License](https://github.com/intel/intel-extension-for-pytorch/blob/main/LICENSE) for the [Intel® Extension for PyTorch*]. + +The images below also contain other software which may be under other licenses (such as Pytorch*, Jupyter*, Bash, etc. from the base). + +It is the image user's responsibility to ensure that any use of The images below comply with any relevant licenses for all software contained within. + +\* Other names and brands may be claimed as the property of others. + + + +[Intel® Arc™ A-Series Graphics]: https://ark.intel.com/content/www/us/en/ark/products/series/227957/intel-arc-a-series-graphics.html +[Intel® Data Center GPU Flex Series]: https://ark.intel.com/content/www/us/en/ark/products/series/230021/intel-data-center-gpu-flex-series.html +[Intel® Data Center GPU Max Series]: https://ark.intel.com/content/www/us/en/ark/products/series/232874/intel-data-center-gpu-max-series.html + +[Intel® MPI]: https://www.intel.com/content/www/us/en/developer/tools/oneapi/mpi-library.html +[Intel® Extension for PyTorch*]: https://intel.github.io/intel-extension-for-pytorch/ +[Intel® Distribution for Python*]: https://www.intel.com/content/www/us/en/developer/tools/oneapi/distribution-for-python.html +[Intel® oneAPI Collective Communications Library]: https://www.intel.com/content/www/us/en/developer/tools/oneapi/oneccl.html +[INC]: https://github.com/intel/neural-compressor +[PyTorch*]: https://pytorch.org/ +[TorchServe*]: https://github.com/pytorch/serve +[DeepSpeed*]: https://github.com/microsoft/DeepSpeed + +[v0.4.0-Beta]: https://github.com/intel/ai-containers/blob/main/pytorch/Dockerfile +[v0.3.4]: https://github.com/intel/ai-containers/blob/v0.3.4/pytorch/Dockerfile +[v0.2.3]: https://github.com/intel/ai-containers/blob/v0.2.3/pytorch/Dockerfile +[v0.1.0]: https://github.com/intel/ai-containers/blob/v0.1.0/pytorch/Dockerfile + +[v2.1.40+xpu]: https://github.com/intel/intel-extension-for-pytorch/releases/tag/v2.1.40%2Bxpu +[v2.1.30+xpu]: https://github.com/intel/intel-extension-for-pytorch/releases/tag/v2.1.30%2Bxpu +[v2.1.20+xpu]: https://github.com/intel/intel-extension-for-pytorch/releases/tag/v2.1.20%2Bxpu +[v2.1.10+xpu]: https://github.com/intel/intel-extension-for-pytorch/releases/tag/v2.1.10%2Bxpu +[v2.0.110+xpu]: https://github.com/intel/intel-extension-for-pytorch/releases/tag/v2.0.110%2Bxpu + +[v2.4.0]: https://github.com/pytorch/pytorch/releases/tag/v2.4.0 +[v2.3.0]: https://github.com/pytorch/pytorch/releases/tag/v2.3.0 +[v2.2.2]: https://github.com/pytorch/pytorch/releases/tag/v2.2.2 +[v2.2.0]: https://github.com/pytorch/pytorch/releases/tag/v2.2.0 +[v2.1.2]: https://github.com/pytorch/pytorch/releases/tag/v2.1.2 +[v2.1.0]: https://github.com/pytorch/pytorch/releases/tag/v2.1.0 +[v2.0.1]: https://github.com/pytorch/pytorch/releases/tag/v2.0.1 +[v2.0.0]: https://github.com/pytorch/pytorch/releases/tag/v2.0.0 + +[v3.0]: https://github.com/intel/neural-compressor/releases/tag/v3.0 +[v2.6]: https://github.com/intel/neural-compressor/releases/tag/v2.6 +[v2.4.1]: https://github.com/intel/neural-compressor/releases/tag/v2.4.1 +[v2.3.1]: https://github.com/intel/neural-compressor/releases/tag/v2.3.1 +[v2.1.1]: https://github.com/intel/neural-compressor/releases/tag/v2.1.1 + +[v2.4.0+cpu]: https://github.com/intel/intel-extension-for-pytorch/releases/tag/v2.4.0%2Bcpu +[v2.3.0+cpu]: https://github.com/intel/intel-extension-for-pytorch/releases/tag/v2.3.0%2Bcpu +[v2.2.0+cpu]: https://github.com/intel/intel-extension-for-pytorch/releases/tag/v2.2.0%2Bcpu +[v2.1.100+cpu]: https://github.com/intel/intel-extension-for-pytorch/releases/tag/v2.1.0%2Bcpu +[v2.1.0+cpu]: https://github.com/intel/intel-extension-for-pytorch/releases/tag/v2.1.0%2Bcpu +[v2.0.100+cpu]: https://github.com/intel/intel-extension-for-pytorch/releases/tag/v2.0.0%2Bcpu +[v2.0.0+cpu]: https://github.com/intel/intel-extension-for-pytorch/releases/tag/v2.0.0%2Bcpu + +[ccl-v2.4.0]: https://github.com/intel/torch-ccl/releases/tag/v2.4.0%2Bcpu%2Brc0 +[ccl-v2.3.0]: https://github.com/intel/torch-ccl/releases/tag/v2.3.0%2Bcpu +[ccl-v2.2.0]: https://github.com/intel/torch-ccl/releases/tag/v2.2.0%2Bcpu +[ccl-v2.1.0]: https://github.com/intel/torch-ccl/releases/tag/v2.1.0%2Bcpu +[ccl-v2.0.0]: https://github.com/intel/torch-ccl/releases/tag/v2.1.0%2Bcpu + + +[v4.44.0]: https://github.com/huggingface/transformers/releases/tag/v4.44.0 + +[914]: https://dgpu-docs.intel.com/releases/stable_914_33_20240730.html +[803]: https://dgpu-docs.intel.com/releases/LTS_803.29_20240131.html +[736]: https://dgpu-docs.intel.com/releases/stable_736_25_20231031.html +[647]: https://dgpu-docs.intel.com/releases/stable_647_21_20230714.html + + +[Inference v4.1]: https://mlcommons.org/benchmarks/inference-datacenter +[Get Started with Intel MLPerf]: https://www.intel.com/content/www/us/en/developer/articles/guide/get-started-mlperf-intel-optimized-docker-images.html diff --git a/docs/tensorflow/README.md b/docs/tensorflow/README.md new file mode 100644 index 000000000..ac2c8b7c9 --- /dev/null +++ b/docs/tensorflow/README.md @@ -0,0 +1,353 @@ +# Intel® Extension for TensorFlow\* + +[Intel® Extension for TensorFlow*] extends [TensorFlow*] with up-to-date feature optimizations for an extra performance boost on Intel hardware. + +[Intel® Extension for TensorFlow*] is based on the TensorFlow [PluggableDevice] interface to bring Intel XPU(GPU, CPU, etc.) devices into [TensorFlow*] with flexibility for on-demand performance on the following Intel GPUs: + +* [Intel® Arc™ A-Series Graphics] +* [Intel® Data Center GPU Flex Series] +* [Intel® Data Center GPU Max Series] + +> **Note:** There are two dockerhub repositories (`intel/intel-extension-for-tensorflow` and `intel/intel-optimized-tensorflow`) that are routinely updated with the latest images, however, some legacy images have not be published to both repositories. + +## XPU images + +The images below include support for both CPU and GPU optimizations: + +| Tag(s) | TensorFlow | ITEX | Driver | Dockerfile | +| ---------------------- | ----------- | -------------- | ------- | --------------- | +| `2.15.0.1-xpu`, `xpu` | [v2.15.1] | [v2.15.0.1] | [803.63]| [v0.4.0-Beta] | +| `2.15.0.0-xpu` | [v2.15.0] | [v2.15.0.0] | [803] | [v0.4.0-Beta] | +| `2.14.0.1-xpu` | [v2.14.1] | [v2.14.0.1] | [736] | [v0.3.4] | +| `2.13.0.0-xpu` | [v2.13.0] | [v2.13.0.0] | [647] | [v0.2.3] | + +### Run the XPU Container + +```bash +docker run -it --rm \ + --device /dev/dri \ + -v /dev/dri/by-path:/dev/dri/by-path \ + --ipc=host \ + intel/intel-extension-for-tensorflow:xpu +``` + +--- + +The images below additionally include [Jupyter Notebook](https://jupyter.org/) server: + +| Tag(s) | TensorFlow | IPEX | Driver | Dockerfile | +| ------------- | ----------- | ------------- | ------ | --------------- | +| `2.15.0.1-xpu-jupyter` | [v2.15.1] | [v2.15.0.1] | [803.63]| [v0.4.0-Beta] | +| `xpu-jupyter` | [v2.14.1] | [v2.14.0.1] | [736] | [v0.3.4] | + +### Run the XPU Jupyter Container + +```bash +docker run -it --rm \ + -p 8888:8888 \ + --net=host \ + --device /dev/dri \ + -v /dev/dri/by-path:/dev/dri/by-path \ + --ipc=host \ + intel/intel-extension-for-tensorflow:2.15.0.1-xpu-jupyter +``` + +After running the command above, copy the URL (something like `http://127.0.0.1:$PORT/?token=***`) into your browser to access the notebook server. + +--- + +The images below are [TensorFlow* Serving] with GPU Optimizations: + +| Tag(s) | TensorFlow | IPEX | +| ------------------------------------- | ----------- | ------------ | +| `2.14.0.1-serving-gpu`, `serving-gpu` | [v2.14.1] | [v2.14.0.1] | +| `2.13.0.0-serving-gpu`, | [v2.13.0] | [v2.13.0.0] | + +### Run the Serving GPU Container + +```bash +docker run -it --rm \ + -p 8500:8500 \ + --device /dev/dri \ + -v /dev/dri/by-path:/dev/dri/by-path \ + -v $PWD/workspace:/workspace \ + -w /workspace \ + -e MODEL_NAME= \ + -e MODEL_DIR= \ + intel/intel-extension-for-tensorflow:serving-gpu +``` + +For more details, follow the procedure in the [Intel® Extension for TensorFlow* Serving] instructions. + +## CPU only images + +The images below are built only with CPU optimizations (GPU acceleration support was deliberately excluded): + +| Tag(s) | TensorFlow | ITEX | Dockerfile | +| --------------------------- | ----------- | ------------ | --------------- | +| `2.15.1-pip-base`, `latest` | [v2.15.1] | [v2.15.0.1] | [v0.4.0-Beta] | +| `2.15.0-pip-base` | [v2.15.0] | [v2.15.0.0] | [v0.4.0-Beta] | +| `2.14.0-pip-base` | [v2.14.1] | [v2.14.0.1] | [v0.3.4] | +| `2.13-pip-base` | [v2.13.0] | [v2.13.0.0] | [v0.2.3] | + +The images below additionally include [Jupyter Notebook](https://jupyter.org/) server: + +| Tag(s) | TensorFlow | ITEX | Dockerfile | +| -------------------- | ----------- | ------------- | --------------- | +| `2.15.1-pip-jupyter` | [v2.15.1] | [v2.15.0.1] | [v0.4.0-Beta] | +| `2.15.0-pip-jupyter` | [v2.15.0] | [v2.15.0.0] | [v0.4.0-Beta] | +| `2.14.0-pip-jupyter` | [v2.14.1] | [v2.14.0.1] | [v0.3.4] | +| `2.13-pip-jupyter` | [v2.13.0] | [v2.13.0.0] | [v0.2.3] | + +### Run the CPU Jupyter Container + +```bash +docker run -it --rm \ + -p 8888:8888 \ + --net=host \ + -v $PWD/workspace:/workspace \ + -w /workspace \ + intel/intel-extension-for-tensorflow:2.15.1-pip-jupyter +``` + +After running the command above, copy the URL (something like `http://127.0.0.1:$PORT/?token=***`) into your browser to access the notebook server. + +--- + +The images below additionally include [Horovod]: + +| Tag(s) | Tensorflow | ITEX | Horovod | Dockerfile | +| ------------------------------ | --------- | ------------ | --------- | --------------- | +| `2.15.1-pip-multinode` | [v2.15.1] | [v2.15.0.1] | [v0.28.1] | [v0.4.0-Beta] | +| `2.15.0-pip-multinode` | [v2.15.0] | [v2.15.0.0] | [v0.28.1] | [v0.4.0-Beta] | +| `2.14.0-pip-openmpi-multinode` | [v2.14.1] | [v2.14.0.1] | [v0.28.1] | [v0.3.4] | +| `2.13-pip-openmpi-mulitnode` | [v2.13.0] | [v2.13.0.0] | [v0.28.0] | [v0.2.3] | + +> [!NOTE] +> Passwordless SSH connection is also enabled in the image, but the container does not contain any SSH ID keys. The user needs to mount those keys at `/root/.ssh/id_rsa` and `/etc/ssh/authorized_keys`. + +> [!TIP] +> Before mounting any keys, modify the permissions of those files with `chmod 600 authorized_keys; chmod 600 id_rsa` to grant read access for the default user account. + +#### Setup and Run ITEX Multi-Node Container + +Some additional assembly is required to utilize this container with OpenSSH. To perform any kind of DDP (Distributed Data Parallel) execution, containers are assigned the roles of launcher and worker respectively: + +SSH Server (Worker) + +1. *Authorized Keys* : `/etc/ssh/authorized_keys` + +SSH Client (Launcher) + +1. *Private User Key* : `/root/.ssh/id_rsa` + +To add these files correctly please follow the steps described below. + +1. Setup ID Keys + + You can use the commands provided below to [generate the identity keys](https://www.ssh.com/academy/ssh/keygen#creating-an-ssh-key-pair-for-user-authentication) for OpenSSH. + + ```bash + ssh-keygen -q -N "" -t rsa -b 4096 -f ./id_rsa + touch authorized_keys + cat id_rsa.pub >> authorized_keys + ``` + +2. Configure the permissions and ownership for all of the files you have created so far + + ```bash + chmod 600 id_rsa config authorized_keys + chown root:root id_rsa.pub id_rsa config authorized_keys + ``` + +3. Create a hostfile for horovod. (Optional) + + ```txt + Host host1 + HostName + IdentitiesOnly yes + IdentityFile ~/.root/id_rsa + Port + Host host2 + HostName + IdentitiesOnly yes + IdentityFile ~/.root/id_rsa + Port + ... + ``` + +4. Configure [Horovod] in your python script + + ```python + import horovod.torch as hvd + + hvd.init() + ``` + +5. Now start the workers and execute DDP on the launcher + + 1. Worker run command: + + ```bash + docker run -it --rm \ + --net=host \ + -v $PWD/authorized_keys:/etc/ssh/authorized_keys \ + -v $PWD/tests:/workspace/tests \ + -w /workspace \ + intel/intel-optimized-tensorflow:2.15.1-pip-multinode \ + bash -c '/usr/sbin/sshd -D' + ``` + + 2. Launcher run command: + + ```bash + docker run -it --rm \ + --net=host \ + -v $PWD/id_rsa:/root/.ssh/id_rsa \ + -v $PWD/tests:/workspace/tests \ + -v $PWD/hostfile:/root/ssh/config \ + -w /workspace \ + intel/intel-optimized-tensorflow:2.15.1-pip-multinode \ + bash -c 'horovodrun --verbose -np 2 -H host1:1,host2:1 /workspace/tests/tf_base_test.py' + ``` + +> [!NOTE] +> [Intel® MPI] can be configured based on your machine settings. If the above commands do not work for you, see the documentation for how to configure based on your network. + +--- + +The images below are [TensorFlow* Serving] with CPU Optimizations: + +| Tag(s) | TensorFlow | ITEX | +| ------------------------------------- | ---------- | ------------ | +| `2.14.0.1-serving-cpu`, `serving-cpu` | [v2.14.1] | [v2.14.0.1] | +| `2.13.0.0-serving-cpu` | [v2.13.0] | [v2.13.0.0] | + +### Run the Serving CPU Container + +```bash +docker run -it --rm \ + -p 8500:8500 \ + --device /dev/dri \ + -v /dev/dri/by-path:/dev/dri/by-path \ + -v $PWD/workspace:/workspace \ + -w /workspace \ + -e MODEL_NAME= \ + -e MODEL_DIR= \ + intel/intel-extension-for-tensorflow:serving-cpu +``` + +For more details, follow the procedure in the [Intel® Extension for TensorFlow* Serving] instructions. + +## CPU only images with Intel® Distribution for Python* + +The images below are built only with CPU optimizations (GPU acceleration support was deliberately excluded) and include [Intel® Distribution for Python*]: + +| Tag(s) | TensorFlow | ITEX | Dockerfile | +| --------------------------- | ----------- | ------------ | --------------- | +| `2.15.1-idp-base` | [v2.15.1] | [v2.15.0.1] | [v0.4.0-Beta] | +| `2.15.0-idp-base` | [v2.15.0] | [v2.15.0.0] | [v0.4.0-Beta] | +| `2.14.0-idp-base` | [v2.14.1] | [v2.14.0.1] | [v0.3.4] | +| `2.13-idp-base` | [v2.13.0] | [v2.13.0.0] | [v0.2.3] | + +The images below additionally include [Jupyter Notebook](https://jupyter.org/) server: + +| Tag(s) | TensorFlow | ITEX | Dockerfile | +| -------------------- | ----------- | ------------- | --------------- | +| `2.15.1-idp-jupyter` | [v2.15.1] | [v2.15.0.1] | [v0.4.0-Beta] | +| `2.15.0-idp-jupyter` | [v2.15.0] | [v2.15.0.0] | [v0.4.0-Beta] | +| `2.14.0-idp-jupyter` | [v2.14.1] | [v2.14.0.1] | [v0.3.4] | +| `2.13-idp-jupyter` | [v2.13.0] | [v2.13.0.0] | [v0.2.3] | + +The images below additionally include [Horovod]: + +| Tag(s) | Tensorflow | ITEX | Horovod | Dockerfile | +| ------------------------------ | --------- | ------------ | --------- | --------------- | +| `2.15.1-idp-multinode` | [v2.15.1] | [v2.15.0.1] | [v0.28.1] | [v0.4.0-Beta] | +| `2.15.0-idp-multinode` | [v2.15.0] | [v2.15.0.0] | [v0.28.1] | [v0.4.0-Beta] | +| `2.14.0-idp-openmpi-multinode` | [v2.14.1] | [v2.14.0.1] | [v0.28.1] | [v0.3.4] | +| `2.13-idp-openmpi-mulitnode` | [v2.13.0] | [v2.13.0.0] | [v0.28.0] | [v0.2.3] | + +## XPU images with Intel® Distribution for Python* + +The images below are built only with CPU and GPU optimizations and include [Intel® Distribution for Python*]: + +| Tag(s) | Pytorch | ITEX | Driver | Dockerfile | +| ---------------- | -------- | ------------ | -------- | ------ | +| `2.15.0.1-xpu-idp-base` | [v2.15.1] | [v2.15.0.1] | [803] | [v0.4.0-Beta] | +| `2.15.0-xpu-idp-base` | [v2.15.0] | [v2.15.0.0] | [803] | [v0.4.0-Beta] | + +The images below additionally include [Jupyter Notebook](https://jupyter.org/) server: + +| Tag(s) | Pytorch | IPEX | Driver | Jupyter Port | Dockerfile | +| --------------------- | -------- | ------------- | ------ | ------------ | --------------- | +| `2.15.0.1-xpu-idp-jupyter` | [v2.15.1] | [v2.15.0.1] | [803] | `8888` | [v0.4.0-Beta] | +| `2.15.0-xpu-idp-jupyter` | [v2.1.0] | [v2.15.0.0] | [803] | `8888` | [v0.4.0-Beta] | + +## Build from Source + +To build the images from source, clone the [Intel® AI Containers](https://github.com/intel/ai-containers) repository, follow the main `README.md` file to setup your environment, and run the following command: + +```bash +cd pytorch +docker compose build tf-base +docker compose run tf-base +``` + +You can find the list of services below for each container in the group: + +| Service Name | Description | +| ------------- | ------------------------------------------------------------------- | +| `tf-base` | Base image with [Intel® Extension for TensorFlow*] | +| `jupyter` | Adds Jupyter Notebook server | +| `multinode` | Adds [Intel® MPI], [Horovod] and [INC] | +| `xpu` | Adds Intel GPU Support | +| `xpu-jupyter` | Adds Jupyter notebook server to GPU image | + +## License + +View the [License](https://github.com/intel/intel-extension-for-tensorflow/tree/main?tab=License-1-ov-file#readme) for the [Intel® Extension for TensorFlow*]. + +The images below also contain other software which may be under other licenses (such as TensorFlow*, Jupyter*, Bash, etc. from the base). + +It is the image user's responsibility to ensure that any use of The images below comply with any relevant licenses for all software contained within. + +\* Other names and brands may be claimed as the property of others. + + + +[Intel® Arc™ A-Series Graphics]: https://ark.intel.com/content/www/us/en/ark/products/series/227957/intel-arc-a-series-graphics.html +[Intel® Data Center GPU Flex Series]: https://ark.intel.com/content/www/us/en/ark/products/series/230021/intel-data-center-gpu-flex-series.html +[Intel® Data Center GPU Max Series]: https://ark.intel.com/content/www/us/en/ark/products/series/232874/intel-data-center-gpu-max-series.html + +[Intel® Extension for TensorFlow*]: https://github.com/intel/intel-extension-for-tensorflow +[Intel® Extension for TensorFlow* Serving]: https://intel.github.io/intel-extension-for-tensorflow/latest/docker/tensorflow-serving/README.html +[Intel® Distribution for Python*]: https://www.intel.com/content/www/us/en/developer/tools/oneapi/distribution-for-python.html +[INC]: https://github.com/intel/neural-compressor +[TensorFlow*]: https://github.com/tensorflow/tensorflow +[PluggableDevice]: https://github.com/tensorflow/community/blob/master/rfcs/20200624-pluggable-device-for-tensorflow.md +[TensorFlow* Serving]: https://github.com/tensorflow/serving +[Horovod]: https://github.com/horovod/horovod +[Intel® MPI]: https://www.intel.com/content/www/us/en/developer/tools/oneapi/mpi-library.html#gs.9bna9o + +[v0.4.0-Beta]: https://github.com/intel/ai-containers/blob/v0.4.0-Beta/tensorflow/Dockerfile +[v0.3.4]: https://github.com/intel/ai-containers/blob/v0.3.4/tensorflow/Dockerfile +[v0.2.3]: https://github.com/intel/ai-containers/blob/v0.2.3/tensorflow/Dockerfile + +[v2.15.1]: https://github.com/tensorflow/tensorflow/releases/tag/v2.15.1 +[v2.15.0]: https://github.com/tensorflow/tensorflow/releases/tag/v2.15.0 +[v2.14.1]: https://github.com/tensorflow/tensorflow/releases/tag/v2.14.1 +[v2.13.0]: https://github.com/tensorflow/tensorflow/releases/tag/v2.13.0 + +[v2.15.0.1]: https://github.com/intel/intel-extension-for-tensorflow/releases/tag/v2.15.0.1 +[v2.15.0.0]: https://github.com/intel/intel-extension-for-tensorflow/releases/tag/v2.15.0.0 +[v2.14.0.1]: https://github.com/intel/intel-extension-for-tensorflow/releases/tag/v2.14.0.1 +[v2.13.0.0]: https://github.com/intel/intel-extension-for-tensorflow/releases/tag/v2.13.0.0 + +[v0.28.1]: https://github.com/horovod/horovod/releases/tag/v0.28.1 +[v0.28.0]: https://github.com/horovod/horovod/releases/tag/v0.28.0 + +[803.63]: https://dgpu-docs.intel.com/releases/LTS_803.63_20240617.html +[803]: https://dgpu-docs.intel.com/releases/LTS_803.29_20240131.html +[736]: https://dgpu-docs.intel.com/releases/stable_736_25_20231031.html +[647]: https://dgpu-docs.intel.com/releases/stable_647_21_20230714.html diff --git a/docs/workflows/README.md b/docs/workflows/README.md new file mode 100644 index 000000000..21269eb7d --- /dev/null +++ b/docs/workflows/README.md @@ -0,0 +1,36 @@ +# Intel® AI Workflows + +Demonstrating showing how the [Intel® AI Containers] can be used for different use cases: + +## PyTorch Workflows + +| Base Container | Device Type | Example | Description | +|----------------|-------------|---------|-------------| +| `intel/intel-optimized-pytorch:2.3.0-pip-multinode` | CPU | [Distributed LLM Fine Tuning with Kubernetes] | Demonstrates using Hugging Face Transformers with Intel® Xeon® Scalable Processors to fine tune LLMs with multiple nodes from a Kubernetes cluster. The example includes a LLM fine tuning script, Dockerfile, and Helm chart. | +| `intel/intel-optimized-pytorch:2.3.0-serving-cpu` | CPU | [TorchServe* with Kubernetes] | Demonstrates using TorchServe* with Intel® Xeon® Scalable Processors to serve models on multinodes nodes from a Kubernetes cluster. The example includes a Helm chart. | + +## Build from Source + +To build the images from source, clone the [Intel® AI Containers] repository, follow the main `README.md` file to setup your environment, and run the following command: + +```bash +cd workflows/charts/huggingface-llm +docker compose build huggingface-llm +docker compose run huggingface-llm sh -c "python /workspace/scripts/finetune.py --help" +``` + +## License + +View the [License](https://github.com/intel/ai-containers/blob/main/LICENSE) for the [Intel® AI Containers]. + +The images below also contain other software which may be under other licenses (such as Pytorch*, Jupyter*, Bash, etc. from the base). + +It is the image user's responsibility to ensure that any use of The images below comply with any relevant licenses for all software contained within. + +\* Other names and brands may be claimed as the property of others. + + + +[Intel® AI Containers]: https://github.com/intel/ai-containers +[Distributed LLM Fine Tuning with Kubernetes]: https://github.com/intel/ai-containers/tree/main/workflows/charts/huggingface-llm +[TorchServe* with Kubernetes]: https://github.com/intel/ai-containers/tree/main/workflows/charts/torchserve diff --git a/tensorflow/Dockerfile b/tensorflow/Dockerfile index a54b5466b..4f3d0f6ee 100644 --- a/tensorflow/Dockerfile +++ b/tensorflow/Dockerfile @@ -77,37 +77,43 @@ EXPOSE 8888 CMD ["bash", "-c", "source /etc/bash.bashrc && jupyter notebook --notebook-dir=/jupyter --port 8888 --ip 0.0.0.0 --no-browser --allow-root --ServerApp.token= --ServerApp.password= --ServerApp.allow_origin=* --ServerApp.base_url=$NB_PREFIX"] -FROM tf-base-${PACKAGE_OPTION} AS openmpi +FROM tf-base-${PACKAGE_OPTION} AS multinode RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \ + build-essential \ + cmake \ + g++ \ + gcc \ + git \ + libgl1-mesa-glx \ + libglib2.0-0 \ libopenmpi-dev \ + numactl \ openmpi-bin \ - openmpi-common + openmpi-common \ + python3-dev \ + unzip \ + virtualenv + +ENV SIGOPT_PROJECT=. WORKDIR / -COPY ompi-requirements.txt . +COPY multinode/requirements.txt requirements.txt -RUN python -m pip install --no-cache-dir -r ompi-requirements.txt +RUN python -m pip install --no-cache-dir -r requirements.txt && \ + rm -rf requirements.txt -FROM openmpi AS horovod - -ENV LD_LIBRARY_PATH /lib64/:/usr/lib64/:/usr/local/lib64 +ENV LD_LIBRARY_PATH="/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH}:/usr/local/lib/python${PYTHON_VERSION}/dist-packages/oneccl_bindings_for_pytorch/opt/mpi/libfabric/lib:/usr/local/lib/python${PYTHON_VERSION}/dist-packages/oneccl_bindings_for_pytorch/lib" RUN apt-get install -y --no-install-recommends --fix-missing \ - unzip \ openssh-client \ openssh-server && \ - rm /etc/ssh/ssh_host_*_key \ - /etc/ssh/ssh_host_*_key.pub - -ENV OMPI_ALLOW_RUN_AS_ROOT=1 -ENV OMPI_ALLOW_RUN_AS_ROOT_CONFIRM=1 - -ENV OMPI_MCA_tl_tcp_if_exclude="lo,docker0" + rm /etc/ssh/ssh_host_*_key \ + /etc/ssh/ssh_host_*_key.pub && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* -# Install OpenSSH for MPI to communicate between containers -RUN mkdir -p /var/run/sshd && \ - echo 'LoginGraceTime 0' >> /etc/ssh/sshd_config +RUN mkdir -p /var/run/sshd # Install Horovod ARG HOROVOD_WITH_TENSORFLOW=1 @@ -116,43 +122,32 @@ ARG HOROVOD_WITHOUT_PYTORCH=1 ARG HOROVOD_WITHOUT_GLOO=1 ARG HOROVOD_WITH_MPI=1 -RUN apt-get install -y --no-install-recommends --fix-missing \ - build-essential \ - cmake \ - g++ \ - gcc \ - git \ - libgl1-mesa-glx \ - libglib2.0-0 \ - python3-dev && \ - apt-get clean && \ - rm -rf /var/lib/apt/lists/* - -WORKDIR / -COPY hvd-requirements.txt . - -RUN python -m pip install --no-cache-dir -r hvd-requirements.txt - -ENV SIGOPT_PROJECT=. - -RUN wget --progress=dot:giga --no-check-certificate https://github.com/intel/neural-compressor/raw/master/docker/third-party-programs-tensorflow.txt -O /licenses/inc-third-party-programs-tensorflow.txt && \ - wget --progress=dot:giga --no-check-certificate https://raw.githubusercontent.com/intel/neural-compressor/master/LICENSE -O /licenses/INC_LICENSE +ENV LD_LIBRARY_PATH /lib64/:/usr/lib64/:/usr/local/lib64 -FROM horovod AS multinode-pip +RUN pip install --no-cache-dir horovod==0.28.1 -WORKDIR / -COPY multinode-requirements.txt . +ARG PYTHON_VERSION -RUN python -m pip install --no-cache-dir -r multinode-requirements.txt +COPY multinode/generate_ssh_keys.sh /generate_ssh_keys.sh -FROM horovod AS multinode-idp +# modify generate_ssh_keys to be a helper script +# print how to use helper script on bash startup +# Avoids loop for further execution of the startup file +ARG PACKAGE_OPTION=pip +ARG PYPATH="/usr/local/lib/python${PYTHON_VERSION}/dist-packages" +RUN if [ "${PACKAGE_OPTION}" = "idp" ]; then PYPATH="/opt/conda/envs/idp/lib/python${PYTHON_VERSION}/site-packages"; fi && \ + echo "source ${PYPATH}/oneccl_bindings_for_pytorch/env/setvars.sh" >> ~/.startup && \ + cat '/generate_ssh_keys.sh' >> ~/.startup && \ + rm -rf /generate_ssh_keys.sh -WORKDIR / -COPY multinode-requirements.txt . +COPY multinode/dockerd-entrypoint.sh /usr/local/bin/dockerd-entrypoint.sh +COPY multinode/sshd_config /etc/ssh/sshd_config +COPY multinode/ssh_config /etc/ssh/ssh_config -RUN python -m pip install --no-cache-dir -r multinode-requirements.txt +RUN wget --progress=dot:giga --no-check-certificate https://github.com/intel/neural-compressor/raw/master/docker/third-party-programs-tensorflow.txt -O /licenses/inc-third-party-programs-tensorflow.txt && \ + wget --progress=dot:giga --no-check-certificate https://raw.githubusercontent.com/intel/neural-compressor/master/LICENSE -O /licenses/INC_LICENSE -FROM ${PYTHON_BASE} AS itex-xpu-base-pip +FROM ${PYTHON_BASE} AS itex-xpu-base RUN apt-get update && \ apt-get install -y --no-install-recommends --fix-missing \ @@ -219,54 +214,7 @@ ADD https://raw.githubusercontent.com/intel/intel-extension-for-tensorflow/maste ENV LD_LIBRARY_PATH=/opt/intel/oneapi/redist/lib:$LD_LIBRARY_PATH -FROM ${PYTHON_BASE} AS itex-xpu-base-idp - -RUN apt-get update && \ - apt-get install -y --no-install-recommends --fix-missing \ - apt-utils \ - build-essential \ - clinfo \ - git \ - gnupg2 \ - gpg-agent \ - rsync \ - unzip \ - wget && \ - apt-get clean && \ - rm -rf /var/lib/apt/lists/* - -ARG ICD_VER -ARG LEVEL_ZERO_GPU_VER -ARG LEVEL_ZERO_VER -ARG LEVEL_ZERO_DEV_VER - -RUN no_proxy="" NO_PROXY="" wget -qO - https://repositories.intel.com/gpu/intel-graphics.key | \ - gpg --dearmor --output /usr/share/keyrings/intel-graphics.gpg -RUN echo "deb [arch=amd64 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/gpu/ubuntu jammy/lts/2350 unified" | \ - tee /etc/apt/sources.list.d/intel-gpu-jammy.list - -RUN no_proxy="" NO_PROXY="" apt-get update && \ - apt-get install -y --no-install-recommends --fix-missing \ - intel-opencl-icd=${ICD_VER} \ - intel-level-zero-gpu=${LEVEL_ZERO_GPU_VER} \ - level-zero=${LEVEL_ZERO_VER} \ - level-zero-dev=${LEVEL_ZERO_DEV_VER} && \ - apt-get clean && \ - rm -rf /var/lib/apt/lists/* - -ARG ITEX_VER="2.15.0.1" - -RUN conda install -n idp -y intel-extension-for-tensorflow=${ITEX_VER}=*xpu* \ - -c https://software.repos.intel.com/python/conda - -ENV LD_LIBRARY_PATH=/opt/conda/envs/idp/lib:$LD_LIBRARY_PATH - -ADD https://raw.githubusercontent.com/intel/intel-extension-for-tensorflow/master/third-party-programs/dockerlayer/THIRD-PARTY-PROGRAMS.txt /licenses/ -ADD https://raw.githubusercontent.com/intel/intel-extension-for-tensorflow/master/third-party-programs/dockerlayer/third-party-program-of-intel-extension-for-tensorflow.txt /licenses/ -ADD https://raw.githubusercontent.com/intel/intel-extension-for-tensorflow/master/third-party-programs/dockerlayer/third-party-programs-of-intel-tensorflow.txt /licenses/ -ADD https://raw.githubusercontent.com/intel/intel-extension-for-tensorflow/master/third-party-programs/dockerlayer/third-party-programs-of-intel-optimization-for-horovod.txt /licenses/ - -FROM itex-xpu-base-${PACKAGE_OPTION} AS itex-xpu-jupyter +FROM itex-xpu-base AS itex-xpu-jupyter WORKDIR /jupyter COPY jupyter-requirements.txt . diff --git a/tensorflow/README.md b/tensorflow/README.md index 195cebdf3..ac2c8b7c9 100644 --- a/tensorflow/README.md +++ b/tensorflow/README.md @@ -85,7 +85,8 @@ The images below are built only with CPU optimizations (GPU acceleration support | Tag(s) | TensorFlow | ITEX | Dockerfile | | --------------------------- | ----------- | ------------ | --------------- | -| `2.15.0-pip-base`, `latest` | [v2.15.0] | [v2.15.0.0] | [v0.4.0-Beta] | +| `2.15.1-pip-base`, `latest` | [v2.15.1] | [v2.15.0.1] | [v0.4.0-Beta] | +| `2.15.0-pip-base` | [v2.15.0] | [v2.15.0.0] | [v0.4.0-Beta] | | `2.14.0-pip-base` | [v2.14.1] | [v2.14.0.1] | [v0.3.4] | | `2.13-pip-base` | [v2.13.0] | [v2.13.0.0] | [v0.2.3] | @@ -93,6 +94,7 @@ The images below additionally include [Jupyter Notebook](https://jupyter.org/) s | Tag(s) | TensorFlow | ITEX | Dockerfile | | -------------------- | ----------- | ------------- | --------------- | +| `2.15.1-pip-jupyter` | [v2.15.1] | [v2.15.0.1] | [v0.4.0-Beta] | | `2.15.0-pip-jupyter` | [v2.15.0] | [v2.15.0.0] | [v0.4.0-Beta] | | `2.14.0-pip-jupyter` | [v2.14.1] | [v2.14.0.1] | [v0.3.4] | | `2.13-pip-jupyter` | [v2.13.0] | [v2.13.0.0] | [v0.2.3] | @@ -105,7 +107,7 @@ docker run -it --rm \ --net=host \ -v $PWD/workspace:/workspace \ -w /workspace \ - intel/intel-extension-for-tensorflow:2.15.0-pip-jupyter + intel/intel-extension-for-tensorflow:2.15.1-pip-jupyter ``` After running the command above, copy the URL (something like `http://127.0.0.1:$PORT/?token=***`) into your browser to access the notebook server. @@ -116,10 +118,102 @@ The images below additionally include [Horovod]: | Tag(s) | Tensorflow | ITEX | Horovod | Dockerfile | | ------------------------------ | --------- | ------------ | --------- | --------------- | +| `2.15.1-pip-multinode` | [v2.15.1] | [v2.15.0.1] | [v0.28.1] | [v0.4.0-Beta] | | `2.15.0-pip-multinode` | [v2.15.0] | [v2.15.0.0] | [v0.28.1] | [v0.4.0-Beta] | | `2.14.0-pip-openmpi-multinode` | [v2.14.1] | [v2.14.0.1] | [v0.28.1] | [v0.3.4] | | `2.13-pip-openmpi-mulitnode` | [v2.13.0] | [v2.13.0.0] | [v0.28.0] | [v0.2.3] | +> [!NOTE] +> Passwordless SSH connection is also enabled in the image, but the container does not contain any SSH ID keys. The user needs to mount those keys at `/root/.ssh/id_rsa` and `/etc/ssh/authorized_keys`. + +> [!TIP] +> Before mounting any keys, modify the permissions of those files with `chmod 600 authorized_keys; chmod 600 id_rsa` to grant read access for the default user account. + +#### Setup and Run ITEX Multi-Node Container + +Some additional assembly is required to utilize this container with OpenSSH. To perform any kind of DDP (Distributed Data Parallel) execution, containers are assigned the roles of launcher and worker respectively: + +SSH Server (Worker) + +1. *Authorized Keys* : `/etc/ssh/authorized_keys` + +SSH Client (Launcher) + +1. *Private User Key* : `/root/.ssh/id_rsa` + +To add these files correctly please follow the steps described below. + +1. Setup ID Keys + + You can use the commands provided below to [generate the identity keys](https://www.ssh.com/academy/ssh/keygen#creating-an-ssh-key-pair-for-user-authentication) for OpenSSH. + + ```bash + ssh-keygen -q -N "" -t rsa -b 4096 -f ./id_rsa + touch authorized_keys + cat id_rsa.pub >> authorized_keys + ``` + +2. Configure the permissions and ownership for all of the files you have created so far + + ```bash + chmod 600 id_rsa config authorized_keys + chown root:root id_rsa.pub id_rsa config authorized_keys + ``` + +3. Create a hostfile for horovod. (Optional) + + ```txt + Host host1 + HostName + IdentitiesOnly yes + IdentityFile ~/.root/id_rsa + Port + Host host2 + HostName + IdentitiesOnly yes + IdentityFile ~/.root/id_rsa + Port + ... + ``` + +4. Configure [Horovod] in your python script + + ```python + import horovod.torch as hvd + + hvd.init() + ``` + +5. Now start the workers and execute DDP on the launcher + + 1. Worker run command: + + ```bash + docker run -it --rm \ + --net=host \ + -v $PWD/authorized_keys:/etc/ssh/authorized_keys \ + -v $PWD/tests:/workspace/tests \ + -w /workspace \ + intel/intel-optimized-tensorflow:2.15.1-pip-multinode \ + bash -c '/usr/sbin/sshd -D' + ``` + + 2. Launcher run command: + + ```bash + docker run -it --rm \ + --net=host \ + -v $PWD/id_rsa:/root/.ssh/id_rsa \ + -v $PWD/tests:/workspace/tests \ + -v $PWD/hostfile:/root/ssh/config \ + -w /workspace \ + intel/intel-optimized-tensorflow:2.15.1-pip-multinode \ + bash -c 'horovodrun --verbose -np 2 -H host1:1,host2:1 /workspace/tests/tf_base_test.py' + ``` + +> [!NOTE] +> [Intel® MPI] can be configured based on your machine settings. If the above commands do not work for you, see the documentation for how to configure based on your network. + --- The images below are [TensorFlow* Serving] with CPU Optimizations: @@ -151,7 +245,8 @@ The images below are built only with CPU optimizations (GPU acceleration support | Tag(s) | TensorFlow | ITEX | Dockerfile | | --------------------------- | ----------- | ------------ | --------------- | -| `2.15.0-idp-base`, `latest` | [v2.15.0] | [v2.15.0.0] | [v0.4.0-Beta] | +| `2.15.1-idp-base` | [v2.15.1] | [v2.15.0.1] | [v0.4.0-Beta] | +| `2.15.0-idp-base` | [v2.15.0] | [v2.15.0.0] | [v0.4.0-Beta] | | `2.14.0-idp-base` | [v2.14.1] | [v2.14.0.1] | [v0.3.4] | | `2.13-idp-base` | [v2.13.0] | [v2.13.0.0] | [v0.2.3] | @@ -159,6 +254,7 @@ The images below additionally include [Jupyter Notebook](https://jupyter.org/) s | Tag(s) | TensorFlow | ITEX | Dockerfile | | -------------------- | ----------- | ------------- | --------------- | +| `2.15.1-idp-jupyter` | [v2.15.1] | [v2.15.0.1] | [v0.4.0-Beta] | | `2.15.0-idp-jupyter` | [v2.15.0] | [v2.15.0.0] | [v0.4.0-Beta] | | `2.14.0-idp-jupyter` | [v2.14.1] | [v2.14.0.1] | [v0.3.4] | | `2.13-idp-jupyter` | [v2.13.0] | [v2.13.0.0] | [v0.2.3] | @@ -167,6 +263,7 @@ The images below additionally include [Horovod]: | Tag(s) | Tensorflow | ITEX | Horovod | Dockerfile | | ------------------------------ | --------- | ------------ | --------- | --------------- | +| `2.15.1-idp-multinode` | [v2.15.1] | [v2.15.0.1] | [v0.28.1] | [v0.4.0-Beta] | | `2.15.0-idp-multinode` | [v2.15.0] | [v2.15.0.0] | [v0.28.1] | [v0.4.0-Beta] | | `2.14.0-idp-openmpi-multinode` | [v2.14.1] | [v2.14.0.1] | [v0.28.1] | [v0.3.4] | | `2.13-idp-openmpi-mulitnode` | [v2.13.0] | [v2.13.0.0] | [v0.28.0] | [v0.2.3] | diff --git a/tensorflow/docker-compose.yaml b/tensorflow/docker-compose.yaml index 9583b2966..957ae0063 100644 --- a/tensorflow/docker-compose.yaml +++ b/tensorflow/docker-compose.yaml @@ -69,21 +69,27 @@ services: multinode: build: labels: + dependency.apt.build-essential: true + dependency.apt.cmake: true dependency.apt.gcc: true + dependency.apt.g++: true + dependency.apt.git: true dependency.apt.libgl1-mesa-glx: true dependency.apt.libglib2: true - dependency.apt.python3-dev: true - dependency.pip.apt.virtualenv: true dependency.apt.libopenmpi-dev: true + dependency.apt.numactl: true dependency.apt.openmpi-bin: true - dependency.apt.unzip: true dependency.apt.openssh-client: true dependency.apt.openssh-server: true - dependency.python.pip: multinode-requirements.txt + dependency.apt.python3-dev: true + dependency.apt.unzip: true + dependency.pip.apt.virtualenv: true + dependency.pip.horovod: 0.28.1 + dependency.python.pip: multinode/requirements.txt org.opencontainers.base.name: "intel/intel-optimized-tensorflow:${TF_VERSION:-2.15.0}-${PACKAGE_OPTION:-pip}-base" org.opencontainers.image.title: "Intel® Extension for TensorFlow MultiNode Image" org.opencontainers.image.version: ${TF_VERSION:-2.15.0}-${PACKAGE_OPTION:-pip}-multinode - target: multinode-${PACKAGE_OPTION:-pip} + target: multinode command: > bash -c "horovodrun --check-build && mpirun --version && python -c 'import horovod.tensorflow as hvd;hvd.init();import horovod.tensorflow;import neural_compressor, tf2onnx; print(\"\\nNeural Compressor Version:\", neural_compressor.__version__, \"\\\nTensorFlow2ONNX Version:\", tf2onnx.__version__)'" extends: tf-base @@ -120,7 +126,7 @@ services: org.opencontainers.base.name: "intel/python:3.10-core" org.opencontainers.image.title: "Intel® Extension for TensorFlow XPU Base Image" org.opencontainers.image.version: ${TF_VER:-2.15.0}-xpu-${PACKAGE_OPTION:-pip}-base - target: itex-xpu-base-${PACKAGE_OPTION:-pip} + target: itex-xpu-base command: > sh -c "python -c 'import tensorflow as tf;print(tf.__version__);from tensorflow.python.client import device_lib;print(device_lib.list_local_devices())'" extends: tf-base diff --git a/tensorflow/hvd-requirements.txt b/tensorflow/hvd-requirements.txt deleted file mode 100644 index f2eadccea..000000000 --- a/tensorflow/hvd-requirements.txt +++ /dev/null @@ -1 +0,0 @@ -horovod==0.28.1 diff --git a/tensorflow/jupyter-requirements.txt b/tensorflow/jupyter-requirements.txt index e95ad6e83..9bdbed92a 100644 --- a/tensorflow/jupyter-requirements.txt +++ b/tensorflow/jupyter-requirements.txt @@ -1,4 +1,4 @@ -jupyterlab==4.3.0b0 +jupyterlab>=4.2.4 jupyterhub==5.1.0 -notebook==7.3.0a1 +notebook>=7.1.3 jupyter-server-proxy>=4.1.2 diff --git a/tensorflow/multinode-requirements.txt b/tensorflow/multinode-requirements.txt deleted file mode 100644 index f5381669b..000000000 --- a/tensorflow/multinode-requirements.txt +++ /dev/null @@ -1,3 +0,0 @@ -cython -tf2onnx -neural-compressor==3.0 diff --git a/tensorflow/multinode/dockerd-entrypoint.sh b/tensorflow/multinode/dockerd-entrypoint.sh new file mode 100755 index 000000000..ba13c0f94 --- /dev/null +++ b/tensorflow/multinode/dockerd-entrypoint.sh @@ -0,0 +1,21 @@ +#!/bin/bash +# Copyright (c) 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -e +set -a +# shellcheck disable=SC1091 +source "$HOME/.startup" +set +a +"$@" diff --git a/tensorflow/multinode/generate_ssh_keys.sh b/tensorflow/multinode/generate_ssh_keys.sh new file mode 100755 index 000000000..0ee61398e --- /dev/null +++ b/tensorflow/multinode/generate_ssh_keys.sh @@ -0,0 +1,28 @@ +#!/usr/bin/env bash +# Copyright (c) 2023 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: Apache-2.0 + +function gen_single_key() { + ALG_NAME=$1 + if [[ ! -f /etc/ssh/ssh_host_${ALG_NAME}_key ]]; then + ssh-keygen -q -N "" -t "${ALG_NAME}" -f "/etc/ssh/ssh_host_${ALG_NAME}_key" + fi +} + +gen_single_key dsa +gen_single_key rsa +gen_single_key ecdsa +gen_single_key ed25519 diff --git a/tensorflow/multinode/requirements.txt b/tensorflow/multinode/requirements.txt new file mode 100644 index 000000000..807477402 --- /dev/null +++ b/tensorflow/multinode/requirements.txt @@ -0,0 +1,5 @@ +cython>=3.0.11 +impi-rt>=2021.12.0 +mpi4py>=3.1.0 +neural-compressor==3.0 +tf2onnx>=1.16.1 diff --git a/tensorflow/multinode/ssh_config b/tensorflow/multinode/ssh_config new file mode 100644 index 000000000..9ac730173 --- /dev/null +++ b/tensorflow/multinode/ssh_config @@ -0,0 +1,4 @@ +Host * + Port 3022 + IdentityFile ~/.ssh/id_rsa + StrictHostKeyChecking no diff --git a/tensorflow/multinode/sshd_config b/tensorflow/multinode/sshd_config new file mode 100644 index 000000000..4796a48af --- /dev/null +++ b/tensorflow/multinode/sshd_config @@ -0,0 +1,12 @@ +HostKey /etc/ssh/ssh_host_dsa_key +HostKey /etc/ssh/ssh_host_rsa_key +HostKey /etc/ssh/ssh_host_ecdsa_key +HostKey /etc/ssh/ssh_host_ed25519_key +AuthorizedKeysFile /etc/ssh/authorized_keys +## Enable DEBUG log. You can ignore this but this may help you debug any issue while enabling SSHD for the first time +LogLevel DEBUG3 +Port 3022 +UsePAM yes +Subsystem sftp /usr/lib/openssh/sftp-server +# https://ubuntu.com/security/CVE-2024-6387 +LoginGraceTime 0 diff --git a/tensorflow/ompi-requirements.txt b/tensorflow/ompi-requirements.txt deleted file mode 100644 index 7b64c1667..000000000 --- a/tensorflow/ompi-requirements.txt +++ /dev/null @@ -1 +0,0 @@ -impi-rt>=2021.12.0 diff --git a/tensorflow/requirements.txt b/tensorflow/requirements.txt index 4d1838312..92fd1059d 100644 --- a/tensorflow/requirements.txt +++ b/tensorflow/requirements.txt @@ -1,4 +1,4 @@ -tensorflow==2.17.0 -intel-extension-for-tensorflow[cpu]==2.15.0.1 +tensorflow==2.15.1 +intel-extension-for-tensorflow[cpu]>=2.15,<2.16 tensorflow-hub==0.16.1 pillow==10.4.0 diff --git a/tensorflow/xpu-requirements.txt b/tensorflow/xpu-requirements.txt index 2c7e3f7a9..9e4bb523d 100644 --- a/tensorflow/xpu-requirements.txt +++ b/tensorflow/xpu-requirements.txt @@ -1,2 +1,2 @@ -tensorflow==2.17.0 +tensorflow==2.15.0 intel-extension-for-tensorflow[xpu]==2.15.0.1 From cd88ad399096eb61a5c0a254d1d036ed7b59c20c Mon Sep 17 00:00:00 2001 From: tylertitsworth Date: Fri, 16 Aug 2024 16:14:35 -0700 Subject: [PATCH 3/4] update compose version Signed-off-by: tylertitsworth --- docs/classical-ml/README.md | 117 --------- docs/index.md | 173 ------------- docs/preset/README.md | 167 ------------- docs/python/README.md | 44 ---- docs/pytorch/README.md | 434 --------------------------------- docs/tensorflow/README.md | 353 --------------------------- docs/workflows/README.md | 36 --- tensorflow/docker-compose.yaml | 21 +- tensorflow/tests/tests.yaml | 14 +- 9 files changed, 17 insertions(+), 1342 deletions(-) delete mode 100644 docs/classical-ml/README.md delete mode 100644 docs/index.md delete mode 100644 docs/preset/README.md delete mode 100644 docs/python/README.md delete mode 100644 docs/pytorch/README.md delete mode 100644 docs/tensorflow/README.md delete mode 100644 docs/workflows/README.md diff --git a/docs/classical-ml/README.md b/docs/classical-ml/README.md deleted file mode 100644 index 9d63355c4..000000000 --- a/docs/classical-ml/README.md +++ /dev/null @@ -1,117 +0,0 @@ -# Intel® Optimized ML - -[Intel® Extension for Scikit-learn*] enhances the performance of [Scikit-learn*] by accelerating the training and inference of machine learning models on Intel® hardware. - -[XGBoost*] is an optimized distributed gradient boosting library designed to be highly efficient, flexible and portable. - -## Images - -The images below include [Intel® Extension for Scikit-learn*] and [XGBoost*]. - -| Tag(s) | Intel SKLearn | Scikit-learn | XGBoost | Dockerfile | -| ------------------------------------------------- | -------------- | ------------ | -------- | --------------- | -| `2024.6.0-pip-base`, `latest` | [v2024.6.0] | [v1.5.0] | [v2.1.0] | [v0.4.0] | -| `2024.5.0-pip-base` | [v2024.5.0] | [v1.5.0] | [v2.1.0] | [v0.4.0] | -| `2024.3.0-pip-base` | [v2024.3.0] | [v1.4.2] | [v2.0.3] | [v0.4.0-Beta] | -| `2024.2.0-xgboost-2.0.3-pip-base` | [v2024.2.0] | [v1.4.1] | [v2.0.3] | [v0.4.0-Beta] | -| `scikit-learning-2024.0.0-xgboost-2.0.2-pip-base` | [v2024.0.0] | [v1.3.2] | [v2.0.2] | [v0.3.4] | - -The images below additionally include [Jupyter Notebook](https://jupyter.org/) server: - -| Tag(s) | Intel SKLearn | Scikit-learn | XGBoost | Dockerfile | -| ---------------------------------------------------- | -------------- | ------------ | -------- | --------------- | -| `2024.6.0-pip-jupyter` | [v2024.6.0] | [v1.5.1] | [v2.1.1] | [v0.4.0] | -| `2024.5.0-pip-jupyter` | [v2024.5.0] | [v1.5.0] | [v2.1.0] | [v0.4.0] | -| `2024.3.0-pip-jupyter` | [v2024.3.0] | [v1.4.2] | [v2.0.3] | [v0.4.0-Beta] | -| `2024.2.0-xgboost-2.0.3-pip-jupyter` | [v2024.2.0] | [v1.4.1] | [v2.0.3] | [v0.4.0-Beta] | -| `scikit-learning-2024.0.0-xgboost-2.0.2-pip-jupyter` | [v2024.0.0] | [v1.3.2] | [v2.0.2] | [v0.3.4] | - -### Run the Jupyter Container - -```bash -docker run -it --rm \ - -p 8888:8888 \ - --net=host \ - -v $PWD/workspace:/workspace \ - -w /workspace \ - intel/intel-optimized-ml:2024.2.0-xgboost-2.0.3-pip-jupyter -``` - -After running the command above, copy the URL (something like `http://127.0.0.1:$PORT/?token=***`) into your browser to access the notebook server. - -## Images with Intel® Distribution for Python* - -The images below include [Intel® Distribution for Python*]: - -| Tag(s) | Intel SKLearn | Scikit-learn | XGBoost | Dockerfile | -| ------------------------------------------------- | -------------- | ------------ | -------- | --------------- | -| `2024.6.0-idp-base` | [v2024.6.0] | [v1.5.1] | [v2.1.1] | [v0.4.0] | -| `2024.5.0-idp-base` | [v2024.5.0] | [v1.5.0] | [v2.1.0] | [v0.4.0] | -| `2024.3.0-idp-base` | [v2024.3.0] | [v1.4.1] | [v2.1.0] | [v0.4.0] | -| `2024.2.0-xgboost-2.0.3-idp-base` | [v2024.2.0] | [v1.4.1] | [v2.0.3] | [v0.4.0-Beta] | -| `scikit-learning-2024.0.0-xgboost-2.0.2-idp-base` | [v2024.0.0] | [v1.3.2] | [v2.0.2] | [v0.3.4] | - -The images below additionally include [Jupyter Notebook](https://jupyter.org/) server: - -| Tag(s) | Intel SKLearn | Scikit-learn | XGBoost | Dockerfile | -| ---------------------------------------------------- | -------------- | ------------ | -------- | --------------- | -| `2024.6.0-idp-jupyter` | [v2024.6.0] | [v1.5.1] | [v2.1.1] | [v0.4.0] | -| `2024.5.0-idp-jupyter` | [v2024.5.0] | [v1.5.0] | [v2.1.0] | [v0.4.0] | -| `2024.3.0-idp-jupyter` | [v2024.3.0] | [v1.4.0] | [v2.1.0] | [v0.4.0] | -| `2024.2.0-xgboost-2.0.3-idp-jupyter` | [v2024.2.0] | [v1.4.1] | [v2.0.3] | [v0.4.0-Beta] | -| `scikit-learning-2024.0.0-xgboost-2.0.2-idp-jupyter` | [v2024.0.0] | [v1.3.2] | [v2.0.2] | [v0.3.4] | - -## Build from Source - -To build the images from source, clone the [Intel® AI Containers](https://github.com/intel/ai-containers) repository, follow the main `README.md` file to setup your environment, and run the following command: - -```bash -cd classical-ml -docker compose build ml-base -docker compose run ml-base -``` - -You can find the list of services below for each container in the group: - -| Service Name | Description | -| ------------ | ------------------------------------------------------------------- | -| `ml-base` | Base image with [Intel® Extension for Scikit-learn*] and [XGBoost*] | -| `jupyter` | Adds Jupyter Notebook server | - -## License - -View the [License](https://github.com/intel/ai-containers/blob/main/LICENSE) for the [Intel® Distribution for Python]. - -The images below also contain other software which may be under other licenses (such as Pytorch*, Jupyter*, Bash, etc. from the base). - -It is the image user's responsibility to ensure that any use of The images below comply with any relevant licenses for all software contained within. - -\* Other names and brands may be claimed as the property of others. - - - -[Intel® Extension for Scikit-learn*]: https://www.intel.com/content/www/us/en/developer/tools/oneapi/scikit-learn.html -[Intel® Distribution for Python]: https://www.intel.com/content/www/us/en/developer/tools/oneapi/distribution-for-python.html#gs.9bos9m -[Scikit-learn*]: https://scikit-learn.org/stable/ -[XGBoost*]: https://github.com/dmlc/xgboost - -[v2024.6.0]: https://github.com/intel/scikit-learn-intelex/releases/tag/2024.6.0 -[v2024.5.0]: https://github.com/intel/scikit-learn-intelex/releases/tag/2024.5.0 -[v2024.3.0]: https://github.com/intel/scikit-learn-intelex/releases/tag/2024.3.0 -[v2024.2.0]: https://github.com/intel/scikit-learn-intelex/releases/tag/2024.2.0 -[v2024.0.0]: https://github.com/intel/scikit-learn-intelex/releases/tag/2024.0.0 - -[v1.5.1]: https://github.com/scikit-learn/scikit-learn/releases/tag/1.5.1 -[v1.5.0]: https://github.com/scikit-learn/scikit-learn/releases/tag/1.5.0 -[v1.4.2]: https://github.com/scikit-learn/scikit-learn/releases/tag/1.4.2 -[v1.4.1]: https://github.com/scikit-learn/scikit-learn/releases/tag/1.4.1 -[v1.3.2]: https://github.com/scikit-learn/scikit-learn/releases/tag/1.3.2 - -[v2.1.1]: https://github.com/dmlc/xgboost/releases/tag/v2.1.1 -[v2.1.0]: https://github.com/dmlc/xgboost/releases/tag/v2.1.0 -[v2.0.3]: https://github.com/dmlc/xgboost/releases/tag/v2.0.3 -[v2.0.2]: https://github.com/dmlc/xgboost/releases/tag/v2.0.2 - -[v0.4.0]: https://github.com/intel/ai-containers/blob/v0.4.0/classical-ml/Dockerfile -[v0.4.0-Beta]: https://github.com/intel/ai-containers/blob/v0.4.0-Beta/classical-ml/Dockerfile -[v0.3.4]: https://github.com/intel/ai-containers/blob/v0.3.4/classical-ml/Dockerfile diff --git a/docs/index.md b/docs/index.md deleted file mode 100644 index 237051230..000000000 --- a/docs/index.md +++ /dev/null @@ -1,173 +0,0 @@ -# Intel® AI Containers - -[![OpenSSF Best Practices](https://www.bestpractices.dev/projects/8270/badge)](https://www.bestpractices.dev/projects/8270) -[![OpenSSF Scorecard](https://api.securityscorecards.dev/projects/github.com/intel/ai-containers/badge)](https://securityscorecards.dev/viewer/?uri=github.com/intel/ai-containers) -[![pre-commit.ci status](https://results.pre-commit.ci/badge/github/intel/ai-containers/main.svg)](https://results.pre-commit.ci/latest/github/intel/ai-containers/main) -[![Coverage Status](https://coveralls.io/repos/github/intel/ai-containers/badge.svg?branch=main)](https://coveralls.io/github/intel/ai-containers?branch=main) -[![CodeQL](https://github.com/intel/ai-containers/actions/workflows/github-code-scanning/codeql/badge.svg?branch=main)](https://github.com/intel/ai-containers/actions/workflows/github-code-scanning/codeql) -[![Docs](https://github.com/intel/ai-containers/actions/workflows/docs.yaml/badge.svg?branch=main)](https://github.com/intel/ai-containers/actions/workflows/docs.yaml) -[![Lint](https://github.com/intel/ai-containers/actions/workflows/lint.yaml/badge.svg?branch=main)](https://github.com/intel/ai-containers/actions/workflows/lint.yaml) -[![Test Runner CI](https://github.com/intel/ai-containers/actions/workflows/test-runner-ci.yaml/badge.svg?branch=main)](https://github.com/intel/ai-containers/actions/workflows/test-runner-ci.yaml) -[![Helm Chart CI](https://github.com/intel/ai-containers/actions/workflows/chart-ci.yaml/badge.svg?event=merge_group)](https://github.com/intel/ai-containers/actions/workflows/chart-ci.yaml) -[![Weekly Tests](https://github.com/intel/ai-containers/actions/workflows/weekly-test.yaml/badge.svg?branch=main)](https://github.com/intel/ai-containers/actions/workflows/weekly-test.yaml) - -This repository contains Dockerfiles, scripts, yaml files, Helm charts, etc. used to scale out AI containers with versions of TensorFlow and PyTorch that have been optimized for Intel platforms. Scaling is done with python, Docker, kubernetes, kubeflow, cnvrg.io, Helm, and other container orchestration frameworks for use in the cloud and on-premise. - -## Project Setup - -Define your project's registry and repository each time you use the project: - -```bash -# REGISTRY/REPO:TAG -export REGISTRY= -export REPO= - -docker login $REGISTRY - -# Verify your access permissions -docker pull $REGISTRY/$REPO:latest -``` - -The maintainers of Intel® AI Containers use Azure to store containers, but an open source container registry like [harbor](https://github.com/goharbor/harbor) is preferred. - -> [!WARNING] -> You can optionally skip this step and use some placeholder values, however some container groups depend on other images and will pull from a registry that you have not defined and result in an error. - -### Set Up Docker Engine - -You'll need to install Docker Engine on your development system. Note that while **Docker Engine** is free to use, **Docker Desktop** may require you to purchase a license. See the [Docker Engine Server installation instructions](https://docs.docker.com/engine/install/#server) for details. - -### Set Up Docker Compose - -Ensure you have Docker Compose installed on your machine. If you don't have this tool installed, consult the official [Docker Compose installation documentation](https://docs.docker.com/compose/install/linux/#install-the-plugin-manually). - -```bash -DOCKER_CONFIG=${DOCKER_CONFIG:-$HOME/.docker} -mkdir -p $DOCKER_CONFIG/cli-plugins -curl -SL https://github.com/docker/compose/releases/download/v2.26.1/docker-compose-linux-x86_64 -o $DOCKER_CONFIG/cli-plugins/docker-compose -chmod +x $DOCKER_CONFIG/cli-plugins/docker-compose -docker compose version -``` - -> [!CAUTION] -> Docker compose `v2.25.0` is the minimum required version for some container groups. - -## Build Containers - -Select your framework of choice (TensorFlow*, PyTorch*, Classical ML) and run the docker compose commands: - -```bash -cd -docker compose up --build -``` - -To configure these containers, simply append the relevant environment variable to the docker compose command based on the build arguments in the compose file. For example: - -```bash -# I want to build ipex-base with Intel® Distribution for Python -cd pytorch -PACKAGE_OPTION=idp docker compose up --build ipex-base -``` - -> [!NOTE] -> If you didn't specify `REGISTRY` or `REPO`, you also need to add the `idp` service to the list to build the dependent python image. - -## Test Containers - -To test the containers, use the [Test Runner Framework](https://github.com/intel/ai-containers/tree/main/test-runner): - -```bash -# I want to test ipex-base with Intel® Distribution for Python -# 1. build the container in the above section -# 2. push it to a relevant registry -PACKAGE_OPTION=idp docker compose push ipex-base -cd .. -# 3. install the test runner python requirements -pip install -r test-runner/requirements.txt -# 4. Run the test file -PACKAGE_OPTION=idp python test-runner/test_runner.py -f pytorch/tests/tests.yaml -``` - -> [!TIP] -> To test a container built by GitHub Actions CI/CD, find the `run number` associated with the workflow run and set the `GITHUB_RUN_NUMBER` environment variable during execution to pull the desired image. - -## Deploy Containers - -### Install [Helm](https://helm.sh/docs/intro/install/) - -This assumes you've setup [kubectl](https://kubernetes.io/docs/tasks/tools/#kubectl) and have a `KUBECONFIG`. - -```bash -curl -fsSL -o get_helm.sh https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 && \ -chmod 700 get_helm.sh && \ -./get_helm.sh -``` - -### Deploy a Helm Chart - -```bash -cd workflows/charts -# Select a Chart and check its README for a list of customization options and other steps required. -helm install \ - --namespace= \ - --set = \ - -``` - -### Test a Helm Chart - -Install [Chart Testing](https://github.com/helm/chart-testing). - -```bash -pip install -r workflows/charts/dev-requirements.txt -brew install chart-testing -``` - -Utilize the `ct` CLI to run `helm lint`, `helm install`, and `helm test`. - -```bash -ct lint-and-install --namespace= --config .github/ct.yaml --charts workflow/charts/ -``` - -## Troubleshooting - -- See the [Docker Troubleshooting Article](https://docs.docker.com/engine/install/troubleshoot/). -- Verify that [Docker Engine Post-Install Steps](https://docs.docker.com/engine/install/linux-postinstall/) are completed. -- When facing socket error check the group membership of the user and ensure they are part of the `docker` group. -- After changing any docker files or configs, restart the docker service `sudo systemctl restart docker`. -- Enable [Docker Desktop for WSL 2](https://docs.docker.com/desktop/windows/wsl/). -- If you are trying to access a container UI from the browser, make sure you have [port forwarded](https://code.visualstudio.com/docs/remote/ssh#_forwarding-a-port-creating-ssh-tunnel) and reconnect. -- If your environment requires a proxy to access the internet, export your development system's proxy settings to the docker environment: - -```bash -export DOCKER_BUILD_ARGS="--build-arg ftp_proxy=${ftp_proxy} \ - --build-arg FTP_PROXY=${FTP_PROXY} --build-arg http_proxy=${http_proxy} \ - --build-arg HTTP_PROXY=${HTTP_PROXY} --build-arg https_proxy=${https_proxy} \ - --build-arg HTTPS_PROXY=${HTTPS_PROXY} --build-arg no_proxy=${no_proxy} \ - --build-arg NO_PROXY=${NO_PROXY} --build-arg socks_proxy=${socks_proxy} \ - --build-arg SOCKS_PROXY=${SOCKS_PROXY}" -``` - -```bash -export DOCKER_RUN_ENVS="-e ftp_proxy=${ftp_proxy} \ - -e FTP_PROXY=${FTP_PROXY} -e http_proxy=${http_proxy} \ - -e HTTP_PROXY=${HTTP_PROXY} -e https_proxy=${https_proxy} \ - -e HTTPS_PROXY=${HTTPS_PROXY} -e no_proxy=${no_proxy} \ - -e NO_PROXY=${NO_PROXY} -e socks_proxy=${socks_proxy} \ - -e SOCKS_PROXY=${SOCKS_PROXY}" -``` - -```bash -docker build $DOCKER_BUILD_ARGS -t my:tag . -docker run $DOCKER_RUN_ENVS --rm -it my:tag -``` - -## Support - -The Intel AI MLOps team tracks bugs and enhancement requests using -[GitHub issues](https://github.com/intel/ai-containers/issues). Before submitting a -suggestion or bug report, search the existing GitHub issues to see if your issue has already been reported. - ---- - -- [Trademarks](http://www.intel.com/content/www/us/en/legal/trademarks.html) diff --git a/docs/preset/README.md b/docs/preset/README.md deleted file mode 100644 index b2a31dd82..000000000 --- a/docs/preset/README.md +++ /dev/null @@ -1,167 +0,0 @@ -# AI Tools Selector Preset Containers - -AI Tools Selector Preset Containers provides data scientists and developers with environment to perform various data-science tasks such as data analysis, data processing, machine learning and deep learning models training and inference. Each container is equipped with the Python packages and tools suited for each task. More detail about each container is described in the table below. - -## Preset Containers - -| Preset Container Name | Purpose | Tools | Image Name | -| -----------------------------| ------------- | ------------- | ----------------- | -| Data Analytics | Perform large scale data analysis |[Modin*](https://github.com/modin-project/modin), [Intel® Dataset Librarian](https://github.com/IntelAI/models/tree/master/datasets/dataset_api), [Intel® Data Connector](https://github.com/IntelAI/models/tree/master/datasets/cloud_data_connector) | [`intel/data-analytics:latest-py3.9`](https://hub.docker.com/r/intel/data-analytics/tags)
[`intel/data-analytics:latest-py3.10`](https://hub.docker.com/r/intel/data-analytics/tags) | -| Classical ML | Train classical-ml models using scikit, modin and xgboost |[Intel® extension for SciKit Learn](https://github.com/intel/scikit-learn-intelex), [Intel® Optimization for XGBoost*](https://github.com/dmlc/xgboost), [Modin*](https://github.com/modin-project/modin),
[Intel® Dataset Librarian](https://github.com/IntelAI/models/tree/master/datasets/dataset_api), [Intel® Data Connector](https://github.com/IntelAI/models/tree/master/datasets/cloud_data_connector) | [`intel/classical-ml:latest-py3.9`](https://hub.docker.com/r/intel/classical-ml/tags)
[`intel/classical-ml:latest-py3.10`](https://hub.docker.com/r/intel/classical-ml/tags) | -| Deep Learning | Train large scale Deep Learning models with Tensorflow or PyTorch | [Intel® Extension for PyTorch](https://github.com/intel/intel-extension-for-pytorch), [Intel® Extension for Tensorflow](https://github.com/intel/intel-extension-for-tensorflow),
[Intel® Optimization for Horovod](https://github.com/intel/intel-optimization-for-horovod), [Intel® Dataset Librarian](https://github.com/IntelAI/models/tree/master/datasets/dataset_api), [Intel® Data Connector](https://github.com/IntelAI/models/tree/master/datasets/cloud_data_connector), [Intel® Extension for DeepSpeed](https://github.com/intel/intel-extension-for-deepspeed) | [`intel/deep-learning:latest-py3.9`](https://hub.docker.com/r/intel/deep-learning/tags)
[`intel/deep-learning:latest-py3.10`](https://hub.docker.com/r/intel/deep-learning/tags) | -| Inference Optimization | Optimize Deep Learning models for inference
using Intel® Neural Compressor | [Intel® Extension for PyTorch](https://github.com/intel/intel-extension-for-pytorch), [Intel® Extension for Tensorflow](https://github.com/intel/intel-extension-for-tensorflow),
[Intel® Neural Compressor](https://github.com/intel/neural-compressor), [Intel® Dataset Librarian](https://github.com/IntelAI/models/tree/master/datasets/dataset_api), [Intel® Data Connector](https://github.com/IntelAI/models/tree/master/datasets/cloud_data_connector) | [`intel/inference-optimization:latest-py3.9`](https://hub.docker.com/r/intel/inference-optimization/tags)
[`intel/inference-optimization:latest-py3.10`](https://hub.docker.com/r/intel/inference-optimization/tags) | - -## Prerequisites - -1. Make sure [docker](https://docs.docker.com/engine/) is installed on the machine. Follow the [instruction here](https://docs.docker.com/engine/install/) to install docker engine on a host machine. - -2. Pull a Preset Container of your choice from the [AI Tools Selector](https://www.intel.com/content/www/us/en/developer/tools/oneapi/ai-tools-selector.html) or from the [table](#preset-containers). The commands below use the `deep-learning` preset as an example. - -```bash -docker pull intel/deep-learning:latest-py3.9 -``` - -## Run Preset Container - -There are 3 modes to run these containers: - -* [Interactive](#run-in-interactive-mode) -* [Jupyter](#run-using-jupyter-notebook) -* [Multi-Node Distributed Training](#run-in-multi-node-distributed-mode-advanced) (Deep Learning and Inference Optimization) - -> [!NOTE] -> Modify the commands below to fit your use case, especially the image, environment variables, and GPU device path. - -### Run in Interactive Mode - -This mode allows running the container in an interactive shell. This enables the ability to interact with the container's bash shell. Below is the command to start the container in interactive mode: - -#### Run on CPU - -```bash -docker run -it --rm \ - --shm-size=12G \ - -v ${PWD}:/home/dev/workdir \ - intel/deep-learning:latest-py3.9 bash -``` - -> [!NOTE] -> Certain applications use shared memory to share data between processes. But the default shared memory segment size is 64M for docker containers, and is not enough for multithreaded applications (Ex. Modin*). Docker recommends increasing shared memory size using `--shm-size`. - -#### Run on GPU - -Find your machine's `RENDER` and `VIDEO` group values to enable [Intel® Flex/Max GPU](https://www.intel.com/content/www/us/en/products/details/discrete-gpus/data-center-gpu.html). - -```bash -RENDER=$(getent group render | sed -E 's,^render:[^:]*:([^:]*):.*$,\1,') -VIDEO=$(getent group video | sed -E 's,^video:[^:]*:([^:]*):.*$,\1,') -test -z "$RENDER" || RENDER_GROUP="--group-add ${RENDER}" -test -z "$VIDEO" || VIDEO_GROUP="--group-add ${VIDEO}" -``` - -```bash -docker run -it --rm \ - ${RENDER_GROUP} \ - ${VIDEO_GROUP} \ - --device=/dev/dri \ - --shm-size=12G \ - -v ${PWD}:/home/dev/workdir \ - -v /dev/dri/by-path:/dev/dri/by-path \ - intel/deep-learning:latest-py3.9 bash -``` - -> [!NOTE] -> Certain applications use shared memory to share data between processes. But the default shared memory segment size is 64M for docker containers, and is not enough for multithreaded applications(Ex. Modin). Docker recommends increasing shared memory size using `--shm-size`. - -#### Next Steps - -1. For Deep Learning and Inference Optimization containers there will be separate conda environments for each AI framework: `pytorch-cpu`, `pytorch-gpu` and `tensorflow`. Use the command below to activate one environment: - - ```bash - conda activate - ``` - -2. Select a test from the `sample-tests` folder and run it using the following command as an example: - - ```bash - bash sample-tests/onnx/run.sh - # or if no bash script is found - python sample-tests/intel_extension_for_tensorflow/test_itex.py - ``` - -### Run using Jupyter Notebook - -This mode launches a jupyterlab notebook server. The command below will start the jupyterlab server which can be accessed from a web browser. Each container includes jupyter kernel to enable conda environment in jupyter notebook. The port for this server is `8888` and is exposed by default when you run the container. - -> [!NOTE] -> When launching a jupyter notebook server this way, docker assigns a [network](https://docs.docker.com/engine/tutorials/networkingcontainers/) such that the container can communicate with other applications like a web browser. By default docker launches containers with the `bridge` network, but if you are trying to access this server from a machine you are `ssh`'ing into, change the network mode with the `--net=host` flag and ensure you are local port forwarding with `ssh -L 8888:8888`. - -#### Run on Jupyter CPU - -```bash -docker run -it --rm \ - --shm-size=12G \ - -v ${PWD}:/home/dev/workdir \ - intel/deep-learning:latest-py3.9 -``` - -> [!NOTE] -> Certain applications use shared memory to share data between processes. But the default shared memory segment size is 64M for docker containers, and is not enough for multithreaded applications (Ex. Modin*). Docker recommends increasing shared memory size using `--shm-size`. - -#### Run on Jupyter GPU - -Find your machine's `RENDER` and `VIDEO` group values to enable [Intel® Flex/Max GPU](https://www.intel.com/content/www/us/en/products/details/discrete-gpus/data-center-gpu.html). - -```bash -RENDER=$(getent group render | sed -E 's,^render:[^:]*:([^:]*):.*$,\1,') -VIDEO=$(getent group video | sed -E 's,^video:[^:]*:([^:]*):.*$,\1,') -test -z "$RENDER" || RENDER_GROUP="--group-add ${RENDER}" -test -z "$VIDEO" || VIDEO_GROUP="--group-add ${VIDEO}" -``` - -```bash -docker run -it --rm \ - ${RENDER_GROUP} \ - ${VIDEO_GROUP} \ - --device=/dev/dri \ - --shm-size=12G \ - -v ${PWD}:/home/dev/workdir \ - -v /dev/dri/by-path:/dev/dri/by-path \ - intel/deep-learning:latest-py3.9 -``` - -> [!NOTE] -> Certain applications use shared memory to share data between processes. But the default shared memory segment size is 64M for docker containers, and is not enough for multithreaded applications (Ex. Modin*). Docker recommends increasing shared memory size using `--shm-size`. - -#### Next Steps - -1. After running this command the terminal should display an output similar to displayed below in the image ![image](https://github.com/intel/ai-containers/assets/18349036/0a8a2d05-f7b0-4a9f-994e-bcc4e4b703a0) The server address together with the port set can be used to connect to the jupyter server in a web browser. For example `http://127.0.0.1:8888`. The token displayed after the `token=` can be used as a password to login into the server. For example in the image displayed above the token is `b66e74a85bc2570bf15782e5124c933c3a4ddabd2cf2d7d3`. - -2. Select a notebook sample from the Overview notebook found in directory you launched the server with. In this example, the `intel/deep-learning` container has a notebook titled [`Deep_Learning_Samples_Overview.ipynb`](https://github.com/intel/ai-containers/blob/main/preset/deep-learning/notebooks/Deep_Learning_Samples_Overview.ipynb) when launched in jupyter mode. - -3. After selecting a notebook sample, select the preset kernel found in the dropdown menu presented when loading the notebook. For Deep Learning and Inference Optimization containers there will be multiple kernels, one for each framework: `pytorch`, `pytorch-gpu`, and `tensorflow`. - -##### Advanced Jupyter Server Configuration - -Modify your notebook server command by using the default example below to change the network (port/ip) and security (privilege) settings by appending it to the docker run commands above: - -```bash -docker run ... intel/deep-learning:latest-py3.9 \ - bash -c "jupyter notebook --notebook-dir=~/jupyter \ - --port 8888 \ - --ip 0.0.0.0 \ - --no-browser \ - --allow-root" -``` - -## Run in Multi-Node Distributed Mode [Advanced] - -You can follow the instructions provided for [Intel® Extension for TensorFlow*](https://github.com/intel/ai-containers/tree/main/preset/deep-learning/demo/tensorflow-distributed/README.md) and [Intel® Extension for PyTorch*](https://github.com/intel/ai-containers/tree/main/preset/deep-learning/demo/pytorch-distributed/README.md) along with the Deep Learning or Inference Optimization presets using your preferred framework. - -## Troubleshooting and Support - -If you face some issue in using the container you can find more information on how to troubleshoot [here](https://github.com/intel/ai-containers#troubleshooting). If you need more help feel free to submit an [issue](https://github.com/intel/ai-containers/issues). - ---- - -*Other names and brands may be claimed as the property of others. [Trademarks](http://www.intel.com/content/www/us/en/legal/trademarks.html) diff --git a/docs/python/README.md b/docs/python/README.md deleted file mode 100644 index 0b9f95dcd..000000000 --- a/docs/python/README.md +++ /dev/null @@ -1,44 +0,0 @@ -# Intel® Distribution for Python - -[Intel® Distribution for Python] enhances performance and can improve your program speed from 10 to 100 times faster. It is a Python distribution that includes the [Intel® Math Kernel Library] (oneMKL) and other Intel performance libraries to enable near-native performance through acceleration of core numerical and machine learning packages. - -[Intel® Distribution for Python] is available as part of the [Intel® oneAPI Base Toolkit](https://software.intel.com/content/www/us/en/develop/tools/oneapi/base-toolkit.html). - -## Images - -The images below include variations for only the core packages in the [Intel® Distribution for Python] installation, or all of the packages. - -| Tag(s) | IDP | -| ---------------------- | ---------- | -| `3.10-full`, `latest` | `2024.2.0` | -| `3.10-core` | `2024.2.0` | - -## Build from Source - -To build the images from source, clone the [Intel® AI Containers](https://github.com/intel/ai-containers) repository, follow the main `README.md` file to setup your environment, and run the following command: - -```bash -cd python -docker compose build idp -docker compose run idp -``` - -You can find the list of services below for each container in the group: - -| Service Name | Description | -| ------------ | ------------------------------------------------------------------- | -| `idp` | Base image with [Intel® Distribution for Python] | -| `pip` | Equivalent python image without [Intel® Distribution for Python] | - -## License - -View the [License](https://github.com/intel/ai-containers/blob/main/LICENSE) for the [Intel® Distribution for Python]. - -It is the image user's responsibility to ensure that any use of The images below comply with any relevant licenses for all software contained within. - -\* Other names and brands may be claimed as the property of others. - - - -[Intel® Distribution for Python]: https://www.intel.com/content/www/us/en/developer/tools/oneapi/distribution-for-python.html#gs.9bos9m -[Intel® Math Kernel Library]: https://www.intel.com/content/www/us/en/developer/tools/oneapi/onemkl.html diff --git a/docs/pytorch/README.md b/docs/pytorch/README.md deleted file mode 100644 index 53adcb1d8..000000000 --- a/docs/pytorch/README.md +++ /dev/null @@ -1,434 +0,0 @@ -# Intel® Extension for Pytorch\* - -[Intel® Extension for PyTorch*] extends [PyTorch*] with up-to-date feature optimizations for an extra performance boost on Intel hardware. - -On Intel CPUs optimizations take advantage of the following instuction sets: - -* Intel® Advanced Matrix Extensions (Intel® AMX) -* Intel® Advanced Vector Extensions 512 (Intel® AVX-512) -* Vector Neural Network Instructions (VNNI) - -On Intel GPUs Intel® Extension for PyTorch\* provides easy GPU acceleration through the PyTorch* `xpu` device. The following Intel GPUs are supported: - -* [Intel® Arc™ A-Series Graphics] -* [Intel® Data Center GPU Flex Series] -* [Intel® Data Center GPU Max Series] - -Images available here start with the [Ubuntu* 22.04](https://hub.docker.com/_/ubuntu) base image with [Intel® Extension for PyTorch*] built for different use cases as well as some additional software. The [Python Dockerfile](https://github.com/intel/ai-containers/blob/main/python/Dockerfile) is used to generate The images below at https://github.com/intel/ai-containers. - -> **Note:** There are two dockerhub repositories (`intel/intel-extension-for-pytorch` and `intel/intel-optimized-pytorch`) that are routinely updated with the latest images, however, some legacy images have not be published to both repositories. - -## XPU images - -The images below include support for both CPU and GPU optimizations: - -| Tag(s) | Pytorch | IPEX | Driver | Dockerfile | -| ---------------------- | -------- | -------------- | ------ | --------------- | -| `2.1.40-xpu` | [v2.1.0] | [v2.1.40+xpu] | [914] | [v0.4.0-Beta] | -| `2.1.30-xpu` | [v2.1.0] | [v2.1.30+xpu] | [803] | [v0.4.0-Beta] | -| `2.1.20-xpu` | [v2.1.0] | [v2.1.20+xpu] | [803] | [v0.3.4] | -| `2.1.10-xpu` | [v2.1.0] | [v2.1.10+xpu] | [736] | [v0.2.3] | -| `xpu-flex-2.0.110-xpu` | [v2.0.1] | [v2.0.110+xpu] | [647] | [v0.1.0] | - ---- - -```bash -docker run -it --rm \ - --device /dev/dri \ - -v /dev/dri/by-path:/dev/dri/by-path \ - --ipc=host \ - intel/intel-extension-for-pytorch:2.1.40-xpu -``` - ---- - -The images below additionally include [Jupyter Notebook](https://jupyter.org/) server: - -| Tag(s) | Pytorch | IPEX | Driver | Jupyter Port | Dockerfile | -| --------------------- | -------- | ------------- | ------ | ------------ | --------------- | -| `2.1.40-xpu-pip-jupyter` | [v2.1.0] | [v2.1.40+xpu] | [914] | `8888` | [v0.4.0-Beta] | -| `2.1.20-xpu-pip-jupyter` | [v2.1.0] | [v2.1.20+xpu] | [803] | `8888` | [v0.3.4] | -| `2.1.10-xpu-pip-jupyter` | [v2.1.0] | [v2.1.10+xpu] | [736] | `8888` | [v0.2.3] | - -### Run the XPU Jupyter Container - -```bash -docker run -it --rm \ - -p 8888:8888 \ - --device /dev/dri \ - -v /dev/dri/by-path:/dev/dri/by-path \ - intel/intel-extension-for-pytorch:2.1.40-xpu-pip-jupyter -``` - -After running the command above, copy the URL (something like `http://127.0.0.1:$PORT/?token=***`) into your browser to access the notebook server. - -## CPU only images - -The images below are built only with CPU optimizations (GPU acceleration support was deliberately excluded): - -| Tag(s) | Pytorch | IPEX | Dockerfile | -| -------------------------- | -------- | ------------ | --------------- | -| `2.4.0-pip-base`, `latest` | [v2.4.0] | [v2.4.0+cpu] | [v0.4.0-Beta] | -| `2.3.0-pip-base` | [v2.3.0] | [v2.3.0+cpu] | [v0.4.0-Beta] | -| `2.2.0-pip-base` | [v2.2.0] | [v2.2.0+cpu] | [v0.3.4] | -| `2.1.0-pip-base` | [v2.1.0] | [v2.1.0+cpu] | [v0.2.3] | -| `2.0.0-pip-base` | [v2.0.0] | [v2.0.0+cpu] | [v0.1.0] | - -### Run the CPU Container - -```bash -docker run -it --rm intel/intel-extension-for-pytorch:latest -``` - ---- - -The images below additionally include [Jupyter Notebook](https://jupyter.org/) server: - -| Tag(s) | Pytorch | IPEX | Dockerfile | -| ------------------- | -------- | ------------ | --------------- | -| `2.4.0-pip-jupyter` | [v2.4.0] | [v2.4.0+cpu] | [v0.4.0-Beta] | -| `2.3.0-pip-jupyter` | [v2.3.0] | [v2.3.0+cpu] | [v0.4.0-Beta] | -| `2.2.0-pip-jupyter` | [v2.2.0] | [v2.2.0+cpu] | [v0.3.4] | -| `2.1.0-pip-jupyter` | [v2.1.0] | [v2.1.0+cpu] | [v0.2.3] | -| `2.0.0-pip-jupyter` | [v2.0.0] | [v2.0.0+cpu] | [v0.1.0] | - -```bash -docker run -it --rm \ - -p 8888:8888 \ - -v $PWD/workspace:/workspace \ - -w /workspace \ - intel/intel-extension-for-pytorch:2.4.0-pip-jupyter -``` - -After running the command above, copy the URL (something like `http://127.0.0.1:$PORT/?token=***`) into your browser to access the notebook server. - ---- - -The images below additionally include [Intel® oneAPI Collective Communications Library] (oneCCL) and Neural Compressor ([INC]): - -| Tag(s) | Pytorch | IPEX | oneCCL | INC | Dockerfile | -| --------------------- | -------- | ------------ | -------------------- | --------- | -------------- | -| `2.4.0-pip-multinode` | [v2.4.0] | [v2.4.0+cpu] | [v2.4.0][ccl-v2.4.0] | [v3.0] | [v0.4.0-Beta] | -| `2.3.0-pip-multinode` | [v2.3.0] | [v2.3.0+cpu] | [v2.3.0][ccl-v2.3.0] | [v2.6] | [v0.4.0-Beta] | -| `2.2.0-pip-multinode` | [v2.2.2] | [v2.2.0+cpu] | [v2.2.0][ccl-v2.2.0] | [v2.6] | [v0.4.0-Beta] | -| `2.1.100-pip-mulitnode` | [v2.1.2] | [v2.1.100+cpu] | [v2.1.0][ccl-v2.1.0] | [v2.6] | [v0.4.0-Beta] | -| `2.0.100-pip-multinode` | [v2.0.1] | [v2.0.100+cpu] | [v2.0.0][ccl-v2.0.0] | [v2.6] | [v0.4.0-Beta] | - -> [!NOTE] -> Passwordless SSH connection is also enabled in the image, but the container does not contain any SSH ID keys. The user needs to mount those keys at `/root/.ssh/id_rsa` and `/etc/ssh/authorized_keys`. - -> [!TIP] -> Before mounting any keys, modify the permissions of those files with `chmod 600 authorized_keys; chmod 600 id_rsa` to grant read access for the default user account. - -#### Setup and Run IPEX Multi-Node Container - -Some additional assembly is required to utilize this container with OpenSSH. To perform any kind of DDP (Distributed Data Parallel) execution, containers are assigned the roles of launcher and worker respectively: - -SSH Server (Worker) - -1. *Authorized Keys* : `/etc/ssh/authorized_keys` - -SSH Client (Launcher) - -1. *Private User Key* : `/root/.ssh/id_rsa` - -To add these files correctly please follow the steps described below. - -1. Setup ID Keys - - You can use the commands provided below to [generate the identity keys](https://www.ssh.com/academy/ssh/keygen#creating-an-ssh-key-pair-for-user-authentication) for OpenSSH. - - ```bash - ssh-keygen -q -N "" -t rsa -b 4096 -f ./id_rsa - touch authorized_keys - cat id_rsa.pub >> authorized_keys - ``` - -2. Configure the permissions and ownership for all of the files you have created so far - - ```bash - chmod 600 id_rsa config authorized_keys - chown root:root id_rsa.pub id_rsa config authorized_keys - ``` - -3. Create a hostfile for `torchrun` or `ipexrun`. (Optional) - - ```txt - Host host1 - HostName - IdentitiesOnly yes - IdentityFile ~/.root/id_rsa - Port - Host host2 - HostName - IdentitiesOnly yes - IdentityFile ~/.root/id_rsa - Port - ... - ``` - -4. Configure [Intel® oneAPI Collective Communications Library] in your python script - - ```python - import oneccl_bindings_for_pytorch - import os - - dist.init_process_group( - backend="ccl", - init_method="tcp://127.0.0.1:3022", - world_size=int(os.environ.get("WORLD_SIZE")), - rank=int(os.environ.get("RANK")), - ) - ``` - -5. Now start the workers and execute DDP on the launcher - - 1. Worker run command: - - ```bash - docker run -it --rm \ - --net=host \ - -v $PWD/authorized_keys:/etc/ssh/authorized_keys \ - -v $PWD/tests:/workspace/tests \ - -w /workspace \ - intel/intel-extension-for-pytorch:2.4.0-pip-multinode \ - bash -c '/usr/sbin/sshd -D' - ``` - - 2. Launcher run command: - - ```bash - docker run -it --rm \ - --net=host \ - -v $PWD/id_rsa:/root/.ssh/id_rsa \ - -v $PWD/tests:/workspace/tests \ - -v $PWD/hostfile:/workspace/hostfile \ - -w /workspace \ - intel/intel-extension-for-pytorch:2.4.0-pip-multinode \ - bash -c 'ipexrun cpu --nnodes 2 --nprocs-per-node 1 --master-addr 127.0.0.1 --master-port 3022 /workspace/tests/ipex-resnet50.py --ipex --device cpu --backend ccl' - ``` - -> [!NOTE] -> [Intel® MPI] can be configured based on your machine settings. If the above commands do not work for you, see the documentation for how to configure based on your network. - -#### Enable [DeepSpeed*] optimizations - -To enable [DeepSpeed*] optimizations with [Intel® oneAPI Collective Communications Library], add the following to your python script: - -```python -import deepspeed - -# Rather than dist.init_process_group(), use deepspeed.init_distributed() -deepspeed.init_distributed(backend="ccl") -``` - -Additionally, if you have a [DeepSpeed* configuration](https://www.deepspeed.ai/getting-started/#deepspeed-configuration) you can use the below command as your launcher to run your script with that configuration: - -```bash - docker run -it --rm \ - --net=host \ - -v $PWD/id_rsa:/root/.ssh/id_rsa \ - -v $PWD/tests:/workspace/tests \ - -v $PWD/hostfile:/workspace/hostfile \ - -v $PWD/ds_config.json:/workspace/ds_config.json \ - -w /workspace \ - intel/intel-extension-for-pytorch:2.4.0-pip-multinode \ - bash -c 'deepspeed --launcher IMPI \ - --master_addr 127.0.0.1 --master_port 3022 \ - --deepspeed_config ds_config.json --hostfile /workspace/hostfile \ - /workspace/tests/ipex-resnet50.py --ipex --device cpu --backend ccl --deepspeed' -``` - ---- - -#### Hugging Face Generative AI Container - -The image below is an extension of the IPEX Multi-Node Container designed to run Hugging Face Generative AI scripts. The container has the typical installations needed to run and fine tune PyTorch generative text models from Hugging Face. It can be used to run multinode jobs using the same instructions from the [IPEX Multi-Node container](#setup-and-run-ipex-multi-node-container). - -| Tag(s) | Pytorch | IPEX | oneCCL | HF Transformers | Dockerfile | -| ------------------------------------- | -------- | ------------ | -------------------- | --------------- | --------------- | -| `2.4.0-pip-multinode-hf-4.44.0-genai` | [v2.4.0] | [v2.4.0+cpu] | [v2.4.0][ccl-v2.4.0] | [v4.44.0] | [v0.4.0-Beta] | - -Below is an example that shows single node job with the existing [`finetune.py`](../workflows/charts/huggingface-llm/scripts/finetune.py) script. - -```bash -# Change into home directory first and run the command -docker run -it \ - -v $PWD/workflows/charts/huggingface-llm/scripts:/workspace/scripts \ - -w /workspace/scripts \ - intel/intel-extension-for-pytorch:2.4.0-pip-multinode-hf-4.44.0-genai \ - bash -c 'python finetune.py ' -``` - ---- - -The images below are [TorchServe*] with CPU Optimizations: - -| Tag(s) | Pytorch | IPEX | Dockerfile | -| ------------------- | -------- | ------------ | --------------- | -| `2.4.0-serving-cpu` | [v2.4.0] | [v2.4.0+cpu] | [v0.4.0-Beta] | -| `2.3.0-serving-cpu` | [v2.3.0] | [v2.3.0+cpu] | [v0.4.0-Beta] | -| `2.2.0-serving-cpu` | [v2.2.0] | [v2.2.0+cpu] | [v0.3.4] | - -For more details, follow the procedure in the [TorchServe](https://github.com/pytorch/serve/blob/master/examples/intel_extension_for_pytorch/README.md) instructions. - -## CPU only images with Intel® Distribution for Python* - -The images below are built only with CPU optimizations (GPU acceleration support was deliberately excluded) and include [Intel® Distribution for Python*]: - -| Tag(s) | Pytorch | IPEX | Dockerfile | -| ---------------- | -------- | ------------ | --------------- | -| `2.4.0-idp-base` | [v2.4.0] | [v2.4.0+cpu] | [v0.4.0-Beta] | -| `2.3.0-idp-base` | [v2.3.0] | [v2.3.0+cpu] | [v0.4.0-Beta] | -| `2.2.0-idp-base` | [v2.2.0] | [v2.2.0+cpu] | [v0.3.4] | -| `2.1.0-idp-base` | [v2.1.0] | [v2.1.0+cpu] | [v0.2.3] | -| `2.0.0-idp-base` | [v2.0.0] | [v2.0.0+cpu] | [v0.1.0] | - -The images below additionally include [Jupyter Notebook](https://jupyter.org/) server: - -| Tag(s) | Pytorch | IPEX | Dockerfile | -| ------------------- | -------- | ------------ | --------------- | -| `2.4.0-idp-jupyter` | [v2.4.0] | [v2.4.0+cpu] | [v0.4.0-Beta] | -| `2.3.0-idp-jupyter` | [v2.3.0] | [v2.3.0+cpu] | [v0.4.0-Beta] | -| `2.2.0-idp-jupyter` | [v2.2.0] | [v2.2.0+cpu] | [v0.3.4] | -| `2.1.0-idp-jupyter` | [v2.1.0] | [v2.1.0+cpu] | [v0.2.3] | -| `2.0.0-idp-jupyter` | [v2.0.0] | [v2.0.0+cpu] | [v0.1.0] | - -The images below additionally include [Intel® oneAPI Collective Communications Library] (oneCCL) and Neural Compressor ([INC]): - -| Tag(s) | Pytorch | IPEX | oneCCL | INC | Dockerfile | -| --------------------- | -------- | ------------ | -------------------- | --------- | --------------- | -| `2.4.0-idp-multinode` | [v2.4.0] | [v2.4.0+cpu] | [v2.4.0][ccl-v2.3.0] | [v3.0] | [v0.4.0-Beta] | -| `2.3.0-idp-multinode` | [v2.3.0] | [v2.3.0+cpu] | [v2.3.0][ccl-v2.3.0] | [v2.6] | [v0.4.0-Beta] | -| `2.2.0-idp-multinode` | [v2.2.0] | [v2.2.0+cpu] | [v2.2.0][ccl-v2.2.0] | [v2.4.1] | [v0.3.4] | -| `2.1.0-idp-mulitnode` | [v2.1.0] | [v2.1.0+cpu] | [v2.1.0][ccl-v2.1.0] | [v2.3.1] | [v0.2.3] | -| `2.0.0-idp-multinode` | [v2.0.0] | [v2.0.0+cpu] | [v2.0.0][ccl-v2.0.0] | [v2.1.1] | [v0.1.0] | - -## XPU images with Intel® Distribution for Python* - -The images below are built only with CPU and GPU optimizations and include [Intel® Distribution for Python*]: - -| Tag(s) | Pytorch | IPEX | Driver | Dockerfile | -| ---------------- | -------- | ------------ | -------- | ------ | -| `2.1.40-xpu-idp-base` | [v2.1.0] | [v2.1.40+xpu] | [914] | [v0.4.0-Beta] | -| `2.1.30-xpu-idp-base` | [v2.1.0] | [v2.1.30+xpu] | [803] | [v0.4.0-Beta] | -| `2.1.10-xpu-idp-base` | [v2.1.0] | [v2.1.10+xpu] | [736] | [v0.2.3] | - -The images below additionally include [Jupyter Notebook](https://jupyter.org/) server: - -| Tag(s) | Pytorch | IPEX | Driver | Jupyter Port | Dockerfile | -| --------------------- | -------- | ------------- | ------ | ------------ | --------------- | -| `2.1.40-xpu-idp-jupyter` | [v2.1.0] | [v2.1.40+xpu] | [914] | `8888` | [v0.4.0-Beta] | -| `2.1.20-xpu-idp-jupyter` | [v2.1.0] | [v2.1.20+xpu] | [803] | `8888` | [v0.3.4] | -| `2.1.10-xpu-idp-jupyter` | [v2.1.0] | [v2.1.10+xpu] | [736] | `8888` | [v0.2.3] | - -## Build from Source - -To build the images from source, clone the [Intel® AI Containers](https://github.com/intel/ai-containers) repository, follow the main `README.md` file to setup your environment, and run the following command: - -```bash -cd pytorch -docker compose build ipex-base -docker compose run ipex-base -``` - -You can find the list of services below for each container in the group: - -| Service Name | Description | -| ------------- | ------------------------------------------------------------------- | -| `ipex-base` | Base image with [Intel® Extension for PyTorch*] | -| `jupyter` | Adds Jupyter Notebook server | -| `multinode` | Adds [Intel® oneAPI Collective Communications Library] and [INC] | -| `xpu` | Adds Intel GPU Support | -| `xpu-jupyter` | Adds Jupyter notebook server to GPU image | -| `serving` | [TorchServe*] | - -## MLPerf Optimized Workloads - -The following images are available for MLPerf-optimized workloads. Instructions are available at '[Get Started with Intel MLPerf]'. - -| Tag(s) | Base OS | MLPerf Round | Target Platform | -| --------------------------------- | -------------- | ---------------- | ------------------------------- | -| `mlperf-inference-4.1-resnet50` | rockylinux:8.7 | [Inference v4.1] | Intel(R) Xeon(R) Platinum 8592+ | -| `mlperf-inference-4.1-retinanet` | ubuntu:22.04 | [Inference v4.1] | Intel(R) Xeon(R) Platinum 8592+ | -| `mlperf-inference-4.1-gptj` | ubuntu:22.04 | [Inference v4.1] | Intel(R) Xeon(R) Platinum 8592+ | -| `mlperf-inference-4.1-bert` | ubuntu:22.04 | [Inference v4.1] | Intel(R) Xeon(R) Platinum 8592+ | -| `mlperf-inference-4.1-dlrmv2` | rockylinux:8.7 | [Inference v4.1] | Intel(R) Xeon(R) Platinum 8592+ | -| `mlperf-inference-4.1-3dunet` | ubuntu:22.04 | [Inference v4.1] | Intel(R) Xeon(R) Platinum 8592+ | - -## License - -View the [License](https://github.com/intel/intel-extension-for-pytorch/blob/main/LICENSE) for the [Intel® Extension for PyTorch*]. - -The images below also contain other software which may be under other licenses (such as Pytorch*, Jupyter*, Bash, etc. from the base). - -It is the image user's responsibility to ensure that any use of The images below comply with any relevant licenses for all software contained within. - -\* Other names and brands may be claimed as the property of others. - - - -[Intel® Arc™ A-Series Graphics]: https://ark.intel.com/content/www/us/en/ark/products/series/227957/intel-arc-a-series-graphics.html -[Intel® Data Center GPU Flex Series]: https://ark.intel.com/content/www/us/en/ark/products/series/230021/intel-data-center-gpu-flex-series.html -[Intel® Data Center GPU Max Series]: https://ark.intel.com/content/www/us/en/ark/products/series/232874/intel-data-center-gpu-max-series.html - -[Intel® MPI]: https://www.intel.com/content/www/us/en/developer/tools/oneapi/mpi-library.html -[Intel® Extension for PyTorch*]: https://intel.github.io/intel-extension-for-pytorch/ -[Intel® Distribution for Python*]: https://www.intel.com/content/www/us/en/developer/tools/oneapi/distribution-for-python.html -[Intel® oneAPI Collective Communications Library]: https://www.intel.com/content/www/us/en/developer/tools/oneapi/oneccl.html -[INC]: https://github.com/intel/neural-compressor -[PyTorch*]: https://pytorch.org/ -[TorchServe*]: https://github.com/pytorch/serve -[DeepSpeed*]: https://github.com/microsoft/DeepSpeed - -[v0.4.0-Beta]: https://github.com/intel/ai-containers/blob/main/pytorch/Dockerfile -[v0.3.4]: https://github.com/intel/ai-containers/blob/v0.3.4/pytorch/Dockerfile -[v0.2.3]: https://github.com/intel/ai-containers/blob/v0.2.3/pytorch/Dockerfile -[v0.1.0]: https://github.com/intel/ai-containers/blob/v0.1.0/pytorch/Dockerfile - -[v2.1.40+xpu]: https://github.com/intel/intel-extension-for-pytorch/releases/tag/v2.1.40%2Bxpu -[v2.1.30+xpu]: https://github.com/intel/intel-extension-for-pytorch/releases/tag/v2.1.30%2Bxpu -[v2.1.20+xpu]: https://github.com/intel/intel-extension-for-pytorch/releases/tag/v2.1.20%2Bxpu -[v2.1.10+xpu]: https://github.com/intel/intel-extension-for-pytorch/releases/tag/v2.1.10%2Bxpu -[v2.0.110+xpu]: https://github.com/intel/intel-extension-for-pytorch/releases/tag/v2.0.110%2Bxpu - -[v2.4.0]: https://github.com/pytorch/pytorch/releases/tag/v2.4.0 -[v2.3.0]: https://github.com/pytorch/pytorch/releases/tag/v2.3.0 -[v2.2.2]: https://github.com/pytorch/pytorch/releases/tag/v2.2.2 -[v2.2.0]: https://github.com/pytorch/pytorch/releases/tag/v2.2.0 -[v2.1.2]: https://github.com/pytorch/pytorch/releases/tag/v2.1.2 -[v2.1.0]: https://github.com/pytorch/pytorch/releases/tag/v2.1.0 -[v2.0.1]: https://github.com/pytorch/pytorch/releases/tag/v2.0.1 -[v2.0.0]: https://github.com/pytorch/pytorch/releases/tag/v2.0.0 - -[v3.0]: https://github.com/intel/neural-compressor/releases/tag/v3.0 -[v2.6]: https://github.com/intel/neural-compressor/releases/tag/v2.6 -[v2.4.1]: https://github.com/intel/neural-compressor/releases/tag/v2.4.1 -[v2.3.1]: https://github.com/intel/neural-compressor/releases/tag/v2.3.1 -[v2.1.1]: https://github.com/intel/neural-compressor/releases/tag/v2.1.1 - -[v2.4.0+cpu]: https://github.com/intel/intel-extension-for-pytorch/releases/tag/v2.4.0%2Bcpu -[v2.3.0+cpu]: https://github.com/intel/intel-extension-for-pytorch/releases/tag/v2.3.0%2Bcpu -[v2.2.0+cpu]: https://github.com/intel/intel-extension-for-pytorch/releases/tag/v2.2.0%2Bcpu -[v2.1.100+cpu]: https://github.com/intel/intel-extension-for-pytorch/releases/tag/v2.1.0%2Bcpu -[v2.1.0+cpu]: https://github.com/intel/intel-extension-for-pytorch/releases/tag/v2.1.0%2Bcpu -[v2.0.100+cpu]: https://github.com/intel/intel-extension-for-pytorch/releases/tag/v2.0.0%2Bcpu -[v2.0.0+cpu]: https://github.com/intel/intel-extension-for-pytorch/releases/tag/v2.0.0%2Bcpu - -[ccl-v2.4.0]: https://github.com/intel/torch-ccl/releases/tag/v2.4.0%2Bcpu%2Brc0 -[ccl-v2.3.0]: https://github.com/intel/torch-ccl/releases/tag/v2.3.0%2Bcpu -[ccl-v2.2.0]: https://github.com/intel/torch-ccl/releases/tag/v2.2.0%2Bcpu -[ccl-v2.1.0]: https://github.com/intel/torch-ccl/releases/tag/v2.1.0%2Bcpu -[ccl-v2.0.0]: https://github.com/intel/torch-ccl/releases/tag/v2.1.0%2Bcpu - - -[v4.44.0]: https://github.com/huggingface/transformers/releases/tag/v4.44.0 - -[914]: https://dgpu-docs.intel.com/releases/stable_914_33_20240730.html -[803]: https://dgpu-docs.intel.com/releases/LTS_803.29_20240131.html -[736]: https://dgpu-docs.intel.com/releases/stable_736_25_20231031.html -[647]: https://dgpu-docs.intel.com/releases/stable_647_21_20230714.html - - -[Inference v4.1]: https://mlcommons.org/benchmarks/inference-datacenter -[Get Started with Intel MLPerf]: https://www.intel.com/content/www/us/en/developer/articles/guide/get-started-mlperf-intel-optimized-docker-images.html diff --git a/docs/tensorflow/README.md b/docs/tensorflow/README.md deleted file mode 100644 index ac2c8b7c9..000000000 --- a/docs/tensorflow/README.md +++ /dev/null @@ -1,353 +0,0 @@ -# Intel® Extension for TensorFlow\* - -[Intel® Extension for TensorFlow*] extends [TensorFlow*] with up-to-date feature optimizations for an extra performance boost on Intel hardware. - -[Intel® Extension for TensorFlow*] is based on the TensorFlow [PluggableDevice] interface to bring Intel XPU(GPU, CPU, etc.) devices into [TensorFlow*] with flexibility for on-demand performance on the following Intel GPUs: - -* [Intel® Arc™ A-Series Graphics] -* [Intel® Data Center GPU Flex Series] -* [Intel® Data Center GPU Max Series] - -> **Note:** There are two dockerhub repositories (`intel/intel-extension-for-tensorflow` and `intel/intel-optimized-tensorflow`) that are routinely updated with the latest images, however, some legacy images have not be published to both repositories. - -## XPU images - -The images below include support for both CPU and GPU optimizations: - -| Tag(s) | TensorFlow | ITEX | Driver | Dockerfile | -| ---------------------- | ----------- | -------------- | ------- | --------------- | -| `2.15.0.1-xpu`, `xpu` | [v2.15.1] | [v2.15.0.1] | [803.63]| [v0.4.0-Beta] | -| `2.15.0.0-xpu` | [v2.15.0] | [v2.15.0.0] | [803] | [v0.4.0-Beta] | -| `2.14.0.1-xpu` | [v2.14.1] | [v2.14.0.1] | [736] | [v0.3.4] | -| `2.13.0.0-xpu` | [v2.13.0] | [v2.13.0.0] | [647] | [v0.2.3] | - -### Run the XPU Container - -```bash -docker run -it --rm \ - --device /dev/dri \ - -v /dev/dri/by-path:/dev/dri/by-path \ - --ipc=host \ - intel/intel-extension-for-tensorflow:xpu -``` - ---- - -The images below additionally include [Jupyter Notebook](https://jupyter.org/) server: - -| Tag(s) | TensorFlow | IPEX | Driver | Dockerfile | -| ------------- | ----------- | ------------- | ------ | --------------- | -| `2.15.0.1-xpu-jupyter` | [v2.15.1] | [v2.15.0.1] | [803.63]| [v0.4.0-Beta] | -| `xpu-jupyter` | [v2.14.1] | [v2.14.0.1] | [736] | [v0.3.4] | - -### Run the XPU Jupyter Container - -```bash -docker run -it --rm \ - -p 8888:8888 \ - --net=host \ - --device /dev/dri \ - -v /dev/dri/by-path:/dev/dri/by-path \ - --ipc=host \ - intel/intel-extension-for-tensorflow:2.15.0.1-xpu-jupyter -``` - -After running the command above, copy the URL (something like `http://127.0.0.1:$PORT/?token=***`) into your browser to access the notebook server. - ---- - -The images below are [TensorFlow* Serving] with GPU Optimizations: - -| Tag(s) | TensorFlow | IPEX | -| ------------------------------------- | ----------- | ------------ | -| `2.14.0.1-serving-gpu`, `serving-gpu` | [v2.14.1] | [v2.14.0.1] | -| `2.13.0.0-serving-gpu`, | [v2.13.0] | [v2.13.0.0] | - -### Run the Serving GPU Container - -```bash -docker run -it --rm \ - -p 8500:8500 \ - --device /dev/dri \ - -v /dev/dri/by-path:/dev/dri/by-path \ - -v $PWD/workspace:/workspace \ - -w /workspace \ - -e MODEL_NAME= \ - -e MODEL_DIR= \ - intel/intel-extension-for-tensorflow:serving-gpu -``` - -For more details, follow the procedure in the [Intel® Extension for TensorFlow* Serving] instructions. - -## CPU only images - -The images below are built only with CPU optimizations (GPU acceleration support was deliberately excluded): - -| Tag(s) | TensorFlow | ITEX | Dockerfile | -| --------------------------- | ----------- | ------------ | --------------- | -| `2.15.1-pip-base`, `latest` | [v2.15.1] | [v2.15.0.1] | [v0.4.0-Beta] | -| `2.15.0-pip-base` | [v2.15.0] | [v2.15.0.0] | [v0.4.0-Beta] | -| `2.14.0-pip-base` | [v2.14.1] | [v2.14.0.1] | [v0.3.4] | -| `2.13-pip-base` | [v2.13.0] | [v2.13.0.0] | [v0.2.3] | - -The images below additionally include [Jupyter Notebook](https://jupyter.org/) server: - -| Tag(s) | TensorFlow | ITEX | Dockerfile | -| -------------------- | ----------- | ------------- | --------------- | -| `2.15.1-pip-jupyter` | [v2.15.1] | [v2.15.0.1] | [v0.4.0-Beta] | -| `2.15.0-pip-jupyter` | [v2.15.0] | [v2.15.0.0] | [v0.4.0-Beta] | -| `2.14.0-pip-jupyter` | [v2.14.1] | [v2.14.0.1] | [v0.3.4] | -| `2.13-pip-jupyter` | [v2.13.0] | [v2.13.0.0] | [v0.2.3] | - -### Run the CPU Jupyter Container - -```bash -docker run -it --rm \ - -p 8888:8888 \ - --net=host \ - -v $PWD/workspace:/workspace \ - -w /workspace \ - intel/intel-extension-for-tensorflow:2.15.1-pip-jupyter -``` - -After running the command above, copy the URL (something like `http://127.0.0.1:$PORT/?token=***`) into your browser to access the notebook server. - ---- - -The images below additionally include [Horovod]: - -| Tag(s) | Tensorflow | ITEX | Horovod | Dockerfile | -| ------------------------------ | --------- | ------------ | --------- | --------------- | -| `2.15.1-pip-multinode` | [v2.15.1] | [v2.15.0.1] | [v0.28.1] | [v0.4.0-Beta] | -| `2.15.0-pip-multinode` | [v2.15.0] | [v2.15.0.0] | [v0.28.1] | [v0.4.0-Beta] | -| `2.14.0-pip-openmpi-multinode` | [v2.14.1] | [v2.14.0.1] | [v0.28.1] | [v0.3.4] | -| `2.13-pip-openmpi-mulitnode` | [v2.13.0] | [v2.13.0.0] | [v0.28.0] | [v0.2.3] | - -> [!NOTE] -> Passwordless SSH connection is also enabled in the image, but the container does not contain any SSH ID keys. The user needs to mount those keys at `/root/.ssh/id_rsa` and `/etc/ssh/authorized_keys`. - -> [!TIP] -> Before mounting any keys, modify the permissions of those files with `chmod 600 authorized_keys; chmod 600 id_rsa` to grant read access for the default user account. - -#### Setup and Run ITEX Multi-Node Container - -Some additional assembly is required to utilize this container with OpenSSH. To perform any kind of DDP (Distributed Data Parallel) execution, containers are assigned the roles of launcher and worker respectively: - -SSH Server (Worker) - -1. *Authorized Keys* : `/etc/ssh/authorized_keys` - -SSH Client (Launcher) - -1. *Private User Key* : `/root/.ssh/id_rsa` - -To add these files correctly please follow the steps described below. - -1. Setup ID Keys - - You can use the commands provided below to [generate the identity keys](https://www.ssh.com/academy/ssh/keygen#creating-an-ssh-key-pair-for-user-authentication) for OpenSSH. - - ```bash - ssh-keygen -q -N "" -t rsa -b 4096 -f ./id_rsa - touch authorized_keys - cat id_rsa.pub >> authorized_keys - ``` - -2. Configure the permissions and ownership for all of the files you have created so far - - ```bash - chmod 600 id_rsa config authorized_keys - chown root:root id_rsa.pub id_rsa config authorized_keys - ``` - -3. Create a hostfile for horovod. (Optional) - - ```txt - Host host1 - HostName - IdentitiesOnly yes - IdentityFile ~/.root/id_rsa - Port - Host host2 - HostName - IdentitiesOnly yes - IdentityFile ~/.root/id_rsa - Port - ... - ``` - -4. Configure [Horovod] in your python script - - ```python - import horovod.torch as hvd - - hvd.init() - ``` - -5. Now start the workers and execute DDP on the launcher - - 1. Worker run command: - - ```bash - docker run -it --rm \ - --net=host \ - -v $PWD/authorized_keys:/etc/ssh/authorized_keys \ - -v $PWD/tests:/workspace/tests \ - -w /workspace \ - intel/intel-optimized-tensorflow:2.15.1-pip-multinode \ - bash -c '/usr/sbin/sshd -D' - ``` - - 2. Launcher run command: - - ```bash - docker run -it --rm \ - --net=host \ - -v $PWD/id_rsa:/root/.ssh/id_rsa \ - -v $PWD/tests:/workspace/tests \ - -v $PWD/hostfile:/root/ssh/config \ - -w /workspace \ - intel/intel-optimized-tensorflow:2.15.1-pip-multinode \ - bash -c 'horovodrun --verbose -np 2 -H host1:1,host2:1 /workspace/tests/tf_base_test.py' - ``` - -> [!NOTE] -> [Intel® MPI] can be configured based on your machine settings. If the above commands do not work for you, see the documentation for how to configure based on your network. - ---- - -The images below are [TensorFlow* Serving] with CPU Optimizations: - -| Tag(s) | TensorFlow | ITEX | -| ------------------------------------- | ---------- | ------------ | -| `2.14.0.1-serving-cpu`, `serving-cpu` | [v2.14.1] | [v2.14.0.1] | -| `2.13.0.0-serving-cpu` | [v2.13.0] | [v2.13.0.0] | - -### Run the Serving CPU Container - -```bash -docker run -it --rm \ - -p 8500:8500 \ - --device /dev/dri \ - -v /dev/dri/by-path:/dev/dri/by-path \ - -v $PWD/workspace:/workspace \ - -w /workspace \ - -e MODEL_NAME= \ - -e MODEL_DIR= \ - intel/intel-extension-for-tensorflow:serving-cpu -``` - -For more details, follow the procedure in the [Intel® Extension for TensorFlow* Serving] instructions. - -## CPU only images with Intel® Distribution for Python* - -The images below are built only with CPU optimizations (GPU acceleration support was deliberately excluded) and include [Intel® Distribution for Python*]: - -| Tag(s) | TensorFlow | ITEX | Dockerfile | -| --------------------------- | ----------- | ------------ | --------------- | -| `2.15.1-idp-base` | [v2.15.1] | [v2.15.0.1] | [v0.4.0-Beta] | -| `2.15.0-idp-base` | [v2.15.0] | [v2.15.0.0] | [v0.4.0-Beta] | -| `2.14.0-idp-base` | [v2.14.1] | [v2.14.0.1] | [v0.3.4] | -| `2.13-idp-base` | [v2.13.0] | [v2.13.0.0] | [v0.2.3] | - -The images below additionally include [Jupyter Notebook](https://jupyter.org/) server: - -| Tag(s) | TensorFlow | ITEX | Dockerfile | -| -------------------- | ----------- | ------------- | --------------- | -| `2.15.1-idp-jupyter` | [v2.15.1] | [v2.15.0.1] | [v0.4.0-Beta] | -| `2.15.0-idp-jupyter` | [v2.15.0] | [v2.15.0.0] | [v0.4.0-Beta] | -| `2.14.0-idp-jupyter` | [v2.14.1] | [v2.14.0.1] | [v0.3.4] | -| `2.13-idp-jupyter` | [v2.13.0] | [v2.13.0.0] | [v0.2.3] | - -The images below additionally include [Horovod]: - -| Tag(s) | Tensorflow | ITEX | Horovod | Dockerfile | -| ------------------------------ | --------- | ------------ | --------- | --------------- | -| `2.15.1-idp-multinode` | [v2.15.1] | [v2.15.0.1] | [v0.28.1] | [v0.4.0-Beta] | -| `2.15.0-idp-multinode` | [v2.15.0] | [v2.15.0.0] | [v0.28.1] | [v0.4.0-Beta] | -| `2.14.0-idp-openmpi-multinode` | [v2.14.1] | [v2.14.0.1] | [v0.28.1] | [v0.3.4] | -| `2.13-idp-openmpi-mulitnode` | [v2.13.0] | [v2.13.0.0] | [v0.28.0] | [v0.2.3] | - -## XPU images with Intel® Distribution for Python* - -The images below are built only with CPU and GPU optimizations and include [Intel® Distribution for Python*]: - -| Tag(s) | Pytorch | ITEX | Driver | Dockerfile | -| ---------------- | -------- | ------------ | -------- | ------ | -| `2.15.0.1-xpu-idp-base` | [v2.15.1] | [v2.15.0.1] | [803] | [v0.4.0-Beta] | -| `2.15.0-xpu-idp-base` | [v2.15.0] | [v2.15.0.0] | [803] | [v0.4.0-Beta] | - -The images below additionally include [Jupyter Notebook](https://jupyter.org/) server: - -| Tag(s) | Pytorch | IPEX | Driver | Jupyter Port | Dockerfile | -| --------------------- | -------- | ------------- | ------ | ------------ | --------------- | -| `2.15.0.1-xpu-idp-jupyter` | [v2.15.1] | [v2.15.0.1] | [803] | `8888` | [v0.4.0-Beta] | -| `2.15.0-xpu-idp-jupyter` | [v2.1.0] | [v2.15.0.0] | [803] | `8888` | [v0.4.0-Beta] | - -## Build from Source - -To build the images from source, clone the [Intel® AI Containers](https://github.com/intel/ai-containers) repository, follow the main `README.md` file to setup your environment, and run the following command: - -```bash -cd pytorch -docker compose build tf-base -docker compose run tf-base -``` - -You can find the list of services below for each container in the group: - -| Service Name | Description | -| ------------- | ------------------------------------------------------------------- | -| `tf-base` | Base image with [Intel® Extension for TensorFlow*] | -| `jupyter` | Adds Jupyter Notebook server | -| `multinode` | Adds [Intel® MPI], [Horovod] and [INC] | -| `xpu` | Adds Intel GPU Support | -| `xpu-jupyter` | Adds Jupyter notebook server to GPU image | - -## License - -View the [License](https://github.com/intel/intel-extension-for-tensorflow/tree/main?tab=License-1-ov-file#readme) for the [Intel® Extension for TensorFlow*]. - -The images below also contain other software which may be under other licenses (such as TensorFlow*, Jupyter*, Bash, etc. from the base). - -It is the image user's responsibility to ensure that any use of The images below comply with any relevant licenses for all software contained within. - -\* Other names and brands may be claimed as the property of others. - - - -[Intel® Arc™ A-Series Graphics]: https://ark.intel.com/content/www/us/en/ark/products/series/227957/intel-arc-a-series-graphics.html -[Intel® Data Center GPU Flex Series]: https://ark.intel.com/content/www/us/en/ark/products/series/230021/intel-data-center-gpu-flex-series.html -[Intel® Data Center GPU Max Series]: https://ark.intel.com/content/www/us/en/ark/products/series/232874/intel-data-center-gpu-max-series.html - -[Intel® Extension for TensorFlow*]: https://github.com/intel/intel-extension-for-tensorflow -[Intel® Extension for TensorFlow* Serving]: https://intel.github.io/intel-extension-for-tensorflow/latest/docker/tensorflow-serving/README.html -[Intel® Distribution for Python*]: https://www.intel.com/content/www/us/en/developer/tools/oneapi/distribution-for-python.html -[INC]: https://github.com/intel/neural-compressor -[TensorFlow*]: https://github.com/tensorflow/tensorflow -[PluggableDevice]: https://github.com/tensorflow/community/blob/master/rfcs/20200624-pluggable-device-for-tensorflow.md -[TensorFlow* Serving]: https://github.com/tensorflow/serving -[Horovod]: https://github.com/horovod/horovod -[Intel® MPI]: https://www.intel.com/content/www/us/en/developer/tools/oneapi/mpi-library.html#gs.9bna9o - -[v0.4.0-Beta]: https://github.com/intel/ai-containers/blob/v0.4.0-Beta/tensorflow/Dockerfile -[v0.3.4]: https://github.com/intel/ai-containers/blob/v0.3.4/tensorflow/Dockerfile -[v0.2.3]: https://github.com/intel/ai-containers/blob/v0.2.3/tensorflow/Dockerfile - -[v2.15.1]: https://github.com/tensorflow/tensorflow/releases/tag/v2.15.1 -[v2.15.0]: https://github.com/tensorflow/tensorflow/releases/tag/v2.15.0 -[v2.14.1]: https://github.com/tensorflow/tensorflow/releases/tag/v2.14.1 -[v2.13.0]: https://github.com/tensorflow/tensorflow/releases/tag/v2.13.0 - -[v2.15.0.1]: https://github.com/intel/intel-extension-for-tensorflow/releases/tag/v2.15.0.1 -[v2.15.0.0]: https://github.com/intel/intel-extension-for-tensorflow/releases/tag/v2.15.0.0 -[v2.14.0.1]: https://github.com/intel/intel-extension-for-tensorflow/releases/tag/v2.14.0.1 -[v2.13.0.0]: https://github.com/intel/intel-extension-for-tensorflow/releases/tag/v2.13.0.0 - -[v0.28.1]: https://github.com/horovod/horovod/releases/tag/v0.28.1 -[v0.28.0]: https://github.com/horovod/horovod/releases/tag/v0.28.0 - -[803.63]: https://dgpu-docs.intel.com/releases/LTS_803.63_20240617.html -[803]: https://dgpu-docs.intel.com/releases/LTS_803.29_20240131.html -[736]: https://dgpu-docs.intel.com/releases/stable_736_25_20231031.html -[647]: https://dgpu-docs.intel.com/releases/stable_647_21_20230714.html diff --git a/docs/workflows/README.md b/docs/workflows/README.md deleted file mode 100644 index 21269eb7d..000000000 --- a/docs/workflows/README.md +++ /dev/null @@ -1,36 +0,0 @@ -# Intel® AI Workflows - -Demonstrating showing how the [Intel® AI Containers] can be used for different use cases: - -## PyTorch Workflows - -| Base Container | Device Type | Example | Description | -|----------------|-------------|---------|-------------| -| `intel/intel-optimized-pytorch:2.3.0-pip-multinode` | CPU | [Distributed LLM Fine Tuning with Kubernetes] | Demonstrates using Hugging Face Transformers with Intel® Xeon® Scalable Processors to fine tune LLMs with multiple nodes from a Kubernetes cluster. The example includes a LLM fine tuning script, Dockerfile, and Helm chart. | -| `intel/intel-optimized-pytorch:2.3.0-serving-cpu` | CPU | [TorchServe* with Kubernetes] | Demonstrates using TorchServe* with Intel® Xeon® Scalable Processors to serve models on multinodes nodes from a Kubernetes cluster. The example includes a Helm chart. | - -## Build from Source - -To build the images from source, clone the [Intel® AI Containers] repository, follow the main `README.md` file to setup your environment, and run the following command: - -```bash -cd workflows/charts/huggingface-llm -docker compose build huggingface-llm -docker compose run huggingface-llm sh -c "python /workspace/scripts/finetune.py --help" -``` - -## License - -View the [License](https://github.com/intel/ai-containers/blob/main/LICENSE) for the [Intel® AI Containers]. - -The images below also contain other software which may be under other licenses (such as Pytorch*, Jupyter*, Bash, etc. from the base). - -It is the image user's responsibility to ensure that any use of The images below comply with any relevant licenses for all software contained within. - -\* Other names and brands may be claimed as the property of others. - - - -[Intel® AI Containers]: https://github.com/intel/ai-containers -[Distributed LLM Fine Tuning with Kubernetes]: https://github.com/intel/ai-containers/tree/main/workflows/charts/huggingface-llm -[TorchServe* with Kubernetes]: https://github.com/intel/ai-containers/tree/main/workflows/charts/torchserve diff --git a/tensorflow/docker-compose.yaml b/tensorflow/docker-compose.yaml index 957ae0063..18aec65ad 100644 --- a/tensorflow/docker-compose.yaml +++ b/tensorflow/docker-compose.yaml @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -version: '3' include: - path: - ../python/docker-compose.yaml @@ -31,7 +30,7 @@ services: PYTHON_VERSION: ${PYTHON_VERSION:-3.10} REGISTRY: ${REGISTRY} REPO: ${REPO} - TF_VERSION: ${TF_VERSION:-2.15.0} + TF_VERSION: ${TF_VERSION:-2.15.1} target: tf-base-${PACKAGE_OPTION:-pip} context: . labels: @@ -41,20 +40,20 @@ services: org.opencontainers.base.name: "intel/python:3.10-core" org.opencontainers.image.name: "intel/intel-optimized-tensorflow" org.opencontainers.image.title: "Intel® Extension for TensorFlow Base Image" - org.opencontainers.image.version: ${TF_VERSION:-2.15.0}-${PACKAGE_OPTION:-pip}-base + org.opencontainers.image.version: ${TF_VERSION:-2.15.1}-${PACKAGE_OPTION:-pip}-base depends_on: - ${PACKAGE_OPTION:-pip} command: > python -c 'import tensorflow as tf; print("Tensorflow Version:", tf.__version__)' - image: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-${TF_VERSION:-2.15.0}-base + image: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-${TF_VERSION:-2.15.1}-base pull_policy: always jupyter: build: labels: dependency.python.pip: jupyter-requirements.txt - org.opencontainers.base.name: "intel/intel-optimized-tensorflow:${TF_VERSION:-2.15.0}-${PACKAGE_OPTION:-pip}-base" + org.opencontainers.base.name: "intel/intel-optimized-tensorflow:${TF_VERSION:-2.15.1}-${PACKAGE_OPTION:-pip}-base" org.opencontainers.image.title: "Intel® Extension for TensorFlow Jupyter Image" - org.opencontainers.image.version: ${TF_VERSION:-2.15.0}-${PACKAGE_OPTION:-pip}-jupyter + org.opencontainers.image.version: ${TF_VERSION:-2.15.1}-${PACKAGE_OPTION:-pip}-jupyter target: jupyter command: > bash -c "python -m jupyter --version" @@ -62,7 +61,7 @@ services: http_proxy: ${http_proxy} https_proxy: ${https_proxy} extends: tf-base - image: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-${TF_VERSION:-2.15.0}-jupyter + image: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-${TF_VERSION:-2.15.1}-jupyter network_mode: host volumes: - /$PWD:/jupyter @@ -86,14 +85,14 @@ services: dependency.pip.apt.virtualenv: true dependency.pip.horovod: 0.28.1 dependency.python.pip: multinode/requirements.txt - org.opencontainers.base.name: "intel/intel-optimized-tensorflow:${TF_VERSION:-2.15.0}-${PACKAGE_OPTION:-pip}-base" + org.opencontainers.base.name: "intel/intel-optimized-tensorflow:${TF_VERSION:-2.15.1}-${PACKAGE_OPTION:-pip}-base" org.opencontainers.image.title: "Intel® Extension for TensorFlow MultiNode Image" - org.opencontainers.image.version: ${TF_VERSION:-2.15.0}-${PACKAGE_OPTION:-pip}-multinode + org.opencontainers.image.version: ${TF_VERSION:-2.15.1}-${PACKAGE_OPTION:-pip}-multinode target: multinode command: > bash -c "horovodrun --check-build && mpirun --version && python -c 'import horovod.tensorflow as hvd;hvd.init();import horovod.tensorflow;import neural_compressor, tf2onnx; print(\"\\nNeural Compressor Version:\", neural_compressor.__version__, \"\\\nTensorFlow2ONNX Version:\", tf2onnx.__version__)'" extends: tf-base - image: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-${TF_VERSION:-2.15.0}-horovod-${HOROVOD_VERSION:-0.28.1}-inc-${INC_VERSION:-2.6} + image: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-${TF_VERSION:-2.15.1}-horovod-${HOROVOD_VERSION:-0.28.1}-inc-${INC_VERSION:-3.0} xpu: build: args: @@ -146,7 +145,7 @@ services: NO_PROXY: '' labels: dependency.python.pip: jupyter-requirements.txt - org.opencontainers.base.name: "intel/intel-optimized-tensorflow:${TF_VERSION:-2.15.0}-xpu-${PACKAGE_OPTION:-pip}-base" + org.opencontainers.base.name: "intel/intel-optimized-tensorflow:${TF_VERSION:-2.15.1}-xpu-${PACKAGE_OPTION:-pip}-base" org.opencontainers.image.title: "Intel® Extension for TensorFlow XPU Jupyter Image" org.opencontainers.image.version: ${TF_VER:-2.15.0}-xpu-${PACKAGE_OPTION:-pip}-jupyter target: itex-xpu-jupyter diff --git a/tensorflow/tests/tests.yaml b/tensorflow/tests/tests.yaml index 0fa5b2b3f..43af22394 100644 --- a/tensorflow/tests/tests.yaml +++ b/tensorflow/tests/tests.yaml @@ -14,7 +14,7 @@ --- import-itex-cpu-${PACKAGE_OPTION:-pip}: - img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-${TF_VERSION:-2.15.0}-base + img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-${TF_VERSION:-2.15.1}-base cmd: python -c "from tensorflow.python.client import device_lib; print(device_lib.list_local_devices())" import-itex-xpu-${PACKAGE_OPTION:-pip}: img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-itex-${TF_VERSION:-2.15.1}-itex-xpu-base @@ -24,20 +24,20 @@ import-itex-xpu-${PACKAGE_OPTION:-pip}: - src: ${PWD}/tensorflow/tests dst: /tests import-cpu-jupyter-${PACKAGE_OPTION:-pip}: - img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-${TF_VERSION:-2.15.0}-jupyter + img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-${TF_VERSION:-2.15.1}-jupyter cmd: python -m jupyter --version import-xpu-jupyter-${PACKAGE_OPTION:-pip}: img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-itex-${TF_VERSION:-2.15.1}-itex-xpu-jupyter cmd: python -m jupyter --version device: ["/dev/dri"] import-multinode-${PACKAGE_OPTION:-pip}: - img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-${TF_VERSION:-2.15.0}-horovod-${HOROVOD_VERSION:-0.28.1}-inc-${INC_VERSION:-2.6} + img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-${TF_VERSION:-2.15.1}-horovod-${HOROVOD_VERSION:-0.28.1}-inc-${INC_VERSION:-3.0} cmd: horovodrun --check-build && mpirun --version && python -c 'import horovod.tensorflow as hvd;hvd.init();import horovod.tensorflow' import-inc-${PACKAGE_OPTION:-pip}: - img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-${TF_VERSION:-2.15.0}-horovod-${HOROVOD_VERSION:-0.28.1}-inc-${INC_VERSION:-2.6} + img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-${TF_VERSION:-2.15.1}-horovod-${HOROVOD_VERSION:-0.28.1}-inc-${INC_VERSION:-3.0} cmd: python -c "import neural_compressor as inc;print(inc.__version__)" itex-cpu-${PACKAGE_OPTION:-pip}: - img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-${TF_VERSION:-2.15.0}-base + img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-${TF_VERSION:-2.15.1}-base cmd: python /tests/tf_base_test.py volumes: - src: ${PWD}/tensorflow/tests @@ -55,13 +55,13 @@ itex-xpu-jupyter-${PACKAGE_OPTION:-pip}: notebook: True device: ["/dev/dri"] multinode-${PACKAGE_OPTION:-pip}: - img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-${TF_VERSION:-2.15.0}-horovod-${HOROVOD_VERSION:-0.28.1}-inc-${INC_VERSION:-2.6} + img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-${TF_VERSION:-2.15.1}-horovod-${HOROVOD_VERSION:-0.28.1}-inc-${INC_VERSION:-3.0} cmd: horovodrun -np 2 -H localhost:2 --binding-args="-bind-to socket -map-by socket" python /tests/tf_base_test.py volumes: - dst: /tests src: $PWD/tensorflow/tests inc-${PACKAGE_OPTION:-pip}: - img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-${TF_VERSION:-2.15.0}-horovod-${HOROVOD_VERSION:-0.28.1}-inc-${INC_VERSION:-2.6} + img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-${TF_VERSION:-2.15.1}-horovod-${HOROVOD_VERSION:-0.28.1}-inc-${INC_VERSION:-3.0} cmd: bash /tests/inc_test.sh volumes: - dst: /tests From 1f76e4103f796db279a310cbd46c184144334bec Mon Sep 17 00:00:00 2001 From: tylertitsworth Date: Fri, 16 Aug 2024 16:32:55 -0700 Subject: [PATCH 4/4] val idp build Signed-off-by: tylertitsworth --- tensorflow/Dockerfile | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tensorflow/Dockerfile b/tensorflow/Dockerfile index 4f3d0f6ee..48fb73321 100644 --- a/tensorflow/Dockerfile +++ b/tensorflow/Dockerfile @@ -33,12 +33,11 @@ ENV KMP_AFFINITY='granularity=fine,verbose,compact,1,0' \ KMP_BLOCKTIME=1 \ KMP_SETTINGS=1 -ARG TF_VERSION - WORKDIR / COPY requirements.txt . -RUN python -m pip install --no-cache-dir -r requirements.txt +RUN python -m pip install --no-cache-dir -r requirements.txt && \ + rm -rf requirements.txt ADD https://raw.githubusercontent.com/intel/intel-extension-for-tensorflow/master/third-party-programs/dockerlayer/THIRD-PARTY-PROGRAMS.txt /licenses/ ADD https://raw.githubusercontent.com/intel/intel-extension-for-tensorflow/master/third-party-programs/dockerlayer/third-party-program-of-intel-extension-for-tensorflow.txt /licenses/ @@ -53,12 +52,13 @@ ENV KMP_AFFINITY='granularity=fine,verbose,compact,1,0' \ ENV PATH /usr/bin:/root/conda/envs/idp/bin:/root/conda/condabin:~/conda/bin/:${PATH} ENV TF_ENABLE_ONEDNN_OPTS=1 -ARG TF_VERSION WORKDIR / COPY requirements.txt . -RUN python -m pip install --no-cache-dir -r requirements.txt +RUN conda run -n idp python -m pip install --no-cache-dir -r requirements.txt && \ + rm -rf requirements.txt && \ + conda clean -y --all ADD https://raw.githubusercontent.com/intel/intel-extension-for-tensorflow/master/third-party-programs/dockerlayer/THIRD-PARTY-PROGRAMS.txt /licenses/ ADD https://raw.githubusercontent.com/intel/intel-extension-for-tensorflow/master/third-party-programs/dockerlayer/third-party-program-of-intel-extension-for-tensorflow.txt /licenses/ @@ -124,7 +124,7 @@ ARG HOROVOD_WITH_MPI=1 ENV LD_LIBRARY_PATH /lib64/:/usr/lib64/:/usr/local/lib64 -RUN pip install --no-cache-dir horovod==0.28.1 +RUN python -m pip install --no-cache-dir horovod==0.28.1 ARG PYTHON_VERSION