diff --git a/.azure-pipelines/scripts/codeScan/pyspelling/pyspelling_conf.yaml b/.azure-pipelines/scripts/codeScan/pyspelling/pyspelling_conf.yaml
index 07fe82c07f3..209e60ee4dd 100644
--- a/.azure-pipelines/scripts/codeScan/pyspelling/pyspelling_conf.yaml
+++ b/.azure-pipelines/scripts/codeScan/pyspelling/pyspelling_conf.yaml
@@ -7,7 +7,7 @@ matrix:
- ${DICT_DIR}/lpot_dict.txt
output: ${DICT_DIR}/lpot_dict.dic
sources:
- - ${REPO_DIR}/docs/*
+ - ${REPO_DIR}/docs/source/*.md
- ${REPO_DIR}/*.md
- ${REPO_DIR}/examples/**/*.md|!${REPO_DIR}/examples/pytorch/**/huggingface_models/**/*.md
- ${REPO_DIR}/neural_compressor/**/*.md
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index e4e37f8fc72..69348ebd344 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -14,16 +14,17 @@ jobs:
- uses: actions/checkout@v1
- name: Install dependencies
run: |
- export PATH="$HOME/.local/bin:$PATH"
+ export PATH="$HOME/.local/bin:$PATH/docs"
sudo apt-get install -y python3-setuptools
- pip3 install --user -r sphinx-requirements.txt
+ pip3 install --user -r docs/sphinx-requirements.txt
- name: Build the docs
run: |
export PATH="$HOME/.local/bin:$PATH"
+ cd docs/
make html
- name: Push the docs
uses: peaceiris/actions-gh-pages@v3
with:
github_token: ${{ secrets.GITHUB_TOKEN }}
- publish_dir: _build/html
+ publish_dir: docs/_build/html
publish_branch: latestHTML
\ No newline at end of file
diff --git a/Makefile b/Makefile
deleted file mode 100644
index 7b308f858cc..00000000000
--- a/Makefile
+++ /dev/null
@@ -1,34 +0,0 @@
-# Minimal makefile for Sphinx documentation
-#
-
-# You can set these variables from the command line.
-SPHINXOPTS =
-SPHINXBUILD = sphinx-build
-SPHINXPROJ = ProjectnameIntelLowPrecisionOptimizationTool
-SOURCEDIR = .
-BUILDDIR = _build
-
-# Put it first so that "make" without argument is like "make help".
-help:
- @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
-
-.PHONY: help Makefile
-
-
-html:
- $(SPHINXBUILD) -b html "$(SOURCEDIR)" "$(BUILDDIR)/html" $(SPHINXOPTS) $(O)
- cp _static/index.html $(BUILDDIR)/html/index.html
- mkdir "$(BUILDDIR)/html/docs/imgs"
- cp docs/imgs/architecture.png "$(BUILDDIR)/html/docs/imgs/architecture.png"
- cp docs/imgs/workflow.png "$(BUILDDIR)/html/docs/imgs/workflow.png"
- cp docs/imgs/INC_GUI.gif "$(BUILDDIR)/html/docs/imgs/INC_GUI.gif"
- cp docs/imgs/release_data.png "$(BUILDDIR)/html/docs/imgs/release_data.png"
- cp "$(BUILDDIR)/html/README.html" "$(BUILDDIR)/html/README.html.tmp"
- sed 's/.md/.html/g' "$(BUILDDIR)/html/README.html.tmp" > "$(BUILDDIR)/html/README.html"
- rm -f "$(BUILDDIR)/html/README.html.tmp"
-
-
-# Catch-all target: route all unknown targets to Sphinx using the new
-# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
-%: Makefile
- @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
\ No newline at end of file
diff --git a/README.md b/README.md
index dc76479528d..ebe3e37ed57 100644
--- a/README.md
+++ b/README.md
@@ -44,7 +44,7 @@ Python version: 3.7, 3.8, 3.9, 3.10
# Or install nightly full version from pip (including GUI)
pip install -i https://test.pypi.org/simple/ neural-compressor-full
```
-More installation methods can be found at [Installation Guide](./docs/installation_guide.md). Please check out our [FAQ](./docs/faq.md) for more details.
+More installation methods can be found at [Installation Guide](./docs/source/installation_guide.md). Please check out our [FAQ](./docs/source/faq.md) for more details.
## Getting Started
### Quantization with Python API
@@ -71,7 +71,7 @@ Search for ```jupyter-lab-neural-compressor``` in the Extension Manager in Jupyt
-### Quantization with [GUI](./docs/bench.md)
+### Quantization with [GUI](./docs/source/bench.md)
```shell
# An ONNX Example
pip install onnx==1.12.0 onnxruntime==1.12.1 onnxruntime-extensions
@@ -80,8 +80,8 @@ wget https://github.com/onnx/models/raw/main/vision/classification/resnet/model/
# Start GUI
inc_bench
```
-
-
+
+
## System Requirements
@@ -98,7 +98,7 @@ inc_bench
#### Intel® Neural Compressor quantized ONNX models support multiple hardware vendors through ONNX Runtime:
-* Intel CPU, AMD/ARM CPU, and NVidia GPU. Please refer to the validated model [list](./docs/validated_model_list.md#Validated-ONNX-QDQ-INT8-models-on-multiple-hardware-through-ONNX-Runtime).
+* Intel CPU, AMD/ARM CPU, and NVidia GPU. Please refer to the validated model [list](./docs/source/validated_model_list.md#Validated-ONNX-QDQ-INT8-models-on-multiple-hardware-through-ONNX-Runtime).
### Validated Software Environment
@@ -146,11 +146,11 @@ inc_bench
> Set the environment variable ``TF_ENABLE_ONEDNN_OPTS=1`` to enable oneDNN optimizations if you are using TensorFlow v2.6 to v2.8. oneDNN is the default for TensorFlow v2.9.
### Validated Models
-Intel® Neural Compressor validated 420+ [examples](./examples) for quantization with a performance speedup geomean of 2.2x and up to 4.2x on VNNI while minimizing accuracy loss. Over 30 pruning and knowledge distillation samples are also available. More details for validated models are available [here](docs/validated_model_list.md).
+Intel® Neural Compressor validated 420+ [examples](./examples) for quantization with a performance speedup geomean of 2.2x and up to 4.2x on VNNI while minimizing accuracy loss. Over 30 pruning and knowledge distillation samples are also available. More details for validated models are available [here](./docs/source/validated_model_list.md).
@@ -235,13 +235,13 @@ Intel® Neural Compressor validated 420+ [examples](./examples) for quantization
* Neural Coder, a new plug-in for Intel Neural Compressor was covered by [Twitter](https://twitter.com/IntelDevTools/status/1583629213697212416), [LinkedIn](https://www.linkedin.com/posts/intel-software_oneapi-ai-deeplearning-activity-6989377309917007872-Dbzg?utm_source=share&utm_medium=member_desktop), and [Intel Developer Zone](https://mp.weixin.qq.com/s/LL-4eD-R0YagFgODM23oQA) from Intel, and [Twitter](https://twitter.com/IntelDevTools/status/1583629213697212416/retweets) and [LinkedIn](https://www.linkedin.com/feed/update/urn:li:share:6990377841435574272/) from Hugging Face. (Oct 2022)
* Intel Neural Compressor successfully landed on [GCP](https://console.cloud.google.com/marketplace/product/bitnami-launchpad/inc-tensorflow-intel?project=verdant-sensor-286207), [AWS](https://aws.amazon.com/marketplace/pp/prodview-yjyh2xmggbmga#pdp-support), and [Azure](https://azuremarketplace.microsoft.com/en-us/marketplace/apps/bitnami.inc-tensorflow-intel) marketplace. (Oct 2022)
-> View our [full publication list](docs/publication_list.md).
+> View our [full publication list](./docs/source/publication_list.md).
## Additional Content
-* [Release Information](docs/releases_info.md)
-* [Contribution Guidelines](docs/contributions.md)
-* [Legal Information](docs/legal_information.md)
+* [Release Information](./docs/source/releases_info.md)
+* [Contribution Guidelines](./docs/source/contributions.md)
+* [Legal Information](./docs/source/legal_information.md)
* [Security Policy](SECURITY.md)
* [Intel® Neural Compressor Website](https://intel.github.io/neural-compressor)
diff --git a/_static/custom.css b/_static/custom.css
deleted file mode 100755
index b2d7a2ec6c2..00000000000
--- a/_static/custom.css
+++ /dev/null
@@ -1,18 +0,0 @@
-/* make the page 1000px */
-.wy-nav-content {
- max-width: 1000px;
-}
-
-/* code block highlight color in rtd changed to lime green, no no no */
-
-.rst-content tt.literal, .rst-content code.literal, .highlight {
- background: #f0f0f0;
-}
-.rst-content tt.literal, .rst-content code.literal {
- color: #000000;
-}
-
-table.docutils th {
- text-align: center;
- vertical-align: middle;
-}
\ No newline at end of file
diff --git a/api-documentation/api-reference.rst b/api-documentation/api-reference.rst
deleted file mode 100755
index 784f6bae5eb..00000000000
--- a/api-documentation/api-reference.rst
+++ /dev/null
@@ -1,16 +0,0 @@
-API Reference
-#############
-
-Read an `introduction to Intel Neural Compressor APIs <../docs/api-introduction.md>`__.
-
-The following APIs are available:
-
-.. toctree::
- :maxdepth: 1
-
- benchmark-api
- objective-api
- pruning-api
- quantization-api
-
-
\ No newline at end of file
diff --git a/api-documentation/benchmark-api.rst b/api-documentation/benchmark-api.rst
deleted file mode 100755
index c6f3da1e87b..00000000000
--- a/api-documentation/benchmark-api.rst
+++ /dev/null
@@ -1,10 +0,0 @@
-.. _benchmark-api
-
-Benchmark
-#########
-
-.. automodule:: neural_compressor.benchmark
- :members:
-
-.. autoclass:: neural_compressor.benchmark.Benchmark
- :members:
\ No newline at end of file
diff --git a/api-documentation/objective-api.rst b/api-documentation/objective-api.rst
deleted file mode 100755
index de63375c256..00000000000
--- a/api-documentation/objective-api.rst
+++ /dev/null
@@ -1,13 +0,0 @@
-.. _objective-api
-
-Objective
-#########
-
-.. automodule:: neural_compressor.objective
- :members:
-
-.. autoclass:: neural_compressor.objective.Measurer
- :members:
-
-.. autoclass:: neural_compressor.objective.FootprintMeasure
- :members:
\ No newline at end of file
diff --git a/api-documentation/pruning-api.rst b/api-documentation/pruning-api.rst
deleted file mode 100755
index ff2fd584546..00000000000
--- a/api-documentation/pruning-api.rst
+++ /dev/null
@@ -1,10 +0,0 @@
-.. _pruning-api
-
-Pruning
-#######
-
-.. automodule:: neural_compressor.pruning
- :members:
-
-.. autoclass:: neural_compressor.pruning.Pruning
- :members:
\ No newline at end of file
diff --git a/api-documentation/quantization-api.rst b/api-documentation/quantization-api.rst
deleted file mode 100755
index af1017d0ef1..00000000000
--- a/api-documentation/quantization-api.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-.. _quantization-api
-
-Quantization
-############
-
-.. automodule:: neural_compressor.quantization
- :members:
\ No newline at end of file
diff --git a/docs/Makefile b/docs/Makefile
new file mode 100644
index 00000000000..329d0ea574d
--- /dev/null
+++ b/docs/Makefile
@@ -0,0 +1,44 @@
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line.
+SPHINXOPTS =
+SPHINXBUILD = sphinx-build
+SOURCEDIR = source
+BUILDDIR = _build
+IMGDIR = source/_static/imgs
+BUILDIMGDIR = _build/html/imgs
+CODEIMGDIR = _build/html/_static
+
+# Put it first so that "make" without argument is like "make help".
+help:
+ @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+
+html:
+ # cp README.md to docs, modify response-link
+ cp -f "../README.md" "./source/README.md"
+ cp -f "./source/README.md" "./source/README.md.tmp"
+ sed 's/.md/.html/g; s/.\/docs\/source\//.\//g; s/.\/neural_coder\/extensions\/screenshots/imgs/g; s/.\/docs\/source\/_static/..\/\/_static/g;' "./source/README.md.tmp" > "./source/README.md"
+ rm -f "./source/README.md.tmp"
+
+ # make sure other png can display normal
+ $(SPHINXBUILD) -b html "$(SOURCEDIR)" "$(BUILDDIR)/html" $(SPHINXOPTS) $(O)
+
+ cp source/_static/index.html $(BUILDDIR)/html/index.html
+ mkdir -p "$(BUILDIMGDIR)"
+ # common svg
+ cp -f "$(CODEIMGDIR)/imgs/common/code.svg" "$(CODEIMGDIR)/images/view-page-source-icon.svg"
+ cp -f "$(CODEIMGDIR)/imgs/common/right.svg" "$(CODEIMGDIR)/images/chevron-right-orange.svg"
+
+ cp "../neural_coder/extensions/screenshots/extmanager.png" "$(BUILDIMGDIR)/extmanager.png"
+ cp "$(IMGDIR)/INC_GUI.gif" "$(BUILDIMGDIR)/INC_GUI.gif"
+ cp "$(IMGDIR)/release_data.png" "$(BUILDIMGDIR)/release_data.png"
+
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+ @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
\ No newline at end of file
diff --git a/docs/design.md b/docs/design.md
deleted file mode 100644
index bee2fa124b8..00000000000
--- a/docs/design.md
+++ /dev/null
@@ -1,15 +0,0 @@
-Design
-=====
-Intel® Neural Compressor features an architecture and workflow that aids in increasing performance and faster deployments across infrastructures.
-
-## Architecture
-
-
-
-
-
-## Workflow
-
-
-
-
diff --git a/make.bat b/docs/make.bat
similarity index 95%
rename from make.bat
rename to docs/make.bat
index 695a8b3ecfd..f9a02b02da3 100644
--- a/make.bat
+++ b/docs/make.bat
@@ -1,36 +1,36 @@
-@ECHO OFF
-
-pushd %~dp0
-
-REM Command file for Sphinx documentation
-
-if "%SPHINXBUILD%" == "" (
- set SPHINXBUILD=sphinx-build
-)
-set SOURCEDIR=.
-set BUILDDIR=_build
-set SPHINXPROJ=ProjectnameIntelLowPrecisionOptimizationTool
-
-if "%1" == "" goto help
-
-%SPHINXBUILD% >NUL 2>NUL
-if errorlevel 9009 (
- echo.
- echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
- echo.installed, then set the SPHINXBUILD environment variable to point
- echo.to the full path of the 'sphinx-build' executable. Alternatively you
- echo.may add the Sphinx directory to PATH.
- echo.
- echo.If you don't have Sphinx installed, grab it from
- echo.http://sphinx-doc.org/
- exit /b 1
-)
-
-%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
-goto end
-
-:help
-%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
-
-:end
-popd
+@ECHO OFF
+
+pushd %~dp0
+
+REM Command file for Sphinx documentation
+
+if "%SPHINXBUILD%" == "" (
+ set SPHINXBUILD=sphinx-build
+)
+set SOURCEDIR=.
+set BUILDDIR=_build
+set SPHINXPROJ=ProjectnameIntelLowPrecisionOptimizationTool
+
+if "%1" == "" goto help
+
+%SPHINXBUILD% >NUL 2>NUL
+if errorlevel 9009 (
+ echo.
+ echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
+ echo.installed, then set the SPHINXBUILD environment variable to point
+ echo.to the full path of the 'sphinx-build' executable. Alternatively you
+ echo.may add the Sphinx directory to PATH.
+ echo.
+ echo.If you don't have Sphinx installed, grab it from
+ echo.http://sphinx-doc.org/
+ exit /b 1
+)
+
+%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
+goto end
+
+:help
+%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
+
+:end
+popd
diff --git a/docs/CODE_OF_CONDUCT.md b/docs/source/CODE_OF_CONDUCT.md
similarity index 100%
rename from docs/CODE_OF_CONDUCT.md
rename to docs/source/CODE_OF_CONDUCT.md
diff --git a/docs/FX.md b/docs/source/FX.md
similarity index 100%
rename from docs/FX.md
rename to docs/source/FX.md
diff --git a/docs/NAS.md b/docs/source/NAS.md
similarity index 99%
rename from docs/NAS.md
rename to docs/source/NAS.md
index a2eb0eb456b..98eac4d8217 100644
--- a/docs/NAS.md
+++ b/docs/source/NAS.md
@@ -136,7 +136,7 @@ Dynamic Neural Architecture Search (DyNAS) is a super-network-based NAS approach
The flow of the DyNAS approach is shown in the following figure. In the first phase of the search, a small population of sub-networks are randomly sampled from the super-network and evaluated (validation measurement) to provide the initial training set for the inner predictor loop. After the predictors are trained, a multi-objective evolutionary search is performed in the predictor objective space. After this extensive search is performed, the best performing sub-network configurations are selected to be the next iteration's validation population. The cycle continues until the search concludes when the user defined evaluation count is met.
-
+
This class is also registered to the Intel® Neural Compressor as a built-in NAS method through a decorator `nas_registry`, its interface is shown below.
diff --git a/docs/PTQ.md b/docs/source/PTQ.md
similarity index 100%
rename from docs/PTQ.md
rename to docs/source/PTQ.md
diff --git a/docs/QAT.md b/docs/source/QAT.md
similarity index 98%
rename from docs/QAT.md
rename to docs/source/QAT.md
index 7bad1c0fcd0..e899f2157c1 100644
--- a/docs/QAT.md
+++ b/docs/source/QAT.md
@@ -4,7 +4,7 @@
Quantization-aware training (QAT) simulates low-precision inference-time computation in the forward pass of the training process. With QAT, all weights and activations are "fake quantized" during both the forward and backward passes of training: that is, float values are rounded to mimic int8 values, but all computations are still done with floating point numbers. Thus, all the weight adjustments during training are made while "aware" of the fact that the model will ultimately be quantized; after quantizing, therefore, this method will usually yield higher accuracy than either dynamic quantization or post-training static quantization.
-
+
## Usage
diff --git a/docs/source/README.md b/docs/source/README.md
new file mode 100644
index 00000000000..c912168f5a7
--- /dev/null
+++ b/docs/source/README.md
@@ -0,0 +1,251 @@
+
An open-source Python library supporting popular model compression techniques on all mainstream deep learning frameworks (TensorFlow, PyTorch, ONNX Runtime, and MXNet)
+
+Intel® Neural Compressor, formerly known as Intel® Low Precision Optimization Tool, is an open-source Python library that runs on Intel CPUs and GPUs, which delivers unified interfaces across multiple deep-learning frameworks for popular network compression technologies such as quantization, pruning, and knowledge distillation. This tool supports automatic accuracy-driven tuning strategies to help the user quickly find out the best quantized model. It also implements different weight-pruning algorithms to generate a pruned model with predefined sparsity goal. It also supports knowledge distillation to distill the knowledge from the teacher model to the student model.
+Intel® Neural Compressor is a critical AI software component in the [Intel® oneAPI AI Analytics Toolkit](https://software.intel.com/content/www/us/en/develop/tools/oneapi/ai-analytics-toolkit.html).
+
+
+**Visit the Intel® Neural Compressor online document website at: .**
+
+## Installation
+
+### Prerequisites
+
+Python version: 3.7, 3.8, 3.9, 3.10
+
+### Install on Linux
+- Release binary install
+ ```Shell
+ # install stable basic version from pip
+ pip install neural-compressor
+ # Or install stable full version from pip (including GUI)
+ pip install neural-compressor-full
+ ```
+- Nightly binary install
+ ```Shell
+ git clone https://github.com/intel/neural-compressor.git
+ cd neural-compressor
+ pip install -r requirements.txt
+ # install nightly basic version from pip
+ pip install -i https://test.pypi.org/simple/ neural-compressor
+ # Or install nightly full version from pip (including GUI)
+ pip install -i https://test.pypi.org/simple/ neural-compressor-full
+ ```
+More installation methods can be found at [Installation Guide](./installation_guide.html). Please check out our [FAQ](./faq.html) for more details.
+
+## Getting Started
+### Quantization with Python API
+
+```shell
+# A TensorFlow Example
+pip install tensorflow
+# Prepare fp32 model
+wget https://storage.googleapis.com/intel-optimized-tensorflow/models/v1_6/mobilenet_v1_1.0_224_frozen.pb
+```
+```python
+import tensorflow as tf
+from neural_compressor.experimental import Quantization, common
+quantizer = Quantization()
+quantizer.model = './mobilenet_v1_1.0_224_frozen.pb'
+dataset = quantizer.dataset('dummy', shape=(1, 224, 224, 3))
+quantizer.calib_dataloader = common.DataLoader(dataset)
+quantizer.fit()
+```
+### Quantization with [JupyterLab Extension](./neural_coder/extensions/neural_compressor_ext_lab/README.html)
+Search for ```jupyter-lab-neural-compressor``` in the Extension Manager in JupyterLab and install with one click:
+
+
+
+
+
+### Quantization with [GUI](./bench.html)
+```shell
+# An ONNX Example
+pip install onnx==1.12.0 onnxruntime==1.12.1 onnxruntime-extensions
+# Prepare fp32 model
+wget https://github.com/onnx/models/raw/main/vision/classification/resnet/model/resnet50-v1-12.onnx
+# Start GUI
+inc_bench
+```
+
+
+
+
+## System Requirements
+
+### Validated Hardware Environment
+#### Intel® Neural Compressor supports CPUs based on [Intel 64 architecture or compatible processors](https://en.wikipedia.org/wiki/X86-64):
+
+* Intel Xeon Scalable processor (formerly Skylake, Cascade Lake, Cooper Lake, and Icelake)
+* Future Intel Xeon Scalable processor (code name Sapphire Rapids)
+
+#### Intel® Neural Compressor supports GPUs built on Intel's Xe architecture:
+
+* [Intel® Data Center GPU Flex Series](https://www.intel.com/content/www/us/en/products/docs/discrete-gpus/data-center-gpu/flex-series/overview.html)
+
+#### Intel® Neural Compressor quantized ONNX models support multiple hardware vendors through ONNX Runtime:
+
+* Intel CPU, AMD/ARM CPU, and NVidia GPU. Please refer to the validated model [list](./validated_model_list.html#Validated-ONNX-QDQ-INT8-models-on-multiple-hardware-through-ONNX-Runtime).
+
+### Validated Software Environment
+
+* OS version: CentOS 8.4, Ubuntu 20.04
+* Python version: 3.7, 3.8, 3.9, 3.10
+
+
+
+> **Note:**
+> Set the environment variable ``TF_ENABLE_ONEDNN_OPTS=1`` to enable oneDNN optimizations if you are using TensorFlow v2.6 to v2.8. oneDNN is the default for TensorFlow v2.9.
+
+### Validated Models
+Intel® Neural Compressor validated 420+ [examples](./examples) for quantization with a performance speedup geomean of 2.2x and up to 4.2x on VNNI while minimizing accuracy loss. Over 30 pruning and knowledge distillation samples are also available. More details for validated models are available [here](./validated_model_list.html).
+
+
+{% endblock %}
+
+{% block menu %}
+
+{{ super() }}
+{% endblock %}
+
+{% block sidebartitle %}
+
+{% endblock %}
+
+
+{%- block footer %}
+
+
+
+
+
+{% endblock %}
+
+
+
+
diff --git a/docs/adaptor.md b/docs/source/adaptor.md
similarity index 100%
rename from docs/adaptor.md
rename to docs/source/adaptor.md
diff --git a/docs/source/api-documentation/adaptor.rst b/docs/source/api-documentation/adaptor.rst
new file mode 100644
index 00000000000..83dc04c0181
--- /dev/null
+++ b/docs/source/api-documentation/adaptor.rst
@@ -0,0 +1,9 @@
+Adaptor
+###########
+
+The adaptor API information is available:
+
+.. toctree::
+ :maxdepth: 1
+
+
diff --git a/docs/source/api-documentation/api-introduction.md b/docs/source/api-documentation/api-introduction.md
new file mode 100644
index 00000000000..e5fc85f5d21
--- /dev/null
+++ b/docs/source/api-documentation/api-introduction.md
@@ -0,0 +1,210 @@
+API Documentation
+=================
+
+## Introduction
+
+Intel® Neural Compressor is an open-source Python library designed to help users quickly deploy low-precision inference solutions on popular deep learning (DL) frameworks such as TensorFlow*, PyTorch*, MXNet, and ONNX Runtime. It automatically optimizes low-precision recipes for deep learning models in order to achieve optimal product objectives, such as inference performance and memory usage, with expected accuracy criteria.
+
+
+## User-facing APIs
+
+These APIs are intended to unify low-precision quantization interfaces cross multiple DL frameworks for the best out-of-the-box experiences.
+
+> **Note**
+>
+> Neural Compressor is continuously improving user-facing APIs to create a better user experience.
+
+> Two sets of user-facing APIs exist. One is the default one supported from Neural Compressor v1.0 for backwards compatibility. The other set consists of new APIs in
+the `neural_compressor.experimental` package.
+
+> We recommend that you use the APIs located in neural_compressor.experimental. All examples have been updated to use the experimental APIs.
+
+The major differences between the default user-facing APIs and the experimental APIs are:
+
+1. The experimental APIs abstract the `neural_compressor.experimental.common.Model` concept to cover those cases whose weight and graph files are stored separately.
+2. The experimental APIs unify the calling style of the `Quantization`, `Pruning`, and `Benchmark` classes by setting model, calibration dataloader, evaluation dataloader, and metric through class attributes rather than passing them as function inputs.
+3. The experimental APIs refine Neural Compressor built-in transforms/datasets/metrics by unifying the APIs cross different framework backends.
+
+## Experimental user-facing APIs
+
+Experimental user-facing APIs consist of the following components:
+
+### Quantization-related APIs
+
+```python
+# neural_compressor.experimental.Quantization
+class Quantization(object):
+ def __init__(self, conf_fname_or_obj):
+ ...
+
+ def __call__(self):
+ ...
+
+ @property
+ def calib_dataloader(self):
+ ...
+
+ @property
+ def eval_dataloader(self):
+ ...
+
+ @property
+ def model(self):
+ ...
+
+ @property
+ def metric(self):
+ ...
+
+ @property
+ def postprocess(self, user_postprocess):
+ ...
+
+ @property
+ def q_func(self):
+ ...
+
+ @property
+ def eval_func(self):
+ ...
+
+```
+The `conf_fname_or_obj` parameter used in the class initialization is the path to the user yaml configuration file or Quantization_Conf class. This yaml file is used to control the entire tuning behavior on the model.
+
+**Neural Compressor User YAML Syntax**
+
+> Intel® Neural Compressor provides template yaml files for [Post-Training Quantization](../neural_compressor/template/ptq.yaml), [Quantization-Aware Training](../neural_compressor/template/qat.yaml), and [Pruning](../neural_compressor/template/pruning.yaml) scenarios. Refer to these template files to understand the meaning of each field.
+
+> Note that most fields in the yaml templates are optional. View the [HelloWorld Yaml](../examples/helloworld/tf_example2/conf.yaml) example for reference.
+
+```python
+# Typical Launcher code
+from neural_compressor.experimental import Quantization, common
+
+# optional if Neural Compressor built-in dataset could be used as model input in yaml
+class dataset(object):
+ def __init__(self, *args):
+ ...
+
+ def __getitem__(self, idx):
+ # return single sample and label tuple without collate. label should be 0 for label-free case
+ ...
+
+ def len(self):
+ ...
+
+# optional if Neural Compressor built-in metric could be used to do accuracy evaluation on model output in yaml
+class custom_metric(object):
+ def __init__(self):
+ ...
+
+ def update(self, predict, label):
+ # metric update per mini-batch
+ ...
+
+ def result(self):
+ # final metric calculation invoked only once after all mini-batch are evaluated
+ # return a scalar to neural_compressor for accuracy-driven tuning.
+ # by default the scalar is higher-is-better. if not, set tuning.accuracy_criterion.higher_is_better to false in yaml.
+ ...
+
+quantizer = Quantization(conf.yaml)
+quantizer.model = '/path/to/model'
+# below two lines are optional if Neural Compressor built-in dataset is used as model calibration input in yaml
+cal_dl = dataset('/path/to/calibration/dataset')
+quantizer.calib_dataloader = common.DataLoader(cal_dl, batch_size=32)
+# below two lines are optional if Neural Compressor built-in dataset is used as model evaluation input in yaml
+dl = dataset('/path/to/evaluation/dataset')
+quantizer.eval_dataloader = common.DataLoader(dl, batch_size=32)
+# optional if Neural Compressor built-in metric could be used to do accuracy evaluation in yaml
+quantizer.metric = common.Metric(custom_metric)
+q_model = quantizer.fit()
+q_model.save('/path/to/output/dir')
+```
+
+`model` attribute in `Quantization` class is an abstraction of model formats across different frameworks. Neural Compressor supports passing the path of `keras model`, `frozen pb`, `checkpoint`, `saved model`, `torch.nn.model`, `mxnet.symbol.Symbol`, `gluon.HybirdBlock`, and `onnx model` to instantiate a `neural_compressor.experimental.` class and set to `quantizer.model`.
+
+`calib_dataloader` and `eval_dataloader` attribute in `Quantization` class is used to set up a calibration dataloader by code. It is optional to set if the user sets corresponding fields in yaml.
+
+`metric` attribute in `Quantization` class is used to set up a custom metric by code. It is optional to set if user finds Neural Compressor built-in metric could be used with their model and sets corresponding fields in yaml.
+
+`postprocess` attribute in `Quantization` class is not necessary in most of the use cases. It is only needed when the user wants to use the built-in metric but the model output can not directly be handled by Neural Compressor built-in metrics. In this case, the user can register a transform to convert the model output to the expected one required by the built-in metric.
+
+`q_func` attribute in `Quantization` class is only for `Quantization Aware Training` case, in which the user needs to register a function that takes `model` as the input parameter and executes the entire training process with self-contained training hyper-parameters.
+
+`eval_func` attribute in `Quantization` class is reserved for special cases. If the user had an evaluation function when train a model, the user must implement a `calib_dataloader` and leave `eval_dataloader` as None. Then, modify this evaluation function to take `model` as the input parameter and return a higher-is-better scaler. In some scenarios, it may reduce development effort.
+
+
+### Pruning-related APIs (POC)
+
+```python
+class Pruning(object):
+ def __init__(self, conf_fname_or_obj):
+ ...
+
+ def on_epoch_begin(self, epoch):
+ ...
+
+ def on_step_begin(self, batch_id):
+ ...
+
+ def on_step_end(self):
+ ...
+
+ def on_epoch_end(self):
+ ...
+
+ def __call__(self):
+ ...
+
+ @property
+ def model(self):
+ ...
+
+ @property
+ def q_func(self):
+ ...
+
+```
+
+This API is used to do sparsity pruning. Currently, it is a Proof of Concept; Neural Compressor only supports `magnitude pruning` on PyTorch.
+
+To learn how to use this API, refer to the [pruning document](../pruning.md).
+
+### Benchmarking-related APIs
+```python
+class Benchmark(object):
+ def __init__(self, conf_fname_or_obj):
+ ...
+
+ def __call__(self):
+ ...
+
+ @property
+ def model(self):
+ ...
+
+ @property
+ def metric(self):
+ ...
+
+ @property
+ def b_dataloader(self):
+ ...
+
+ @property
+ def postprocess(self, user_postprocess):
+ ...
+```
+
+This API is used to measure model performance and accuracy.
+
+To learn how to use this API, refer to the [benchmarking document](../docs/benchmark.md).
+
+## Default user-facing APIs
+
+The default user-facing APIs exist for backwards compatibility from the v1.0 release. Refer to [v1.1 API](https://github.com/intel/neural-compressor/blob/v1.1/docs/introduction.md) to understand how the default user-facing APIs work.
+
+View the [HelloWorld example](/examples/helloworld/tf_example6) that uses default user-facing APIs for user reference.
+
+Full examples using default user-facing APIs can be found [here](https://github.com/intel/neural-compressor/tree/v1.1/examples).
diff --git a/api-documentation/apis.rst b/docs/source/api-documentation/apis.rst
old mode 100755
new mode 100644
similarity index 59%
rename from api-documentation/apis.rst
rename to docs/source/api-documentation/apis.rst
index a9bac5c00e4..921dc1beb34
--- a/api-documentation/apis.rst
+++ b/docs/source/api-documentation/apis.rst
@@ -6,5 +6,8 @@ The following API information is available:
.. toctree::
:maxdepth: 1
- api-reference
- ../docs/api-introduction.md
\ No newline at end of file
+ component
+ common
+ strategy
+ adaptor
+ pythonic
diff --git a/docs/source/api-documentation/common.rst b/docs/source/api-documentation/common.rst
new file mode 100644
index 00000000000..285c9136026
--- /dev/null
+++ b/docs/source/api-documentation/common.rst
@@ -0,0 +1,15 @@
+Common
+###########
+
+The common API information is available:
+
+.. toctree::
+ :maxdepth: 1
+
+ common/data
+ common/metric
+ common/model
+ common/criterion
+ common/benchmark
+ common/optimizer
+
diff --git a/docs/source/api-documentation/common/benchmark.rst b/docs/source/api-documentation/common/benchmark.rst
new file mode 100644
index 00000000000..0af068b64ba
--- /dev/null
+++ b/docs/source/api-documentation/common/benchmark.rst
@@ -0,0 +1,7 @@
+Benchmark
+==============
+
+.. autoapisummary::
+
+ neural_compressor.experimental.benchmark
+
diff --git a/docs/source/api-documentation/common/bleu.rst b/docs/source/api-documentation/common/bleu.rst
new file mode 100644
index 00000000000..7b4c04997e4
--- /dev/null
+++ b/docs/source/api-documentation/common/bleu.rst
@@ -0,0 +1,61 @@
+BLEU
+====================================================
+
+.. py:module:: neural_compressor.experimental.metric.bleu
+
+
+Module Contents
+---------------
+
+Classes
+~~~~~~~
+
+.. autoapisummary::
+
+ neural_compressor.experimental.metric.bleu.BLEU
+
+.. py:class:: BLEU
+
+ Bases: :py:obj:`object`
+
+ Computes the BLEU (Bilingual Evaluation Understudy) score.
+
+ BLEU is an algorithm for evaluating the quality of text which has
+ been machine-translated from one natural language to another.
+ This implementent approximate the BLEU score since we do not
+ glue word pieces or decode the ids and tokenize the output.
+ By default, we use ngram order of 4 and use brevity penalty.
+ Also, this does not have beam search.
+
+ .. attribute:: predictions
+
+ List of translations to score.
+
+ .. attribute:: labels
+
+ List of the reference corresponding to the prediction result.
+
+ .. py:method:: reset() -> None
+
+ Clear the predictions and labels in the cache.
+
+
+ .. py:method:: update(prediction: Sequence[str], label: Sequence[str]) -> None
+
+ Add the prediction and label.
+
+ :param prediction: The prediction result.
+ :param label: The reference corresponding to the prediction result.
+
+ :raises ValueError: An error occurred when the length of the prediction
+ :raises and label are different.:
+
+
+ .. py:method:: result() -> float
+
+ Compute the BLEU score.
+
+ :returns: The approximate BLEU score.
+ :rtype: bleu_score
+
+
diff --git a/docs/source/api-documentation/common/criterion.rst b/docs/source/api-documentation/common/criterion.rst
new file mode 100644
index 00000000000..64d5c053dc6
--- /dev/null
+++ b/docs/source/api-documentation/common/criterion.rst
@@ -0,0 +1,5 @@
+Criterion
+==============
+
+.. autoapisummary::
+ neural_compressor.experimental.common.criterion
diff --git a/docs/source/api-documentation/common/data.rst b/docs/source/api-documentation/common/data.rst
new file mode 100644
index 00000000000..9f045f42a5c
--- /dev/null
+++ b/docs/source/api-documentation/common/data.rst
@@ -0,0 +1,11 @@
+Data
+###########
+
+The data API information is available:
+
+.. toctree::
+ :maxdepth: 1
+
+ data/datasets.rst
+ data/dataloader.rst
+ data/transforms.rst
\ No newline at end of file
diff --git a/docs/source/api-documentation/common/data/dataloader.rst b/docs/source/api-documentation/common/data/dataloader.rst
new file mode 100644
index 00000000000..38809581d3a
--- /dev/null
+++ b/docs/source/api-documentation/common/data/dataloader.rst
@@ -0,0 +1,19 @@
+Dataloader
+==============
+
+BaseDataLoader
+---------------
+
+.. autoapisummary::
+
+ neural_compressor.experimental.data.dataloaders.base_dataloader
+
+dataloaders
+------------
+.. autoapisummary::
+
+ neural_compressor.experimental.data.dataloaders.default_dataloader
+ neural_compressor.experimental.data.dataloaders.mxnet_dataloader
+ neural_compressor.experimental.data.dataloaders.onnxrt_dataloader
+ neural_compressor.experimental.data.dataloaders.pytorch_dataloader
+ neural_compressor.experimental.data.dataloaders.tensorflow_dataloader
\ No newline at end of file
diff --git a/docs/source/api-documentation/common/data/datasets.rst b/docs/source/api-documentation/common/data/datasets.rst
new file mode 100644
index 00000000000..f86f41c0040
--- /dev/null
+++ b/docs/source/api-documentation/common/data/datasets.rst
@@ -0,0 +1,12 @@
+Datasets
+==============
+
+.. autoapisummary::
+
+ neural_compressor.experimental.data.datasets.bert_dataset
+ neural_compressor.experimental.data.datasets.coco_dataset
+ neural_compressor.experimental.data.datasets.dataset
+ neural_compressor.experimental.data.datasets.dummy_dataset
+ neural_compressor.experimental.data.datasets.imagenet_dataset
+ neural_compressor.experimental.data.datasets.dummy_dataset_v2
+ neural_compressor.experimental.data.datasets.style_transfer_dataset
\ No newline at end of file
diff --git a/docs/source/api-documentation/common/data/transforms.rst b/docs/source/api-documentation/common/data/transforms.rst
new file mode 100644
index 00000000000..d9e63e1fc3e
--- /dev/null
+++ b/docs/source/api-documentation/common/data/transforms.rst
@@ -0,0 +1,6 @@
+Transforms
+==============
+
+.. autoapisummary::
+
+ neural_compressor.experimental.data.transforms.transform
\ No newline at end of file
diff --git a/docs/source/api-documentation/common/metric.rst b/docs/source/api-documentation/common/metric.rst
new file mode 100644
index 00000000000..9349942d733
--- /dev/null
+++ b/docs/source/api-documentation/common/metric.rst
@@ -0,0 +1,6 @@
+Metric
+==============
+
+.. autoapisummary::
+ neural_compressor.experimental.metric.metric
+ neural_compressor.experimental.metric.bleu
\ No newline at end of file
diff --git a/docs/source/api-documentation/common/model.rst b/docs/source/api-documentation/common/model.rst
new file mode 100644
index 00000000000..b632b177c65
--- /dev/null
+++ b/docs/source/api-documentation/common/model.rst
@@ -0,0 +1,6 @@
+Model
+==============
+
+.. autoapisummary::
+
+ neural_compressor.experimental.common.model
diff --git a/docs/source/api-documentation/common/optimizer.rst b/docs/source/api-documentation/common/optimizer.rst
new file mode 100644
index 00000000000..e714f856f7e
--- /dev/null
+++ b/docs/source/api-documentation/common/optimizer.rst
@@ -0,0 +1,5 @@
+Optimizer
+==============
+
+.. autoapisummary::
+ neural_compressor.experimental.common.optimizer
diff --git a/docs/source/api-documentation/component.rst b/docs/source/api-documentation/component.rst
new file mode 100644
index 00000000000..8781252accb
--- /dev/null
+++ b/docs/source/api-documentation/component.rst
@@ -0,0 +1,15 @@
+component(experiemental API, deprecated in 2.0)
+#################################################
+
+The component API information is available:
+
+.. toctree::
+ :maxdepth: 1
+
+ component/Quantization
+ component/Pruning
+ component/Distillation
+ component/Scheduler
+ component/MixedPrecision
+ component/ModelConversion
+ component/Nas
\ No newline at end of file
diff --git a/docs/source/api-documentation/component/Distillation.rst b/docs/source/api-documentation/component/Distillation.rst
new file mode 100644
index 00000000000..7cb9766904a
--- /dev/null
+++ b/docs/source/api-documentation/component/Distillation.rst
@@ -0,0 +1,6 @@
+Distillation
+==============
+
+.. autoapisummary::
+
+ neural_compressor.experimental.distillation
diff --git a/docs/source/api-documentation/component/MixedPrecision.rst b/docs/source/api-documentation/component/MixedPrecision.rst
new file mode 100644
index 00000000000..6152894ac75
--- /dev/null
+++ b/docs/source/api-documentation/component/MixedPrecision.rst
@@ -0,0 +1,6 @@
+MixedPrecision
+================
+
+.. autoapisummary::
+
+ neural_compressor.experimental.mixed_precision
\ No newline at end of file
diff --git a/docs/source/api-documentation/component/ModelConversion.rst b/docs/source/api-documentation/component/ModelConversion.rst
new file mode 100644
index 00000000000..3ce2e1fb891
--- /dev/null
+++ b/docs/source/api-documentation/component/ModelConversion.rst
@@ -0,0 +1,6 @@
+ModelConversion
+================
+
+.. autoapisummary::
+
+ neural_compressor.experimental.model_conversion
diff --git a/docs/source/api-documentation/component/Nas.rst b/docs/source/api-documentation/component/Nas.rst
new file mode 100644
index 00000000000..572f1cf21f6
--- /dev/null
+++ b/docs/source/api-documentation/component/Nas.rst
@@ -0,0 +1,15 @@
+Neural architecture search (NAS)
+=================================
+
+Package Contents
+----------------
+.. autoapisummary::
+
+ neural_compressor.experimental.nas.nas
+
+Classes
+----------------
+.. autoapisummary::
+
+ neural_compressor.experimental.nas.basic_nas
+ neural_compressor.experimental.nas.dynas
\ No newline at end of file
diff --git a/docs/source/api-documentation/component/Pruning.rst b/docs/source/api-documentation/component/Pruning.rst
new file mode 100644
index 00000000000..3bec7485947
--- /dev/null
+++ b/docs/source/api-documentation/component/Pruning.rst
@@ -0,0 +1,7 @@
+Pruning
+==============
+
+.. autoapisummary::
+
+ neural_compressor.experimental.pruning
+ neural_compressor.experimental.pytorch_pruner.pruning
\ No newline at end of file
diff --git a/docs/source/api-documentation/component/Quantization.rst b/docs/source/api-documentation/component/Quantization.rst
new file mode 100644
index 00000000000..afa6fc3cf75
--- /dev/null
+++ b/docs/source/api-documentation/component/Quantization.rst
@@ -0,0 +1,6 @@
+Quantization
+==============
+
+.. autoapisummary::
+
+ neural_compressor.experimental.quantization
diff --git a/docs/source/api-documentation/component/Scheduler.rst b/docs/source/api-documentation/component/Scheduler.rst
new file mode 100644
index 00000000000..44bc31212ce
--- /dev/null
+++ b/docs/source/api-documentation/component/Scheduler.rst
@@ -0,0 +1,6 @@
+Scheduler
+==============
+
+.. autoapisummary::
+
+ neural_compressor.experimental.scheduler
diff --git a/docs/source/api-documentation/pythonic.rst b/docs/source/api-documentation/pythonic.rst
new file mode 100644
index 00000000000..77514870ff7
--- /dev/null
+++ b/docs/source/api-documentation/pythonic.rst
@@ -0,0 +1,8 @@
+Pythonic
+###########
+
+The Pythonic API information is available:
+
+.. toctree::
+ :maxdepth: 1
+
diff --git a/docs/source/api-documentation/strategy.rst b/docs/source/api-documentation/strategy.rst
new file mode 100644
index 00000000000..db96e96607d
--- /dev/null
+++ b/docs/source/api-documentation/strategy.rst
@@ -0,0 +1,9 @@
+strategy
+###########
+
+The strategy API information is available:
+
+.. toctree::
+ :maxdepth: 1
+
+
diff --git a/docs/api-introduction.md b/docs/source/api-introduction.md
similarity index 100%
rename from docs/api-introduction.md
rename to docs/source/api-introduction.md
diff --git a/docs/backend_quant.md b/docs/source/backend_quant.md
similarity index 100%
rename from docs/backend_quant.md
rename to docs/source/backend_quant.md
diff --git a/docs/bench.md b/docs/source/bench.md
similarity index 79%
rename from docs/bench.md
rename to docs/source/bench.md
index 8938b916357..e0b9802ee7a 100644
--- a/docs/bench.md
+++ b/docs/source/bench.md
@@ -91,113 +91,113 @@ or specify different port that is already opened, for example 8080:
## Home screen
This view shows introduction to Intel® Neural Compressor Bench and a button for creating new project. After clicking this button, pop-up with project wizard will be shown.
-
+
# Create new project
To create a new project, in first step you need to choose its name.
-
+
In second step there are 2 possible options to choose from:
* *predefined model* - you choose model from predefined examples list, you don't need to set any additional parameters,
* *custom model* - in this scenario you can set more parameters and customize your model.
-
+
## Predefined model
First you need to choose domain for the model (image recognition or object detection). For each domain there are few available models to choose from. When you click *Finish* the chosen model will be downloaded.
-
+
## Custom model
First you have to choose the model path. When it is chosen, in most cases all other fields will be completed automatically. You can edit its input and output nodes, see the model graph (if it is available for this model) and set shape for synthetic dataset. If model domain was not detected, you need to choose it from the list. Model domain is used to set some default parameters for the model.
-
+
## Display model graph
-For several model types there will be a button available  in the project wizard. It is also possible to see the graph in Diagnosis tab. The graph by default is collapsed, but when you click on plus icon, sections will be unfolded.
+For several model types there will be a button available  in the project wizard. It is also possible to see the graph in Diagnosis tab. The graph by default is collapsed, but when you click on plus icon, sections will be unfolded.
-.
+.
# Project list
On the left hand side there is a panel with list of created projects. When you click on the project name, you can see its details. "Create new project" button navigates to new project wizard pop-up described in previous section.
-
+
## Remove project
If you want to remove project, you have to click the trash icon next to project name (it is visible when the cursor is on the project name).
-
+
Then you will be prompted to confirm your choice by typing the project name. Project removal is not reversible.
-
+
# Develop the project
## Optimization tab
### Optimization table
In Optimizations tab you can see list of optimizations in the project. Currently UI supports three optimization precisions and two types of optimization.
-
+
### Optimization wizard
To add new optimization, click "Add new optimization" button at the bottom of the table and follow the steps.
-
+
### Editing optimization entries
There is a possibility to modify some optimization parameters even after exit from Wizard.
If optimization has not been run yet, the pencil icon on the right hand side should be in light blue color. That indicates that it can be modified. After click on that pencil icon you can select different precision or dataset.
For Quantization you can also modify Tuning details before optimizing model.
-
+
### Optimization details
To perform optimization click "Run" button. Once process is finished you can click on row with specific optimization to display details about optimization parameters and optimized model. When you click on blue arrow icon in model path line, you can download optimized model.
-
+
## Benchmark tab
### Benchmark table
For each optimization and input model you can add benchmark. Benchmark have 2 modes: accuracy and performance. In benchmark tab you can see all your benchmarks. When you check checkboxes in the last column you can choose benchmark you want to compare in the chart (visible after clicking "Compare selected").
-
+
### Benchmark wizard
To add new benchmark, click "Add new benchmark" button at the bottom of the table and follow the steps.
-
+
### Editing benchmark entries
As for optimizations you can also modify benchmark parameters. You can modify benchmark mode, dataset and benchmark parameters like batch size, number of instances and number of cores per instance.
-
+
### Benchmark details
When the benchmark is added, you can click "Run" button to execute it. Results will be filled in the table and in details view visible after clicking row in the table. You can also see config and output logs when clicking links highlighted in blue.
-
+
## Profiling tab
### Profiling table
It is also possible to do profiling of all Tensorflow frozen models in project.
-
+
### Profiling wizard
To profile model, click "Add new profiling" button at the bottom of the table and follow the steps.
-
+
### Editing profiling entries
In Profiling tab you can edit dataset and number or threads.
-
+
### Profiling details
Once profiling entry is added, you can click "Run" button to execute it. After completing the process, the results will appear in the form of a bar chart and a table with full profiling data. The table is also used to control which operations are included in the chart. Check the box next to the selected row and click "Update chart" button to include it in the bar chart.
Click "Download .csv file" button to get profiling data in .csv file.
-
+
# Diagnosis tab
@@ -205,43 +205,43 @@ Diagnosis tab offers convenient debug information for optimizations with easy wa
To get OP list you need to execute quantization optimization and select optimized model on left hand side. In OP table you can see list of OPs with MSE and min/max activation values. Selecting one of OP in table highlights its position in graph. Configuration for currently selected OP can be set in section under OP table.
-
+
You can set model wise parameters that apply to whole model by clicking button with "Model wise". When you set specific configuration you can view summary and generate new optimization config.
-
+
Model wise configuration provides separate settings for weights and activations.
-
+
## Dataset tab
### Dataset list
Dataset tab presents list of datasets assigned to a project. In most cases the "dummy" dataset consisting of synthetic data should be automatically added while creating a project.
-
+
### Dataset wizard
New dataset can be defined by clicking "Add new profiling" button at the bottom of the table and follow the steps.
-
+
### Dataset details
Dataset details can be inspected by clicking specific row.
-
+
### Custom dataset
When adding the dataset, you can choose *custom* in dataloader and metric field. In that case a template file will be created. The path to the template file will be available in dataset details. You should edit this file to add your custom configuration before using this dataset in optimizations or benchmarks. Small yellow warning will remind about it.
-
+
## Project information
Last tab is called "Project info". You can find here details about the project, when it was created and modified, what is the framework and some details about input model. It is also possible to add some notes about the project.
-
+
## System information
-One can see system information by clicking  button. The result is details dialog:
+One can see system information by clicking  button. The result is details dialog:
-
+
## Security
diff --git a/docs/benchmark.md b/docs/source/benchmark.md
similarity index 100%
rename from docs/benchmark.md
rename to docs/source/benchmark.md
diff --git a/conf.py b/docs/source/conf.py
old mode 100755
new mode 100644
similarity index 85%
rename from conf.py
rename to docs/source/conf.py
index e6316de8938..a721b5d0ba1
--- a/conf.py
+++ b/docs/source/conf.py
@@ -12,12 +12,15 @@
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
#
+from os import getenv
+import importlib.util
import os
import sys
-sys.path.insert(0, os.path.abspath('.'))
-import importlib.util
+sys.path.insert(0, os.path.abspath('../../'))
moduleName = 'version'
-modulePath = os.getcwd() + '/neural_compressor/version.py'
+# get version.py
+modulePathNeu = os.path.abspath(os.path.join(os.getcwd(), "../.."))
+modulePath = modulePathNeu + '/neural_compressor/version.py'
spec = importlib.util.spec_from_file_location(moduleName,modulePath)
NCversion = importlib.util.module_from_spec(spec)
spec.loader.exec_module(NCversion)
@@ -44,15 +47,20 @@
# Add any Sphinx extension module names here, as strings. They can be
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
# ones.
-extensions = ["recommonmark","sphinx_markdown_tables","sphinx_md", "sphinx.ext.autodoc"]
-
+extensions = ['recommonmark', 'sphinx_markdown_tables', 'sphinx.ext.coverage', 'sphinx.ext.autosummary',
+ 'sphinx_md', 'autoapi.extension', 'sphinx.ext.napoleon']
+autoapi_dirs = ['../../neural_compressor']
+autoapi_add_toctree_entry = False
+autosummary_generate = True
+autoapi_options = ['members', 'show-inheritance',
+ 'show-module-summary', 'imported-members', ]
+autoapi_ignore = []
# Add any paths that contain templates here, relative to this directory.
templates_path = ['_templates']
# The suffix(es) of source filenames.
# You can specify multiple suffix as a list of string:
#
-# source_suffix = ['.rst', '.md']
source_suffix = ['.rst', '.md']
# The master toctree document.
@@ -77,13 +85,13 @@
# -- Options for HTML output -------------------------------------------------
# The theme to use for HTML and HTML Help pages. See the documentation for
-# a list of builtin themes.
+# a list of builtin themes.
#
# html_theme = "asteroid_sphinx_theme"
# html_theme = "classic"
# html_theme = "alabaster"
# html_theme = "sphinx_book_theme"
-html_theme = "sphinx_rtd_theme"
+html_theme = "pytorch_sphinx_theme"
# Theme options are theme-specific and customize the look and feel of a theme
# further. For a list of options available for each theme, see the
@@ -96,6 +104,17 @@
# so a file named "default.css" will overwrite the builtin "default.css".
html_static_path = ['_static']
+def skip_util_classes(app, what, name, obj, skip, options):
+ if what == "class" and obj.docstring.startswith("Not displayed in API Docs.") :
+ skip = True
+ return skip
+
+
+def setup(app):
+ app.add_css_file("custom.css")
+ app.connect("autoapi-skip-member", skip_util_classes)
+
+html_favicon = '_static/imgs/common/intel.svg'
# Custom sidebar templates, must be a dictionary that maps document names
# to template names.
#
@@ -163,13 +182,9 @@
'Miscellaneous'),
]
-def setup(app):
- app.add_css_file("custom.css")
-
-from os import getenv
sphinx_md_useGitHubURL = True
-baseBranch = "master"
+baseBranch = "api-docs"
commitSHA = getenv('GITHUB_SHA')
githubBaseURL = 'https://github.com/' + (getenv('GITHUB_REPOSITORY') or 'intel/neural-compressor') + '/'
githubFileURL = githubBaseURL + "blob/"
diff --git a/docs/contributions.md b/docs/source/contributions.md
similarity index 100%
rename from docs/contributions.md
rename to docs/source/contributions.md
diff --git a/docs/dataloader.md b/docs/source/dataloader.md
similarity index 100%
rename from docs/dataloader.md
rename to docs/source/dataloader.md
diff --git a/docs/dataset.md b/docs/source/dataset.md
similarity index 100%
rename from docs/dataset.md
rename to docs/source/dataset.md
diff --git a/docs/source/design.md b/docs/source/design.md
new file mode 100644
index 00000000000..e75e25df785
--- /dev/null
+++ b/docs/source/design.md
@@ -0,0 +1,15 @@
+Design
+=====
+Intel® Neural Compressor features an architecture and workflow that aids in increasing performance and faster deployments across infrastructures.
+
+## Architecture
+
+
+
+
+
+## Workflow
+
+
+
+
diff --git a/docs/distillation.md b/docs/source/distillation.md
similarity index 95%
rename from docs/distillation.md
rename to docs/source/distillation.md
index b5f363a5a67..49cec901185 100644
--- a/docs/distillation.md
+++ b/docs/source/distillation.md
@@ -1,138 +1,138 @@
-Distillation
-============
-
-1. [Introduction](#introduction)
-
- 1.1. [Knowledge Distillation](#knowledge-distillation)
-
- 1.2. [Intermediate Layer Knowledge Distillation](#intermediate-layer-knowledge-distillation)
-
- 1.3. [Self Distillation](#self-distillation)
-
-2. [Distillation Support Matrix](#distillation-support-matrix)
-3. [Get Started with Distillation API ](#get-started-with-distillation-api)
-4. [Examples](#examples)
-
-## Introduction
-
-Distillation is one of popular approaches of network compression, which transfers knowledge from a large model to a smaller one without loss of validity. As smaller models are less expensive to evaluate, they can be deployed on less powerful hardware (such as a mobile device). Graph shown below is the workflow of the distillation, the teacher model will take the same input that feed into the student model to produce the output that contains knowledge of the teacher model to instruct the student model.
-
-
-
-
-Intel® Neural Compressor supports Knowledge Distillation and Intermediate Layer Knowledge Distillation algorithms.
-
-### Knowledge Distillation
-Knowledge distillation is proposed in [Distilling the Knowledge in a Neural Network](https://arxiv.org/abs/1503.02531). It leverages the logits (the input of softmax in the classification tasks) of teacher and student model to minimize the the difference between their predicted class distributions, this can be done by minimizing the below loss function.
-
-$$L_{KD} = D(z_t, z_s)$$
-
-Where $D$ is a distance measurement, e.g. Euclidean distance and Kullback–Leibler divergence, $z_t$ and $z_s$ are the logits of teacher and student model, or predicted distributions from softmax of the logits in case the distance is measured in terms of distribution.
-
-### Intermediate Layer Knowledge Distillation
-
-There are more information contained in the teacher model beside its logits, for example, the output features of the teacher model's intermediate layers often been used to guide the student model, as in [Patient Knowledge Distillation for BERT Model Compression](https://arxiv.org/pdf/1908.09355) and [MobileBERT: a Compact Task-Agnostic BERT for Resource-Limited Devices](https://arxiv.org/abs/2004.02984). The general loss function for this approach can be summarized as follow.
-
-$$L_{KD} = \sum\limits_i D(T_t^{n_i}(F_t^{n_i}), T_s^{m_i}(F_s^{m_i}))$$
-
-Where $D$ is a distance measurement as before, $F_t^{n_i}$ the output feature of the $n_i$'s layer of the teacher model, $F_s^{m_i}$ the output feature of the $m_i$'s layer of the student model. Since the dimensions of $F_t^{n_i}$ and $F_s^{m_i}$ are usually different, the transformations $T_t^{n_i}$ and $T_s^{m_i}$ are needed to match dimensions of the two features. Specifically, the transformation can take the forms like identity, linear transformation, 1X1 convolution etc.
-
-### Self Distillation
-
-Self-distillation ia a one-stage training method where the teacher model and student models can be trained together. It attaches several attention modules and shallow classifiers at different depths of neural networks and distills knowledge from the deepest classifier to the shallower classifiers. Different from the conventional knowledge distillation methods where the knowledge of the teacher model is transferred to another student model, self-distillation can be considered as knowledge transfer in the same model, from the deeper layers to the shallower layers.
-The additional classifiers in self-distillation allow the neural network to work in a dynamic manner, which leads to a much higher acceleration.
-
-
-
-
-Architecture from paper [Self-Distillation: Towards Efficient and Compact Neural Networks](https://ieeexplore.ieee.org/document/9381661)
-
-## Distillation Support Matrix
-
-|Distillation Algorithm |PyTorch |TensorFlow |
-|------------------------------------------------|:--------:|:---------:|
-|Knowledge Distillation |✔ |✔ |
-|Intermediate Layer Knowledge Distillation |✔ |Will be supported|
-|Self Distillation |✔ |✖ |
-
-## Get Started with Distillation API
-
-Simplest launcher code if training behavior is defined in user-defined yaml.
-
-```python
-from neural_compressor.experimental import Distillation, common
-distiller = Distillation('/path/to/user/yaml')
-distiller.student_model = student_model
-distiller.teacher_model = teacher_model
-model = distiller.fit()
-```
-Distillation class also support DistillationConf class as it's argument.
-
-```python
-from neural_compressor.experimental import Distillation, common
-from neural_compressor.conf.config import DistillationConf
-conf = DistillationConf('/path/to/user/yaml')
-distiller = Distillation(conf)
-distiller.student_model = student_model
-distiller.teacher_model = teacher_model
-model = distiller.fit()
-```
-
-User can pass the customized training/evaluation functions to `Distillation` for flexible scenarios. In this case, distillation process can be done by pre-defined hooks in Neural Compressor. User needs to put those hooks inside the training function.
-
-Neural Compressor defines several hooks for user pass
-
-```
-on_train_begin() : Hook executed before training begins
-on_after_compute_loss(input, student_output, student_loss) : Hook executed after each batch inference of student model
-on_epoch_end() : Hook executed at each epoch end
-```
-
-Following section shows how to use hooks in user pass-in training function which is part of example from BlendCNN distillation:
-
-```python
-def train_func(model):
- distiller.on_train_begin()
- for nepoch in range(epochs):
- model.train()
- cnt = 0
- loss_sum = 0.
- iter_bar = tqdm(train_dataloader, desc='Iter (loss=X.XXX)')
- for batch in iter_bar:
- teacher_logits, input_ids, segment_ids, input_mask, target = batch
- cnt += 1
- output = model(input_ids, segment_ids, input_mask)
- loss = criterion(output, target)
- loss = distiller.on_after_compute_loss(
- {'input_ids':input_ids, 'segment_ids':segment_ids, 'input_mask':input_mask},
- output,
- loss,
- teacher_logits)
- optimizer.zero_grad()
- loss.backward()
- optimizer.step()
- if cnt >= iters:
- break
- print('Average Loss: {}'.format(loss_sum / cnt))
- distiller.on_epoch_end()
-...
-```
-
-In this case, the launcher code is like the following:
-
-```python
-from neural_compressor.experimental import Distillation, common
-from neural_compressor.experimental.common.criterion import PyTorchKnowledgeDistillationLoss
-distiller = Distillation(args.config)
-distiller.student_model = model
-distiller.teacher_model = teacher
-distiller.criterion = PyTorchKnowledgeDistillationLoss()
-distiller.train_func = train_func
-model = distiller.fit()
-```
-
-## Examples
-
-[Distillation Examples](../examples/README.md#distillation)
-
-[Distillation Examples Results](./validated_model_list.md#validated-knowledge-distillation-examples)
+Distillation
+============
+
+1. [Introduction](#introduction)
+
+ 1.1. [Knowledge Distillation](#knowledge-distillation)
+
+ 1.2. [Intermediate Layer Knowledge Distillation](#intermediate-layer-knowledge-distillation)
+
+ 1.3. [Self Distillation](#self-distillation)
+
+2. [Distillation Support Matrix](#distillation-support-matrix)
+3. [Get Started with Distillation API ](#get-started-with-distillation-api)
+4. [Examples](#examples)
+
+## Introduction
+
+Distillation is one of popular approaches of network compression, which transfers knowledge from a large model to a smaller one without loss of validity. As smaller models are less expensive to evaluate, they can be deployed on less powerful hardware (such as a mobile device). Graph shown below is the workflow of the distillation, the teacher model will take the same input that feed into the student model to produce the output that contains knowledge of the teacher model to instruct the student model.
+
+
+
+
+Intel® Neural Compressor supports Knowledge Distillation and Intermediate Layer Knowledge Distillation algorithms.
+
+### Knowledge Distillation
+Knowledge distillation is proposed in [Distilling the Knowledge in a Neural Network](https://arxiv.org/abs/1503.02531). It leverages the logits (the input of softmax in the classification tasks) of teacher and student model to minimize the the difference between their predicted class distributions, this can be done by minimizing the below loss function.
+
+$$L_{KD} = D(z_t, z_s)$$
+
+Where $D$ is a distance measurement, e.g. Euclidean distance and Kullback–Leibler divergence, $z_t$ and $z_s$ are the logits of teacher and student model, or predicted distributions from softmax of the logits in case the distance is measured in terms of distribution.
+
+### Intermediate Layer Knowledge Distillation
+
+There are more information contained in the teacher model beside its logits, for example, the output features of the teacher model's intermediate layers often been used to guide the student model, as in [Patient Knowledge Distillation for BERT Model Compression](https://arxiv.org/pdf/1908.09355) and [MobileBERT: a Compact Task-Agnostic BERT for Resource-Limited Devices](https://arxiv.org/abs/2004.02984). The general loss function for this approach can be summarized as follow.
+
+$$L_{KD} = \sum\limits_i D(T_t^{n_i}(F_t^{n_i}), T_s^{m_i}(F_s^{m_i}))$$
+
+Where $D$ is a distance measurement as before, $F_t^{n_i}$ the output feature of the $n_i$'s layer of the teacher model, $F_s^{m_i}$ the output feature of the $m_i$'s layer of the student model. Since the dimensions of $F_t^{n_i}$ and $F_s^{m_i}$ are usually different, the transformations $T_t^{n_i}$ and $T_s^{m_i}$ are needed to match dimensions of the two features. Specifically, the transformation can take the forms like identity, linear transformation, 1X1 convolution etc.
+
+### Self Distillation
+
+Self-distillation ia a one-stage training method where the teacher model and student models can be trained together. It attaches several attention modules and shallow classifiers at different depths of neural networks and distills knowledge from the deepest classifier to the shallower classifiers. Different from the conventional knowledge distillation methods where the knowledge of the teacher model is transferred to another student model, self-distillation can be considered as knowledge transfer in the same model, from the deeper layers to the shallower layers.
+The additional classifiers in self-distillation allow the neural network to work in a dynamic manner, which leads to a much higher acceleration.
+
+
+
+
+Architecture from paper [Self-Distillation: Towards Efficient and Compact Neural Networks](https://ieeexplore.ieee.org/document/9381661)
+
+## Distillation Support Matrix
+
+|Distillation Algorithm |PyTorch |TensorFlow |
+|------------------------------------------------|:--------:|:---------:|
+|Knowledge Distillation |✔ |✔ |
+|Intermediate Layer Knowledge Distillation |✔ |Will be supported|
+|Self Distillation |✔ |✖ |
+
+## Get Started with Distillation API
+
+Simplest launcher code if training behavior is defined in user-defined yaml.
+
+```python
+from neural_compressor.experimental import Distillation, common
+distiller = Distillation('/path/to/user/yaml')
+distiller.student_model = student_model
+distiller.teacher_model = teacher_model
+model = distiller.fit()
+```
+Distillation class also support DistillationConf class as it's argument.
+
+```python
+from neural_compressor.experimental import Distillation, common
+from neural_compressor.conf.config import DistillationConf
+conf = DistillationConf('/path/to/user/yaml')
+distiller = Distillation(conf)
+distiller.student_model = student_model
+distiller.teacher_model = teacher_model
+model = distiller.fit()
+```
+
+User can pass the customized training/evaluation functions to `Distillation` for flexible scenarios. In this case, distillation process can be done by pre-defined hooks in Neural Compressor. User needs to put those hooks inside the training function.
+
+Neural Compressor defines several hooks for user pass
+
+```
+on_train_begin() : Hook executed before training begins
+on_after_compute_loss(input, student_output, student_loss) : Hook executed after each batch inference of student model
+on_epoch_end() : Hook executed at each epoch end
+```
+
+Following section shows how to use hooks in user pass-in training function which is part of example from BlendCNN distillation:
+
+```python
+def train_func(model):
+ distiller.on_train_begin()
+ for nepoch in range(epochs):
+ model.train()
+ cnt = 0
+ loss_sum = 0.
+ iter_bar = tqdm(train_dataloader, desc='Iter (loss=X.XXX)')
+ for batch in iter_bar:
+ teacher_logits, input_ids, segment_ids, input_mask, target = batch
+ cnt += 1
+ output = model(input_ids, segment_ids, input_mask)
+ loss = criterion(output, target)
+ loss = distiller.on_after_compute_loss(
+ {'input_ids':input_ids, 'segment_ids':segment_ids, 'input_mask':input_mask},
+ output,
+ loss,
+ teacher_logits)
+ optimizer.zero_grad()
+ loss.backward()
+ optimizer.step()
+ if cnt >= iters:
+ break
+ print('Average Loss: {}'.format(loss_sum / cnt))
+ distiller.on_epoch_end()
+...
+```
+
+In this case, the launcher code is like the following:
+
+```python
+from neural_compressor.experimental import Distillation, common
+from neural_compressor.experimental.common.criterion import PyTorchKnowledgeDistillationLoss
+distiller = Distillation(args.config)
+distiller.student_model = model
+distiller.teacher_model = teacher
+distiller.criterion = PyTorchKnowledgeDistillationLoss()
+distiller.train_func = train_func
+model = distiller.fit()
+```
+
+## Examples
+
+[Distillation Examples](../examples/README.md#distillation)
+
+[Distillation Examples Results](./validated_model_list.md#validated-knowledge-distillation-examples)
diff --git a/docs/distillation_quantization.md b/docs/source/distillation_quantization.md
similarity index 100%
rename from docs/distillation_quantization.md
rename to docs/source/distillation_quantization.md
diff --git a/docs/distributed.md b/docs/source/distributed.md
similarity index 100%
rename from docs/distributed.md
rename to docs/source/distributed.md
diff --git a/docs/doclist.rst b/docs/source/doclist.rst
similarity index 100%
rename from docs/doclist.rst
rename to docs/source/doclist.rst
diff --git a/docs/dynamic_quantization.md b/docs/source/dynamic_quantization.md
similarity index 100%
rename from docs/dynamic_quantization.md
rename to docs/source/dynamic_quantization.md
diff --git a/docs/examples_readme.md b/docs/source/examples_readme.md
similarity index 100%
rename from docs/examples_readme.md
rename to docs/source/examples_readme.md
diff --git a/docs/faq.md b/docs/source/faq.md
similarity index 100%
rename from docs/faq.md
rename to docs/source/faq.md
diff --git a/docs/framework_yaml.md b/docs/source/framework_yaml.md
similarity index 97%
rename from docs/framework_yaml.md
rename to docs/source/framework_yaml.md
index cb4b72da20a..7e8f6136a4d 100644
--- a/docs/framework_yaml.md
+++ b/docs/source/framework_yaml.md
@@ -1,194 +1,194 @@
-Framework YAML Configuration Files
-====
-1. [Introduction](#introduction)
-2. [Supported Feature Matrix](#supported-feature-matrix)
-2. [Get Started with Framework YAML Files](#get-started-with-framework-yaml-files)
-
-
-
-## Introduction
-
-Intel® Neural Compressor uses YAML files for quick
-and user-friendly configurations. There are two types of YAML files -
-user YAML files and framework YAML files, which are used in
-running user cases and setting up framework capabilities, respectively.
-
-Here, we introduce the framework YAML file, which describes the behavior of
-a specific framework. There is a corresponding framework YAML file for each framework supported by
-Intel® Neural Compressor - TensorFlow
-, Intel® Extension for TensorFlow*, PyTorch, Intel® Extension for PyTorch*, ONNX Runtime, and MXNet.
-
->**Note**: Before diving to the details, we recommend that the end users do NOT make modifications
-unless they have clear requirements that can only be met by modifying the attributes.
-
-## Supported Feature Matrix
-
-| Framework | YAML Configuration Files |
-|------------|:------------------------:|
-| TensorFlow | ✔ |
-| PyTorch | ✔ |
-| ONNX | ✔ |
-| MXNet | ✔ |
-
-
-## Get started with Framework YAML Files
-
-For the purpose of framework setup, let's take a look at a tensorflow framework YAML file;
-other framework YAML files follow same syntax. A framework YAML file specifies following
-information and capabilities for current runtime framework. Let's go through
-them one by one:
-
-* ***version***: This specifies the supported versions.
-```yaml
- version:
- name: ['2.1.0', '2.2.0', '2.3.0', '2.4.0', '2.5.0', '2.6.0', '2.6.1', '2.6.2', '2.7.0', '2.8.0', '1.15.0-up1', '1.15.0-up2']
-```
-
-* ***precisions***: This defines the supported precisions of specific versions.
-```yaml
- precisions:
- names: int8, uint8, bf16, fp32
- valid_mixed_precisions: []
-```
-* ***op***: This defines a list of valid OP types for each precision.
-```yaml
- ops:
- int8: ['Conv2D', 'MatMul', 'ConcatV2', 'MaxPool', 'AvgPool']
- uint8: ['Conv2D', 'DepthwiseConv2dNative', 'MatMul', 'ConcatV2', 'MaxPool', 'AvgPool']
- bf16: ['Conv2D']
- fp32: ['*'] # '*' means all op types
-```
-* ***capabilities***: This defines the quantization ability of specific ops, such as
-granularity, scheme, and algorithm. The activation assumes that input and output activations
-share the same data type by default, which is based on op semantics defined by
-frameworks.
-```yaml
- capabilities:
- int8: {
- 'Conv2D': {
- 'weight': {
- 'dtype': ['int8', 'fp32'],
- 'scheme': ['sym'],
- 'granularity': ['per_channel','per_tensor'],
- 'algorithm': ['minmax']
- },
- 'activation': {
- 'dtype': ['int8', 'fp32'],
- 'scheme': ['sym'],
- 'granularity': ['per_tensor'],
- 'algorithm': ['minmax', 'kl']
- }
- },
- 'MatMul': {
- 'weight': {
- 'dtype': ['int8', 'fp32'],
- 'scheme': ['sym'],
- 'granularity': ['per_tensor'],
- 'algorithm': ['minmax']
- },
- 'activation': {
- 'dtype': ['int8', 'fp32'],
- 'scheme': ['asym', 'sym'],
- 'granularity': ['per_tensor'],
- 'algorithm': ['minmax']
- }
- },
- 'default': {
- 'activation': {
- 'dtype': ['uint8', 'fp32'],
- 'algorithm': ['minmax'],
- 'scheme': ['sym'],
- 'granularity': ['per_tensor']
- }
- },
- }
-
- uint8: {
- 'Conv2D': {
- 'weight': {
- 'dtype': ['int8', 'fp32'],
- 'scheme': ['sym'],
- 'granularity': ['per_channel','per_tensor'],
- 'algorithm': ['minmax']
- },
- 'activation': {
- 'dtype': ['uint8', 'fp32'],
- 'scheme': ['sym'],
- 'granularity': ['per_tensor'],
- 'algorithm': ['minmax', 'kl']
- }
- },
- 'MatMul': {
- 'weight': {
- 'dtype': ['int8', 'fp32'],
- 'scheme': ['sym'],
- 'granularity': ['per_tensor'],
- 'algorithm': ['minmax']
- },
- 'activation': {
- 'dtype': ['uint8', 'fp32'],
- 'scheme': ['asym', 'sym'],
- 'granularity': ['per_tensor'],
- 'algorithm': ['minmax']
- }
- },
- 'default': {
- 'activation': {
- 'dtype': ['uint8', 'fp32'],
- 'algorithm': ['minmax'],
- 'scheme': ['sym'],
- 'granularity': ['per_tensor']
- }
- },
- }
-```
-* ***patterns***: This defines the supported fusion sequence for each op.
-```yaml
- patterns:
- fp32: [
- 'Conv2D + Add + Relu',
- 'Conv2D + Add + Relu6',
- 'Conv2D + Relu',
- 'Conv2D + Relu6',
- 'Conv2D + BiasAdd'
- ]
- int8: [
- 'Conv2D + BiasAdd',
- 'Conv2D + BiasAdd + Relu',
- 'Conv2D + BiasAdd + Relu6'
- ]
- uint8: [
- 'Conv2D + BiasAdd + AddN + Relu',
- 'Conv2D + BiasAdd + AddN + Relu6',
- 'Conv2D + BiasAdd + AddV2 + Relu',
- 'Conv2D + BiasAdd + AddV2 + Relu6',
- 'Conv2D + BiasAdd + Add + Relu',
- 'Conv2D + BiasAdd + Add + Relu6',
- 'Conv2D + BiasAdd + Relu',
- 'Conv2D + BiasAdd + Relu6',
- 'Conv2D + Add + Relu',
- 'Conv2D + Add + Relu6',
- 'Conv2D + Relu',
- 'Conv2D + Relu6',
- 'Conv2D + BiasAdd',
- 'DepthwiseConv2dNative + BiasAdd + Relu6',
- 'DepthwiseConv2dNative + BiasAdd + Relu',
- 'DepthwiseConv2dNative + Add + Relu6',
- 'DepthwiseConv2dNative + BiasAdd',
- 'MatMul + BiasAdd + Relu',
- 'MatMul + BiasAdd',
- ]
-```
-
-* ***grappler_optimization***: This defines the grappler optimization.
-```yaml
- grappler_optimization:
- pruning: True # optional. grappler pruning optimizer,default value is True.
- shape: True # optional. grappler shape optimizer,default value is True.
- constfold: False # optional. grappler constant folding optimizer, default value is True.
- arithmetic: False # optional. grappler arithmetic optimizer,default value is False.
- dependency: True # optional. grappler dependency optimizer,default value is True.
- debug_stripper: True # optional. grappler debug_stripper optimizer,default value is True.
- loop: True # optional. grappler loop optimizer,default value is True.
-
-```
+Framework YAML Configuration Files
+====
+1. [Introduction](#introduction)
+2. [Supported Feature Matrix](#supported-feature-matrix)
+2. [Get Started with Framework YAML Files](#get-started-with-framework-yaml-files)
+
+
+
+## Introduction
+
+Intel® Neural Compressor uses YAML files for quick
+and user-friendly configurations. There are two types of YAML files -
+user YAML files and framework YAML files, which are used in
+running user cases and setting up framework capabilities, respectively.
+
+Here, we introduce the framework YAML file, which describes the behavior of
+a specific framework. There is a corresponding framework YAML file for each framework supported by
+Intel® Neural Compressor - TensorFlow
+, Intel® Extension for TensorFlow*, PyTorch, Intel® Extension for PyTorch*, ONNX Runtime, and MXNet.
+
+>**Note**: Before diving to the details, we recommend that the end users do NOT make modifications
+unless they have clear requirements that can only be met by modifying the attributes.
+
+## Supported Feature Matrix
+
+| Framework | YAML Configuration Files |
+|------------|:------------------------:|
+| TensorFlow | ✔ |
+| PyTorch | ✔ |
+| ONNX | ✔ |
+| MXNet | ✔ |
+
+
+## Get started with Framework YAML Files
+
+For the purpose of framework setup, let's take a look at a tensorflow framework YAML file;
+other framework YAML files follow same syntax. A framework YAML file specifies following
+information and capabilities for current runtime framework. Let's go through
+them one by one:
+
+* ***version***: This specifies the supported versions.
+```yaml
+ version:
+ name: ['2.1.0', '2.2.0', '2.3.0', '2.4.0', '2.5.0', '2.6.0', '2.6.1', '2.6.2', '2.7.0', '2.8.0', '1.15.0-up1', '1.15.0-up2']
+```
+
+* ***precisions***: This defines the supported precisions of specific versions.
+```yaml
+ precisions:
+ names: int8, uint8, bf16, fp32
+ valid_mixed_precisions: []
+```
+* ***op***: This defines a list of valid OP types for each precision.
+```yaml
+ ops:
+ int8: ['Conv2D', 'MatMul', 'ConcatV2', 'MaxPool', 'AvgPool']
+ uint8: ['Conv2D', 'DepthwiseConv2dNative', 'MatMul', 'ConcatV2', 'MaxPool', 'AvgPool']
+ bf16: ['Conv2D']
+ fp32: ['*'] # '*' means all op types
+```
+* ***capabilities***: This defines the quantization ability of specific ops, such as
+granularity, scheme, and algorithm. The activation assumes that input and output activations
+share the same data type by default, which is based on op semantics defined by
+frameworks.
+```yaml
+ capabilities:
+ int8: {
+ 'Conv2D': {
+ 'weight': {
+ 'dtype': ['int8', 'fp32'],
+ 'scheme': ['sym'],
+ 'granularity': ['per_channel','per_tensor'],
+ 'algorithm': ['minmax']
+ },
+ 'activation': {
+ 'dtype': ['int8', 'fp32'],
+ 'scheme': ['sym'],
+ 'granularity': ['per_tensor'],
+ 'algorithm': ['minmax', 'kl']
+ }
+ },
+ 'MatMul': {
+ 'weight': {
+ 'dtype': ['int8', 'fp32'],
+ 'scheme': ['sym'],
+ 'granularity': ['per_tensor'],
+ 'algorithm': ['minmax']
+ },
+ 'activation': {
+ 'dtype': ['int8', 'fp32'],
+ 'scheme': ['asym', 'sym'],
+ 'granularity': ['per_tensor'],
+ 'algorithm': ['minmax']
+ }
+ },
+ 'default': {
+ 'activation': {
+ 'dtype': ['uint8', 'fp32'],
+ 'algorithm': ['minmax'],
+ 'scheme': ['sym'],
+ 'granularity': ['per_tensor']
+ }
+ },
+ }
+
+ uint8: {
+ 'Conv2D': {
+ 'weight': {
+ 'dtype': ['int8', 'fp32'],
+ 'scheme': ['sym'],
+ 'granularity': ['per_channel','per_tensor'],
+ 'algorithm': ['minmax']
+ },
+ 'activation': {
+ 'dtype': ['uint8', 'fp32'],
+ 'scheme': ['sym'],
+ 'granularity': ['per_tensor'],
+ 'algorithm': ['minmax', 'kl']
+ }
+ },
+ 'MatMul': {
+ 'weight': {
+ 'dtype': ['int8', 'fp32'],
+ 'scheme': ['sym'],
+ 'granularity': ['per_tensor'],
+ 'algorithm': ['minmax']
+ },
+ 'activation': {
+ 'dtype': ['uint8', 'fp32'],
+ 'scheme': ['asym', 'sym'],
+ 'granularity': ['per_tensor'],
+ 'algorithm': ['minmax']
+ }
+ },
+ 'default': {
+ 'activation': {
+ 'dtype': ['uint8', 'fp32'],
+ 'algorithm': ['minmax'],
+ 'scheme': ['sym'],
+ 'granularity': ['per_tensor']
+ }
+ },
+ }
+```
+* ***patterns***: This defines the supported fusion sequence for each op.
+```yaml
+ patterns:
+ fp32: [
+ 'Conv2D + Add + Relu',
+ 'Conv2D + Add + Relu6',
+ 'Conv2D + Relu',
+ 'Conv2D + Relu6',
+ 'Conv2D + BiasAdd'
+ ]
+ int8: [
+ 'Conv2D + BiasAdd',
+ 'Conv2D + BiasAdd + Relu',
+ 'Conv2D + BiasAdd + Relu6'
+ ]
+ uint8: [
+ 'Conv2D + BiasAdd + AddN + Relu',
+ 'Conv2D + BiasAdd + AddN + Relu6',
+ 'Conv2D + BiasAdd + AddV2 + Relu',
+ 'Conv2D + BiasAdd + AddV2 + Relu6',
+ 'Conv2D + BiasAdd + Add + Relu',
+ 'Conv2D + BiasAdd + Add + Relu6',
+ 'Conv2D + BiasAdd + Relu',
+ 'Conv2D + BiasAdd + Relu6',
+ 'Conv2D + Add + Relu',
+ 'Conv2D + Add + Relu6',
+ 'Conv2D + Relu',
+ 'Conv2D + Relu6',
+ 'Conv2D + BiasAdd',
+ 'DepthwiseConv2dNative + BiasAdd + Relu6',
+ 'DepthwiseConv2dNative + BiasAdd + Relu',
+ 'DepthwiseConv2dNative + Add + Relu6',
+ 'DepthwiseConv2dNative + BiasAdd',
+ 'MatMul + BiasAdd + Relu',
+ 'MatMul + BiasAdd',
+ ]
+```
+
+* ***grappler_optimization***: This defines the grappler optimization.
+```yaml
+ grappler_optimization:
+ pruning: True # optional. grappler pruning optimizer,default value is True.
+ shape: True # optional. grappler shape optimizer,default value is True.
+ constfold: False # optional. grappler constant folding optimizer, default value is True.
+ arithmetic: False # optional. grappler arithmetic optimizer,default value is False.
+ dependency: True # optional. grappler dependency optimizer,default value is True.
+ debug_stripper: True # optional. grappler debug_stripper optimizer,default value is True.
+ loop: True # optional. grappler loop optimizer,default value is True.
+
+```
diff --git a/docs/getting_started.md b/docs/source/getting_started.md
similarity index 100%
rename from docs/getting_started.md
rename to docs/source/getting_started.md
diff --git a/docs/graph_optimization.md b/docs/source/graph_optimization.md
similarity index 100%
rename from docs/graph_optimization.md
rename to docs/source/graph_optimization.md
diff --git a/docs/incompatible_changes.md b/docs/source/incompatible_changes.md
similarity index 100%
rename from docs/incompatible_changes.md
rename to docs/source/incompatible_changes.md
diff --git a/index.rst b/docs/source/index.rst
old mode 100755
new mode 100644
similarity index 62%
rename from index.rst
rename to docs/source/index.rst
index 3b323adc4c8..2ab2e1d8bbb
--- a/index.rst
+++ b/docs/source/index.rst
@@ -11,14 +11,15 @@ Sections
:maxdepth: 1
README.md
- docs/tutorial.md
- docs/examples_readme.md
+ tutorial.md
+ examples_readme.md
api-documentation/apis.rst
- docs/doclist.rst
- docs/releases_info.md
- docs/contributions.md
- docs/legal_information.md
- docs/security_policy.md
+ doclist.rst
+ releases_info.md
+ contributions.md
+ legal_information.md
+ security_policy.md
+
Intel® Neural Compressor repository
diff --git a/docs/infrastructure.md b/docs/source/infrastructure.md
similarity index 98%
rename from docs/infrastructure.md
rename to docs/source/infrastructure.md
index f09ec322d6a..c2c2aae62e4 100644
--- a/docs/infrastructure.md
+++ b/docs/source/infrastructure.md
@@ -11,8 +11,8 @@ Neural Coder automatically inserts quantization code snippets on a PyTorch model
## Architecture
-
-
+
+
Intel® Neural Compressor has unified interfaces which dispatch tasks to different frameworks via adaptor layer. The adaptor layer is the bridge between the tuning strategy and vanilla framework quantization APIs. Users can select tuning strategies and the strategy module contains model configs and tuning configs. Model configs define the quantization approach, if it's post-training static quantization, users need to set more parameters like calibration and so on. There are several tuning strategies for users to choose from while the basic strategy is set as default.
diff --git a/docs/installation_guide.md b/docs/source/installation_guide.md
similarity index 100%
rename from docs/installation_guide.md
rename to docs/source/installation_guide.md
diff --git a/docs/legal_information.md b/docs/source/legal_information.md
similarity index 100%
rename from docs/legal_information.md
rename to docs/source/legal_information.md
diff --git a/docs/metric.md b/docs/source/metric.md
similarity index 100%
rename from docs/metric.md
rename to docs/source/metric.md
diff --git a/docs/mixed_precision.md b/docs/source/mixed_precision.md
similarity index 95%
rename from docs/mixed_precision.md
rename to docs/source/mixed_precision.md
index 04b155bb8f1..4a0ff3830fe 100644
--- a/docs/mixed_precision.md
+++ b/docs/source/mixed_precision.md
@@ -12,9 +12,9 @@ The recent growth of Deep Learning has driven the development of more complex mo
The recently launched 3rd Gen Intel® Xeon® Scalable processor (codenamed Cooper Lake), featuring Intel® Deep Learning Boost, is the first general-purpose x86 CPU to support the bfloat16 format. Specifically, three new bfloat16 instructions are added as a part of the AVX512_BF16 extension within Intel Deep Learning Boost: VCVTNE2PS2BF16, VCVTNEPS2BF16, and VDPBF16PS. The first two instructions allow converting to and from bfloat16 data type, while the last one performs a dot product of bfloat16 pairs. Further details can be found in the [hardware numerics document](https://software.intel.com/content/www/us/en/develop/download/bfloat16-hardware-numerics-definition.html) published by Intel.
-
+
diff --git a/docs/model.md b/docs/source/model.md
similarity index 95%
rename from docs/model.md
rename to docs/source/model.md
index 3bb4f0e9a52..b0ca55236f0 100644
--- a/docs/model.md
+++ b/docs/source/model.md
@@ -11,9 +11,9 @@ Model
## Introduction
The Neural Compressor Model feature is used to encapsulate the behavior of model building and saving. By simply providing information such as different model formats and framework_specific_info, Neural Compressor performs optimizations and quantization on this model object and returns a Neural Compressor Model object for further model persistence or benchmarking. A Neural Compressor Model helps users to maintain necessary model information which is required during optimization and quantization such as the input/output names, workspace path, and other model format knowledge. This helps unify the features gap brought by different model formats and frameworks.
-
+
diff --git a/docs/model_conversion.md b/docs/source/model_conversion.md
similarity index 100%
rename from docs/model_conversion.md
rename to docs/source/model_conversion.md
diff --git a/docs/objective.md b/docs/source/objective.md
similarity index 100%
rename from docs/objective.md
rename to docs/source/objective.md
diff --git a/docs/orchestration.md b/docs/source/orchestration.md
old mode 100755
new mode 100644
similarity index 97%
rename from docs/orchestration.md
rename to docs/source/orchestration.md
index 9de1d46172b..fb7e4fa4099
--- a/docs/orchestration.md
+++ b/docs/source/orchestration.md
@@ -1,112 +1,112 @@
-Optimization Orchestration
-============
-
-1. [Introduction](#introduction)
-
- 1.1. [One-shot](#one-shot)
-
- 1.2. [Multi-shot](#multi-shot)
-
-2. [Orchestration Support Matrix](#orchestration-support-matrix)
-3. [Get Started with Orchestration API ](#get-started-with-orchestration-api)
-4. [Examples](#examples)
-
-## Introduction
-
-Orchestration is the combination of multiple optimization techniques, either applied simultaneously (one-shot) or sequentially (multi-shot). Intel Neural Compressor supports arbitrary meaningful combinations of supported optimization methods under one-shot or multi-shot, such as pruning during quantization-aware training, or pruning and then post-training quantization, pruning and then distillation and then quantization.
-
-### One-shot
-Since quantization-aware training, pruning and distillation all leverage training process for optimization, we can achieve the goal of optimization through one shot training with arbitrary meaningful combinations of these methods, which often gain more benefits in terms of performance and accuracy than just one compression technique applied, and usually are as efficient as applying just one compression technique. The three possible combinations are shown below.
-- Pruning during quantization-aware training
-- Distillation with pattern lock pruning
-- Distillation with pattern lock pruning and quantization-aware training
-
-### Multi-shot
-Of course, besides one-shot, we also support separate execution of each optimization process.
-- Pruning and then post-training quantization
-- Distillation and then post-training quantization
-- Distillation, then pruning and post-training quantization
-
-## Orchestration Support Matrix
-
-
-
-
Orchestration
-
Combinations
-
Supported
-
-
-
-
-
One-shot
-
Pruning + Quantization Aware Training
-
✔
-
-
-
Distillation + Quantization Aware Training
-
✔
-
-
-
Distillation + Pruning
-
✔
-
-
-
Distillation + Pruning + Quantization Aware Training
-
✔
-
-
-
Multi-shot
-
Pruning then Quantization
-
✔
-
-
-
Distillation then Quantization
-
✔
-
-
-
Distillation then Pruning
-
✔
-
-
-
Distillation then Pruning then Quantization
-
✔
-
-
-
-
-## Get Started with Orchestration API
-
-Neural Compressor defines `Scheduler` class to automatically pipeline execute model optimization with one shot or multiple shots way.
-
-User instantiates model optimization components, such as quantization, pruning, distillation, separately. After that, user could append
-those separate optimization objects into scheduler's pipeline, the scheduler API executes them one by one.
-
-In following example it executes the pruning and then post-training quantization with two-shot way.
-
-```python
-from neural_compressor.experimental import Quantization, Pruning, Scheduler
-prune = Pruning(prune_conf)
-quantizer = Quantization(post_training_quantization_conf)
-scheduler = Scheduler()
-scheduler.model = model
-scheduler.append(prune)
-scheduler.append(quantizer)
-opt_model = scheduler.fit()
-```
-
-If user wants to execute the pruning and quantization-aware training with one-shot way, the code is like below.
-
-```python
-from neural_compressor.experimental import Quantization, Pruning, Scheduler
-prune = Pruning(prune_conf)
-quantizer = Quantization(quantization_aware_training_conf)
-scheduler = Scheduler()
-scheduler.model = model
-combination = scheduler.combine(prune, quantizer)
-scheduler.append(combination)
-opt_model = scheduler.fit()
-```
-
-## Examples
-
-[Orchestration Examples](../examples/README.md#orchestration)
+Optimization Orchestration
+============
+
+1. [Introduction](#introduction)
+
+ 1.1. [One-shot](#one-shot)
+
+ 1.2. [Multi-shot](#multi-shot)
+
+2. [Orchestration Support Matrix](#orchestration-support-matrix)
+3. [Get Started with Orchestration API ](#get-started-with-orchestration-api)
+4. [Examples](#examples)
+
+## Introduction
+
+Orchestration is the combination of multiple optimization techniques, either applied simultaneously (one-shot) or sequentially (multi-shot). Intel Neural Compressor supports arbitrary meaningful combinations of supported optimization methods under one-shot or multi-shot, such as pruning during quantization-aware training, or pruning and then post-training quantization, pruning and then distillation and then quantization.
+
+### One-shot
+Since quantization-aware training, pruning and distillation all leverage training process for optimization, we can achieve the goal of optimization through one shot training with arbitrary meaningful combinations of these methods, which often gain more benefits in terms of performance and accuracy than just one compression technique applied, and usually are as efficient as applying just one compression technique. The three possible combinations are shown below.
+- Pruning during quantization-aware training
+- Distillation with pattern lock pruning
+- Distillation with pattern lock pruning and quantization-aware training
+
+### Multi-shot
+Of course, besides one-shot, we also support separate execution of each optimization process.
+- Pruning and then post-training quantization
+- Distillation and then post-training quantization
+- Distillation, then pruning and post-training quantization
+
+## Orchestration Support Matrix
+
+
+
+
Orchestration
+
Combinations
+
Supported
+
+
+
+
+
One-shot
+
Pruning + Quantization Aware Training
+
✔
+
+
+
Distillation + Quantization Aware Training
+
✔
+
+
+
Distillation + Pruning
+
✔
+
+
+
Distillation + Pruning + Quantization Aware Training
+
✔
+
+
+
Multi-shot
+
Pruning then Quantization
+
✔
+
+
+
Distillation then Quantization
+
✔
+
+
+
Distillation then Pruning
+
✔
+
+
+
Distillation then Pruning then Quantization
+
✔
+
+
+
+
+## Get Started with Orchestration API
+
+Neural Compressor defines `Scheduler` class to automatically pipeline execute model optimization with one shot or multiple shots way.
+
+User instantiates model optimization components, such as quantization, pruning, distillation, separately. After that, user could append
+those separate optimization objects into scheduler's pipeline, the scheduler API executes them one by one.
+
+In following example it executes the pruning and then post-training quantization with two-shot way.
+
+```python
+from neural_compressor.experimental import Quantization, Pruning, Scheduler
+prune = Pruning(prune_conf)
+quantizer = Quantization(post_training_quantization_conf)
+scheduler = Scheduler()
+scheduler.model = model
+scheduler.append(prune)
+scheduler.append(quantizer)
+opt_model = scheduler.fit()
+```
+
+If user wants to execute the pruning and quantization-aware training with one-shot way, the code is like below.
+
+```python
+from neural_compressor.experimental import Quantization, Pruning, Scheduler
+prune = Pruning(prune_conf)
+quantizer = Quantization(quantization_aware_training_conf)
+scheduler = Scheduler()
+scheduler.model = model
+combination = scheduler.combine(prune, quantizer)
+scheduler.append(combination)
+opt_model = scheduler.fit()
+```
+
+## Examples
+
+[Orchestration Examples](../examples/README.md#orchestration)
diff --git a/docs/platform_configuration.md b/docs/source/platform_configuration.md
similarity index 100%
rename from docs/platform_configuration.md
rename to docs/source/platform_configuration.md
diff --git a/docs/pruning.md b/docs/source/pruning.md
old mode 100755
new mode 100644
similarity index 92%
rename from docs/pruning.md
rename to docs/source/pruning.md
index 1cf9f570642..e886bb0ffae
--- a/docs/pruning.md
+++ b/docs/source/pruning.md
@@ -1,234 +1,234 @@
-Pruning
-============
-
-1. [Introduction](#introduction)
-
- 1.1. [Neural Network Pruning](#neural-network-pruning)
-
- 1.2. [Pruning Patterns](#pruning-patterns)
-
- 1.3. [Pruning Criteria](#pruning-criteria)
-
- 1.4. [Pruning Schedule](#pruning-schedule)
-
-2. [Pruning Support Matrix](#pruning-support-matrix)
-
-3. [Get Started With Pruning API](#get-started-with-pruning-api)
-
-4. [Examples](#examples)
-
-## Introduction
-
-### Neural Network Pruning
-Neural network pruning (briefly known as pruning or sparsity) is one of the most promising model compression techniques. It removes the least important parameters in the network and achieves compact architectures with minimal accuracy drop and maximal inference acceleration. As current state-of-the-art models have increasingly more parameters, pruning plays a crucial role in enabling them to run on devices whose memory footprints and computing resources are limited.
-
-
-
-
-
-
-### Pruning Patterns
-
-Pruning patterns defines the rules of pruned weights' arrangements in space.
-
-
-
-
-
-
-- Unstructured Pruning
-
-Unstructured pruning means finding and removing the less salient connection in the model where the nonzero patterns are irregular and could be anywhere in the matrix.
-
-- 2in4 Pruning
-
-NVIDIA proposed [2:4 sparsity](https://developer.nvidia.com/blog/accelerating-inference-with-sparsity-using-ampere-and-tensorrt/) (or known as "2in4 sparsity") in Ampere architecture, for every 4 continuous elements in a matrix, two of them are zero and others are non-zero.
-
-- Structured Pruning
-
-Structured pruning means finding parameters in groups, deleting entire blocks, filters, or channels according to some pruning criterions. In general, structured pruning leads to lower accuracy due to restrictive structure than unstructured pruning; However, it can accelerate the model execution significantly because it can fit hardware design better.
-
-Different from 2:4 sparsity above, we propose the block-wise structured sparsity patterns that we are able to demonstrate the performance benefits on existing Intel hardwares even without the support of hardware sparsity. A block-wise sparsity pattern with block size ```S``` means the contiguous ```S``` elements in this block are all zero values.
-
-For a typical GEMM, the weight dimension is ```IC``` x ```OC```, where ```IC``` is the number of input channels and ```OC``` is the number of output channels. Note that sometimes ```IC``` is also called dimension ```K```, and ```OC``` is called dimension ```N```. The sparsity dimension is on ```OC``` (or ```N```).
-
-For a typical Convolution, the weight dimension is ```OC x IC x KH x KW```, where ```OC``` is the number of output channels, ```IC``` is the number of input channels, and ```KH``` and ```KW``` is the kernel height and weight. The sparsity dimension is also on ```OC```.
-
-Here is a figure showing a matrix with ```IC``` = 32 and ```OC``` = 16 dimension, and a block-wise sparsity pattern with block size 4 on ```OC``` dimension.
-
-
-
-
-
-### Pruning Criteria
-
-Pruning criteria defines the rules of which weights are least important to be pruned, in order to maintain the model's original accuracy. Most popular criteria examine weights' absolute value and their corresponding gradients.
-
-- Magnitude
-
- The algorithm prunes the weight by the lowest absolute value at each layer with given sparsity target.
-
-- Gradient sensitivity
-
- The algorithm prunes the head, intermediate layers, and hidden states in NLP model according to importance score calculated by following the paper [FastFormers](https://arxiv.org/abs/2010.13382).
-
-- Group Lasso
-
- The algorithm uses Group lasso regularization to prune entire rows, columns or blocks of parameters that result in a smaller dense network.
-
-- Pattern Lock
-
- The algorithm locks the sparsity pattern in fine tune phase by freezing those zero values of weight tensor during weight update of training.
-
-- SNIP
-
- The algorithm prunes the dense model at its initialization, by analyzing the weights' effect to the loss function when they are masked. Please refer to the original [paper](https://arxiv.org/abs/1810.02340) for details
-
-- SNIP with momentum
-
- The algorithm improves original SNIP algorithms and introduces weights' score maps which updates in a momentum way.\
- In the following formula, $n$ is the pruning step and $W$ and $G$ are model's weights and gradients respectively.
- $$Score_{n} = 1.0 \times Score_{n-1} + 0.9 \times |W_{n} \times G_{n}|$$
-
-### Pruning Schedule
-
-Pruning schedule defines the way the model reach the target sparsity (the ratio of pruned weights).
-
-- One-shot Pruning
-
- One-shot pruning means the model is pruned to its target sparsity with one single step. This pruning method often works at model's initialization step. It can easily cause accuracy drop, but save much training time.
-
-- Iterative Pruning
-
- Iterative pruning means the model is gradually pruned to its target sparsity during a training process. The pruning process contains several pruning steps, and each step raises model's sparsity to a higher value. In the final pruning step, the model reaches target sparsity and the pruning process ends.
-
-## Pruning Support Matrix
-
-
-
-
-
Pruning Type
-
Pruning Granularity
-
Pruning Algorithm
-
Framework
-
-
-
-
-
Unstructured Pruning
-
Element-wise
-
Magnitude
-
PyTorch, TensorFlow
-
-
-
Pattern Lock
-
PyTorch
-
-
-
SNIP with momentum
-
PyTorch
-
-
-
Structured Pruning
-
Filter/Channel-wise
-
Gradient Sensitivity
-
PyTorch
-
-
-
SNIP with momentum
-
PyTorch
-
-
-
Block-wise
-
Group Lasso
-
PyTorch
-
-
-
SNIP with momentum
-
PyTorch
-
-
-
Element-wise
-
Pattern Lock
-
PyTorch
-
-
-
SNIP with momentum
-
PyTorch
-
-
-
-
-## Get Started with Pruning API
-
-Neural Compressor `Pruning` API is defined under `neural_compressor.experimental.Pruning`, which takes a user defined yaml file as input. Below is the launcher code of applying the API to execute a pruning process.
-
-```python
-from neural_compressor.experimental import Pruning
-prune = Pruning('/path/to/user/pruning/yaml')
-prune.model = model
-model = prune.fit()
-```
-
-Users can pass the customized training/evaluation functions to `Pruning` for flexible scenarios. In this case, pruning process can be done by pre-defined hooks in Neural Compressor. Users need to put those hooks inside the training function.
-
-Neural Compressor defines several hooks for users to use:
-
-```
-on_epoch_begin(epoch) : Hook executed at each epoch beginning
-on_step_begin(batch) : Hook executed at each batch beginning
-on_step_end() : Hook executed at each batch end
-on_epoch_end() : Hook executed at each epoch end
-on_before_optimizer_step() : Hook executed after gradients calculated and before backward
-```
-
-Following section shows how to use hooks in user pass-in training function which is part of example from BERT training:
-
-```python
-def pruning_func(model):
- for epoch in range(int(args.num_train_epochs)):
- pbar = ProgressBar(n_total=len(train_dataloader), desc='Training')
- model.train()
- prune.on_epoch_begin(epoch)
- for step, batch in enumerate(train_dataloader):
- prune.on_step_begin(step)
- batch = tuple(t.to(args.device) for t in batch)
- inputs = {'input_ids': batch[0],
- 'attention_mask': batch[1],
- 'labels': batch[3]}
- #inputs['token_type_ids'] = batch[2]
- outputs = model(**inputs)
- loss = outputs[0] # model outputs are always tuple in transformers (see doc)
-
- if args.n_gpu > 1:
- loss = loss.mean() # mean() to average on multi-gpu parallel training
- if args.gradient_accumulation_steps > 1:
- loss = loss / args.gradient_accumulation_steps
-
- loss.backward()
- torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
-
- if (step + 1) % args.gradient_accumulation_steps == 0:
- prune.on_before_optimizer_step()
- optimizer.step()
- scheduler.step() # Update learning rate schedule
- model.zero_grad()
-
- prune.on_step_end()
-...
-```
-In this case, the launcher code is like the following:
-
-```python
-from neural_compressor.experimental import Pruning, common
-prune = Pruning(args.config)
-prune.model = model
-prune.train_func = pruning_func
-model = prune.fit()
-```
-
-## Examples
-
-We validate the sparsity on typical models across different domains (including CV, NLP, and Recommendation System). [Validated pruning examples](../docs/validated_model_list.md#validated-pruning-examples) shows the sparsity pattern, sparsity ratio, and accuracy of sparse and dense (Reference) model for each model.
-
-Please refer to pruning examples([TensorFlow](../examples/README.md#Pruning), [PyTorch](../examples/README.md#Pruning-1)) for more information.
+Pruning
+============
+
+1. [Introduction](#introduction)
+
+ 1.1. [Neural Network Pruning](#neural-network-pruning)
+
+ 1.2. [Pruning Patterns](#pruning-patterns)
+
+ 1.3. [Pruning Criteria](#pruning-criteria)
+
+ 1.4. [Pruning Schedule](#pruning-schedule)
+
+2. [Pruning Support Matrix](#pruning-support-matrix)
+
+3. [Get Started With Pruning API](#get-started-with-pruning-api)
+
+4. [Examples](#examples)
+
+## Introduction
+
+### Neural Network Pruning
+Neural network pruning (briefly known as pruning or sparsity) is one of the most promising model compression techniques. It removes the least important parameters in the network and achieves compact architectures with minimal accuracy drop and maximal inference acceleration. As current state-of-the-art models have increasingly more parameters, pruning plays a crucial role in enabling them to run on devices whose memory footprints and computing resources are limited.
+
+
+
+
+
+
+### Pruning Patterns
+
+Pruning patterns defines the rules of pruned weights' arrangements in space.
+
+
+
+
+
+
+- Unstructured Pruning
+
+Unstructured pruning means finding and removing the less salient connection in the model where the nonzero patterns are irregular and could be anywhere in the matrix.
+
+- 2in4 Pruning
+
+NVIDIA proposed [2:4 sparsity](https://developer.nvidia.com/blog/accelerating-inference-with-sparsity-using-ampere-and-tensorrt/) (or known as "2in4 sparsity") in Ampere architecture, for every 4 continuous elements in a matrix, two of them are zero and others are non-zero.
+
+- Structured Pruning
+
+Structured pruning means finding parameters in groups, deleting entire blocks, filters, or channels according to some pruning criterions. In general, structured pruning leads to lower accuracy due to restrictive structure than unstructured pruning; However, it can accelerate the model execution significantly because it can fit hardware design better.
+
+Different from 2:4 sparsity above, we propose the block-wise structured sparsity patterns that we are able to demonstrate the performance benefits on existing Intel hardwares even without the support of hardware sparsity. A block-wise sparsity pattern with block size ```S``` means the contiguous ```S``` elements in this block are all zero values.
+
+For a typical GEMM, the weight dimension is ```IC``` x ```OC```, where ```IC``` is the number of input channels and ```OC``` is the number of output channels. Note that sometimes ```IC``` is also called dimension ```K```, and ```OC``` is called dimension ```N```. The sparsity dimension is on ```OC``` (or ```N```).
+
+For a typical Convolution, the weight dimension is ```OC x IC x KH x KW```, where ```OC``` is the number of output channels, ```IC``` is the number of input channels, and ```KH``` and ```KW``` is the kernel height and weight. The sparsity dimension is also on ```OC```.
+
+Here is a figure showing a matrix with ```IC``` = 32 and ```OC``` = 16 dimension, and a block-wise sparsity pattern with block size 4 on ```OC``` dimension.
+
+
+
+
+
+### Pruning Criteria
+
+Pruning criteria defines the rules of which weights are least important to be pruned, in order to maintain the model's original accuracy. Most popular criteria examine weights' absolute value and their corresponding gradients.
+
+- Magnitude
+
+ The algorithm prunes the weight by the lowest absolute value at each layer with given sparsity target.
+
+- Gradient sensitivity
+
+ The algorithm prunes the head, intermediate layers, and hidden states in NLP model according to importance score calculated by following the paper [FastFormers](https://arxiv.org/abs/2010.13382).
+
+- Group Lasso
+
+ The algorithm uses Group lasso regularization to prune entire rows, columns or blocks of parameters that result in a smaller dense network.
+
+- Pattern Lock
+
+ The algorithm locks the sparsity pattern in fine tune phase by freezing those zero values of weight tensor during weight update of training.
+
+- SNIP
+
+ The algorithm prunes the dense model at its initialization, by analyzing the weights' effect to the loss function when they are masked. Please refer to the original [paper](https://arxiv.org/abs/1810.02340) for details
+
+- SNIP with momentum
+
+ The algorithm improves original SNIP algorithms and introduces weights' score maps which updates in a momentum way.\
+ In the following formula, $n$ is the pruning step and $W$ and $G$ are model's weights and gradients respectively.
+ $$Score_{n} = 1.0 \times Score_{n-1} + 0.9 \times |W_{n} \times G_{n}|$$
+
+### Pruning Schedule
+
+Pruning schedule defines the way the model reach the target sparsity (the ratio of pruned weights).
+
+- One-shot Pruning
+
+ One-shot pruning means the model is pruned to its target sparsity with one single step. This pruning method often works at model's initialization step. It can easily cause accuracy drop, but save much training time.
+
+- Iterative Pruning
+
+ Iterative pruning means the model is gradually pruned to its target sparsity during a training process. The pruning process contains several pruning steps, and each step raises model's sparsity to a higher value. In the final pruning step, the model reaches target sparsity and the pruning process ends.
+
+## Pruning Support Matrix
+
+
+
+
+
Pruning Type
+
Pruning Granularity
+
Pruning Algorithm
+
Framework
+
+
+
+
+
Unstructured Pruning
+
Element-wise
+
Magnitude
+
PyTorch, TensorFlow
+
+
+
Pattern Lock
+
PyTorch
+
+
+
SNIP with momentum
+
PyTorch
+
+
+
Structured Pruning
+
Filter/Channel-wise
+
Gradient Sensitivity
+
PyTorch
+
+
+
SNIP with momentum
+
PyTorch
+
+
+
Block-wise
+
Group Lasso
+
PyTorch
+
+
+
SNIP with momentum
+
PyTorch
+
+
+
Element-wise
+
Pattern Lock
+
PyTorch
+
+
+
SNIP with momentum
+
PyTorch
+
+
+
+
+## Get Started with Pruning API
+
+Neural Compressor `Pruning` API is defined under `neural_compressor.experimental.Pruning`, which takes a user defined yaml file as input. Below is the launcher code of applying the API to execute a pruning process.
+
+```python
+from neural_compressor.experimental import Pruning
+prune = Pruning('/path/to/user/pruning/yaml')
+prune.model = model
+model = prune.fit()
+```
+
+Users can pass the customized training/evaluation functions to `Pruning` for flexible scenarios. In this case, pruning process can be done by pre-defined hooks in Neural Compressor. Users need to put those hooks inside the training function.
+
+Neural Compressor defines several hooks for users to use:
+
+```
+on_epoch_begin(epoch) : Hook executed at each epoch beginning
+on_step_begin(batch) : Hook executed at each batch beginning
+on_step_end() : Hook executed at each batch end
+on_epoch_end() : Hook executed at each epoch end
+on_before_optimizer_step() : Hook executed after gradients calculated and before backward
+```
+
+Following section shows how to use hooks in user pass-in training function which is part of example from BERT training:
+
+```python
+def pruning_func(model):
+ for epoch in range(int(args.num_train_epochs)):
+ pbar = ProgressBar(n_total=len(train_dataloader), desc='Training')
+ model.train()
+ prune.on_epoch_begin(epoch)
+ for step, batch in enumerate(train_dataloader):
+ prune.on_step_begin(step)
+ batch = tuple(t.to(args.device) for t in batch)
+ inputs = {'input_ids': batch[0],
+ 'attention_mask': batch[1],
+ 'labels': batch[3]}
+ #inputs['token_type_ids'] = batch[2]
+ outputs = model(**inputs)
+ loss = outputs[0] # model outputs are always tuple in transformers (see doc)
+
+ if args.n_gpu > 1:
+ loss = loss.mean() # mean() to average on multi-gpu parallel training
+ if args.gradient_accumulation_steps > 1:
+ loss = loss / args.gradient_accumulation_steps
+
+ loss.backward()
+ torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
+
+ if (step + 1) % args.gradient_accumulation_steps == 0:
+ prune.on_before_optimizer_step()
+ optimizer.step()
+ scheduler.step() # Update learning rate schedule
+ model.zero_grad()
+
+ prune.on_step_end()
+...
+```
+In this case, the launcher code is like the following:
+
+```python
+from neural_compressor.experimental import Pruning, common
+prune = Pruning(args.config)
+prune.model = model
+prune.train_func = pruning_func
+model = prune.fit()
+```
+
+## Examples
+
+We validate the sparsity on typical models across different domains (including CV, NLP, and Recommendation System). [Validated pruning examples](../docs/validated_model_list.md#validated-pruning-examples) shows the sparsity pattern, sparsity ratio, and accuracy of sparse and dense (Reference) model for each model.
+
+Please refer to pruning examples([TensorFlow](../examples/README.md#Pruning), [PyTorch](../examples/README.md#Pruning-1)) for more information.
diff --git a/docs/publication_list.md b/docs/source/publication_list.md
similarity index 100%
rename from docs/publication_list.md
rename to docs/source/publication_list.md
diff --git a/docs/pythonic_style.md b/docs/source/pythonic_style.md
similarity index 97%
rename from docs/pythonic_style.md
rename to docs/source/pythonic_style.md
index 508f315ff56..3f09059f197 100644
--- a/docs/pythonic_style.md
+++ b/docs/source/pythonic_style.md
@@ -1,136 +1,136 @@
-Pythonic Style Access for Configurations
-====
-
-1. [Introduction](#introduction)
-2. [Supported Feature Matrix](#supported-feature-matrix)
-3. [Get Started with Pythonic API for Configurations](#get-started-with-pythonic-api-for-configurations)
-
-## Introduction
-To meet the variety of needs arising from various circumstances, INC now provides a
-pythonic style access - Pythonic API - for same purpose of either user or framework configurations.
-
-The Pythonic API for Configuration allows users to specify configurations
-directly in their python codes without referring to
-a separate YAML file. While we support both simultaneously,
-the Pythonic API for Configurations has several advantages over YAML files,
-which one can tell from usages in the context below. Hence, we recommend
-users to use the Pythonic API for Configurations moving forward.
-
-## Supported Feature Matrix
-
-### Pythonic API for User Configurations
-| Optimization Techniques | Pythonic API |
-|-------------------------|:------------:|
-| Quantization | ✔ |
-| Pruning | ✔ |
-| Distillation | ✔ |
-| NAS | ✔ |
-### Pythonic API for Framework Configurations
-
-| Framework | Pythonic API |
-|------------|:------------:|
-| TensorFlow | ✔ |
-| PyTorch | ✔ |
-| ONNX | ✔ |
-| MXNet | ✔ |
-
-## Get Started with Pythonic API for Configurations
-
-### Pythonic API for User Configurations
-Now, let's go through the Pythonic API for Configurations in the order of
-sections similar as in user YAML files.
-
-#### Quantization
-
-To specify quantization configurations, users can use the following
-Pythonic API step by step.
-
-* First, load the ***config*** module
-```python
-from neural_compressor import config
-```
-* Next, assign values to the attributes of *config.quantization* to use specific configurations, and pass the config to *Quantization* API.
-```python
-config.quantization.inputs = ['image'] # list of str
-config.quantization.outputs = ['out'] # list of str
-config.quantization.backend = 'onnxrt_integerops' # support tensorflow, tensorflow_itex, pytorch, pytorch_ipex, pytorch_fx, onnxrt_qlinearops, onnxrt_integerops, onnxrt_qdq, onnxrt_qoperator, mxnet
-config.quantization.approach = 'post_training_dynamic_quant' # support post_training_static_quant, post_training_dynamic_quant, quant_aware_training
-config.quantization.device = 'cpu' # support cpu, gpu
-config.quantization.op_type_list = {'Conv': {'weight': {'dtype': ['fp32']}, 'activation': {'dtype': ['fp32']}}} # dict
-config.quantization.strategy = 'mse' # support basic, mse, bayesian, random, exhaustive
-config.quantization.objective = 'accuracy' # support performance, accuracy, modelsize, footprint
-config.quantization.timeout = 100 # int, default is 0
-config.quantization.accuracy_criterion.relative = 0.5 # float, default is 0.01
-config.quantization.reduce_range = False # bool. default value depends on hardware, True if cpu supports VNNI instruction, otherwise is False
-config.quantization.use_bf16 = False # bool
-from neural_compressor.experimental import Quantization
-quantizer = Quantization(config)
-```
-
-#### Distillation
-To specify distillation configurations, users can assign values to
-the corresponding attributes.
-```python
-from neural_compressor import config
-config.distillation.optimizer = {'SGD': {'learning_rate': 0.0001}}
-
-from neural_compressor.experimental import Distillation
-distiller = Distillation(config)
-```
-#### Pruning
-To specify pruning configurations, users can assign values to the corresponding attributes.
-```python
-from neural_compressor import config
-config.pruning.weight_compression.initial_sparsity = 0.0
-config.pruning.weight_compression.target_sparsity = 0.9
-config.pruning.weight_compression.max_sparsity_ratio_per_layer = 0.98
-config.pruning.weight_compression.prune_type = "basic_magnitude"
-config.pruning.weight_compression.start_epoch = 0
-config.pruning.weight_compression.end_epoch = 3
-config.pruning.weight_compression.start_step = 0
-config.pruning.weight_compression.end_step = 0
-config.pruning.weight_compression.update_frequency = 1.0
-config.pruning.weight_compression.update_frequency_on_step = 1
-config.pruning.weight_compression.prune_domain = "global"
-config.pruning.weight_compression.pattern = "tile_pattern_1x1"
-
-from neural_compressor.experimental import Pruning
-prune = Pruning(config)
-```
-#### NAS
-To specify nas configurations, users can assign values to the
-corresponding attributes.
-
-```python
-from neural_compressor import config
-config.nas.approach = 'dynas'
-from neural_compressor.experimental import NAS
-nas = NAS(config)
-```
-
-
-#### Benchmark
-To specify benchmark configurations, users can assign values to the
-corresponding attributes.
-```python
-from neural_compressor import config
-config.benchmark.warmup = 10
-config.benchmark.iteration = 10
-config.benchmark.cores_per_instance = 10
-config.benchmark.num_of_instance = 10
-config.benchmark.inter_num_of_threads = 10
-config.benchmark.intra_num_of_threads = 10
-
-from neural_compressor.experimental import Benchmark
-benchmark = Benchmark(config)
-```
-### Pythonic API for Framework Configurations
-Now, let's go through the Pythonic API for Configurations in setting up similar framework
-capabilities as in YAML files. Users can specify a framework's (eg. ONNX Runtime) capability by
-assigning values to corresponding attributes.
-
-```python
-config.onnxruntime.precisions = ['int8', 'uint8']
-config.onnxruntime.graph_optimization_level = 'DISABLE_ALL' # only onnxruntime has graph_optimization_level attribute
-```
-
+Pythonic Style Access for Configurations
+====
+
+1. [Introduction](#introduction)
+2. [Supported Feature Matrix](#supported-feature-matrix)
+3. [Get Started with Pythonic API for Configurations](#get-started-with-pythonic-api-for-configurations)
+
+## Introduction
+To meet the variety of needs arising from various circumstances, INC now provides a
+pythonic style access - Pythonic API - for same purpose of either user or framework configurations.
+
+The Pythonic API for Configuration allows users to specify configurations
+directly in their python codes without referring to
+a separate YAML file. While we support both simultaneously,
+the Pythonic API for Configurations has several advantages over YAML files,
+which one can tell from usages in the context below. Hence, we recommend
+users to use the Pythonic API for Configurations moving forward.
+
+## Supported Feature Matrix
+
+### Pythonic API for User Configurations
+| Optimization Techniques | Pythonic API |
+|-------------------------|:------------:|
+| Quantization | ✔ |
+| Pruning | ✔ |
+| Distillation | ✔ |
+| NAS | ✔ |
+### Pythonic API for Framework Configurations
+
+| Framework | Pythonic API |
+|------------|:------------:|
+| TensorFlow | ✔ |
+| PyTorch | ✔ |
+| ONNX | ✔ |
+| MXNet | ✔ |
+
+## Get Started with Pythonic API for Configurations
+
+### Pythonic API for User Configurations
+Now, let's go through the Pythonic API for Configurations in the order of
+sections similar as in user YAML files.
+
+#### Quantization
+
+To specify quantization configurations, users can use the following
+Pythonic API step by step.
+
+* First, load the ***config*** module
+```python
+from neural_compressor import config
+```
+* Next, assign values to the attributes of *config.quantization* to use specific configurations, and pass the config to *Quantization* API.
+```python
+config.quantization.inputs = ['image'] # list of str
+config.quantization.outputs = ['out'] # list of str
+config.quantization.backend = 'onnxrt_integerops' # support tensorflow, tensorflow_itex, pytorch, pytorch_ipex, pytorch_fx, onnxrt_qlinearops, onnxrt_integerops, onnxrt_qdq, onnxrt_qoperator, mxnet
+config.quantization.approach = 'post_training_dynamic_quant' # support post_training_static_quant, post_training_dynamic_quant, quant_aware_training
+config.quantization.device = 'cpu' # support cpu, gpu
+config.quantization.op_type_list = {'Conv': {'weight': {'dtype': ['fp32']}, 'activation': {'dtype': ['fp32']}}} # dict
+config.quantization.strategy = 'mse' # support basic, mse, bayesian, random, exhaustive
+config.quantization.objective = 'accuracy' # support performance, accuracy, modelsize, footprint
+config.quantization.timeout = 100 # int, default is 0
+config.quantization.accuracy_criterion.relative = 0.5 # float, default is 0.01
+config.quantization.reduce_range = False # bool. default value depends on hardware, True if cpu supports VNNI instruction, otherwise is False
+config.quantization.use_bf16 = False # bool
+from neural_compressor.experimental import Quantization
+quantizer = Quantization(config)
+```
+
+#### Distillation
+To specify distillation configurations, users can assign values to
+the corresponding attributes.
+```python
+from neural_compressor import config
+config.distillation.optimizer = {'SGD': {'learning_rate': 0.0001}}
+
+from neural_compressor.experimental import Distillation
+distiller = Distillation(config)
+```
+#### Pruning
+To specify pruning configurations, users can assign values to the corresponding attributes.
+```python
+from neural_compressor import config
+config.pruning.weight_compression.initial_sparsity = 0.0
+config.pruning.weight_compression.target_sparsity = 0.9
+config.pruning.weight_compression.max_sparsity_ratio_per_layer = 0.98
+config.pruning.weight_compression.prune_type = "basic_magnitude"
+config.pruning.weight_compression.start_epoch = 0
+config.pruning.weight_compression.end_epoch = 3
+config.pruning.weight_compression.start_step = 0
+config.pruning.weight_compression.end_step = 0
+config.pruning.weight_compression.update_frequency = 1.0
+config.pruning.weight_compression.update_frequency_on_step = 1
+config.pruning.weight_compression.prune_domain = "global"
+config.pruning.weight_compression.pattern = "tile_pattern_1x1"
+
+from neural_compressor.experimental import Pruning
+prune = Pruning(config)
+```
+#### NAS
+To specify nas configurations, users can assign values to the
+corresponding attributes.
+
+```python
+from neural_compressor import config
+config.nas.approach = 'dynas'
+from neural_compressor.experimental import NAS
+nas = NAS(config)
+```
+
+
+#### Benchmark
+To specify benchmark configurations, users can assign values to the
+corresponding attributes.
+```python
+from neural_compressor import config
+config.benchmark.warmup = 10
+config.benchmark.iteration = 10
+config.benchmark.cores_per_instance = 10
+config.benchmark.num_of_instance = 10
+config.benchmark.inter_num_of_threads = 10
+config.benchmark.intra_num_of_threads = 10
+
+from neural_compressor.experimental import Benchmark
+benchmark = Benchmark(config)
+```
+### Pythonic API for Framework Configurations
+Now, let's go through the Pythonic API for Configurations in setting up similar framework
+capabilities as in YAML files. Users can specify a framework's (eg. ONNX Runtime) capability by
+assigning values to corresponding attributes.
+
+```python
+config.onnxruntime.precisions = ['int8', 'uint8']
+config.onnxruntime.graph_optimization_level = 'DISABLE_ALL' # only onnxruntime has graph_optimization_level attribute
+```
+
diff --git a/docs/quantization_mixed_precision.md b/docs/source/quantization_mixed_precision.md
similarity index 87%
rename from docs/quantization_mixed_precision.md
rename to docs/source/quantization_mixed_precision.md
index 728c854da5c..9352a81f8cf 100644
--- a/docs/quantization_mixed_precision.md
+++ b/docs/source/quantization_mixed_precision.md
@@ -1,59 +1,59 @@
-### Turn ON Auto Mixed Precision during Quantization
-
-BF16 conversion during quantization is default OFF. To force enable it, users need to turn on use_bf16 by pythonic config:
-
-```python
-from neural_compressor import config
-from neural_compressor.experimental import Quantization
-
-config.quantization.use_bf16 = True
-quantizer = Quantization(config)
-```
-
-### Tensorflow
-
-Intel has worked with the TensorFlow development team to enhance TensorFlow to include bfloat16 data support for CPUs. For more information about BF16 in TensorFlow, please read [Accelerating AI performance on 3rd Gen Intel® Xeon® Scalable processors with TensorFlow and Bfloat16](https://blog.tensorflow.org/2020/06/accelerating-ai-performance-on-3rd-gen-processors-with-tensorflow-bfloat16.html).
-
-- BF16 conversion during quantization in TensorFlow
-
-
-
-
-
-
-- Three steps
-
-1. Convert to a `FP32 + INT8` mixed precision Graph
-
- In this steps, TF adaptor will regard all fallback datatype as `FP32`. According to the per op datatype in tuning config passed by strategy, TF adaptor will generate a `FP32 + INT8` mixed precision graph.
-
-2. Convert to a `BF16 + FP32 + INT8` mixed precision Graph
-
- In this phase, adaptor will convert some `FP32` ops to `BF16` according to `bf16_ops` list in tuning config.
-
-3. Optimize the `BF16 + FP32 + INT8` mixed precision Graph
-
- After the mixed precision graph generated, there are still some optimization need to be applied to improved the performance, for example `Cast + Cast` and so on. The `BF16Convert` transformer also apply a depth-first method to make it possible to take the ops use `BF16` which can support `BF16` datatype to reduce the insertion of `Cast` op.
-
-### PyTorch
-
-Intel has also worked with the PyTorch development team to enhance PyTorch to include bfloat16 data support for CPUs.
-
-- BF16 conversion during quantization in PyTorch
-
-
-
-
-
-- Two steps
-1. Convert to a `FP32 + INT8` mixed precision Graph or Module
-
- In this steps, PT adaptor will combine the `INT8` ops and all fallback ops to `FP32 + INT8` mixed precision Graph or Module no matter in Eager mode or Fx Graph mode.
-
-2. Convert to a `BF16 + FP32 + INT8` mixed precision Graph or Module
-
- In this phase, adaptor will according to `BF16` op list from strategy tune config to wrapper the `FP32` module with `BF16Wrapper` to realize the `BF16 + FP32 + INT8` mixed precision Graph or Module. adaptor will do retrace the `GraphModule` again if using Fx Graph mode.
+### Turn ON Auto Mixed Precision during Quantization
+
+BF16 conversion during quantization is default OFF. To force enable it, users need to turn on use_bf16 by pythonic config:
+
+```python
+from neural_compressor import config
+from neural_compressor.experimental import Quantization
+
+config.quantization.use_bf16 = True
+quantizer = Quantization(config)
+```
+
+### Tensorflow
+
+Intel has worked with the TensorFlow development team to enhance TensorFlow to include bfloat16 data support for CPUs. For more information about BF16 in TensorFlow, please read [Accelerating AI performance on 3rd Gen Intel® Xeon® Scalable processors with TensorFlow and Bfloat16](https://blog.tensorflow.org/2020/06/accelerating-ai-performance-on-3rd-gen-processors-with-tensorflow-bfloat16.html).
+
+- BF16 conversion during quantization in TensorFlow
+
+
+
+
+
+
+- Three steps
+
+1. Convert to a `FP32 + INT8` mixed precision Graph
+
+ In this steps, TF adaptor will regard all fallback datatype as `FP32`. According to the per op datatype in tuning config passed by strategy, TF adaptor will generate a `FP32 + INT8` mixed precision graph.
+
+2. Convert to a `BF16 + FP32 + INT8` mixed precision Graph
+
+ In this phase, adaptor will convert some `FP32` ops to `BF16` according to `bf16_ops` list in tuning config.
+
+3. Optimize the `BF16 + FP32 + INT8` mixed precision Graph
+
+ After the mixed precision graph generated, there are still some optimization need to be applied to improved the performance, for example `Cast + Cast` and so on. The `BF16Convert` transformer also apply a depth-first method to make it possible to take the ops use `BF16` which can support `BF16` datatype to reduce the insertion of `Cast` op.
+
+### PyTorch
+
+Intel has also worked with the PyTorch development team to enhance PyTorch to include bfloat16 data support for CPUs.
+
+- BF16 conversion during quantization in PyTorch
+
+
+
+
+
+- Two steps
+1. Convert to a `FP32 + INT8` mixed precision Graph or Module
+
+ In this steps, PT adaptor will combine the `INT8` ops and all fallback ops to `FP32 + INT8` mixed precision Graph or Module no matter in Eager mode or Fx Graph mode.
+
+2. Convert to a `BF16 + FP32 + INT8` mixed precision Graph or Module
+
+ In this phase, adaptor will according to `BF16` op list from strategy tune config to wrapper the `FP32` module with `BF16Wrapper` to realize the `BF16 + FP32 + INT8` mixed precision Graph or Module. adaptor will do retrace the `GraphModule` again if using Fx Graph mode.
diff --git a/docs/reference_examples.md b/docs/source/reference_examples.md
similarity index 100%
rename from docs/reference_examples.md
rename to docs/source/reference_examples.md
diff --git a/docs/releases_info.md b/docs/source/releases_info.md
similarity index 100%
rename from docs/releases_info.md
rename to docs/source/releases_info.md
diff --git a/docs/sigopt_strategy.md b/docs/source/sigopt_strategy.md
similarity index 100%
rename from docs/sigopt_strategy.md
rename to docs/source/sigopt_strategy.md
diff --git a/docs/tensorboard.md b/docs/source/tensorboard.md
similarity index 96%
rename from docs/tensorboard.md
rename to docs/source/tensorboard.md
index ad8965032fc..716e094eb35 100644
--- a/docs/tensorboard.md
+++ b/docs/source/tensorboard.md
@@ -185,13 +185,13 @@ See the [tensorflow.py](https://github.com/intel/neural-compressor/tree/master/n
* From the **GRAPHS** tab, select "baseline/." in the "Run" box and find the first 'Conv2d' op after 'input' op. The op name is "v0/cg/conv0/Relu":
-
+
* From the **GRAPHS** tab, select "tune_1/." in the "Run" box and find the first 'Conv2d' op after 'input' op. The tensor name is 'v0/cg/conv0/conv2d/Conv2D_eightbit_requantize':
-
+
* Switch to the **HISTOGRAMS** tab. Click the 'v0/cg/conv0' op name in the search box. TensorBoard groups the tensors with the same op name together so you can compare the tensor of baseline 'v0/cg/conv0/Relu' with the tensor of tune_1 'v0/cg/conv0/conv2d/Conv2D_eightbit_requantize_int8.output'. Note that the tensor name can be changed after quantization, so group the tensor by op name and compare. From the chart below, we can see that the histogram of the first conv2d output tensor are different. This is due to a known TensorFlow issue. After filtering the 'v0/cg/conv0/conv2d/Conv2D' op by adding "op_wise" in the yaml file, the issue disappears.
-
+
diff --git a/docs/transform.md b/docs/source/transform.md
similarity index 100%
rename from docs/transform.md
rename to docs/source/transform.md
diff --git a/docs/tuning_strategies.md b/docs/source/tuning_strategies.md
similarity index 99%
rename from docs/tuning_strategies.md
rename to docs/source/tuning_strategies.md
index 55c5fcae411..6e11941559a 100644
--- a/docs/tuning_strategies.md
+++ b/docs/source/tuning_strategies.md
@@ -17,7 +17,7 @@ Each strategy generates the next quantization configuration according to its
logic and the last quantization result. The function of strategies is shown
below:
-
+
Strategies begin with an adaptor layer (Framework Adaptor) where the user
passes a framework-specific model to initialize an instance of the
diff --git a/docs/user_yaml.md b/docs/source/user_yaml.md
similarity index 97%
rename from docs/user_yaml.md
rename to docs/source/user_yaml.md
index 179bf197531..c50d28a0e01 100644
--- a/docs/user_yaml.md
+++ b/docs/source/user_yaml.md
@@ -1,167 +1,167 @@
-User YAML Configuration Files
-=====
-1. [Introduction](#introduction)
-2. [Supported Feature Matrix](#supported-feature-matrix)
-3. [Get Started with User YAML Files](#get-started-with-user-yaml-files)
-
-
-## Introduction
-
-Intel® Neural Compressor uses YAML files for quick
-and user-friendly configurations. There are two types of YAML files -
-user YAML files and framework YAML files, which are used in
-running user cases and setting up framework capabilities, respectively.
-
-First, let's take a look at a user YAML file, It defines the model, tuning
-strategies, tuning calibrations and evaluations, and performance benchmarking
-of the passing model vs. original model.
-
-## Supported Feature Matrix
-
-| Optimization Techniques | YAML Configuration Files |
-|-------------------------|:------------------------:|
-| Quantization | ✔ |
-| Pruning | ✔ |
-| Distillation | ✔ |
-
-
-## Get started with User YAML Files
-
-
-A complete user YAML file is organized logically into several sections:
-
-* ***model***: The model specifications define a user model's name, inputs, outputs and framework.
-
-
-```yaml
-model: # mandatory. used to specify model specific information.
- name: mobilenet_v1
- framework: tensorflow # mandatory. supported values are tensorflow, pytorch, pytorch_ipex, onnxrt_integer, onnxrt_qlinear or mxnet; allow new framework backend extension.
- inputs: image_tensor # optional. inputs field is only required in tensorflow.
- outputs: num_detections,detection_boxes,detection_scores,detection_classes # optional. outputs field is only required in tensorflow.
-```
-* ***quantization***: The quantization specifications define quantization tuning space and related calibrations. To calibrate, users can
-specify *sampling_size* (optional) and use the subsection *dataloader* to specify
-the dataset location using *root* and transformation using *transform*. To
-implement tuning space constraints, users can use the subsection *model_wise* and *op_wise* for specific configurations.
-
-```yaml
-quantization: # optional. tuning constraints on model-wise for advance user to reduce tuning space.
- calibration:
- sampling_size: 20 # optional. default value is 100. used to set how many samples should be used in calibration.
- dataloader:
- dataset:
- ImageRecord:
- root: /path/to/imagenet/ # NOTE: modify to calibration dataset location if needed
- transform:
- BilinearImagenet:
- height: 224
- width: 224
- model_wise: # optional. tuning constraints on model-wise for advance user to reduce tuning space.
- weight:
- granularity: per_channel
- scheme: asym
- dtype: int8
- algorithm: minmax
- activation:
- granularity: per_tensor
- scheme: asym
- dtype: int8, fp32
- algorithm: minmax, kl
- op_wise: { # optional. tuning constraints on op-wise for advance user to reduce tuning space.
- 'conv1': {
- 'activation': {'dtype': ['uint8', 'fp32'],
- 'algorithm': ['minmax', 'kl'],
- 'scheme':['sym']},
- 'weight': {'dtype': ['int8', 'fp32'],
- 'algorithm': ['minmax']}
- }
- }
-```
-
-* ***pruning***: The pruning specifications define pruning tuning space. To define the training behavior, uses can
-use the subsection *train* to specify the training hyper-parameters and the training dataloader.
-To define the pruning approach, users can use the subsection *approach* to specify
-pruning target, choose the type of pruning algorithm, and the way to apply it
-during training process.
-
-```yaml
-pruning:
- train:
- dataloader:
- ...
- epoch: 40
- optimizer:
- Adam:
- learning_rate: 1e-06
- beta_1: 0.9
- beta_2: 0.999
- epsilon: 1e-07
- criterion:
- SparseCategoricalCrossentropy:
- reduction: sum_over_batch_size
- from_logits: False
- approach:
- weight_compression:
- initial_sparsity: 0.0
- target_sparsity: 0.54
- start_epoch: 0
- end_epoch: 19
- pruners:
- - !Pruner
- start_epoch: 0
- end_epoch: 19
- prune_type: basic_magnitude
-```
-* ***distillation***: The distillation specifications define distillation's tuning
-space. Similar to pruning, to define the training behavior, users can use the
-subsection *train* to specify the training hyper-parameters and the training
-dataloader and it is optional if users implement *train_func* and set the attribute
-of distillation instance to *train_func*. For criterion, Intel® Neural Compressor provides a built-in
-knowledge distillation loss class to calculate distillation loss.
-```yaml
-distillation:
- train:
- start_epoch: 0
- end_epoch: 90
- iteration: 1000
- frequency: 1
- dataloader:
- ...
- optimizer:
- SGD:
- learning_rate: 0.001
- momentum: 0.1
- nesterov: True
- weight_decay: 0.001
- criterion:
- KnowledgeDistillationLoss:
- temperature: 1.0
- loss_types: ['CE', 'CE']
- loss_weights: [0.5, 0.5]
-```
-* ***evaluation***: The evaluation specifications define the dataloader and metric for accuracy evaluation as well as dataloader
-and configurations for performance benchmarking.
-```yaml
-evaluation: # optional. required if user doesn't provide eval_func in neural_compressor.Quantization.
- accuracy:
- metric:
- ...
- dataloader:
- ...
-```
-* ***tuning***: The tuning specifications define overall tuning targets. Users can
-use *accuracy_criterion* to specify the target of accuracy loss percentage and use
-*exit_policy* to specify the tuning timeout in seconds. The random
-seed can be specified using *random_seed*.
-
-```yaml
-tuning:
- accuracy_criterion:
- relative: 0.01 # the tuning target of accuracy loss percentage: 1%
- higher_is_better: True
- exit_policy:
- timeout: 0 # tuning timeout (seconds), 0 means early stop
- random_seed: 9527 # random seed
-```
-
+User YAML Configuration Files
+=====
+1. [Introduction](#introduction)
+2. [Supported Feature Matrix](#supported-feature-matrix)
+3. [Get Started with User YAML Files](#get-started-with-user-yaml-files)
+
+
+## Introduction
+
+Intel® Neural Compressor uses YAML files for quick
+and user-friendly configurations. There are two types of YAML files -
+user YAML files and framework YAML files, which are used in
+running user cases and setting up framework capabilities, respectively.
+
+First, let's take a look at a user YAML file, It defines the model, tuning
+strategies, tuning calibrations and evaluations, and performance benchmarking
+of the passing model vs. original model.
+
+## Supported Feature Matrix
+
+| Optimization Techniques | YAML Configuration Files |
+|-------------------------|:------------------------:|
+| Quantization | ✔ |
+| Pruning | ✔ |
+| Distillation | ✔ |
+
+
+## Get started with User YAML Files
+
+
+A complete user YAML file is organized logically into several sections:
+
+* ***model***: The model specifications define a user model's name, inputs, outputs and framework.
+
+
+```yaml
+model: # mandatory. used to specify model specific information.
+ name: mobilenet_v1
+ framework: tensorflow # mandatory. supported values are tensorflow, pytorch, pytorch_ipex, onnxrt_integer, onnxrt_qlinear or mxnet; allow new framework backend extension.
+ inputs: image_tensor # optional. inputs field is only required in tensorflow.
+ outputs: num_detections,detection_boxes,detection_scores,detection_classes # optional. outputs field is only required in tensorflow.
+```
+* ***quantization***: The quantization specifications define quantization tuning space and related calibrations. To calibrate, users can
+specify *sampling_size* (optional) and use the subsection *dataloader* to specify
+the dataset location using *root* and transformation using *transform*. To
+implement tuning space constraints, users can use the subsection *model_wise* and *op_wise* for specific configurations.
+
+```yaml
+quantization: # optional. tuning constraints on model-wise for advance user to reduce tuning space.
+ calibration:
+ sampling_size: 20 # optional. default value is 100. used to set how many samples should be used in calibration.
+ dataloader:
+ dataset:
+ ImageRecord:
+ root: /path/to/imagenet/ # NOTE: modify to calibration dataset location if needed
+ transform:
+ BilinearImagenet:
+ height: 224
+ width: 224
+ model_wise: # optional. tuning constraints on model-wise for advance user to reduce tuning space.
+ weight:
+ granularity: per_channel
+ scheme: asym
+ dtype: int8
+ algorithm: minmax
+ activation:
+ granularity: per_tensor
+ scheme: asym
+ dtype: int8, fp32
+ algorithm: minmax, kl
+ op_wise: { # optional. tuning constraints on op-wise for advance user to reduce tuning space.
+ 'conv1': {
+ 'activation': {'dtype': ['uint8', 'fp32'],
+ 'algorithm': ['minmax', 'kl'],
+ 'scheme':['sym']},
+ 'weight': {'dtype': ['int8', 'fp32'],
+ 'algorithm': ['minmax']}
+ }
+ }
+```
+
+* ***pruning***: The pruning specifications define pruning tuning space. To define the training behavior, uses can
+use the subsection *train* to specify the training hyper-parameters and the training dataloader.
+To define the pruning approach, users can use the subsection *approach* to specify
+pruning target, choose the type of pruning algorithm, and the way to apply it
+during training process.
+
+```yaml
+pruning:
+ train:
+ dataloader:
+ ...
+ epoch: 40
+ optimizer:
+ Adam:
+ learning_rate: 1e-06
+ beta_1: 0.9
+ beta_2: 0.999
+ epsilon: 1e-07
+ criterion:
+ SparseCategoricalCrossentropy:
+ reduction: sum_over_batch_size
+ from_logits: False
+ approach:
+ weight_compression:
+ initial_sparsity: 0.0
+ target_sparsity: 0.54
+ start_epoch: 0
+ end_epoch: 19
+ pruners:
+ - !Pruner
+ start_epoch: 0
+ end_epoch: 19
+ prune_type: basic_magnitude
+```
+* ***distillation***: The distillation specifications define distillation's tuning
+space. Similar to pruning, to define the training behavior, users can use the
+subsection *train* to specify the training hyper-parameters and the training
+dataloader and it is optional if users implement *train_func* and set the attribute
+of distillation instance to *train_func*. For criterion, Intel® Neural Compressor provides a built-in
+knowledge distillation loss class to calculate distillation loss.
+```yaml
+distillation:
+ train:
+ start_epoch: 0
+ end_epoch: 90
+ iteration: 1000
+ frequency: 1
+ dataloader:
+ ...
+ optimizer:
+ SGD:
+ learning_rate: 0.001
+ momentum: 0.1
+ nesterov: True
+ weight_decay: 0.001
+ criterion:
+ KnowledgeDistillationLoss:
+ temperature: 1.0
+ loss_types: ['CE', 'CE']
+ loss_weights: [0.5, 0.5]
+```
+* ***evaluation***: The evaluation specifications define the dataloader and metric for accuracy evaluation as well as dataloader
+and configurations for performance benchmarking.
+```yaml
+evaluation: # optional. required if user doesn't provide eval_func in neural_compressor.Quantization.
+ accuracy:
+ metric:
+ ...
+ dataloader:
+ ...
+```
+* ***tuning***: The tuning specifications define overall tuning targets. Users can
+use *accuracy_criterion* to specify the target of accuracy loss percentage and use
+*exit_policy* to specify the tuning timeout in seconds. The random
+seed can be specified using *random_seed*.
+
+```yaml
+tuning:
+ accuracy_criterion:
+ relative: 0.01 # the tuning target of accuracy loss percentage: 1%
+ higher_is_better: True
+ exit_policy:
+ timeout: 0 # tuning timeout (seconds), 0 means early stop
+ random_seed: 9527 # random seed
+```
+
diff --git a/docs/validated_model_list.md b/docs/source/validated_model_list.md
similarity index 100%
rename from docs/validated_model_list.md
rename to docs/source/validated_model_list.md
diff --git a/docs/welcome.md b/docs/source/welcome.md
similarity index 96%
rename from docs/welcome.md
rename to docs/source/welcome.md
index 51e12e13a40..3531bf0e052 100644
--- a/docs/welcome.md
+++ b/docs/source/welcome.md
@@ -7,7 +7,7 @@ Intel® Neural Compressor (formerly known as Intel® Low Precision Optimization
| Architecture | Workflow |
| - | - |
-|  |  |
+|  |  |
Supported deep learning frameworks are:
diff --git a/docs/sphinx-requirements.txt b/docs/sphinx-requirements.txt
new file mode 100644
index 00000000000..b38e80ab0e9
--- /dev/null
+++ b/docs/sphinx-requirements.txt
@@ -0,0 +1,6 @@
+sphinx
+pytorch_sphinx_theme
+recommonmark
+sphinx-markdown-tables
+sphinx-md
+sphinx-autoapi
\ No newline at end of file
diff --git a/examples/notebook/usage_example.md b/examples/notebook/usage_example.md
index b1454ce04c8..90f910140e6 100644
--- a/examples/notebook/usage_example.md
+++ b/examples/notebook/usage_example.md
@@ -3,7 +3,7 @@
## Steps
The following diagram shows steps for enabling model with Neural Compressor:
-
+
## Example
diff --git a/examples/pytorch/nlp/huggingface_models/common/README.md b/examples/pytorch/nlp/huggingface_models/common/README.md
index 17a2b3d22e3..4904434f6b4 100644
--- a/examples/pytorch/nlp/huggingface_models/common/README.md
+++ b/examples/pytorch/nlp/huggingface_models/common/README.md
@@ -16,7 +16,7 @@ limitations under the License.
-
+
diff --git a/neural_compressor/experimental/common/criterion.py b/neural_compressor/experimental/common/criterion.py
index 0cbc1e3ac31..11308854d10 100644
--- a/neural_compressor/experimental/common/criterion.py
+++ b/neural_compressor/experimental/common/criterion.py
@@ -1518,4 +1518,4 @@ def __call__(self, **kwargs):
class: PyTorchSelfKnowledgeDistillationLoss
param dict (dict): param dict
"""
- return PyTorchSelfKnowledgeDistillationLoss, self._param_check()
+ return PyTorchSelfKnowledgeDistillationLoss, self._param_check()
\ No newline at end of file
diff --git a/neural_compressor/experimental/data/datasets/bert_dataset.py b/neural_compressor/experimental/data/datasets/bert_dataset.py
index 636b3bef28f..c22abaa996e 100644
--- a/neural_compressor/experimental/data/datasets/bert_dataset.py
+++ b/neural_compressor/experimental/data/datasets/bert_dataset.py
@@ -33,7 +33,7 @@
@dataset_registry(dataset_type="bert", framework="pytorch", dataset_format='')
class PytorchBertDataset(Dataset):
"""PyTorch dataset used for model Bert.
-
+
This Dataset is to construct from the Bert TensorDataset and not a full implementation
from yaml config. The original repo link is: https://github.com/huggingface/transformers.
When you want use this Dataset, you should add it before you initialize your DataLoader.
diff --git a/sphinx-requirements.txt b/sphinx-requirements.txt
deleted file mode 100755
index 71cfc10b849..00000000000
--- a/sphinx-requirements.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-sphinx
-sphinx-rtd-theme
-recommonmark
-sphinx-markdown-tables
-sphinx-md
\ No newline at end of file