hidet-org · yaoyaoding · Nov 3, 2022 · Oct 31, 2022 · Nov 2, 2022 · Nov 3, 2022
diff --git a/.github/workflows/fast_tests.yaml b/.github/workflows/fast_tests.yaml
@@ -0,0 +1,43 @@
+name: Python Tests (Fast)
+
+on:
+  push:
+    branches: [main]
+  pull_request:
+    branches: [main]
+
+jobs:
+  tests:
+    runs-on: [self-hosted, Linux, X64, gpu]
+    container:
+      image: nvidia/cuda:11.8.0-devel-ubuntu20.04
+      options: --gpus all
+    steps:
+      - uses: actions/checkout@v3
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: "3.8"
+      - name: Setup cmake
+        uses: jwlawson/actions-setup-cmake@v1.13
+        with:
+          cmake-version: '3.19.x'
+      - name: Setup ccache
+        run: |
+          apt update && apt install -y ccache
+      - name: Build hidet
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r requirements.txt
+          pip install -r requirements-dev.txt
+          bash scripts/build_wheel.sh
+          WHEEL=$(find ./scripts/ -maxdepth 1 -name '*.whl')
+          echo "Built wheel: $WHEEL" 
+          pip install --no-deps --force-reinstall $WHEEL
+      - name: Run minimal tests
+        run: |
+          python -m pytest -v tests/minimal/test_add.py
+      - name: Run full tests
+        run: |
+          # stop the build if format is not correct
+          python -m pytest -v ./tests
diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml
@@ -1,9 +1,16 @@
-name: Python Tests
+name: Python Tests (Full)
 
 on:
   push:
     branches: [main]
   pull_request:
+    branches: [main]
+    paths: # run when any of these files change
+      - 'include/**'
+      - 'src/**'
+      - 'python/hidet/backend/**'
+      - 'python/hidet/transforms/**'
+      - 'python/hidet/ir/**'
 
 jobs:
   tests:
@@ -38,5 +45,5 @@ jobs:
           python -m pytest -v tests/minimal/test_add.py
       - name: Run full tests
         run: |
-          # stop the build if format is not correct
-          python -m pytest -v ./tests
+          # stop the build if format is not correct, clear cache
+          python -m pytest -v --clear-cache ./tests   
diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -96,6 +96,7 @@
     "repository_url": "https://github.com/yaoyaoding/hidet",
     "use_repository_button": True,
     'logo_only': True,
+    "extra_navbar": r"<a href=/netron target=_blank>Customized Netron</a>",
 }
 html_title = "Hidet Documentation"
 html_permalinks_icon = "<span>¶</span>"
@@ -104,7 +105,6 @@
 googleanalytics_enabled = True
 
 
-
 # Add any paths that contain custom static files (such as style sheets) here,
 # relative to this directory. They are copied after the builtin static files,
 # so a file named "default.css" will overwrite the builtin "default.css".

diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -38,13 +38,14 @@ Hidet is an open-source DNN inference framework, it features
   how-to-guides/add-new-operator/index
   gallery/how-to-guides/add-operator-resolve-rule
   gallery/how-to-guides/add-subgraph-rewrite-rule
+  gallery/how-to-guides/visualize-flow-graph
+
 
 .. toctree::
   :maxdepth: 1
   :caption: Notes
 
   notes/operator-cache
-  notes/visualize-flow-graph
 
 .. toctree::
   :maxdepth: 1

diff --git a/docs/source/notes/visualize-flow-graph.rst b/docs/source/notes/visualize-flow-graph.rst
diff --git a/gallery/how-to-guides/add-new-operator-compute-definition.py b/gallery/how-to-guides/add-new-operator-compute-definition.py
@@ -18,7 +18,7 @@
   DSL to define the mathematical definition of an operator.
 
 The precise mathematical definition of each operator in Hidet is defined through a domain-specific-language (DSL).
-In this article, we will show how to define the mathematical definition of a new operator in Hidet using this DSL,
+In this tutorial, we will show how to define the mathematical definition of a new operator in Hidet using this DSL,
 which is defined in the :py:mod:`hidet.ir.compute` module.
 
 
@@ -290,6 +290,7 @@ def run_task(task: Task, inputs: List[hidet.Tensor], outputs: List[hidet.Tensor]
         print(tensor)
     print()
 
+
 # %%
 # The following code shows how to 1) define the computation, 2) define the task, and 3) build and run the task.
 #
@@ -301,11 +302,13 @@ def run_task(task: Task, inputs: List[hidet.Tensor], outputs: List[hidet.Tensor]
 #  high-level computation graph of a deep learning model. The latter is a tensor node in the domain-specific language
 #  that is used to describe the computation of a single operator.
 
-import numpy as np
 from hidet.ir.compute import tensor_input, reduce, compute, arg_reduce, TensorNode
 
+# sphinx_gallery_start_ignore
 # Hidet use numpy for tensor printing, this line reduce the number of printed digits
+import numpy as np
 np.set_printoptions(precision=2, suppress=True)
+# sphinx_gallery_end_ignore
 
 def add_example():
     a: TensorNode = tensor_input(name='a', dtype='float32', shape=[5])
@@ -317,6 +320,7 @@ def add_example():
 
 add_example()
 
+
 # %%
 # More Examples
 # -------------
@@ -325,13 +329,14 @@ def add_example():
 #   :class: margin
 #
 #   All the hidet operators are defined in :py:mod:`hidet.graph.ops` submodule. And all of existing operators
-#   are defined through the compute primitives described in this article. Feel free to check the source code to learn more
-#   about how to define the computation of different operators.
+#   are defined through the compute primitives described in this tutorial. Feel free to check the source code to learn
+#   more about how to define the computation of different operators.
 #
 # At last, we show more examples of using the compute primitives to define operator computation.
 #
 # ReduceSum
 # ^^^^^^^^^
+
 def reduce_sum_example():
     a = tensor_input('a', dtype='float32', shape=[4, 3])
     b = compute(
@@ -345,6 +350,7 @@ def reduce_sum_example():
 
 reduce_sum_example()
 
+
 # %%
 # ArgMax
 # ^^^^^^
@@ -360,9 +366,9 @@ def arg_max_example():
     run_task(task, [hidet.randn([4, 3])], [hidet.empty([4], dtype='int32')])
 
 
-
 arg_max_example()
 
+
 # %%
 # MatMul
 # ^^^^^^
@@ -406,7 +412,7 @@ def softmax_example():
 # %%
 # Summary
 # -------
-# In this article, we introduced the compute primitives that are used to define the computation of operators in Hidet.
+# In this tutorial, we introduced the compute primitives that are used to define the computation of operators in Hidet.
 # After that, we showed how to wrap the computation DAG into a task and build and run the task. In the next step, we
 # will show you how to use these compute primitives to define new operators in Hidet.
-#
+#
diff --git a/gallery/how-to-guides/add-new-operator-rule-based.py b/gallery/how-to-guides/add-new-operator-rule-based.py
@@ -2,7 +2,133 @@
 Using Rule-based Scheduling
 ===========================
 
+In the previous tutorial, we have learned how to define the computation using compute primitives and wrap it into a
+:py:class:`~hidet.ir.task.Task`. In this tutorial, we will learn how to add an operator (i.e.,
+:py:class:`~hidet.graph.Operator`) with given computation definition, and use hidet's provided rule-based scheduler to
+automatically schedule the computation into a tensor program.
 
+Three steps to define a new operator
+------------------------------------
 
+There are three steps to define a new operator in Hidet.
 
+1. Define the computation task class by inheriting :py:class:`~hidet.ir.task.Task`.
+2. Define the operator class by inheriting :py:class:`~hidet.graph.Operator`.
+3. Define a function to create the operator instance.
+
+Batch Matrix Multiplication Example
+-----------------------------------
+
+We will take the batch matrix multiplication as an example to illustrate the three steps.
+
+1. Define the computation task class
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+We define the computation task class **BatchMatmulTask** by inheriting :py:class:`~hidet.ir.task.Task` class. The
+**BatchMatmulTask** class's constructor function takes two arguments, **a** and **b** that are the input tensor nodes
+of the batch matrix multiplication.
 """
+
+# sphinx_gallery_start_ignore
+# Hidet use numpy for tensor printing, this line reduce the number of printed digits
+import numpy as np
+np.set_printoptions(precision=2, suppress=True)
+# sphinx_gallery_end_ignore
+from hidet.ir.compute import TensorNode, compute, reduce
+from hidet.ir.task import Task
+
+
+class BatchMatmulTask(Task):
+    def __init__(self, a: TensorNode, b: TensorNode):
+        # get the input sizes
+        batch_size, m_size, k_size = a.const_shape()
+        batch_size, k_size, n_size = b.const_shape()
+
+        # define the computation
+        c = compute(
+            name='c',
+            shape=[batch_size, m_size, n_size],
+            fcompute=lambda p, i, j: reduce(
+                shape=[k_size],
+                fcompute=lambda k: a[p, i, k] * b[p, k, j],
+                reduce_type='sum'
+            )
+        )
+
+        # call the parent class constructor to initialize the task
+        super().__init__(
+            name='batch_matmul',  # the name of the task
+            inputs=[a, b],  # the input tensor nodes
+            outputs=[c]  # the output tensor nodes
+        )
+
+
+# %%
+# 2. Define the operator class
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# Our next step is to define the operator class **BatchMatmulOp** by inheriting :py:class:`~hidet.graph.Operator` class.
+from hidet.graph import Operator, Tensor
+from hidet.graph.ops.definitions.utils import input_like
+
+
+class BatchMatmulOp(Operator):
+    def __init__(self, a: Tensor, b: Tensor):
+        # call the parent class constructor to initialize the operator
+        super().__init__(
+            inputs=[a, b],  # the input tensors
+            task=BatchMatmulTask(  # the task of the operator
+                # create tensor nodes (TensorNode) with the same shape and dtype as the tensors (Tensor)
+                input_like(a, 'a'),
+                input_like(b, 'b')
+            )
+        )
+
+
+# %%
+# 3. Define a function to create the operator instance
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# We define a function **batch_matmul** to create the operator instance **BatchMatmulOp** and return the output tensor.
+
+
+def batch_matmul(a: Tensor, b: Tensor) -> Tensor:
+    # get_output(0) returns the first output tensor of the operator
+    return BatchMatmulOp(a, b).get_output(0)
+
+
+# %%
+# Use the defined operator
+# ~~~~~~~~~~~~~~~~~~~~~~~~
+# The new operator has no difference with the hidet provided operators, as we define hidet operators in the same way.
+# For example, when we optimize the flow graph, this new operator can also fuse surrounding operators.
+import hidet
+
+
+def demo_usage():
+    a = hidet.randn([2, 2, 3])
+    b = hidet.randn([2, 3, 2])
+    c = batch_matmul(a, b)
+    print(a)
+    print(b)
+    print(c)
+
+demo_usage()
+
+# %%
+# Two Scheduling Machanisms
+# -------------------------
+# We only define the computation of the operator, and leave the scheduling to the rule-based scheduler provided by
+# hidet. We call this method of scheduling as **rule-based scheduling**. Most hidet operators are using the same
+# rule-based scheduler as we used in this example. Our experience shows that the rule-based
+# scheduler can achieve good performance for operators that do not have large amount of reduction. However, for
+# operators like matrix multiplication, convolution, etc., the rule-based scheduler may not be able to achieve the
+# best performance as it does not use shared memory to cache the data loading. Thus, hidet also provides another
+# scheduling mechanism, the **template-based scheduling**.
+#
+
+# %%
+# Summary
+# -------
+# In this tutorial, we have learned how to define a new operator with given computation definition, and use hidet's
+# provided rule-based scheduler to automatically schedule the computation into a tensor program. In the next tutorial,
+# we will learn how to use the template-based scheduling to achieve better performance.
+