hidet-org · yaoyaoding · Nov 5, 2022 · Nov 4, 2022 · Nov 4, 2022 · Nov 4, 2022
diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml
@@ -0,0 +1,69 @@
+name: Docs
+
+on:
+  push:
+    branches: [main]
+    paths:
+      - 'docs/**'
+      - 'gallery/**'
+
+jobs:
+  docs:
+    runs-on: [self-hosted, Linux, X64, gpu]
+    container:
+      image: nvidia/cuda:11.8.0-devel-ubuntu20.04
+      options: --gpus all
+    steps:
+      - name: Install dependencies via apt
+        run: |
+          apt update && apt install -y ccache git
+
+      - uses: actions/checkout@v3
+
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: "3.8"
+
+      - name: Setup cmake
+        uses: jwlawson/actions-setup-cmake@v1.13
+        with:
+          cmake-version: '3.19.x'
+
+      - name: Install dependencies via pip
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r requirements.txt
+          pip install -r requirements-dev.txt
+
+      - name: Build hidet
+        run: |
+          bash scripts/build_wheel.sh
+          WHEEL=$(find ./scripts/ -maxdepth 1 -name '*.whl')
+          echo "WHEEL_NAME=$WHEEL" >> $GITHUB_ENV
+          echo "Built wheel: ${{ env.WHEEL_NAME }}"
+
+      - name: Install hidet
+        run: |
+          pip install --no-deps --force-reinstall ${{ env.WHEEL_NAME }}
+
+
+      # Build the docs
+
+      - name: Install docs dependencies
+        run: |
+          pip install -r docs/requirements.txt
+
+      - name: Build docs
+        run: |
+          cd docs; make clean; make html
+
+      - name: Upload docs
+        uses: up9cloud/action-rsync@master
+        env:
+          VERBOSE: true
+          SOURCE: docs/build/html
+          HOST: ${{ secrets.DOCS_HOST }}
+          KEY: ${{ secrets.DOCS_DEPLOY_SSH_KEY }}
+          USER: ${{ secrets.DOCS_DEPLOY_USER }}
+          TARGET: ${{ secrets.DOCS_TARGET }}
diff --git a/.github/workflows/fast_tests.yaml b/.github/workflows/fast_tests.yaml
diff --git a/.github/workflows/lint.yaml b/.github/workflows/lint.yaml
@@ -6,7 +6,7 @@ on:
   pull_request:
 
 jobs:
-  check-format-and-lint:
+  format-and-lint:
     runs-on: ubuntu-latest
     strategy:
       matrix:

diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml
@@ -1,22 +1,10 @@
-name: Python Tests (Full)
+name: Tests
 
 on:
   push:
     branches: [main]
-    paths: # run when any of these files change
-      - 'include/**'
-      - 'src/**'
-      - 'python/hidet/backend/**'
-      - 'python/hidet/transforms/**'
-      - 'python/hidet/ir/**'
+
   pull_request:
-    branches: [main]
-    paths: # run when any of these files change
-      - 'include/**'
-      - 'src/**'
-      - 'python/hidet/backend/**'
-      - 'python/hidet/transforms/**'
-      - 'python/hidet/ir/**'
 
 jobs:
   tests:
@@ -25,31 +13,73 @@ jobs:
       image: nvidia/cuda:11.8.0-devel-ubuntu20.04
       options: --gpus all
     steps:
+      - name: Install dependencies via apt
+        run: |
+          pwd
+          apt update && apt install -y ccache git
+          git --version
+
       - uses: actions/checkout@v3
+
       - name: Set up Python
         uses: actions/setup-python@v4
         with:
           python-version: "3.8"
+
       - name: Setup cmake
         uses: jwlawson/actions-setup-cmake@v1.13
         with:
           cmake-version: '3.19.x'
-      - name: Setup ccache
-        run: |
-          apt update && apt install -y ccache
-      - name: Build hidet
+
+      - name: Install dependencies via pip
         run: |
           python -m pip install --upgrade pip
           pip install -r requirements.txt
           pip install -r requirements-dev.txt
+
+      - name: Build hidet
+        run: |
           bash scripts/build_wheel.sh
           WHEEL=$(find ./scripts/ -maxdepth 1 -name '*.whl')
-          echo "Built wheel: $WHEEL" 
-          pip install --no-deps --force-reinstall $WHEEL
-      - name: Run minimal tests
+          echo "WHEEL_NAME=$WHEEL" >> $GITHUB_ENV
+          echo "Built wheel: ${{ env.WHEEL_NAME }}"
+
+      - name: Install hidet
         run: |
-          python -m pytest -v tests/minimal/test_add.py
-      - name: Run full tests
+          pip install --no-deps --force-reinstall ${{ env.WHEEL_NAME }}
+
+      # Run tests
+
+      - name: Fix ownership of the repo
+        run: |
+          git config --global --add safe.directory `pwd`
+
+      - name: Diff against main
+        uses: technote-space/get-diff-action@v6
+        with:
+          PATTERNS: |
+            include/**/*.h
+            src/**/*
+            python/hidet/backend/**/*.py
+            python/hidet/transforms/**/*.py
+            python/hidet/ir/**/*.py
+
+      - name: Run tests with operator cache cleared
+        run: |
+          python -m pytest -v --durations=20 --clear-cache ./tests
+        if: env.GIT_DIFF
+
+      - name: Run tests with operator cache
+        run: |
+          python -m pytest -v --durations=20 ./tests
+        if: "!env.GIT_DIFF"
+
+      # Build the docs
+
+      - name: Install docs dependencies
+        run: |
+          pip install -r docs/requirements.txt
+
+      - name: Build docs
         run: |
-          # stop the build if format is not correct, clear cache
-          python -m pytest -v --clear-cache ./tests   
+          cd docs; make clean; make html
diff --git a/.gitignore b/.gitignore
@@ -201,3 +201,6 @@ build-release
 
 # onnx model
 *.onnx
+
+# intermediate files
+/gallery/**/*.json
diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -20,7 +20,17 @@
 import sphinx.errors
 sphinx.application.ExtensionError = sphinx.errors.ExtensionError
 
-sys.path.insert(0, os.path.abspath('../../python'))
+try:
+    # The gallery scripts require to import hidet module.
+    # We first try to import hidet from existing sys.path.
+    import hidet
+    # If successful, hidet is already installed, and we can skip the following.
+except ImportError:
+    # Otherwise, we might in a git repo, and we can import hidet from the repo by adding the repo root to sys.path.
+    sys.path.insert(0, os.path.abspath('../../python'))
+
+import hidet
+print('Build docs with under cache: {}'.format(hidet.option.get_cache_dir()))
 
 # -- Project information -----------------------------------------------------
 

diff --git a/gallery/getting-started/quick-start.py b/gallery/getting-started/quick-start.py
@@ -10,7 +10,7 @@
 # %%
 # We should first import hidet.
 import hidet
-hidet.option.cache_dir('./cache')
+
 # %%
 # Define tensors
 # --------------

diff --git a/gallery/how-to-guides/add-new-operator-template-based.py b/gallery/how-to-guides/add-new-operator-template-based.py
@@ -45,7 +45,10 @@ def __init__(self, a: TensorNode, b: TensorNode):
             }
         )
 
-    def implement_cuda(self) -> IRModule:
+    def allow_epilogue(self) -> bool:
+        return False
+
+    def implement_cuda(self, working_dir: str) -> IRModule:
         # override this method to use template-based scheduling
         return batch_matmul_mma_fp16_schedule(self)
 
@@ -75,20 +78,20 @@ def batch_matmul_mma_fp16_schedule(task: BatchMatmulFp16Task) -> IRModule:
     from hidet.lang.mapping import repeat, spatial
     from hidet.lang.cuda import blockIdx, threadIdx, syncthreads
     from hidet.lang.cuda import MmaConfig, mma_sync
-    from hidet.transforms.tools import fuse_and_pack
+    from hidet.transforms.tools import generate_packed_func
 
     # get the workload size
-    bs = task.attributes['batch_size'],
-    m_size = task.attributes['m_size'],
-    n_size = task.attributes['n_size'],
+    bs = task.attributes['batch_size']
+    m_size = task.attributes['m_size']
+    n_size = task.attributes['n_size']
     k_size = task.attributes['k_size']
 
     # define the template hyper-parameters
-    mma_config = MmaConfig.m16n8k16_f16_f16()
-    block_m, block_n, block_k = 128, 128, 16
-    warp_m, warp_n, warp_k = 64, 64, 16
+    mma_config = MmaConfig.m16n8k8_f16_f16()
+    block_m, block_n, block_k = 128, 128, 8
+    warp_m, warp_n, warp_k = 64, 64, 8
     warp_count_m, warp_count_n, warp_count_k = 2, 2, 1
-    mma_m, mma_n, mma_k = mma_config.m, mma_config.n, mma_config.k  # 16, 8, 16
+    mma_m, mma_n, mma_k = mma_config.m, mma_config.n, mma_config.k  # 16, 8, 8
     mma_count_m, mma_count_n, mma_count = 4, 8, 1
     threads = warp_count_m * warp_count_n * warp_count_k * 32
 
@@ -177,9 +180,9 @@ def batch_matmul_kernel(
                 offset_k = k0 * block_k
                 gmem_a = a[blockIdx.z, offset_m:, offset_k:]
                 gmem_b = b[blockIdx.z, offset_k:, offset_n:]
-                for i, k in repeat(16, 1).spatial(8, 16).on(threadIdx.x):
+                for i, k in repeat(8, 1).spatial(16, 8).on(threadIdx.x):
                     smem_a[i, k] = gmem_a.read([i, k], protected=True)
-                for k, j in repeat(16, 1).spatial(1, 128).on(threadIdx.x):
+                for k, j in repeat(8, 1).spatial(1, 128).on(threadIdx.x):
                     smem_b[k, j] = gmem_b.read([k, j], protected=True)
                 syncthreads()
                 load_regs_a(smem_a, regs_a)
@@ -190,7 +193,8 @@ def batch_matmul_kernel(
 
     ir_module = module.ir_module()
     # conduct the fusion (when the task has prologue or epilogue) and generate the packed function
-    ir_module = fuse_and_pack(ir_module, kernel_func=batch_matmul_kernel, task=task)
+    # ir_module = fuse_and_pack(ir_module, kernel_func=batch_matmul_kernel, task=task)
+    generate_packed_func(ir_module, func=batch_matmul_kernel, pack_func_name=task.name)
     return ir_module
 
 

diff --git a/python/hidet/ir/task.py b/python/hidet/ir/task.py
@@ -156,10 +156,10 @@ def implement_cuda(self, workding_dir: str) -> IRModule:
     def implement_cpu(self, workding_dir: str) -> IRModule:
         return NotImplemented
 
-    def allow_prologue(self) -> True:
+    def allow_prologue(self) -> bool:
         return True
 
-    def allow_epilogue(self) -> True:
+    def allow_epilogue(self) -> bool:
         return True
 
     def is_injective_task(self) -> bool:

diff --git a/python/hidet/transforms/tools/apply_prologue_epilogue.py b/python/hidet/transforms/tools/apply_prologue_epilogue.py
@@ -63,6 +63,7 @@ def visit_Function(self, func: Function):
         self.anchor_outputs: List[Var] = func.params[anchor_num_inputs:]
 
         # create parameters for fused function, and bind task graph parameters to function parameters
+        # todo: do not create new parameters for the inputs/outputs that have not been fused
         new_params: List[Var] = []
         for tensor_node in self.task_graph.input_tensors + self.task_graph.output_tensors:
             new_params.append(Var(tensor_node.name, tensor_node.data_type))