intel · airMeng · Mar 14, 2024 · Jan 17, 2024 · Feb 27, 2024 · Feb 27, 2024
diff --git a/.editorconfig b/.editorconfig
@@ -0,0 +1,15 @@
+# EditorConfig is awesome: https://EditorConfig.org
+
+# top-most EditorConfig file
+root = true
+
+# Unix-style newlines with a newline ending every file
+[*]
+end_of_line = lf
+insert_final_newline = true
+trim_trailing_whitespace = true
+
+# C/C++ follows clang-format
+[*.{c,cpp,h,hpp}]
+indent_style = space
+indent_size = 4
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -9,14 +9,14 @@ if (NOT CMAKE_BUILD_TYPE)
 endif()
 if(UNIX)
 else() # Windows
-    # Force CMake to use icx-cl rather than the default C++ compiler/linker 
+    # Force CMake to use icx-cl rather than the default C++ compiler/linker
     # (needed on Windows only)
     # include (CMakeForceCompiler)
     # CMAKE_FORCE_CXX_COMPILER (icx-cl IntelDPCPP)
     set(CMAKE_CXX_COMPILER icx-cl)
     include (Platform/Windows-Clang)
     include(cmake/GTestExternal.cmake)
-endif() 
+endif()
 
 project(XeTLA)
 

diff --git a/examples/01_gemm_universal/gemm_universal.cpp b/examples/01_gemm_universal/gemm_universal.cpp
@@ -13,12 +13,13 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *******************************************************************************/
-#include <tests/utils/utils.hpp>
 #include "xetla.hpp"
+#include <tests/utils/utils.hpp>
 
 enum class kslicing_impl_t : uint8_t { none = 0, global = 1, local = 2 };
 
-template <kslicing_impl_t kslicing_type = kslicing_impl_t::none>
+template <gpu_arch arch_tag,
+        kslicing_impl_t kslicing_type = kslicing_impl_t::none>
 void gemm_universal_run(uint32_t iter) {
     // Tips, the example demonstrates programming kernel with XeTLA, it works as expected with current configurations.
     // Please make sure you fully understand these configurations before you do any modifications, incomplete changes may lead to unexpected behaviors.
@@ -82,7 +83,7 @@ void gemm_universal_run(uint32_t iter) {
     constexpr uint32_t num_local_splitk
             = (kslicing_type == kslicing_impl_t::local) ? 2 : 1;
 
-    // Mirco-kernel configuration
+    // Micro-kernel configuration
     using tune_option = dict_t<
             elem_v_t<tune_key::param_optimizer_type,
                     tune_key_value::param_optimizer_decision_tree>,
@@ -102,8 +103,8 @@ void gemm_universal_run(uint32_t iter) {
             data_type_c, // output datatype for C
             mem_layout::row_major, // memory layout for C
             8, // leading dimension alignment for C, in unit of element
-            data_type_acc, // accumulator data type for intermediate resutls
-            gpu_arch::Xe, // GPU arch
+            data_type_acc, // accumulator data type for intermediate results
+            arch_tag, // GPU arch
             tune_option>;
 
     // allocate temp buffers for global split
@@ -184,36 +185,42 @@ void gemm_universal_run(uint32_t iter) {
     free(Cnt, context);
 }
 
+template <gpu_arch arch_tag>
+struct main_wrapper {
+    static constexpr auto exec = []() {
+        // An example code for calculating matrix multiplication using
+        // GEMM_UNIVERSAL API:
+        //   C = A x B
+        // The resulted matrix C is partitioned by the group range
+        // in to multiple blocks. The block matrix
+        //  C<i_w, j_w>
+        // is computed by the workgroup with id: (0, i_w, j_w).
+        // (i_w, j_w) is an element in range specified by group range.
+        // Each thread with index (0, i_s, j_s) inside the same workgroup
+        // is responsible for a sub block of matrix multiplication, which is
+        //   C<i_w, j_w>[i_s*sg_m:(i_s+1):sg_m,j_s*sg_n:(j_s+1)*sg_n]
+
+        // Alternatively, some threads can cooperate on the same sub block
+        // matrix given the same (i_s, j_s), i.e. the index space is extended
+        // from (0, i_s, j_s) to (k_s, i_s, j_s).
+
+        // Another method to achieve the same effect is to extend the index space
+        // in group range, i.e. from (0, i_w, j_w) to (k_w, i_w, j_w)
+
+        // More detailed description referring to the cooperation (kslicing) could
+        // be found in the example 01_gemm_universal with custom implementation
+
+        // basic gemm_universal
+        gemm_universal_run<arch_tag, kslicing_impl_t::none>(10);
+
+        // basic gemm_universal with workgroup cooperation
+        // gemm_universal_run<arch_tag, kslicing_impl_t::global>(10);
+
+        // basic gemm_universal with thread cooperation
+        // gemm_universal_run<arch_tag, kslicing_impl_t::local>(10);
+    };
+};
 int main() {
-    // An example code for calculating matrix multiplication using
-    // GEMM_UNIVERSAL API:
-    //   C = A x B
-    // The resulted matrix C is partitioned by the group range
-    // in to multiple blocks. The block matrix
-    //  C<i_w, j_w>
-    // is computed by the workgroup with id: (0, i_w, j_w).
-    // (i_w, j_w) is an element in range specified by group range.
-    // Each thread with index (0, i_s, j_s) inside the same workgroup
-    // is responsible for a sub block of matrix multiplication, which is
-    //   C<i_w, j_w>[i_s*sg_m:(i_s+1):sg_m,j_s*sg_n:(j_s+1)*sg_n]
-
-    // Alternatively, some threads can cooperate on the same sub block
-    // matrix given the same (i_s, j_s), i.e. the index space is extended
-    // from (0, i_s, j_s) to (k_s, i_s, j_s).
-
-    // Another method to achieve the same effect is to extend the index space
-    // in group range, i.e. from (0, i_w, j_w) to (k_w, i_w, j_w)
-
-    // More detailed description referring to the cooperation (kslicing) could
-    // be found in the example 01_gemm_universal with custom implementation
-
-    // basic gemm_universal
-    gemm_universal_run<kslicing_impl_t::none>(10);
-
-    // basic gemm_universal with workgroup cooperation
-    // gemm_universal_run<kslicing_impl_t::global>(10);
-
-    // basic gemm_universal with thread cooperation
-    // gemm_universal_run<kslicing_impl_t::local>(10);
-    return (0);
+    dispatch_arch<main_wrapper>::exec();
+    return 0;
 }
diff --git a/examples/02_basic_gemm/basic_gemm.cpp b/examples/02_basic_gemm/basic_gemm.cpp
@@ -13,10 +13,10 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  *******************************************************************************/
-#include <tests/utils/utils.hpp>
 #include "xetla.hpp"
+#include <tests/utils/utils.hpp>
 
-template <gpu_arch arch_tag_>
+template <gpu_arch arch_tag>
 void basic_gemm_run(sycl::queue queue, uint32_t iter) {
     // Tips, the example demonstrates programming kernel with XeTLA, it works as expected with current configurations.
     // Please make sure you fully understand these configurations before you do any modifications, incomplete changes may lead to unexpected behaviors.
@@ -110,11 +110,11 @@ void basic_gemm_run(sycl::queue queue, uint32_t iter) {
                 // should larger than 8
                 static constexpr uint32_t k_stride = 32;
 
-                // Step 1: define mirco-kernel's configuration
+                // Step 1: define Micro-kernel's configuration
                 using wg_shape = shape<wg_tile_n, wg_tile_m>;
                 using sg_shape = shape<sg_tile_n, sg_tile_m>;
 
-                // Mirco-kernel configuration
+                // Micro-kernel configuration
                 using gemm_tune_option
                         = dict_t<elem_t_t<tune_key::sg_tile_shape, sg_shape>,
                                 elem_v_t<tune_key::prefetch_distance,
@@ -132,10 +132,10 @@ void basic_gemm_run(sycl::queue queue, uint32_t iter) {
                         8, // leading dimension for B, in unit of element
                         mem_space::
                                 global, // memory reading from global mem for B
-                        data_type_acc, // accumulator data type for intermediate resutls
+                        data_type_acc, // accumulator data type for intermediate results
                         wg_shape, // computation tile shape
                         k_stride, // elements in each iteration
-                        gpu_arch::Xe, // GPU arch
+                        arch_tag, // GPU arch
                         gemm_tune_option>;
                 gemm_t gemm;
 
@@ -149,24 +149,26 @@ void basic_gemm_run(sycl::queue queue, uint32_t iter) {
                         mem_space::global, // memory writing to global mem for C
                         wg_shape, // computation tile shape
                         k_stride, // elements in each iteration
-                        gpu_arch::Xe, // GPU arch
+                        arch_tag, // GPU arch
                         epilogue_tune_option>;
 
                 // Step 3: define the shared local memory usages
                 // developers have the responsibility to set
-                // shared loacal memory through XeTLA API
+                // shared local memory through XeTLA API
                 static constexpr uint32_t barrier_count = gemm_t::barrier_count;
                 static constexpr uint32_t slm_size = gemm_t::slm_size;
+                static_assert(slm_size <= arch_attr_t<arch_tag>::local_mem_size,
+                        "The local memory size excess!");
                 xetla_nbarrier_init<barrier_count>();
                 xetla_local_init<slm_size>();
 
-                // Step 4: ecah workgroup gets it individual index to start computation
+                // Step 4: each workgroup gets it individual index to start computation
                 int start_n = item.get_group(2) * wg_tile_n;
                 int start_m = item.get_group(1) * wg_tile_m;
                 // no slicing in K direction so start from zero for all WG
                 int start_k = 0;
 
-                // Each workgroup will compute all data in K based on no k_sliciing
+                // Each workgroup will compute all data in K based on no k_slicing
                 // The developer can set how much data a subgroup compute by k_stride
                 uint32_t wg_tile_k = matrix_k;
                 uint32_t inner_loop_count
@@ -183,7 +185,7 @@ void basic_gemm_run(sycl::queue queue, uint32_t iter) {
                 mem_desc_output_c md_c(
                         {C}, {matrix_n, matrix_m, ldc}, {start_n, start_m});
 
-                // Step 6: real calculation with accumulator varibales which suppose
+                // Step 6: real calculation with accumulator variables which suppose
                 // will be in register.
                 typename gemm_t::matAcc_t matAcc;
                 matAcc.init(0);
@@ -194,8 +196,7 @@ void basic_gemm_run(sycl::queue queue, uint32_t iter) {
                 // the results is in the matAcc rather than real output C
                 typename gemm_t::work_group_t g(item.get_local_linear_id());
                 gemm(g, matAcc, gemm_args);
-
-                // Step 7: write the results from matACC to real output C
+                // Step 7: write the results from matAcc to real output C
                 epilogue_t epilogue;
                 epilogue(g, matAcc, md_c);
             });
@@ -220,23 +221,21 @@ void basic_gemm_run(sycl::queue queue, uint32_t iter) {
     free(C, context);
 }
 
+template <gpu_arch arch_tag>
+struct main_wrapper {
+    static constexpr auto exec = []() {
+        // This case shows how to use batch-reduce (br) GEMM microkernel to
+        // solve a standard GEMM
+        // Turn on the profiling property to facilitate subsequent profiling
+        sycl::property_list properties {
+                sycl::property::queue::enable_profiling()};
+
+        // Define SYCL queue, context and device
+        auto queue = sycl::queue(properties);
+        basic_gemm_run<arch_tag>(queue, 10);
+    };
+};
 int main() {
-    // This case shows how to use batch-reduce (br) GEMM microkernel to
-    // solve a standard GEMM
-    // Turn on the profiling property to facilitate subsequent profiling
-    sycl::property_list properties {sycl::property::queue::enable_profiling()};
-
-    // Define SYCL queue, context and device
-    auto queue = sycl::queue(properties);
-    auto device = queue.get_device();
-
-    // Detect the execution size, 8 for Arc, 16 for PVC.
-    int ExecSize
-            = device.get_info<ext::intel::info::device::gpu_eu_simd_width>();
-    if (ExecSize == 8) {
-        basic_gemm_run<gpu_arch::Dg2>(queue, 10);
-    } else {
-        basic_gemm_run<gpu_arch::Xe>(queue, 10);
-    }
-    return (0);
+    dispatch_arch<main_wrapper>::exec();
+    return 0;
 }
diff --git a/examples/03_gemm_relu_bias/gemm_relu_bias.cpp b/examples/03_gemm_relu_bias/gemm_relu_bias.cpp
@@ -14,8 +14,8 @@
 * limitations under the License.
 *******************************************************************************/
 #include <algorithm>
-#include <tests/utils/utils.hpp>
 #include "xetla.hpp"
+#include <tests/utils/utils.hpp>
 
 using namespace cl::sycl;
 using namespace gpu::xetla;
@@ -140,7 +140,7 @@ void gemm_relu_bias_run(uint32_t iter) {
     using epilogue_policy
             = xetla::group::epilogue_policy_tile_op<tile_op_t, gpu_arch::Xe>;
 
-    // Mirco-kernel configuration
+    // Micro-kernel configuration
     using tune_option = dict_t<
             elem_v_t<tune_key::param_optimizer_type,
                     tune_key_value::param_optimizer_decision_tree>,
@@ -156,7 +156,7 @@ void gemm_relu_bias_run(uint32_t iter) {
             data_type_c, // output datatype for C
             mem_layout::row_major, // memory layout for C
             8, // leading dimension alignment for C, in unit of element
-            data_type_acc, // accumulator data type for intermediate resutls
+            data_type_acc, // accumulator data type for intermediate results
             gpu_arch::Xe, // GPU arch
             tune_option>;
     using gemm_op_t = typename default_config_t::type;
@@ -223,15 +223,15 @@ int main() {
     // The purpose of this example is to illustrate the epilogue_t API in XeTLA.
 
     // It allows user to implement multiple Ops inside a kernel call to avoid
-    // overheads in invokation, memory transfer, etc.
+    // overheads in invocation, memory transfer, etc.
     // Take the following python code as an example:
 
     // Original:
     // > import torch as to
     // > x = to.matmul(A, B)
     // > y = to.nn.functional.relu(x)
 
-    // It takes two kernel invokations and the ReLU Op is a elementwise operation
+    // It takes two kernel invocations and the ReLU Op is a elementwise operation
     // that could be fused into MatMul Op, which is basically calling GEMM kernel.
 
     // Fusion:

diff --git a/examples/04_gemm_polynomial/gemm_polynomial.cpp b/examples/04_gemm_polynomial/gemm_polynomial.cpp
@@ -14,8 +14,8 @@
  * limitations under the License.
  *******************************************************************************/
 #include <algorithm>
-#include <tests/utils/utils.hpp>
 #include "xetla.hpp"
+#include <tests/utils/utils.hpp>
 
 #include "gemm_polynomial.hpp"
 
@@ -137,7 +137,7 @@ void gemm_polynomial_run(int iter) {
     using epilogue_policy
             = xetla::group::epilogue_policy_tile_op<tile_op_t, gpu_arch::Xe>;
 
-    // Mirco-kernel configuration
+    // Micro-kernel configuration
     using tune_option = dict_t<
             elem_v_t<tune_key::param_optimizer_type,
                     tune_key_value::param_optimizer_decision_tree>,
@@ -154,7 +154,7 @@ void gemm_polynomial_run(int iter) {
             data_type_c, // output datatype for C
             mem_layout::row_major, // memory layout for C
             8, // leading dimension alignment for C, in unit of element
-            data_type_acc, // accumulator data type for intermediate resutls
+            data_type_acc, // accumulator data type for intermediate results
             gpu_arch::Xe, // GPU arch
             tune_option>;
 

diff --git a/examples/05_batch_gemm/batch_gemm.cpp b/examples/05_batch_gemm/batch_gemm.cpp
@@ -90,7 +90,7 @@ void batch_gemm_run(uint32_t iter) {
     using wg_shape = shape<wg_tile_n, wg_tile_m>;
     using sg_shape = shape<sg_tile_n, sg_tile_m>;
 
-    // Mirco-kernel configuration
+    // Micro-kernel configuration
     using tune_option
             = dict_t<elem_v_t<tune_key::param_optimizer_type,
                              tune_key_value::param_optimizer_decision_tree>,
@@ -106,7 +106,7 @@ void batch_gemm_run(uint32_t iter) {
             mem_layout::row_major, // memory layout for B
             8, // leading dimension for B, in unit of element
             mem_space::global, // memory reading from global mem for B
-            data_type_acc, // accumulator data type for intermediate resutls
+            data_type_acc, // accumulator data type for intermediate results
             wg_shape, // computation tile shape
             wg_tile_k, // elements in each iteration
             gpu_arch::Xe, // GPU arch

diff --git a/examples/05_batch_gemm/batch_gemm.hpp b/examples/05_batch_gemm/batch_gemm.hpp
@@ -173,8 +173,8 @@ class batch_gemm_t {
     /// @return The size of local memory required.
     __XETLA_API static constexpr uint32_t get_slm_size() {
         constexpr uint32_t size = gemm_t::slm_size + epilogue_t::slm_size;
-        static_assert(size <= (128 * 1024),
-                "The local memory size should be less than 128KB!");
+        static_assert(size <= arch_attr_t<arch_tag>::local_mem_size,
+                "The local memory size excess!");
         return size;
     };